tesseract-doc/output_8cpp_source.html

00001 /******************************************************************
00002  * File:        output.cpp  (Formerly output.c)
00003  * Description: Output pass
00004  * Author:                                      Phil Cheatle
00005  * Created:                                     Thu Aug  4 10:56:08 BST 1994
00006  *
00007  * (C) Copyright 1994, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023
00024 #include "mfcpch.h"
00025 #include <string.h>
00026 #include <ctype.h>
00027 #ifdef __UNIX__
00028 #include          <assert.h>
00029 #include          <unistd.h>
00030 #include          <errno.h>
00031 #endif
00032 #include "helpers.h"
00033 #include "tfacep.h"
00034 #include "tessvars.h"
00035 #include "control.h"
00036 #include "secname.h"
00037 #include "reject.h"
00038 #include "docqual.h"
00039 #include "output.h"
00040 #include "bestfirst.h"
00041 #include "globals.h"
00042 #include "tesseractclass.h"
00043
00044 #define EPAPER_EXT      ".ep"
00045 #define PAGE_YSIZE      3508
00046 #define CTRL_INSET      '\024'   //dc4=text inset
00047 #define CTRL_FONT       '\016'   //so=font change
00048 #define CTRL_DEFAULT      '\017' //si=default font
00049 #define CTRL_SHIFT      '\022'   //dc2=x shift
00050 #define CTRL_TAB        '\011'   //tab
00051 #define CTRL_NEWLINE      '\012' //newline
00052 #define CTRL_HARDLINE   '\015'   //cr
00053
00054 /**********************************************************************
00055  * pixels_to_pts
00056  *
00057  * Convert an integer number of pixels to the nearest integer
00058  * number of points.
00059  **********************************************************************/
00060
00061 inT32 pixels_to_pts(               //convert coords
00062                     inT32 pixels,
00063                     inT32 pix_res  //resolution
00064                    ) {
00065   float pts;                     //converted value
00066
00067   pts = pixels * 72.0 / pix_res;
00068   return (inT32) (pts + 0.5);    //round it
00069 }
00070
00071 namespace tesseract {
00072 void Tesseract::output_pass(  //Tess output pass //send to api
00073                             PAGE_RES_IT &page_res_it,
00074                             const TBOX *target_word_box) {
00075   BLOCK_RES *block_of_last_word;
00076   inT16 block_id;
00077   BOOL8 force_eol;               //During output
00078   BLOCK *nextblock;              //block of next word
00079   WERD *nextword;                //next word
00080
00081   page_res_it.restart_page ();
00082   block_of_last_word = NULL;
00083   while (page_res_it.word () != NULL) {
00084     check_debug_pt (page_res_it.word (), 120);
00085
00086         if (target_word_box)
00087         {
00088
00089                 TBOX current_word_box=page_res_it.word ()->word->bounding_box();
00090                 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
00091                 if (!target_word_box->contains(center_pt))
00092                 {
00093                         page_res_it.forward ();
00094                         continue;
00095                 }
00096
00097         }
00098     if (tessedit_write_block_separators &&
00099     block_of_last_word != page_res_it.block ()) {
00100       block_of_last_word = page_res_it.block ();
00101       block_id = block_of_last_word->block->index();
00102     }
00103
00104     force_eol = (tessedit_write_block_separators &&
00105       (page_res_it.block () != page_res_it.next_block ())) ||
00106       (page_res_it.next_word () == NULL);
00107
00108     if (page_res_it.next_word () != NULL)
00109       nextword = page_res_it.next_word ()->word;
00110     else
00111       nextword = NULL;
00112     if (page_res_it.next_block () != NULL)
00113       nextblock = page_res_it.next_block ()->block;
00114     else
00115       nextblock = NULL;
00116                                  //regardless of tilde crunching
00117     write_results(page_res_it,
00118                   determine_newline_type(page_res_it.word()->word,
00119                                          page_res_it.block()->block,
00120                                          nextword, nextblock), force_eol);
00121     page_res_it.forward();
00122   }
00123 }
00124
00125
00126 /*************************************************************************
00127  * write_results()
00128  *
00129  * All recognition and rejection has now been done. Generate the following:
00130  *   .txt file     - giving the final best choices with NO highlighting
00131  *   .raw file     - giving the tesseract top choice output for each word
00132  *   .map file     - showing how the .txt file has been rejected in the .ep file
00133  *   epchoice list - a list of one element per word, containing the text for the
00134  *                   epaper. Reject strings are inserted.
00135  *   inset list    - a list of bounding boxes of reject insets - indexed by the
00136  *                   reject strings in the epchoice text.
00137  *************************************************************************/
00138 void Tesseract::write_results(PAGE_RES_IT &page_res_it,
00139                               char newline_type,  // type of newline
00140                               BOOL8 force_eol) {  // override tilde crunch?
00141   WERD_RES *word = page_res_it.word();
00142   const UNICHARSET &uchset = *word->uch_set;
00143   STRING repetition_code;
00144   const STRING *wordstr;
00145   STRING wordstr_lengths;
00146   int i;
00147   char unrecognised = STRING (unrecognised_char)[0];
00148   char ep_chars[32];             //Only for unlv_tilde_crunch
00149   int ep_chars_index = 0;
00150   char txt_chs[32];              //Only for unlv_tilde_crunch
00151   char map_chs[32];              //Only for unlv_tilde_crunch
00152   int txt_index = 0;
00153   BOOL8 need_reject = FALSE;
00154   UNICHAR_ID space = uchset.unichar_to_id(" ");
00155   if ((word->unlv_crunch_mode != CR_NONE ||
00156        word->best_choice->length() == 0) &&
00157       !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
00158     if ((word->unlv_crunch_mode != CR_DELETE) &&
00159         (!stats_.tilde_crunch_written ||
00160          ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
00161           (word->word->space () > 0) &&
00162           !word->word->flag (W_FUZZY_NON) &&
00163           !word->word->flag (W_FUZZY_SP)))) {
00164       if (!word->word->flag (W_BOL) &&
00165           (word->word->space () > 0) &&
00166           !word->word->flag (W_FUZZY_NON) &&
00167           !word->word->flag (W_FUZZY_SP)) {
00168         // Write a space to separate from preceeding good text.
00169         txt_chs[txt_index] = ' ';
00170         map_chs[txt_index++] = '1';
00171         ep_chars[ep_chars_index++] = ' ';
00172         stats_.last_char_was_tilde = false;
00173       }
00174       need_reject = TRUE;
00175     }
00176     if ((need_reject && !stats_.last_char_was_tilde) ||
00177         (force_eol && stats_.write_results_empty_block)) {
00178       /* Write a reject char - mark as rejected unless zero_rejection mode */
00179       stats_.last_char_was_tilde = TRUE;
00180       txt_chs[txt_index] = unrecognised;
00181       if (tessedit_zero_rejection || (suspect_level == 0)) {
00182         map_chs[txt_index++] = '1';
00183         ep_chars[ep_chars_index++] = unrecognised;
00184       }
00185       else {
00186         map_chs[txt_index++] = '0';
00187         /*
00188            The ep_choice string is a faked reject to allow newdiff to sync the
00189            .etx with the .txt and .map files.
00190          */
00191         ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
00192                                  //dummy reject
00193         ep_chars[ep_chars_index++] = 1;
00194                                  //dummy reject
00195         ep_chars[ep_chars_index++] = 1;
00196                                  //type
00197         ep_chars[ep_chars_index++] = 2;
00198                                  //dummy reject
00199         ep_chars[ep_chars_index++] = 1;
00200                                  //dummy reject
00201         ep_chars[ep_chars_index++] = 1;
00202       }
00203       stats_.tilde_crunch_written = true;
00204       stats_.last_char_was_newline = false;
00205       stats_.write_results_empty_block = false;
00206     }
00207
00208     if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
00209       /* Add a new line output */
00210       txt_chs[txt_index] = '\n';
00211       map_chs[txt_index++] = '\n';
00212                                  //end line
00213       ep_chars[ep_chars_index++] = newline_type;
00214
00215                                  //Cos of the real newline
00216       stats_.tilde_crunch_written = false;
00217       stats_.last_char_was_newline = true;
00218       stats_.last_char_was_tilde = false;
00219     }
00220     txt_chs[txt_index] = '\0';
00221     map_chs[txt_index] = '\0';
00222     ep_chars[ep_chars_index] = '\0';  // terminate string
00223     word->ep_choice = new WERD_CHOICE(ep_chars, uchset);
00224
00225     if (force_eol)
00226       stats_.write_results_empty_block = true;
00227     return;
00228   }
00229
00230   /* NORMAL PROCESSING of non tilde crunched words */
00231
00232   stats_.tilde_crunch_written = false;
00233   if (newline_type)
00234     stats_.last_char_was_newline = true;
00235   else
00236     stats_.last_char_was_newline = false;
00237   stats_.write_results_empty_block = force_eol;  // about to write a real word
00238
00239   if (unlv_tilde_crunching &&
00240       stats_.last_char_was_tilde &&
00241       (word->word->space() == 0) &&
00242       !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
00243       (word->best_choice->unichar_id(0) == space)) {
00244     /* Prevent adjacent tilde across words - we know that adjacent tildes within
00245        words have been removed */
00246     word->best_choice->remove_unichar_id(0);
00247     if (word->best_choice->blob_choices() != NULL) {
00248       BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
00249       if (!blob_choices_it.empty()) delete blob_choices_it.extract();
00250     }
00251     word->reject_map.remove_pos (0);
00252     word->box_word->DeleteBox(0);
00253   }
00254   if (newline_type ||
00255     (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
00256     stats_.last_char_was_tilde = false;
00257   else {
00258     if (word->reject_map.length () > 0) {
00259       if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
00260         stats_.last_char_was_tilde = true;
00261       else
00262         stats_.last_char_was_tilde = false;
00263     }
00264     else if (word->word->space () > 0)
00265       stats_.last_char_was_tilde = false;
00266     /* else it is unchanged as there are no output chars */
00267   }
00268
00269   ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
00270
00271   set_unlv_suspects(word);
00272   check_debug_pt (word, 120);
00273   if (tessedit_rejection_debug) {
00274     tprintf ("Dict word: \"%s\": %d\n",
00275              word->best_choice->debug_string().string(),
00276              dict_word(*(word->best_choice)));
00277   }
00278   if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
00279     repetition_code = "|^~R";
00280     wordstr_lengths = "\001\001\001\001";
00281     repetition_code += uchset.id_to_unichar(get_rep_char(word));
00282     wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word)));
00283     wordstr = &repetition_code;
00284   } else {
00285     if (tessedit_zero_rejection) {
00286       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00287       for (i = 0; i < word->best_choice->length(); ++i) {
00288         if (word->reject_map[i].rejected())
00289           word->reject_map[i].setrej_minimal_rej_accept();
00290       }
00291     }
00292     if (tessedit_minimal_rejection) {
00293       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00294       for (i = 0; i < word->best_choice->length(); ++i) {
00295         if ((word->best_choice->unichar_id(i) != space) &&
00296             word->reject_map[i].rejected())
00297           word->reject_map[i].setrej_minimal_rej_accept();
00298       }
00299     }
00300   }
00301 }
00302 }  // namespace tesseract
00303
00304 /**********************************************************************
00305  * determine_newline_type
00306  *
00307  * Find whether we have a wrapping or hard newline.
00308  * Return FALSE if not at end of line.
00309  **********************************************************************/
00310
00311 char determine_newline_type(                   //test line ends
00312                             WERD *word,        //word to do
00313                             BLOCK *block,      //current block
00314                             WERD *next_word,   //next word
00315                             BLOCK *next_block  //block of next word
00316                            ) {
00317   inT16 end_gap;                 //to right edge
00318   inT16 width;                   //of next word
00319   TBOX word_box;                  //bounding
00320   TBOX next_box;                  //next word
00321   TBOX block_box;                 //block bounding
00322
00323   if (!word->flag (W_EOL))
00324     return FALSE;                //not end of line
00325   if (next_word == NULL || next_block == NULL || block != next_block)
00326     return CTRL_NEWLINE;
00327   if (next_word->space () > 0)
00328     return CTRL_HARDLINE;        //it is tabbed
00329   word_box = word->bounding_box ();
00330   next_box = next_word->bounding_box ();
00331   block_box = block->bounding_box ();
00332                                  //gap to eol
00333   end_gap = block_box.right () - word_box.right ();
00334   end_gap -= (inT32) block->space ();
00335   width = next_box.right () - next_box.left ();
00336   //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
00337   //              block_box.right(),word_box.right(),end_gap,
00338   //              next_box.right(),next_box.left(),width,
00339   //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
00340   return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
00341 }
00342
00343 /*************************************************************************
00344  * get_rep_char()
00345  * Return the first accepted character from the repetition string. This is the
00346  * character which is repeated - as determined earlier by fix_rep_char()
00347  *************************************************************************/
00348 namespace tesseract {
00349 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
00350   int i;
00351   for (i = 0; ((i < word->reject_map.length()) &&
00352                (word->reject_map[i].rejected())); ++i);
00353
00354   if (i < word->reject_map.length()) {
00355     return word->best_choice->unichar_id(i);
00356   } else {
00357     return word->uch_set->unichar_to_id(unrecognised_char.string());
00358   }
00359 }
00360
00361 /*************************************************************************
00362  * SUSPECT LEVELS
00363  *
00364  * 0 - dont reject ANYTHING
00365  * 1,2 - partial rejection
00366  * 3 - BEST
00367  *
00368  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
00369  * tessedit_minimal_rejection.
00370  *************************************************************************/
00371 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
00372   int len = word_res->reject_map.length();
00373   const WERD_CHOICE &word = *(word_res->best_choice);
00374   const UNICHARSET &uchset = *word.unicharset();
00375   int i;
00376   float rating_per_ch;
00377
00378   if (suspect_level == 0) {
00379     for (i = 0; i < len; i++) {
00380       if (word_res->reject_map[i].rejected())
00381         word_res->reject_map[i].setrej_minimal_rej_accept();
00382     }
00383     return;
00384   }
00385
00386   if (suspect_level >= 3)
00387     return;                      //Use defaults
00388
00389   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
00390
00391   if (safe_dict_word(word_res) &&
00392       (count_alphas(word) > suspect_short_words)) {
00393     /* Unreject alphas in dictionary words */
00394     for (i = 0; i < len; ++i) {
00395       if (word_res->reject_map[i].rejected() &&
00396           uchset.get_isalpha(word.unichar_id(i)))
00397         word_res->reject_map[i].setrej_minimal_rej_accept();
00398     }
00399   }
00400
00401   rating_per_ch = word.rating() / word_res->reject_map.length();
00402
00403   if (rating_per_ch >= suspect_rating_per_ch)
00404     return;                      //Dont touch bad ratings
00405
00406   if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
00407     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
00408     for (i = 0; i < len; ++i) {
00409       if (word_res->reject_map[i].rejected() &&
00410           (!uchset.eq(word.unichar_id(i), " ")))
00411         word_res->reject_map[i].setrej_minimal_rej_accept();
00412     }
00413   }
00414
00415   for (i = 0; i < len; i++) {
00416     if (word_res->reject_map[i].rejected()) {
00417       if (word_res->reject_map[i].flag(R_DOC_REJ))
00418         word_res->reject_map[i].setrej_minimal_rej_accept();
00419       if (word_res->reject_map[i].flag(R_BLOCK_REJ))
00420         word_res->reject_map[i].setrej_minimal_rej_accept();
00421       if (word_res->reject_map[i].flag(R_ROW_REJ))
00422         word_res->reject_map[i].setrej_minimal_rej_accept();
00423     }
00424   }
00425
00426   if (suspect_level == 2)
00427     return;
00428
00429   if (!suspect_constrain_1Il ||
00430       (word_res->reject_map.length() <= suspect_short_words)) {
00431     for (i = 0; i < len; i++) {
00432       if (word_res->reject_map[i].rejected()) {
00433         if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
00434           word_res->reject_map[i].flag(R_POSTNN_1IL)))
00435           word_res->reject_map[i].setrej_minimal_rej_accept();
00436
00437         if (!suspect_constrain_1Il &&
00438           word_res->reject_map[i].flag(R_MM_REJECT))
00439           word_res->reject_map[i].setrej_minimal_rej_accept();
00440       }
00441     }
00442   }
00443
00444   if (acceptable_word_string(*word_res->uch_set,
00445                              word.unichar_string().string(),
00446                              word.unichar_lengths().string()) !=
00447                                  AC_UNACCEPTABLE ||
00448       acceptable_number_string(word.unichar_string().string(),
00449                                word.unichar_lengths().string())) {
00450     if (word_res->reject_map.length() > suspect_short_words) {
00451       for (i = 0; i < len; i++) {
00452         if (word_res->reject_map[i].rejected() &&
00453           (!word_res->reject_map[i].perm_rejected() ||
00454            word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
00455            word_res->reject_map[i].flag (R_POSTNN_1IL) ||
00456            word_res->reject_map[i].flag (R_MM_REJECT))) {
00457           word_res->reject_map[i].setrej_minimal_rej_accept();
00458         }
00459       }
00460     }
00461   }
00462 }
00463
00464 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
00465   int count = 0;
00466   for (int i = 0; i < word.length(); ++i) {
00467     if (word.unicharset()->get_isalpha(word.unichar_id(i)))
00468       count++;
00469   }
00470   return count;
00471 }
00472
00473
00474 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
00475   int count = 0;
00476   for (int i = 0; i < word.length(); ++i) {
00477     if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
00478         word.unicharset()->get_isdigit(word.unichar_id(i)))
00479       count++;
00480   }
00481   return count;
00482 }
00483
00484
00485 BOOL8 Tesseract::acceptable_number_string(const char *s,
00486                                           const char *lengths) {
00487   BOOL8 prev_digit = FALSE;
00488
00489   if (*lengths == 1 && *s == '(')
00490     s++;
00491
00492   if (*lengths == 1 &&
00493       ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
00494     s++;
00495
00496   for (; *s != '\0'; s += *(lengths++)) {
00497     if (unicharset.get_isdigit(s, *lengths))
00498       prev_digit = TRUE;
00499     else if (prev_digit &&
00500              (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
00501       prev_digit = FALSE;
00502     else if (prev_digit && *lengths == 1 &&
00503              (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
00504       return TRUE;
00505     else if (prev_digit &&
00506              *lengths == 1 && (*s == '%') &&
00507              (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
00508              (*(s + *lengths + *(lengths + 1)) == '\0'))
00509       return TRUE;
00510     else
00511       return FALSE;
00512   }
00513   return TRUE;
00514 }
00515 }  // namespace tesseract