Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: pageres.h (Formerly page_res.h) 00003 * Description: Results classes used by control.c 00004 * Author: Phil Cheatle 00005 * Created: Tue Sep 22 08:42:49 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 #ifndef PAGERES_H 00020 #define PAGERES_H 00021 00022 #include "blobs.h" 00023 #include "boxword.h" 00024 #include "elst.h" 00025 #include "genericvector.h" 00026 #include "normalis.h" 00027 #include "ocrblock.h" 00028 #include "ocrrow.h" 00029 #include "params_training_featdef.h" 00030 #include "ratngs.h" 00031 #include "rejctmap.h" 00032 #include "seam.h" 00033 #include "werd.h" 00034 00035 namespace tesseract { 00036 struct FontInfo; 00037 class Tesseract; 00038 } 00039 using tesseract::FontInfo; 00040 00041 static const inT16 kBlamerBoxTolerance = 5; 00042 00043 // Enum for expressing the source of error. 00044 // Note: Please update kIncorrectResultReasonNames when modifying this enum. 00045 enum IncorrectResultReason { 00046 // The text recorded in best choice == truth text 00047 IRR_CORRECT, 00048 // Either: Top choice is incorrect and is a dictionary word (language model 00049 // is unlikely to help correct such errors, so blame the classifier). 00050 // Or: the correct unichar was not included in shortlist produced by the 00051 // classifier at all. 00052 IRR_CLASSIFIER, 00053 // Chopper have not found one or more splits that correspond to the correct 00054 // character bounding boxes recorded in BlamerBundle::truth_word. 00055 IRR_CHOPPER, 00056 // Classifier did include correct unichars for each blob in the correct 00057 // segmentation, however its rating could have been too bad to allow the 00058 // language model to pull out the correct choice. On the other hand the 00059 // strength of the language model might have been too weak to favor the 00060 // correct answer, this we call this case a classifier-language model 00061 // tradeoff error. 00062 IRR_CLASS_LM_TRADEOFF, 00063 // Page layout failed to produce the correct bounding box. Blame page layout 00064 // if the truth was not found for the word, which implies that the bounding 00065 // box of the word was incorrect (no truth word had a similar bounding box). 00066 IRR_PAGE_LAYOUT, 00067 // SegSearch heuristic prevented one or more blobs from the correct 00068 // segmentation state to be classified (e.g. the blob was too wide). 00069 IRR_SEGSEARCH_HEUR, 00070 // The correct segmentaiton state was not explored because of poor SegSearch 00071 // pain point prioritization. We blame SegSearch pain point prioritization 00072 // if the best rating of a choice constructed from correct segmentation is 00073 // better than that of the best choice (i.e. if we got to explore the correct 00074 // segmentation state, language model would have picked the correct choice). 00075 IRR_SEGSEARCH_PP, 00076 // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word, 00077 00078 // and thus use the old language model (permuters). 00079 // TODO(antonova): integrate the new language mode with chopper 00080 IRR_CLASS_OLD_LM_TRADEOFF, 00081 // If there is an incorrect adaptive template match with a better score than 00082 // a correct one (either pre-trained or adapted), mark this as adaption error. 00083 IRR_ADAPTION, 00084 // split_and_recog_word() failed to find a suitable split in truth. 00085 IRR_NO_TRUTH_SPLIT, 00086 // Truth is not available for this word (e.g. when words in corrected content 00087 // file are turned into ~~~~ because an appropriate alignment was not found. 00088 IRR_NO_TRUTH, 00089 // The text recorded in best choice != truth text, but none of the above 00090 // reasons are set. 00091 IRR_UNKNOWN, 00092 00093 IRR_NUM_REASONS 00094 }; 00095 00096 // Blamer-related information to determine the source of errors. 00097 struct BlamerBundle { 00098 static const char *IncorrectReasonName(IncorrectResultReason irr); 00099 BlamerBundle() : truth_has_char_boxes(false), 00100 incorrect_result_reason(IRR_CORRECT), 00101 lattice_data(NULL) { ClearResults(); } 00102 ~BlamerBundle() { delete[] lattice_data; } 00103 void ClearResults() { 00104 norm_truth_word.DeleteAllBoxes(); 00105 norm_box_tolerance = 0; 00106 if (!NoTruth()) incorrect_result_reason = IRR_CORRECT; 00107 debug = ""; 00108 segsearch_is_looking_for_blame = false; 00109 best_correctly_segmented_rating = WERD_CHOICE::kBadRating; 00110 correct_segmentation_cols.clear(); 00111 correct_segmentation_rows.clear(); 00112 best_choice_is_dict_and_top_choice = false; 00113 delete[] lattice_data; 00114 lattice_data = NULL; 00115 lattice_size = 0; 00116 } 00117 void CopyTruth(const BlamerBundle &other) { 00118 truth_has_char_boxes = other.truth_has_char_boxes; 00119 truth_word = other.truth_word; 00120 truth_text = other.truth_text; 00121 incorrect_result_reason = 00122 (other.NoTruth() ? other.incorrect_result_reason : IRR_CORRECT); 00123 } 00124 void CopyResults(const BlamerBundle &other) { 00125 norm_truth_word = other.norm_truth_word; 00126 norm_box_tolerance = other.norm_box_tolerance; 00127 incorrect_result_reason = other.incorrect_result_reason; 00128 segsearch_is_looking_for_blame = other.segsearch_is_looking_for_blame; 00129 best_correctly_segmented_rating =other.best_correctly_segmented_rating; 00130 correct_segmentation_cols = other.correct_segmentation_cols; 00131 correct_segmentation_rows = other.correct_segmentation_rows; 00132 best_choice_is_dict_and_top_choice = 00133 other.best_choice_is_dict_and_top_choice; 00134 if (other.lattice_data != NULL) { 00135 lattice_data = new char[other.lattice_size]; 00136 memcpy(lattice_data, other.lattice_data, other.lattice_size); 00137 lattice_size = other.lattice_size; 00138 } else { 00139 lattice_data = NULL; 00140 } 00141 } 00142 BlamerBundle(const BlamerBundle &other) { 00143 this->CopyTruth(other); 00144 this->CopyResults(other); 00145 } 00146 const char *IncorrectReason() const; 00147 bool NoTruth() const { 00148 return (incorrect_result_reason == IRR_NO_TRUTH || 00149 incorrect_result_reason == IRR_PAGE_LAYOUT); 00150 } 00151 void SetBlame(IncorrectResultReason irr, 00152 const STRING &msg, const WERD_CHOICE *choice, bool debug) { 00153 this->incorrect_result_reason = irr; 00154 this->debug = this->IncorrectReason(); 00155 this->debug += " to blame: "; 00156 this->FillDebugString(msg, choice, &(this->debug)); 00157 if (debug) tprintf("SetBlame(): %s", this->debug.string()); 00158 } 00159 // Appends choice and truth details to the given debug string. 00160 void FillDebugString(const STRING &msg, const WERD_CHOICE *choice, 00161 STRING *debug); 00162 00163 // Set to true when bounding boxes for individual unichars are recorded. 00164 bool truth_has_char_boxes; 00165 // The true_word (in the original image coordinate space) contains ground 00166 // truth bounding boxes for this WERD_RES. 00167 tesseract::BoxWord truth_word; 00168 // Same as above, but in normalized coordinates 00169 // (filled in by WERD_RES::SetupForRecognition()). 00170 tesseract::BoxWord norm_truth_word; 00171 // Tolerance for bounding box comparisons in normalized space. 00172 int norm_box_tolerance; 00173 // Contains ground truth unichar for each of the bounding boxes in truth_word. 00174 GenericVector<STRING> truth_text; 00175 // The reason for incorrect OCR result. 00176 IncorrectResultReason incorrect_result_reason; 00177 // Debug text associated with the blame. 00178 STRING debug; 00179 // Misadaption debug information (filled in if this word was misadapted to). 00180 STRING misadaption_debug; 00181 // Variables used by the segmentation search when looking for the blame. 00182 // Set to true while segmentation search is continued after the usual 00183 // termination condition in order to look for the blame. 00184 bool segsearch_is_looking_for_blame; 00185 // Best rating for correctly segmented path 00186 // (set and used by SegSearch when looking for blame). 00187 float best_correctly_segmented_rating; 00188 // Vectors populated by SegSearch to indicate column and row indices that 00189 // correspond to blobs with correct bounding boxes. 00190 GenericVector<int> correct_segmentation_cols; 00191 GenericVector<int> correct_segmentation_rows; 00192 // Set to true if best choice is a dictionary word and 00193 // classifier's top choice. 00194 bool best_choice_is_dict_and_top_choice; 00195 // Serialized segmentation search lattice. 00196 char *lattice_data; 00197 int lattice_size; // size of lattice_data in bytes 00198 // Information about hypotheses (paths) explored by the segmentation search. 00199 tesseract::ParamsTrainingBundle params_training_bundle; 00200 }; 00201 00202 /* Forward declarations */ 00203 00204 class BLOCK_RES; 00205 00206 ELISTIZEH (BLOCK_RES) CLISTIZEH (BLOCK_RES) 00207 class 00208 ROW_RES; 00209 00210 ELISTIZEH (ROW_RES) 00211 class WERD_RES; 00212 00213 ELISTIZEH (WERD_RES) 00214 00215 /************************************************************************* 00216 * PAGE_RES - Page results 00217 *************************************************************************/ 00218 class PAGE_RES { // page result 00219 public: 00220 inT32 char_count; 00221 inT32 rej_count; 00222 BLOCK_RES_LIST block_res_list; 00223 BOOL8 rejected; 00224 // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to 00225 // the next word. This pointer is not owned by PAGE_RES class. 00226 WERD_CHOICE **prev_word_best_choice; 00227 // Sums of blame reasons computed by the blamer. 00228 GenericVector<int> blame_reasons; 00229 // Debug information about all the misadaptions on this page. 00230 // Each BlamerBundle contains an index into this vector, so that words that 00231 // caused misadaption could be marked. However, since words could be 00232 // deleted/split/merged, the log is stored on the PAGE_RES level. 00233 GenericVector<STRING> misadaption_log; 00234 00235 inline void Init() { 00236 char_count = 0; 00237 rej_count = 0; 00238 rejected = FALSE; 00239 prev_word_best_choice = NULL; 00240 blame_reasons.init_to_size(IRR_NUM_REASONS, 0); 00241 } 00242 00243 PAGE_RES() { Init(); } // empty constructor 00244 00245 PAGE_RES(BLOCK_LIST *block_list, // real blocks 00246 WERD_CHOICE **prev_word_best_choice_ptr); 00247 00248 ~PAGE_RES () { // destructor 00249 } 00250 }; 00251 00252 /************************************************************************* 00253 * BLOCK_RES - Block results 00254 *************************************************************************/ 00255 00256 class BLOCK_RES:public ELIST_LINK { 00257 public: 00258 BLOCK * block; // real block 00259 inT32 char_count; // chars in block 00260 inT32 rej_count; // rejected chars 00261 inT16 font_class; // 00262 inT16 row_count; 00263 float x_height; 00264 BOOL8 font_assigned; // block already 00265 // processed 00266 BOOL8 bold; // all bold 00267 BOOL8 italic; // all italic 00268 00269 ROW_RES_LIST row_res_list; 00270 00271 BLOCK_RES() { 00272 } // empty constructor 00273 00274 BLOCK_RES(BLOCK *the_block); // real block 00275 00276 ~BLOCK_RES () { // destructor 00277 } 00278 }; 00279 00280 /************************************************************************* 00281 * ROW_RES - Row results 00282 *************************************************************************/ 00283 00284 class ROW_RES:public ELIST_LINK { 00285 public: 00286 ROW * row; // real row 00287 inT32 char_count; // chars in block 00288 inT32 rej_count; // rejected chars 00289 inT32 whole_word_rej_count; // rejs in total rej wds 00290 WERD_RES_LIST word_res_list; 00291 00292 ROW_RES() { 00293 } // empty constructor 00294 00295 ROW_RES(ROW *the_row); // real row 00296 00297 ~ROW_RES() { // destructor 00298 } 00299 }; 00300 00301 /************************************************************************* 00302 * WERD_RES - Word results 00303 *************************************************************************/ 00304 enum CRUNCH_MODE 00305 { 00306 CR_NONE, 00307 CR_KEEP_SPACE, 00308 CR_LOOSE_SPACE, 00309 CR_DELETE 00310 }; 00311 00312 // WERD_RES is a collection of publicly accessible members that gathers 00313 // information about a word result. 00314 class WERD_RES : public ELIST_LINK { 00315 public: 00316 // Which word is which? 00317 // There are 3 coordinate spaces in use here: a possibly rotated pixel space, 00318 // the original image coordinate space, and the BLN space in which the 00319 // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight, 00320 // and the x-middle of the word is at 0. 00321 // In the rotated pixel space, coordinates correspond to the input image, 00322 // but may be rotated about the origin by a multiple of 90 degrees, 00323 // and may therefore be negative. 00324 // In any case a rotation by denorm.block()->re_rotation() will take them 00325 // back to the original image. 00326 // The other differences between words all represent different stages of 00327 // processing during recognition. 00328 00329 // ---------------------------INPUT------------------------------------- 00330 00331 // The word is the input C_BLOBs in the rotated pixel space. 00332 // word is NOT owned by the WERD_RES unless combination is true. 00333 // All the other word pointers ARE owned by the WERD_RES. 00334 WERD* word; // Input C_BLOB word. 00335 00336 // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------ 00337 00338 // The bln_boxes contains the bounding boxes (only) of the input word, in the 00339 // BLN space. The lengths of word and bln_boxes 00340 // match as they are both before any chopping. 00341 // TODO(rays) determine if docqual does anything useful and delete bln_boxes 00342 // if it doesn't. 00343 tesseract::BoxWord* bln_boxes; // BLN input bounding boxes. 00344 // The denorm provides the transformation to get back to the rotated image 00345 // coords from the chopped_word/rebuild_word BLN coords. 00346 DENORM denorm; // For use on chopped_word. 00347 // Unicharset used by the classifier output in best_choice and raw_choice. 00348 const UNICHARSET* uch_set; // For converting back to utf8. 00349 00350 // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION---- 00351 // ----Setup to a (different!) state expected by the various classifiers---- 00352 // TODO(rays) Tidy and make more consistent. 00353 00354 // The chopped_word is also in BLN space, and represents the fully chopped 00355 // character fragments that make up the word. 00356 // The length of chopped_word matches length of seam_array + 1 (if set). 00357 TWERD* chopped_word; // BLN chopped fragments output. 00358 SEAMS seam_array; // Seams matching chopped_word. 00359 WERD_CHOICE *best_choice; // tess output 00360 WERD_CHOICE *raw_choice; // top choice permuter 00361 // Alternative paths found during chopping/segmentation search stages 00362 // (the first entry being a slim copy of best_choice). 00363 GenericVector<WERD_CHOICE *> alt_choices; 00364 GenericVector<GenericVector<int> > alt_states; 00365 00366 // Truth bounding boxes, text and incorrect choice reason. 00367 BlamerBundle *blamer_bundle; 00368 00369 // --------------OUTPUT FROM RECOGNITION------------------------------- 00370 // --------------Not all fields are necessarily set.------------------- 00371 // ---best_choice, raw_choice *must* end up set, with a box_word------- 00372 // ---In complete output, the number of blobs in rebuild_word matches--- 00373 // ---the number of boxes in box_word, the number of unichar_ids in--- 00374 // ---best_choice, the number of ints in best_state, and the number--- 00375 // ---of strings in correct_text-------------------------------------- 00376 // ---SetupFake Sets everything to appropriate values if the word is--- 00377 // ---known to be bad before recognition.------------------------------ 00378 00379 // The rebuild_word is also in BLN space, but represents the final best 00380 // segmentation of the word. Its length is therefore the same as box_word. 00381 TWERD* rebuild_word; // BLN best segmented word. 00382 // The box_word is in the original image coordinate space. It is the 00383 // bounding boxes of the rebuild_word, after denormalization. 00384 // The length of box_word matches rebuild_word, best_state (if set) and 00385 // correct_text (if set), as well as best_choice and represents the 00386 // number of classified units in the output. 00387 tesseract::BoxWord* box_word; // Denormalized output boxes. 00388 // The best_state stores the relationship between chopped_word and 00389 // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i] 00390 // adjacent blobs in chopped_word. The seams in seam_array are hidden 00391 // within a rebuild_word blob and revealed between them. 00392 GenericVector<int> best_state; // Number of blobs in each best blob. 00393 // The correct_text is used during training and adaption to carry the 00394 // text to the training system without the need for a unicharset. There 00395 // is one entry in the vector for each blob in rebuild_word and box_word. 00396 GenericVector<STRING> correct_text; 00397 // The Tesseract that was used to recognize this word. Just a borrowed 00398 // pointer. Note: Tesseract's class definition is in a higher-level library. 00399 // We avoid introducing a cyclic dependency by not using the Tesseract 00400 // within WERD_RES. We are just storing it to provide access to it 00401 // for the top-level multi-language controller, and maybe for output of 00402 // the recognized language. 00403 tesseract::Tesseract* tesseract; 00404 00405 // Less-well documented members. 00406 // TODO(rays) Add more documentation here. 00407 WERD_CHOICE *ep_choice; // ep text TODO(rays) delete this. 00408 REJMAP reject_map; // best_choice rejects 00409 BOOL8 tess_failed; 00410 /* 00411 If tess_failed is TRUE, one of the following tests failed when Tess 00412 returned: 00413 - The outword blob list was not the same length as the best_choice string; 00414 - The best_choice string contained ALL blanks; 00415 - The best_choice string was zero length 00416 */ 00417 BOOL8 tess_accepted; // Tess thinks its ok? 00418 BOOL8 tess_would_adapt; // Tess would adapt? 00419 BOOL8 done; // ready for output? 00420 bool small_caps; // word appears to be small caps 00421 inT8 italic; 00422 inT8 bold; 00423 // The fontinfos are pointers to data owned by the classifier. 00424 const FontInfo* fontinfo; 00425 const FontInfo* fontinfo2; 00426 inT8 fontinfo_id_count; // number of votes 00427 inT8 fontinfo_id2_count; // number of votes 00428 BOOL8 guessed_x_ht; 00429 BOOL8 guessed_caps_ht; 00430 CRUNCH_MODE unlv_crunch_mode; 00431 float x_height; // post match estimate 00432 float caps_height; // post match estimate 00433 00434 /* 00435 To deal with fuzzy spaces we need to be able to combine "words" to form 00436 combinations when we suspect that the gap is a non-space. The (new) text 00437 ord code generates separate words for EVERY fuzzy gap - flags in the word 00438 indicate whether the gap is below the threshold (fuzzy kern) and is thus 00439 NOT a real word break by default, or above the threshold (fuzzy space) and 00440 this is a real word break by default. 00441 00442 The WERD_RES list contains all these words PLUS "combination" words built 00443 out of (copies of) the words split by fuzzy kerns. The separate parts have 00444 their "part_of_combo" flag set true and should be IGNORED on a default 00445 reading of the list. 00446 00447 Combination words are FOLLOWED by the sequence of part_of_combo words 00448 which they combine. 00449 */ 00450 BOOL8 combination; //of two fuzzy gap wds 00451 BOOL8 part_of_combo; //part of a combo 00452 BOOL8 reject_spaces; //Reject spacing? 00453 // FontInfo ids for each unichar in best_choice. 00454 GenericVector<inT8> best_choice_fontinfo_ids; 00455 00456 WERD_RES() { 00457 InitNonPointers(); 00458 InitPointers(); 00459 } 00460 WERD_RES(WERD *the_word) { 00461 InitNonPointers(); 00462 InitPointers(); 00463 word = the_word; 00464 } 00465 WERD_RES(const WERD_RES &source) { 00466 InitPointers(); 00467 *this = source; // see operator= 00468 } 00469 00470 ~WERD_RES(); 00471 00472 // Returns the UTF-8 string for the given blob index in the best_choice word, 00473 // given that we know whether we are in a right-to-left reading context. 00474 // This matters for mirrorable characters such as parentheses. We recognize 00475 // characters purely based on their shape on the page, and by default produce 00476 // the corresponding unicode for a left-to-right context. 00477 const char* const BestUTF8(int blob_index, bool in_rtl_context) const { 00478 if (blob_index < 0 || blob_index >= best_choice->length()) 00479 return NULL; 00480 UNICHAR_ID id = best_choice->unichar_id(blob_index); 00481 if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID) 00482 return NULL; 00483 UNICHAR_ID mirrored = uch_set->get_mirror(id); 00484 if (in_rtl_context && mirrored > 0 && mirrored != INVALID_UNICHAR_ID) 00485 id = mirrored; 00486 return uch_set->id_to_unichar_ext(id); 00487 } 00488 // Returns the UTF-8 string for the given blob index in the raw_choice word. 00489 const char* const RawUTF8(int blob_index) const { 00490 if (blob_index < 0 || blob_index >= raw_choice->length()) 00491 return NULL; 00492 UNICHAR_ID id = raw_choice->unichar_id(blob_index); 00493 if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID) 00494 return NULL; 00495 return uch_set->id_to_unichar(id); 00496 } 00497 00498 UNICHARSET::Direction SymbolDirection(int blob_index) const { 00499 if (best_choice == NULL || 00500 blob_index >= best_choice->length() || 00501 blob_index < 0) 00502 return UNICHARSET::U_OTHER_NEUTRAL; 00503 return uch_set->get_direction(best_choice->unichar_id(blob_index)); 00504 } 00505 00506 bool AnyRtlCharsInWord() const { 00507 if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1) 00508 return false; 00509 for (int id = 0; id < best_choice->length(); id++) { 00510 int unichar_id = best_choice->unichar_id(id); 00511 if (unichar_id < 0 || unichar_id >= uch_set->size()) 00512 continue; // Ignore illegal chars. 00513 UNICHARSET::Direction dir = 00514 uch_set->get_direction(unichar_id); 00515 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || 00516 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC || 00517 dir == UNICHARSET::U_ARABIC_NUMBER) 00518 return true; 00519 } 00520 return false; 00521 } 00522 00523 bool AnyLtrCharsInWord() const { 00524 if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1) 00525 return false; 00526 for (int id = 0; id < best_choice->length(); id++) { 00527 int unichar_id = best_choice->unichar_id(id); 00528 if (unichar_id < 0 || unichar_id >= uch_set->size()) 00529 continue; // Ignore illegal chars. 00530 UNICHARSET::Direction dir = uch_set->get_direction(unichar_id); 00531 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) 00532 return true; 00533 } 00534 return false; 00535 } 00536 00537 // Return whether the blobs in this WERD_RES 0, 1,... come from an engine 00538 // that gave us the unichars in reading order (as opposed to strict left 00539 // to right). 00540 bool UnicharsInReadingOrder() const { 00541 return best_choice->unichars_in_script_order(); 00542 } 00543 00544 void InitNonPointers(); 00545 void InitPointers(); 00546 void Clear(); 00547 void ClearResults(); 00548 00549 WERD_RES& operator=(const WERD_RES& source); //from this 00550 00551 void CopySimpleFields(const WERD_RES& source); 00552 00553 // Initializes a blank (default constructed) WERD_RES from one that has 00554 // already been recognized. 00555 // Use SetupFor*Recognition afterwards to complete the setup and make 00556 // it ready for a retry recognition. 00557 void InitForRetryRecognition(const WERD_RES& source); 00558 00559 // Sets up the members used in recognition: bln_boxes, chopped_word, 00560 // seam_array, denorm, best_choice, raw_choice. Returns false if 00561 // the word is empty and sets up fake results. If use_body_size is 00562 // true and row->body_size is set, then body_size will be used for 00563 // blob normalization instead of xheight + ascrise. This flag is for 00564 // those languages that are using CJK pitch model and thus it has to 00565 // be true if and only if tesseract->textord_use_cjk_fp_model is 00566 // true. 00567 bool SetupForTessRecognition(const UNICHARSET& unicharset_in, 00568 tesseract::Tesseract* tesseract, Pix* pix, 00569 bool numeric_mode, bool use_body_size, 00570 ROW *row, BLOCK* block); 00571 00572 // Sets up the members used in recognition: 00573 // bln_boxes, chopped_word, seam_array, denorm. 00574 // Returns false if the word is empty and sets up fake results. 00575 bool SetupForCubeRecognition(const UNICHARSET& unicharset_in, 00576 tesseract::Tesseract* tesseract, 00577 const BLOCK* block); 00578 00579 // Sets up the members used in recognition for an empty recognition result: 00580 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. 00581 void SetupFake(const UNICHARSET& uch); 00582 00583 // Set the word as having the script of the input unicharset. 00584 void SetupWordScript(const UNICHARSET& unicharset_in); 00585 00586 // Sets up the blamer_bundle if it is not null, using the initialized denorm. 00587 void SetupBlamerBundle(); 00588 00589 // Moves the results fields from word to this. This takes ownership of all 00590 // the data, so src can be destructed. 00591 // word1.ConsumeWordResult(word); 00592 // delete word; 00593 // is simpler and faster than: 00594 // word1 = *word; 00595 // delete word; 00596 // as it doesn't need to copy and reallocate anything. 00597 void ConsumeWordResults(WERD_RES* word); 00598 00599 // Replace the best choice and rebuild box word. 00600 void ReplaceBestChoice(const WERD_CHOICE& choice, 00601 const GenericVector<int> &segmentation_state); 00602 00603 // Builds the rebuild_word from the chopped_word and the best_state. 00604 void RebuildBestState(); 00605 00606 // Copies the chopped_word to the rebuild_word, faking a best_state as well. 00607 // Also sets up the output box_word. 00608 void CloneChoppedToRebuild(); 00609 00610 // Sets/replaces the box_word with one made from the rebuild_word. 00611 void SetupBoxWord(); 00612 00613 // Sets up the script positions in the output boxword using the best_choice 00614 // to get the unichars, and the unicharset to get the target positions. 00615 void SetScriptPositions(); 00616 00617 // Returns the indices [start, end) containing the core of the word, stripped 00618 // of any superscript digits on either side. 00619 // (i.e., the non-footnote part of the word). 00620 // Assumes that BoxWord is all set up for best_choice. 00621 void WithoutFootnoteSpan(int *start, int *end) const; 00622 00623 // Given an alternate word choice and segmentation state, yield the indices 00624 // [start, end) containig the core of the word, stripped of any superscript 00625 // digits on either side. (i.e. stripping off the footnote parts). 00626 void WithoutFootnoteSpan( 00627 const WERD_CHOICE &choice, const GenericVector<int> &state, 00628 int *start, int *end) const; 00629 00630 // Classifies the word with some already-calculated BLOB_CHOICEs. 00631 // The choices are an array of blob_count pointers to BLOB_CHOICE, 00632 // providing a single classifier result for each blob. 00633 // The BLOB_CHOICEs are consumed and the word takes ownership. 00634 // The number of blobs in the outword must match blob_count. 00635 void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices); 00636 00637 // Copies the best_choice strings to the correct_text for adaption/training. 00638 void BestChoiceToCorrectText(); 00639 00640 // Merges 2 adjacent blobs in the result if the permanent callback 00641 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent 00642 // callback box_cb is NULL or returns true, setting the merged blob 00643 // result to the class returned from class_cb. 00644 // Returns true if anything was merged. 00645 bool ConditionalBlobMerge( 00646 TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb, 00647 TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb, 00648 BLOB_CHOICE_LIST_CLIST *blob_choices); 00649 00650 // Callback helper for fix_quotes returns a double quote if both 00651 // arguments are quote, otherwise INVALID_UNICHAR_ID. 00652 UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2); 00653 void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices); 00654 00655 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both 00656 // arguments are hyphen, otherwise INVALID_UNICHAR_ID. 00657 UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2); 00658 // Callback helper for fix_hyphens returns true if box1 and box2 overlap 00659 // (assuming both on the same textline, are in order and a chopped em dash.) 00660 bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2); 00661 void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices); 00662 00663 // Callback helper for merge_tess_fails returns a space if both 00664 // arguments are space, otherwise INVALID_UNICHAR_ID. 00665 UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2); 00666 void merge_tess_fails(); 00667 00668 static WERD_RES* deep_copy(const WERD_RES* src) { 00669 return new WERD_RES(*src); 00670 } 00671 00672 // Copy blobs from word_res onto this word (eliminating spaces between). 00673 // Since this may be called bidirectionally OR both the BOL and EOL flags. 00674 void copy_on(WERD_RES *word_res) { //from this word 00675 word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL)); 00676 word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL)); 00677 word->copy_on(word_res->word); 00678 } 00679 00680 // Returns true if the collection of count pieces, starting at start, are all 00681 // natural connected components, ie there are no real chops involved. 00682 bool PiecesAllNatural(int start, int count) const; 00683 }; 00684 00685 /************************************************************************* 00686 * PAGE_RES_IT - Page results iterator 00687 *************************************************************************/ 00688 00689 class PAGE_RES_IT { 00690 public: 00691 PAGE_RES * page_res; // page being iterated 00692 00693 PAGE_RES_IT() { 00694 } // empty contructor 00695 00696 PAGE_RES_IT(PAGE_RES *the_page_res) { // page result 00697 page_res = the_page_res; 00698 restart_page(); // ready to scan 00699 } 00700 00701 // Do two PAGE_RES_ITs point at the same word? 00702 // This is much cheaper than cmp(). 00703 bool operator ==(const PAGE_RES_IT &other) const; 00704 00705 bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); } 00706 00707 // Given another PAGE_RES_IT to the same page, 00708 // this before other: -1 00709 // this equal to other: 0 00710 // this later than other: 1 00711 int cmp(const PAGE_RES_IT &other) const; 00712 00713 WERD_RES *restart_page() { 00714 return start_page(false); // Skip empty blocks. 00715 } 00716 WERD_RES *restart_page_with_empties() { 00717 return start_page(true); // Allow empty blocks. 00718 } 00719 WERD_RES *start_page(bool empty_ok); 00720 00721 WERD_RES *restart_row(); 00722 00723 // ============ Methods that mutate the underling structures =========== 00724 // Note that these methods will potentially invalidate other PAGE_RES_ITs 00725 // and are intended to be used only while a single PAGE_RES_IT is active. 00726 // This problem needs to be taken into account if these mutation operators 00727 // are ever provided to PageIterator or its subclasses. 00728 00729 // Inserts the new_word and a corresponding WERD_RES before the current 00730 // position. The simple fields of the WERD_RES are copied from clone_res and 00731 // the resulting WERD_RES is returned for further setup with best_choice etc. 00732 WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word); 00733 00734 // Deletes the current WERD_RES and its underlying WERD. 00735 void DeleteCurrentWord(); 00736 00737 WERD_RES *forward() { // Get next word. 00738 return internal_forward(false, false); 00739 } 00740 // Move forward, but allow empty blocks to show as single NULL words. 00741 WERD_RES *forward_with_empties() { 00742 return internal_forward(false, true); 00743 } 00744 00745 WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph 00746 WERD_RES *forward_block(); // get first word in next non-empty block 00747 00748 WERD_RES *prev_word() const { // previous word 00749 return prev_word_res; 00750 } 00751 ROW_RES *prev_row() const { // row of prev word 00752 return prev_row_res; 00753 } 00754 BLOCK_RES *prev_block() const { // block of prev word 00755 return prev_block_res; 00756 } 00757 WERD_RES *word() const { // current word 00758 return word_res; 00759 } 00760 ROW_RES *row() const { // row of current word 00761 return row_res; 00762 } 00763 BLOCK_RES *block() const { // block of cur. word 00764 return block_res; 00765 } 00766 WERD_RES *next_word() const { // next word 00767 return next_word_res; 00768 } 00769 ROW_RES *next_row() const { // row of next word 00770 return next_row_res; 00771 } 00772 BLOCK_RES *next_block() const { // block of next word 00773 return next_block_res; 00774 } 00775 void rej_stat_word(); // for page/block/row 00776 00777 private: 00778 void ResetWordIterator(); 00779 WERD_RES *internal_forward(bool new_block, bool empty_ok); 00780 00781 WERD_RES * prev_word_res; // previous word 00782 ROW_RES *prev_row_res; // row of prev word 00783 BLOCK_RES *prev_block_res; // block of prev word 00784 00785 WERD_RES *word_res; // current word 00786 ROW_RES *row_res; // row of current word 00787 BLOCK_RES *block_res; // block of cur. word 00788 00789 WERD_RES *next_word_res; // next word 00790 ROW_RES *next_row_res; // row of next word 00791 BLOCK_RES *next_block_res; // block of next word 00792 00793 BLOCK_RES_IT block_res_it; // iterators 00794 ROW_RES_IT row_res_it; 00795 WERD_RES_IT word_res_it; 00796 }; 00797 #endif