tesseract-doc/pageres_8h_source.html

00001 /**********************************************************************
00002  * File:        pageres.h  (Formerly page_res.h)
00003  * Description: Results classes used by control.c
00004  * Author:              Phil Cheatle
00005  * Created:     Tue Sep 22 08:42:49 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 #ifndef           PAGERES_H
00020 #define           PAGERES_H
00021
00022 #include "blobs.h"
00023 #include "boxword.h"
00024 #include "elst.h"
00025 #include "genericvector.h"
00026 #include "normalis.h"
00027 #include "ocrblock.h"
00028 #include "ocrrow.h"
00029 #include "params_training_featdef.h"
00030 #include "ratngs.h"
00031 #include "rejctmap.h"
00032 #include "seam.h"
00033 #include "werd.h"
00034
00035 namespace tesseract {
00036 struct FontInfo;
00037 class Tesseract;
00038 }
00039 using tesseract::FontInfo;
00040
00041 static const inT16 kBlamerBoxTolerance = 5;
00042
00043 // Enum for expressing the source of error.
00044 // Note: Please update kIncorrectResultReasonNames when modifying this enum.
00045 enum IncorrectResultReason {
00046   // The text recorded in best choice == truth text
00047   IRR_CORRECT,
00048   // Either: Top choice is incorrect and is a dictionary word (language model
00049   // is unlikely to help correct such errors, so blame the classifier).
00050   // Or: the correct unichar was not included in shortlist produced by the
00051   // classifier at all.
00052   IRR_CLASSIFIER,
00053   // Chopper have not found one or more splits that correspond to the correct
00054   // character bounding boxes recorded in BlamerBundle::truth_word.
00055   IRR_CHOPPER,
00056   // Classifier did include correct unichars for each blob in the correct
00057   // segmentation, however its rating could have been too bad to allow the
00058   // language model to pull out the correct choice. On the other hand the
00059   // strength of the language model might have been too weak to favor the
00060   // correct answer, this we call this case a classifier-language model
00061   // tradeoff error.
00062   IRR_CLASS_LM_TRADEOFF,
00063   // Page layout failed to produce the correct bounding box. Blame page layout
00064   // if the truth was not found for the word, which implies that the bounding
00065   // box of the word was incorrect (no truth word had a similar bounding box).
00066   IRR_PAGE_LAYOUT,
00067   // SegSearch heuristic prevented one or more blobs from the correct
00068   // segmentation state to be classified (e.g. the blob was too wide).
00069   IRR_SEGSEARCH_HEUR,
00070   // The correct segmentaiton state was not explored because of poor SegSearch
00071   // pain point prioritization. We blame SegSearch pain point prioritization
00072   // if the best rating of a choice constructed from correct segmentation is
00073   // better than that of the best choice (i.e. if we got to explore the correct
00074   // segmentation state, language model would have picked the correct choice).
00075   IRR_SEGSEARCH_PP,
00076   // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
00077
00078   // and thus use the old language model (permuters).
00079   // TODO(antonova): integrate the new language mode with chopper
00080   IRR_CLASS_OLD_LM_TRADEOFF,
00081   // If there is an incorrect adaptive template match with a better score than
00082   // a correct one (either pre-trained or adapted), mark this as adaption error.
00083   IRR_ADAPTION,
00084   // split_and_recog_word() failed to find a suitable split in truth.
00085   IRR_NO_TRUTH_SPLIT,
00086   // Truth is not available for this word (e.g. when words in corrected content
00087   // file are turned into ~~~~ because an appropriate alignment was not found.
00088   IRR_NO_TRUTH,
00089   // The text recorded in best choice != truth text, but none of the above
00090   // reasons are set.
00091   IRR_UNKNOWN,
00092
00093   IRR_NUM_REASONS
00094 };
00095
00096 // Blamer-related information to determine the source of errors.
00097 struct BlamerBundle {
00098   static const char *IncorrectReasonName(IncorrectResultReason irr);
00099   BlamerBundle() : truth_has_char_boxes(false),
00100       incorrect_result_reason(IRR_CORRECT),
00101       lattice_data(NULL) { ClearResults(); }
00102   ~BlamerBundle() { delete[] lattice_data; }
00103   void ClearResults() {
00104     norm_truth_word.DeleteAllBoxes();
00105     norm_box_tolerance = 0;
00106     if (!NoTruth()) incorrect_result_reason = IRR_CORRECT;
00107     debug = "";
00108     segsearch_is_looking_for_blame = false;
00109     best_correctly_segmented_rating = WERD_CHOICE::kBadRating;
00110     correct_segmentation_cols.clear();
00111     correct_segmentation_rows.clear();
00112     best_choice_is_dict_and_top_choice = false;
00113     delete[] lattice_data;
00114     lattice_data = NULL;
00115     lattice_size = 0;
00116   }
00117   void CopyTruth(const BlamerBundle &other) {
00118     truth_has_char_boxes = other.truth_has_char_boxes;
00119     truth_word = other.truth_word;
00120     truth_text = other.truth_text;
00121     incorrect_result_reason =
00122         (other.NoTruth() ? other.incorrect_result_reason : IRR_CORRECT);
00123   }
00124   void CopyResults(const BlamerBundle &other) {
00125     norm_truth_word = other.norm_truth_word;
00126     norm_box_tolerance = other.norm_box_tolerance;
00127     incorrect_result_reason = other.incorrect_result_reason;
00128     segsearch_is_looking_for_blame = other.segsearch_is_looking_for_blame;
00129     best_correctly_segmented_rating =other.best_correctly_segmented_rating;
00130     correct_segmentation_cols = other.correct_segmentation_cols;
00131     correct_segmentation_rows = other.correct_segmentation_rows;
00132     best_choice_is_dict_and_top_choice =
00133         other.best_choice_is_dict_and_top_choice;
00134     if (other.lattice_data != NULL) {
00135       lattice_data = new char[other.lattice_size];
00136       memcpy(lattice_data, other.lattice_data, other.lattice_size);
00137       lattice_size = other.lattice_size;
00138     } else {
00139       lattice_data = NULL;
00140     }
00141   }
00142   BlamerBundle(const BlamerBundle &other) {
00143     this->CopyTruth(other);
00144     this->CopyResults(other);
00145   }
00146   const char *IncorrectReason() const;
00147   bool NoTruth() const {
00148     return (incorrect_result_reason == IRR_NO_TRUTH ||
00149              incorrect_result_reason == IRR_PAGE_LAYOUT);
00150   }
00151   void SetBlame(IncorrectResultReason irr,
00152                 const STRING &msg, const WERD_CHOICE *choice, bool debug) {
00153     this->incorrect_result_reason = irr;
00154     this->debug = this->IncorrectReason();
00155     this->debug += " to blame: ";
00156     this->FillDebugString(msg, choice, &(this->debug));
00157     if (debug) tprintf("SetBlame(): %s", this->debug.string());
00158   }
00159   // Appends choice and truth details to the given debug string.
00160   void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
00161                        STRING *debug);
00162
00163   // Set to true when bounding boxes for individual unichars are recorded.
00164   bool truth_has_char_boxes;
00165   // The true_word (in the original image coordinate space) contains ground
00166   // truth bounding boxes for this WERD_RES.
00167   tesseract::BoxWord truth_word;
00168   // Same as above, but in normalized coordinates
00169   // (filled in by WERD_RES::SetupForRecognition()).
00170   tesseract::BoxWord norm_truth_word;
00171   // Tolerance for bounding box comparisons in normalized space.
00172   int norm_box_tolerance;
00173   // Contains ground truth unichar for each of the bounding boxes in truth_word.
00174   GenericVector<STRING> truth_text;
00175   // The reason for incorrect OCR result.
00176   IncorrectResultReason incorrect_result_reason;
00177   // Debug text associated with the blame.
00178   STRING debug;
00179   // Misadaption debug information (filled in if this word was misadapted to).
00180   STRING misadaption_debug;
00181   // Variables used by the segmentation search when looking for the blame.
00182   // Set to true while segmentation search is continued after the usual
00183   // termination condition in order to look for the blame.
00184   bool segsearch_is_looking_for_blame;
00185   // Best rating for correctly segmented path
00186   // (set and used by SegSearch when looking for blame).
00187   float best_correctly_segmented_rating;
00188   // Vectors populated by SegSearch to indicate column and row indices that
00189   // correspond to blobs with correct bounding boxes.
00190   GenericVector<int> correct_segmentation_cols;
00191   GenericVector<int> correct_segmentation_rows;
00192   // Set to true if best choice is a dictionary word and
00193   // classifier's top choice.
00194   bool best_choice_is_dict_and_top_choice;
00195   // Serialized segmentation search lattice.
00196   char *lattice_data;
00197   int lattice_size;  // size of lattice_data in bytes
00198   // Information about hypotheses (paths) explored by the segmentation search.
00199   tesseract::ParamsTrainingBundle params_training_bundle;
00200 };
00201
00202 /* Forward declarations */
00203
00204 class BLOCK_RES;
00205
00206 ELISTIZEH (BLOCK_RES) CLISTIZEH (BLOCK_RES)
00207 class
00208 ROW_RES;
00209
00210 ELISTIZEH (ROW_RES)
00211 class WERD_RES;
00212
00213 ELISTIZEH (WERD_RES)
00214
00215 /*************************************************************************
00216  * PAGE_RES - Page results
00217  *************************************************************************/
00218 class PAGE_RES {                 // page result
00219  public:
00220   inT32 char_count;
00221   inT32 rej_count;
00222   BLOCK_RES_LIST block_res_list;
00223   BOOL8 rejected;
00224   // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
00225   // the next word. This pointer is not owned by PAGE_RES class.
00226   WERD_CHOICE **prev_word_best_choice;
00227   // Sums of blame reasons computed by the blamer.
00228   GenericVector<int> blame_reasons;
00229   // Debug information about all the misadaptions on this page.
00230   // Each BlamerBundle contains an index into this vector, so that words that
00231   // caused misadaption could be marked. However, since words could be
00232   // deleted/split/merged, the log is stored on the PAGE_RES level.
00233   GenericVector<STRING> misadaption_log;
00234
00235   inline void Init() {
00236     char_count = 0;
00237     rej_count = 0;
00238     rejected = FALSE;
00239     prev_word_best_choice = NULL;
00240     blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
00241   }
00242
00243   PAGE_RES() { Init(); }  // empty constructor
00244
00245   PAGE_RES(BLOCK_LIST *block_list,   // real blocks
00246            WERD_CHOICE **prev_word_best_choice_ptr);
00247
00248   ~PAGE_RES () {               // destructor
00249   }
00250 };
00251
00252 /*************************************************************************
00253  * BLOCK_RES - Block results
00254  *************************************************************************/
00255
00256 class BLOCK_RES:public ELIST_LINK {
00257  public:
00258   BLOCK * block;               // real block
00259   inT32 char_count;            // chars in block
00260   inT32 rej_count;             // rejected chars
00261   inT16 font_class;            //
00262   inT16 row_count;
00263   float x_height;
00264   BOOL8 font_assigned;         // block already
00265   //      processed
00266   BOOL8 bold;                  // all bold
00267   BOOL8 italic;                // all italic
00268
00269   ROW_RES_LIST row_res_list;
00270
00271   BLOCK_RES() {
00272   }                            // empty constructor
00273
00274   BLOCK_RES(BLOCK *the_block);  // real block
00275
00276   ~BLOCK_RES () {              // destructor
00277   }
00278 };
00279
00280 /*************************************************************************
00281  * ROW_RES - Row results
00282  *************************************************************************/
00283
00284 class ROW_RES:public ELIST_LINK {
00285  public:
00286   ROW * row;                   // real row
00287   inT32 char_count;            // chars in block
00288   inT32 rej_count;             // rejected chars
00289   inT32 whole_word_rej_count;  // rejs in total rej wds
00290   WERD_RES_LIST word_res_list;
00291
00292   ROW_RES() {
00293   }                            // empty constructor
00294
00295   ROW_RES(ROW *the_row);  // real row
00296
00297   ~ROW_RES() {                // destructor
00298   }
00299 };
00300
00301 /*************************************************************************
00302  * WERD_RES - Word results
00303  *************************************************************************/
00304 enum CRUNCH_MODE
00305 {
00306   CR_NONE,
00307   CR_KEEP_SPACE,
00308   CR_LOOSE_SPACE,
00309   CR_DELETE
00310 };
00311
00312 // WERD_RES is a collection of publicly accessible members that gathers
00313 // information about a word result.
00314 class WERD_RES : public ELIST_LINK {
00315  public:
00316   // Which word is which?
00317   // There are 3 coordinate spaces in use here: a possibly rotated pixel space,
00318   // the original image coordinate space, and the BLN space in which the
00319   // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
00320   // and the x-middle of the word is at 0.
00321   // In the rotated pixel space, coordinates correspond to the input image,
00322   // but may be rotated about the origin by a multiple of 90 degrees,
00323   // and may therefore be negative.
00324   // In any case a rotation by denorm.block()->re_rotation() will take them
00325   // back to the original image.
00326   // The other differences between words all represent different stages of
00327   // processing during recognition.
00328
00329   // ---------------------------INPUT-------------------------------------
00330
00331   // The word is the input C_BLOBs in the rotated pixel space.
00332   // word is NOT owned by the WERD_RES unless combination is true.
00333   // All the other word pointers ARE owned by the WERD_RES.
00334   WERD* word;                     // Input C_BLOB word.
00335
00336   // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
00337
00338   // The bln_boxes contains the bounding boxes (only) of the input word, in the
00339   // BLN space. The lengths of word and bln_boxes
00340   // match as they are both before any chopping.
00341   // TODO(rays) determine if docqual does anything useful and delete bln_boxes
00342   // if it doesn't.
00343   tesseract::BoxWord* bln_boxes;  // BLN input bounding boxes.
00344   // The denorm provides the transformation to get back to the rotated image
00345   // coords from the chopped_word/rebuild_word BLN coords.
00346   DENORM denorm;                  // For use on chopped_word.
00347   // Unicharset used by the classifier output in best_choice and raw_choice.
00348   const UNICHARSET* uch_set;  // For converting back to utf8.
00349
00350   // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
00351   // ----Setup to a (different!) state expected by the various classifiers----
00352   // TODO(rays) Tidy and make more consistent.
00353
00354   // The chopped_word is also in BLN space, and represents the fully chopped
00355   // character fragments that make up the word.
00356   // The length of chopped_word matches length of seam_array + 1 (if set).
00357   TWERD* chopped_word;            // BLN chopped fragments output.
00358   SEAMS seam_array;               // Seams matching chopped_word.
00359   WERD_CHOICE *best_choice;       // tess output
00360   WERD_CHOICE *raw_choice;        // top choice permuter
00361   // Alternative paths found during chopping/segmentation search stages
00362   // (the first entry being a slim copy of best_choice).
00363   GenericVector<WERD_CHOICE *> alt_choices;
00364   GenericVector<GenericVector<int> > alt_states;
00365
00366   // Truth bounding boxes, text and incorrect choice reason.
00367   BlamerBundle *blamer_bundle;
00368
00369   // --------------OUTPUT FROM RECOGNITION-------------------------------
00370   // --------------Not all fields are necessarily set.-------------------
00371   // ---best_choice, raw_choice *must* end up set, with a box_word-------
00372   // ---In complete output, the number of blobs in rebuild_word matches---
00373   // ---the number of boxes in box_word, the number of unichar_ids in---
00374   // ---best_choice, the number of ints in best_state, and the number---
00375   // ---of strings in correct_text--------------------------------------
00376   // ---SetupFake Sets everything to appropriate values if the word is---
00377   // ---known to be bad before recognition.------------------------------
00378
00379   // The rebuild_word is also in BLN space, but represents the final best
00380   // segmentation of the word. Its length is therefore the same as box_word.
00381   TWERD* rebuild_word;            // BLN best segmented word.
00382   // The box_word is in the original image coordinate space. It is the
00383   // bounding boxes of the rebuild_word, after denormalization.
00384   // The length of box_word matches rebuild_word, best_state (if set) and
00385   // correct_text (if set), as well as best_choice and represents the
00386   // number of classified units in the output.
00387   tesseract::BoxWord* box_word;   // Denormalized output boxes.
00388   // The best_state stores the relationship between chopped_word and
00389   // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
00390   // adjacent blobs in chopped_word. The seams in seam_array are hidden
00391   // within a rebuild_word blob and revealed between them.
00392   GenericVector<int> best_state;  // Number of blobs in each best blob.
00393   // The correct_text is used during training and adaption to carry the
00394   // text to the training system without the need for a unicharset. There
00395   // is one entry in the vector for each blob in rebuild_word and box_word.
00396   GenericVector<STRING> correct_text;
00397   // The Tesseract that was used to recognize this word. Just a borrowed
00398   // pointer. Note: Tesseract's class definition is in a higher-level library.
00399   // We avoid introducing a cyclic dependency by not using the Tesseract
00400   // within WERD_RES. We are just storing it to provide access to it
00401   // for the top-level multi-language controller, and maybe for output of
00402   // the recognized language.
00403   tesseract::Tesseract* tesseract;
00404
00405   // Less-well documented members.
00406   // TODO(rays) Add more documentation here.
00407   WERD_CHOICE *ep_choice;      // ep text TODO(rays) delete this.
00408   REJMAP reject_map;           // best_choice rejects
00409   BOOL8 tess_failed;
00410   /*
00411     If tess_failed is TRUE, one of the following tests failed when Tess
00412     returned:
00413     - The outword blob list was not the same length as the best_choice string;
00414     - The best_choice string contained ALL blanks;
00415     - The best_choice string was zero length
00416   */
00417   BOOL8 tess_accepted;          // Tess thinks its ok?
00418   BOOL8 tess_would_adapt;       // Tess would adapt?
00419   BOOL8 done;                   // ready for output?
00420   bool small_caps;             // word appears to be small caps
00421   inT8 italic;
00422   inT8 bold;
00423   // The fontinfos are pointers to data owned by the classifier.
00424   const FontInfo* fontinfo;
00425   const FontInfo* fontinfo2;
00426   inT8 fontinfo_id_count;       // number of votes
00427   inT8 fontinfo_id2_count;      // number of votes
00428   BOOL8 guessed_x_ht;
00429   BOOL8 guessed_caps_ht;
00430   CRUNCH_MODE unlv_crunch_mode;
00431   float x_height;              // post match estimate
00432   float caps_height;           // post match estimate
00433
00434   /*
00435     To deal with fuzzy spaces we need to be able to combine "words" to form
00436     combinations when we suspect that the gap is a non-space. The (new) text
00437     ord code generates separate words for EVERY fuzzy gap - flags in the word
00438     indicate whether the gap is below the threshold (fuzzy kern) and is thus
00439     NOT a real word break by default, or above the threshold (fuzzy space) and
00440     this is a real word break by default.
00441
00442     The WERD_RES list contains all these words PLUS "combination" words built
00443     out of (copies of) the words split by fuzzy kerns. The separate parts have
00444     their "part_of_combo" flag set true and should be IGNORED on a default
00445     reading of the list.
00446
00447     Combination words are FOLLOWED by the sequence of part_of_combo words
00448     which they combine.
00449   */
00450   BOOL8 combination;           //of two fuzzy gap wds
00451   BOOL8 part_of_combo;         //part of a combo
00452   BOOL8 reject_spaces;         //Reject spacing?
00453   // FontInfo ids for each unichar in best_choice.
00454   GenericVector<inT8> best_choice_fontinfo_ids;
00455
00456   WERD_RES() {
00457     InitNonPointers();
00458     InitPointers();
00459   }
00460   WERD_RES(WERD *the_word) {
00461     InitNonPointers();
00462     InitPointers();
00463     word = the_word;
00464   }
00465   WERD_RES(const WERD_RES &source) {
00466     InitPointers();
00467     *this = source;            // see operator=
00468   }
00469
00470   ~WERD_RES();
00471
00472   // Returns the UTF-8 string for the given blob index in the best_choice word,
00473   // given that we know whether we are in a right-to-left reading context.
00474   // This matters for mirrorable characters such as parentheses.  We recognize
00475   // characters purely based on their shape on the page, and by default produce
00476   // the corresponding unicode for a left-to-right context.
00477   const char* const BestUTF8(int blob_index, bool in_rtl_context) const {
00478     if (blob_index < 0 || blob_index >= best_choice->length())
00479       return NULL;
00480     UNICHAR_ID id = best_choice->unichar_id(blob_index);
00481     if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
00482       return NULL;
00483     UNICHAR_ID mirrored = uch_set->get_mirror(id);
00484     if (in_rtl_context && mirrored > 0 && mirrored != INVALID_UNICHAR_ID)
00485       id = mirrored;
00486     return uch_set->id_to_unichar_ext(id);
00487   }
00488   // Returns the UTF-8 string for the given blob index in the raw_choice word.
00489   const char* const RawUTF8(int blob_index) const {
00490     if (blob_index < 0 || blob_index >= raw_choice->length())
00491       return NULL;
00492     UNICHAR_ID id = raw_choice->unichar_id(blob_index);
00493     if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
00494       return NULL;
00495     return uch_set->id_to_unichar(id);
00496   }
00497
00498   UNICHARSET::Direction SymbolDirection(int blob_index) const {
00499     if (best_choice == NULL ||
00500         blob_index >= best_choice->length() ||
00501         blob_index < 0)
00502       return UNICHARSET::U_OTHER_NEUTRAL;
00503     return uch_set->get_direction(best_choice->unichar_id(blob_index));
00504   }
00505
00506   bool AnyRtlCharsInWord() const {
00507     if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
00508       return false;
00509     for (int id = 0; id < best_choice->length(); id++) {
00510       int unichar_id = best_choice->unichar_id(id);
00511       if (unichar_id < 0 || unichar_id >= uch_set->size())
00512         continue;  // Ignore illegal chars.
00513       UNICHARSET::Direction dir =
00514           uch_set->get_direction(unichar_id);
00515       if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
00516           dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
00517           dir == UNICHARSET::U_ARABIC_NUMBER)
00518         return true;
00519     }
00520     return false;
00521   }
00522
00523   bool AnyLtrCharsInWord() const {
00524     if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
00525       return false;
00526     for (int id = 0; id < best_choice->length(); id++) {
00527       int unichar_id = best_choice->unichar_id(id);
00528       if (unichar_id < 0 || unichar_id >= uch_set->size())
00529         continue;  // Ignore illegal chars.
00530       UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
00531       if (dir == UNICHARSET::U_LEFT_TO_RIGHT)
00532         return true;
00533     }
00534     return false;
00535   }
00536
00537   // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
00538   // that gave us the unichars in reading order (as opposed to strict left
00539   // to right).
00540   bool UnicharsInReadingOrder() const {
00541     return best_choice->unichars_in_script_order();
00542   }
00543
00544   void InitNonPointers();
00545   void InitPointers();
00546   void Clear();
00547   void ClearResults();
00548
00549   WERD_RES& operator=(const WERD_RES& source);  //from this
00550
00551   void CopySimpleFields(const WERD_RES& source);
00552
00553   // Initializes a blank (default constructed) WERD_RES from one that has
00554   // already been recognized.
00555   // Use SetupFor*Recognition afterwards to complete the setup and make
00556   // it ready for a retry recognition.
00557   void InitForRetryRecognition(const WERD_RES& source);
00558
00559   // Sets up the members used in recognition: bln_boxes, chopped_word,
00560   // seam_array, denorm, best_choice, raw_choice.  Returns false if
00561   // the word is empty and sets up fake results.  If use_body_size is
00562   // true and row->body_size is set, then body_size will be used for
00563   // blob normalization instead of xheight + ascrise. This flag is for
00564   // those languages that are using CJK pitch model and thus it has to
00565   // be true if and only if tesseract->textord_use_cjk_fp_model is
00566   // true.
00567   bool SetupForTessRecognition(const UNICHARSET& unicharset_in,
00568                                tesseract::Tesseract* tesseract, Pix* pix,
00569                                bool numeric_mode, bool use_body_size,
00570                                ROW *row, BLOCK* block);
00571
00572   // Sets up the members used in recognition:
00573   // bln_boxes, chopped_word, seam_array, denorm.
00574   // Returns false if the word is empty and sets up fake results.
00575   bool SetupForCubeRecognition(const UNICHARSET& unicharset_in,
00576                                tesseract::Tesseract* tesseract,
00577                                const BLOCK* block);
00578
00579   // Sets up the members used in recognition for an empty recognition result:
00580   // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
00581   void SetupFake(const UNICHARSET& uch);
00582
00583   // Set the word as having the script of the input unicharset.
00584   void SetupWordScript(const UNICHARSET& unicharset_in);
00585
00586   // Sets up the blamer_bundle if it is not null, using the initialized denorm.
00587   void SetupBlamerBundle();
00588
00589   // Moves the results fields from word to this. This takes ownership of all
00590   // the data, so src can be destructed.
00591   // word1.ConsumeWordResult(word);
00592   // delete word;
00593   // is simpler and faster than:
00594   // word1 = *word;
00595   // delete word;
00596   // as it doesn't need to copy and reallocate anything.
00597   void ConsumeWordResults(WERD_RES* word);
00598
00599   // Replace the best choice and rebuild box word.
00600   void ReplaceBestChoice(const WERD_CHOICE& choice,
00601                          const GenericVector<int> &segmentation_state);
00602
00603   // Builds the rebuild_word from the chopped_word and the best_state.
00604   void RebuildBestState();
00605
00606   // Copies the chopped_word to the rebuild_word, faking a best_state as well.
00607   // Also sets up the output box_word.
00608   void CloneChoppedToRebuild();
00609
00610   // Sets/replaces the box_word with one made from the rebuild_word.
00611   void SetupBoxWord();
00612
00613   // Sets up the script positions in the output boxword using the best_choice
00614   // to get the unichars, and the unicharset to get the target positions.
00615   void SetScriptPositions();
00616
00617   // Returns the indices [start, end) containing the core of the word, stripped
00618   // of any superscript digits on either side.
00619   // (i.e., the non-footnote part of the word).
00620   // Assumes that BoxWord is all set up for best_choice.
00621   void WithoutFootnoteSpan(int *start, int *end) const;
00622
00623   // Given an alternate word choice and segmentation state, yield the indices
00624   // [start, end) containig the core of the word, stripped of any superscript
00625   // digits on either side.  (i.e. stripping off the footnote parts).
00626   void WithoutFootnoteSpan(
00627       const WERD_CHOICE &choice, const GenericVector<int> &state,
00628       int *start, int *end) const;
00629
00630   // Classifies the word with some already-calculated BLOB_CHOICEs.
00631   // The choices are an array of blob_count pointers to BLOB_CHOICE,
00632   // providing a single classifier result for each blob.
00633   // The BLOB_CHOICEs are consumed and the word takes ownership.
00634   // The number of blobs in the outword must match blob_count.
00635   void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);
00636
00637   // Copies the best_choice strings to the correct_text for adaption/training.
00638   void BestChoiceToCorrectText();
00639
00640   // Merges 2 adjacent blobs in the result if the permanent callback
00641   // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
00642   // callback box_cb is NULL or returns true, setting the merged blob
00643   // result to the class returned from class_cb.
00644   // Returns true if anything was merged.
00645   bool ConditionalBlobMerge(
00646       TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
00647       TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb,
00648       BLOB_CHOICE_LIST_CLIST *blob_choices);
00649
00650   // Callback helper for fix_quotes returns a double quote if both
00651   // arguments are quote, otherwise INVALID_UNICHAR_ID.
00652   UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
00653   void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices);
00654
00655   // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
00656   // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
00657   UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2);
00658   // Callback helper for fix_hyphens returns true if box1 and box2 overlap
00659   // (assuming both on the same textline, are in order and a chopped em dash.)
00660   bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
00661   void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices);
00662
00663   // Callback helper for merge_tess_fails returns a space if both
00664   // arguments are space, otherwise INVALID_UNICHAR_ID.
00665   UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
00666   void merge_tess_fails();
00667
00668   static WERD_RES* deep_copy(const WERD_RES* src) {
00669     return new WERD_RES(*src);
00670   }
00671
00672   // Copy blobs from word_res onto this word (eliminating spaces between).
00673   // Since this may be called bidirectionally OR both the BOL and EOL flags.
00674   void copy_on(WERD_RES *word_res) {  //from this word
00675     word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
00676     word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
00677     word->copy_on(word_res->word);
00678   }
00679
00680   // Returns true if the collection of count pieces, starting at start, are all
00681   // natural connected components, ie there are no real chops involved.
00682   bool PiecesAllNatural(int start, int count) const;
00683 };
00684
00685 /*************************************************************************
00686  * PAGE_RES_IT - Page results iterator
00687  *************************************************************************/
00688
00689 class PAGE_RES_IT {
00690  public:
00691   PAGE_RES * page_res;         // page being iterated
00692
00693   PAGE_RES_IT() {
00694   }                            // empty contructor
00695
00696   PAGE_RES_IT(PAGE_RES *the_page_res) {    // page result
00697     page_res = the_page_res;
00698     restart_page();  // ready to scan
00699   }
00700
00701   // Do two PAGE_RES_ITs point at the same word?
00702   // This is much cheaper than cmp().
00703   bool operator ==(const PAGE_RES_IT &other) const;
00704
00705   bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); }
00706
00707   // Given another PAGE_RES_IT to the same page,
00708   //  this before other:     -1
00709   //  this equal to other:    0
00710   //  this later than other:  1
00711   int cmp(const PAGE_RES_IT &other) const;
00712
00713   WERD_RES *restart_page() {
00714     return start_page(false);  // Skip empty blocks.
00715   }
00716   WERD_RES *restart_page_with_empties() {
00717     return start_page(true);  // Allow empty blocks.
00718   }
00719   WERD_RES *start_page(bool empty_ok);
00720
00721   WERD_RES *restart_row();
00722
00723   // ============ Methods that mutate the underling structures ===========
00724   // Note that these methods will potentially invalidate other PAGE_RES_ITs
00725   // and are intended to be used only while a single PAGE_RES_IT is  active.
00726   // This problem needs to be taken into account if these mutation operators
00727   // are ever provided to PageIterator or its subclasses.
00728
00729   // Inserts the new_word and a corresponding WERD_RES before the current
00730   // position. The simple fields of the WERD_RES are copied from clone_res and
00731   // the resulting WERD_RES is returned for further setup with best_choice etc.
00732   WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);
00733
00734   // Deletes the current WERD_RES and its underlying WERD.
00735   void DeleteCurrentWord();
00736
00737   WERD_RES *forward() {  // Get next word.
00738     return internal_forward(false, false);
00739   }
00740   // Move forward, but allow empty blocks to show as single NULL words.
00741   WERD_RES *forward_with_empties() {
00742     return internal_forward(false, true);
00743   }
00744
00745   WERD_RES *forward_paragraph();  // get first word in next non-empty paragraph
00746   WERD_RES *forward_block();  // get first word in next non-empty block
00747
00748   WERD_RES *prev_word() const {  // previous word
00749     return prev_word_res;
00750   }
00751   ROW_RES *prev_row() const {  // row of prev word
00752     return prev_row_res;
00753   }
00754   BLOCK_RES *prev_block() const {  // block of prev word
00755     return prev_block_res;
00756   }
00757   WERD_RES *word() const {  // current word
00758     return word_res;
00759   }
00760   ROW_RES *row() const {  // row of current word
00761     return row_res;
00762   }
00763   BLOCK_RES *block() const {  // block of cur. word
00764     return block_res;
00765   }
00766   WERD_RES *next_word() const {  // next word
00767     return next_word_res;
00768   }
00769   ROW_RES *next_row() const {  // row of next word
00770     return next_row_res;
00771   }
00772   BLOCK_RES *next_block() const {  // block of next word
00773     return next_block_res;
00774   }
00775   void rej_stat_word();  // for page/block/row
00776
00777  private:
00778   void ResetWordIterator();
00779   WERD_RES *internal_forward(bool new_block, bool empty_ok);
00780
00781   WERD_RES * prev_word_res;    // previous word
00782   ROW_RES *prev_row_res;       // row of prev word
00783   BLOCK_RES *prev_block_res;   // block of prev word
00784
00785   WERD_RES *word_res;          // current word
00786   ROW_RES *row_res;            // row of current word
00787   BLOCK_RES *block_res;        // block of cur. word
00788
00789   WERD_RES *next_word_res;     // next word
00790   ROW_RES *next_row_res;       // row of next word
00791   BLOCK_RES *next_block_res;   // block of next word
00792
00793   BLOCK_RES_IT block_res_it;   // iterators
00794   ROW_RES_IT row_res_it;
00795   WERD_RES_IT word_res_it;
00796 };
00797 #endif