Tesseract  3.02
tesseract-ocr/wordrec/wordrec.h
Go to the documentation of this file.
00001 
00002 // File:        wordrec.h
00003 // Description: wordrec class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #ifndef TESSERACT_WORDREC_WORDREC_H__
00020 #define TESSERACT_WORDREC_WORDREC_H__
00021 
00022 #include "associate.h"
00023 #include "classify.h"
00024 #include "dict.h"
00025 #include "language_model.h"
00026 #include "ratngs.h"
00027 #include "matrix.h"
00028 #include "matchtab.h"
00029 #include "oldheap.h"
00030 #include "gradechop.h"
00031 #include "seam.h"
00032 #include "states.h"
00033 #include "findseam.h"
00034 #include "callcpp.h"
00035 
00036 struct CHUNKS_RECORD;
00037 struct SEARCH_RECORD;
00038 class WERD_RES;
00039 
00040 // A struct for storing child/parent pairs of the BLOB_CHOICE_LISTs
00041 // to be processed by the segmentation search.
00042 struct SEG_SEARCH_PENDING : public ELIST_LINK {
00043   SEG_SEARCH_PENDING(int child_row_arg,
00044                      BLOB_CHOICE_LIST *parent_arg,
00045                      tesseract::LanguageModelFlagsType changed_arg) :
00046     child_row(child_row_arg), parent(parent_arg), changed(changed_arg) {}
00047 
00048   // Comparator function for add_sorted().
00049   static int compare(const void *p1, const void *p2) {
00050     const SEG_SEARCH_PENDING *e1 = *reinterpret_cast<
00051       const SEG_SEARCH_PENDING * const *>(p1);
00052     const SEG_SEARCH_PENDING *e2 = *reinterpret_cast<
00053       const SEG_SEARCH_PENDING * const *>(p2);
00054     if (e1->child_row == e2->child_row &&
00055         e1->parent == e2->parent) return 0;
00056     return (e1->child_row < e2->child_row) ? -1 : 1;
00057   }
00058 
00059   int child_row;  // row of the child in the ratings matrix
00060   BLOB_CHOICE_LIST *parent;  // pointer to the parent BLOB_CHOICE_LIST
00061   // Flags that indicate which language model components are still active
00062   // on the parent path (i.e. recorded some changes to the language model
00063   // state) and need to be invoked for this pending entry.
00064   // This field is used as an argument to LanguageModel::UpdateState()
00065   // in Wordrec::UpdateSegSearchNodes().
00066   tesseract::LanguageModelFlagsType changed;
00067 };
00068 
00069 ELISTIZEH(SEG_SEARCH_PENDING);
00070 
00071 
00072 namespace tesseract {
00073 
00074 /* ccmain/tstruct.cpp *********************************************************/
00075 class FRAGMENT:public ELIST_LINK
00076 {
00077   public:
00078     FRAGMENT() {  //constructor
00079     }
00080     FRAGMENT(EDGEPT *head_pt,   //start
00081              EDGEPT *tail_pt);  //end
00082 
00083     ICOORD head;                 //coords of start
00084     ICOORD tail;                 //coords of end
00085     EDGEPT *headpt;              //start point
00086     EDGEPT *tailpt;              //end point
00087 };
00088 ELISTIZEH(FRAGMENT)
00089 
00090 
00091 class Wordrec : public Classify {
00092  public:
00093   // config parameters *******************************************************
00094   BOOL_VAR_H(merge_fragments_in_matrix, TRUE,
00095              "Merge the fragments in the ratings matrix and delete them "
00096              "after merging");
00097   BOOL_VAR_H(wordrec_no_block, FALSE, "Don't output block information");
00098   BOOL_VAR_H(wordrec_enable_assoc, TRUE, "Associator Enable");
00099   BOOL_VAR_H(force_word_assoc, FALSE,
00100              "force associator to run regardless of what enable_assoc is."
00101              "This is used for CJK where component grouping is necessary.");
00102   INT_VAR_H(wordrec_num_seg_states, 30, "Segmentation states");
00103   double_VAR_H(wordrec_worst_state, 1, "Worst segmentation state");
00104   BOOL_VAR_H(fragments_guide_chopper, FALSE,
00105              "Use information from fragments to guide chopping process");
00106   INT_VAR_H(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
00107   double_VAR_H(tessedit_certainty_threshold, -2.25, "Good blob limit");
00108   INT_VAR_H(chop_debug, 0, "Chop debug");
00109   BOOL_VAR_H(chop_enable, 1, "Chop enable");
00110   BOOL_VAR_H(chop_vertical_creep, 0, "Vertical creep");
00111   INT_VAR_H(chop_split_length, 10000, "Split Length");
00112   INT_VAR_H(chop_same_distance, 2, "Same distance");
00113   INT_VAR_H(chop_min_outline_points, 6, "Min Number of Points on Outline");
00114   INT_VAR_H(chop_inside_angle, -50, "Min Inside Angle Bend");
00115   INT_VAR_H(chop_min_outline_area, 2000, "Min Outline Area");
00116   double_VAR_H(chop_split_dist_knob, 0.5, "Split length adjustment");
00117   double_VAR_H(chop_overlap_knob, 0.9, "Split overlap adjustment");
00118   double_VAR_H(chop_center_knob, 0.15, "Split center adjustment");
00119   double_VAR_H(chop_sharpness_knob, 0.06, "Split sharpness adjustment");
00120   double_VAR_H(chop_width_change_knob, 5.0, "Width change adjustment");
00121   double_VAR_H(chop_ok_split, 100.0, "OK split limit");
00122   double_VAR_H(chop_good_split, 50.0, "Good split limit");
00123   INT_VAR_H(chop_x_y_weight, 3, "X / Y  length weight");
00124   INT_VAR_H(segment_adjust_debug, 0, "Segmentation adjustment debug");
00125   BOOL_VAR_H(assume_fixed_pitch_char_segment, FALSE,
00126              "include fixed-pitch heuristics in char segmentation");
00127   BOOL_VAR_H(use_new_state_cost, FALSE,
00128              "use new state cost heuristics for segmentation state evaluation");
00129   double_VAR_H(heuristic_segcost_rating_base, 1.25,
00130                "base factor for adding segmentation cost into word rating."
00131                "It's a multiplying factor, the larger the value above 1, "
00132                "the bigger the effect of segmentation cost.");
00133   double_VAR_H(heuristic_weight_rating, 1,
00134                "weight associated with char rating in combined cost of state");
00135   double_VAR_H(heuristic_weight_width, 0,
00136                "weight associated with width evidence in combined cost of state");
00137   double_VAR_H(heuristic_weight_seamcut, 0,
00138                "weight associated with seam cut in combined cost of state");
00139   double_VAR_H(heuristic_max_char_wh_ratio, 2.0,
00140                "max char width-to-height ratio allowed in segmentation");
00141   INT_VAR_H(wordrec_debug_level, 0, "Debug level for wordrec");
00142   BOOL_VAR_H(wordrec_debug_blamer, false, "Print blamer debug messages");
00143   BOOL_VAR_H(wordrec_run_blamer, false, "Try to set the blame for errors");
00144   BOOL_VAR_H(enable_new_segsearch, false,
00145              "Enable new segmentation search path.");
00146   INT_VAR_H(segsearch_debug_level, 0, "SegSearch debug level");
00147   INT_VAR_H(segsearch_max_pain_points, 2000,
00148             "Maximum number of pain points stored in the queue");
00149   INT_VAR_H(segsearch_max_futile_classifications, 10,
00150             "Maximum number of pain point classifications per word.");
00151   double_VAR_H(segsearch_max_char_wh_ratio, 2.0,
00152                "Maximum character width-to-height ratio");
00153   double_VAR_H(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
00154                "Maximum character width-to-height ratio for"
00155                "fixed pitch fonts");
00156   BOOL_VAR_H(save_alt_choices, false,
00157              "Save alternative paths found during chopping "
00158              "and segmentation search");
00159 
00160   // methods from wordrec/*.cpp ***********************************************
00161   Wordrec();
00162   virtual ~Wordrec();
00163 
00164   void CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from,
00165                        BLOB_CHOICE_LIST_VECTOR *to);
00166 
00167   // Returns true if text recorded in choice is the same as truth_text.
00168   bool ChoiceIsCorrect(const UNICHARSET& uni_set,
00169                        const WERD_CHOICE *choice,
00170                        const GenericVector<STRING> &truth_text);
00171 
00172   // Fills word->alt_choices with alternative paths found during
00173   // chopping/segmentation search that are kept in best_choices.
00174   // TODO(antonova): the function currently saves unchar ids, rating and
00175   // certainty information for each alternative choice.
00176   // We might need to add saving blob choices and segmentation state
00177   // associated with each alt choice if needed.
00178   void SaveAltChoices(const LIST &best_choices, WERD_RES *word);
00179 
00180   // Fills character choice lattice in the given BlamerBundle
00181   // using the given ratings matrix and best choice list.
00182   void FillLattice(const MATRIX &ratings, const LIST &best_choices,
00183                    const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
00184 
00185   // Calls fill_lattice_ member function
00186   // (assumes that fill_lattice_ is not NULL).
00187   void CallFillLattice(const MATRIX &ratings, const LIST &best_choices,
00188                        const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) {
00189     (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
00190   }
00191 
00192   // tface.cpp
00193   void program_editup(const char *textbase,
00194                       bool init_classifier,
00195                       bool init_permute);
00196   BLOB_CHOICE_LIST_VECTOR *cc_recog(WERD_RES *word);
00197   void program_editdown(inT32 elasped_time);
00198   void set_pass1();
00199   void set_pass2();
00200   int end_recog();
00201   BLOB_CHOICE_LIST *call_matcher(const DENORM* denorm, TBLOB* blob);
00202   int dict_word(const WERD_CHOICE &word);
00203   // wordclass.cpp
00204   BLOB_CHOICE_LIST *classify_blob(TBLOB *blob,
00205                                   const DENORM& denorm,
00206                                   const char *string,
00207                                   C_COL color,
00208                                   BlamerBundle *blamer_bundle);
00209   BLOB_CHOICE_LIST *fake_classify_blob(UNICHAR_ID class_id,
00210                                        float rating, float certainty);
00211   void update_blob_classifications(TWERD *word,
00212                                    const BLOB_CHOICE_LIST_VECTOR &choices);
00213 
00214   // bestfirst.cpp
00215   BLOB_CHOICE_LIST_VECTOR *evaluate_chunks(CHUNKS_RECORD *chunks_record,
00216                                            SEARCH_STATE search_state,
00217                                            BlamerBundle *blamer_bundle);
00218   void update_ratings(const BLOB_CHOICE_LIST_VECTOR &new_choices,
00219                       const CHUNKS_RECORD *chunks_record,
00220                       const SEARCH_STATE search_state);
00221   inT16 evaluate_state(CHUNKS_RECORD *chunks_record,
00222                        SEARCH_RECORD *the_search,
00223                        DANGERR *fixpt,
00224                        BlamerBundle *blamer_bundle);
00225   SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
00226                             int num_joints,
00227                             BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00228                             WERD_CHOICE *best_choice,
00229                             WERD_CHOICE *raw_choice,
00230                             STATE *state);
00231   void best_first_search(CHUNKS_RECORD *chunks_record,
00232                          BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00233                          WERD_RES *word,
00234                          STATE *state,
00235                          DANGERR *fixpt,
00236                          STATE *best_state);
00237   void delete_search(SEARCH_RECORD *the_search);
00238   void expand_node(FLOAT32 worst_priority,
00239                    CHUNKS_RECORD *chunks_record,
00240                    SEARCH_RECORD *the_search);
00241   void replace_char_widths(CHUNKS_RECORD *chunks_record,
00242                            SEARCH_STATE state);
00243   // Transfers the given state to the word's output fields: rebuild_word,
00244   // best_state, box_word, and returns the corresponding blob choices.
00245   BLOB_CHOICE_LIST_VECTOR *rebuild_current_state(
00246       WERD_RES *word,
00247       STATE *state,
00248       BLOB_CHOICE_LIST_VECTOR *char_choices,
00249       MATRIX *ratings);
00250   // Creates a fake blob choice from the combination of the given fragments.
00251   // unichar is the class to be made from the combination,
00252   // expanded_fragment_lengths[choice_index] is the number of fragments to use.
00253   // old_choices[choice_index] has the classifier output for each fragment.
00254   // choice index initially indexes the last fragment and should be decremented
00255   // expanded_fragment_lengths[choice_index] times to get the earlier fragments.
00256   // Guarantees to return something non-null, or abort!
00257   BLOB_CHOICE* rebuild_fragments(
00258       const char* unichar,
00259       const char* expanded_fragment_lengths,
00260       int choice_index,
00261       BLOB_CHOICE_LIST_VECTOR *old_choices);
00262   // Creates a joined copy of the blobs between x and y (inclusive) and
00263   // insert into the rebuild_word in word.
00264   // Returns a deep copy of the classifier results for the blob.
00265   BLOB_CHOICE_LIST *join_blobs_and_classify(
00266       WERD_RES* word, int x, int y, int choice_index, MATRIX *ratings,
00267       BLOB_CHOICE_LIST_VECTOR *old_choices);
00268   STATE *pop_queue(HEAP *queue);
00269   void push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority,
00270                   FLOAT32 priority, bool debug);
00271 
00272   // segsearch.cpp
00273   // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
00274   // Each entry in the matrix represents the classification choice
00275   // for a chunk, i.e. an entry in row 2, column 1 represents the list
00276   // of ratings for the chunks 1 and 2 classified as a single blob.
00277   // The entries on the diagonal of the matrix are classifier choice lists
00278   // for a single chunk from the maximal segmentation.
00279   //
00280   // The ratings matrix given to SegSearch represents the segmentation
00281   // graph / trellis for the current word. The nodes in the graph are the
00282   // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
00283   // matrix. The children of each node (nodes connected by outgoing links)
00284   // are the entries in the column that is equal to node's row+1. The parents
00285   // (nodes connected by the incoming links) are the entries in the row that
00286   // is equal to the node's column-1. Here is an example ratings matrix:
00287   //
00288   //    0    1    2   3   4
00289   //  -------------------------
00290   // 0| c,(                   |
00291   // 1| d    l,1              |
00292   // 2|           o           |
00293   // 3|              c,(      |
00294   // 4|              g,y  l,1 |
00295   //  -------------------------
00296   //
00297   // In the example above node "o" has children (outgoing connection to nodes)
00298   // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
00299   //
00300   // The objective of the search is to find the least cost path, where the cost
00301   // is determined by the language model components and the properties of the
00302   // cut between the blobs on the path. SegSearch starts by populating the
00303   // matrix with the all the entries that were classified by the chopper and
00304   // finding the initial best path. Based on the classifier ratings, language
00305   // model scores and the properties of each cut, a list of "pain points" is
00306   // constructed - those are the points on the path where the choices do not
00307   // look consistent with the neighboring choices, the cuts look particularly
00308   // problematic, or the certainties of the blobs are low. The most troublesome
00309   // "pain point" is picked from the list and the new entry in the ratings
00310   // matrix corresponding to this "pain point" is filled in. Then the language
00311   // model state is updated to reflect the new classification and the new
00312   // "pain points" are added to the list and the next most troublesome
00313   // "pain point" is determined. This continues until either the word choice
00314   // composed from the best paths in the segmentation graph is "good enough"
00315   // (e.g. above a certain certainty threshold, is an unambiguous dictionary
00316   // word, etc) or there are no more "pain points" to explore.
00317   void SegSearch(CHUNKS_RECORD *chunks_record,
00318                  WERD_CHOICE *best_choice,
00319                  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00320                  WERD_CHOICE *raw_choice,
00321                  STATE *output_best_state,
00322                  BlamerBundle *blamer_bundle);
00323 
00324   // chop.cpp
00325   PRIORITY point_priority(EDGEPT *point);
00326   void add_point_to_list(POINT_GROUP point_list, EDGEPT *point);
00327   int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
00328   int is_little_chunk(EDGEPT *point1, EDGEPT *point2);
00329   int is_small_area(EDGEPT *point1, EDGEPT *point2);
00330   EDGEPT *pick_close_point(EDGEPT *critical_point,
00331                            EDGEPT *vertical_point,
00332                            int *best_dist);
00333   void prioritize_points(TESSLINE *outline, POINT_GROUP points);
00334   void new_min_point(EDGEPT *local_min, POINT_GROUP points);
00335   void new_max_point(EDGEPT *local_max, POINT_GROUP points);
00336   void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point,
00337                                  EDGEPT** best_point,
00338                                  EDGEPT_CLIST *new_points);
00339 
00340   // chopper.cpp
00341   SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
00342                           bool italic_blob, SEAMS seam_list);
00343   SEAM *chop_numbered_blob(TWERD *word, inT32 blob_number,
00344                            bool italic_blob, SEAMS seam_list);
00345   SEAM *chop_overlapping_blob(const GenericVector<TBOX>& boxes,
00346                               WERD_RES *word_res, inT32 *blob_number,
00347                               bool italic_blob, SEAMS seam_list);
00348   bool improve_one_blob(WERD_RES *word_res,
00349                         BLOB_CHOICE_LIST_VECTOR *char_choices,
00350                         inT32 *blob_number,
00351                         SEAMS *seam_list,
00352                         DANGERR *fixpt,
00353                         bool split_next_to_fragment,
00354                         BlamerBundle *blamer_bundle);
00355   void modify_blob_choice(BLOB_CHOICE_LIST *answer,
00356                           int chop_index);
00357   bool chop_one_blob(TWERD *word,
00358                      BLOB_CHOICE_LIST_VECTOR *char_choices,
00359                      inT32 *blob_number,
00360                      SEAMS *seam_list,
00361                      int *right_chop_index);
00362   bool chop_one_blob2(const GenericVector<TBOX>& boxes,
00363                       WERD_RES *word_res, SEAMS *seam_list);
00364   BLOB_CHOICE_LIST_VECTOR *chop_word_main(WERD_RES *word);
00365   void improve_by_chopping(WERD_RES *word,
00366                            BLOB_CHOICE_LIST_VECTOR *char_choices,
00367                            STATE *best_state,
00368                            BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00369                            DANGERR *fixpt,
00370                            bool *updated_best_choice);
00371   MATRIX *word_associator(bool only_create_ratings_matrtix,
00372                           WERD_RES *word,
00373                           STATE *state,
00374                           BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00375                           DANGERR *fixpt,
00376                           STATE *best_state);
00377   inT16 select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00378                              float rating_ceiling,
00379                              bool split_next_to_fragment);
00380   inT16 select_blob_to_split_from_fixpt(DANGERR *fixpt);
00381   void set_chopper_blame(WERD_RES *word);
00382 
00383   // findseam.cpp
00384   void junk_worst_seam(SEAM_QUEUE seams, SEAM *new_seam, float new_priority);
00385   void choose_best_seam(SEAM_QUEUE seam_queue,
00386                         SEAM_PILE *seam_pile,
00387                         SPLIT *split,
00388                         PRIORITY priority,
00389                         SEAM **seam_result,
00390                         TBLOB *blob);
00391   void combine_seam(SEAM_QUEUE seam_queue, SEAM_PILE seam_pile, SEAM *seam);
00392   inT16 constrained_split(SPLIT *split, TBLOB *blob);
00393   void delete_seam_pile(SEAM_PILE seam_pile);
00394   SEAM *pick_good_seam(TBLOB *blob);
00395   PRIORITY seam_priority(SEAM *seam, inT16 xmin, inT16 xmax);
00396   void try_point_pairs (EDGEPT * points[MAX_NUM_POINTS],
00397                         inT16 num_points,
00398                         SEAM_QUEUE seam_queue,
00399                         SEAM_PILE * seam_pile, SEAM ** seam, TBLOB * blob);
00400   void try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS],
00401                            inT16 num_points,
00402                            EDGEPT_CLIST *new_points,
00403                            SEAM_QUEUE seam_queue,
00404                            SEAM_PILE * seam_pile, SEAM ** seam, TBLOB * blob);
00405 
00406   // gradechop.cpp
00407   PRIORITY full_split_priority(SPLIT *split, inT16 xmin, inT16 xmax);
00408   PRIORITY grade_center_of_blob(register BOUNDS_RECT rect);
00409   PRIORITY grade_overlap(register BOUNDS_RECT rect);
00410   PRIORITY grade_split_length(register SPLIT *split);
00411   PRIORITY grade_sharpness(register SPLIT *split);
00412   PRIORITY grade_width_change(register BOUNDS_RECT rect);
00413   void set_outline_bounds(register EDGEPT *point1,
00414                           register EDGEPT *point2,
00415                           BOUNDS_RECT rect);
00416 
00417   // outlines.cpp
00418   int crosses_outline(EDGEPT *p0, EDGEPT *p1, EDGEPT *outline);
00419   int is_crossed(TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1);
00420   int is_same_edgept(EDGEPT *p1, EDGEPT *p2);
00421   bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1,
00422                   EDGEPT **near_pt);
00423   void reverse_outline(EDGEPT *outline);
00424 
00425   // pieces.cpp
00426   virtual BLOB_CHOICE_LIST *classify_piece(TBLOB *pieces,
00427                                            const DENORM& denorm,
00428                                            SEAMS seams,
00429                                            inT16 start,
00430                                            inT16 end,
00431                                            BlamerBundle *blamer_bundle);
00432   // Try to merge fragments in the ratings matrix and put the result in
00433   // the corresponding row and column
00434   void merge_fragments(MATRIX *ratings,
00435                        inT16 num_blobs);
00436   // Recursively go through the ratings matrix to find lists of fragments
00437   // to be merged in the function merge_and_put_fragment_lists.
00438   // current_frag is the postion of the piece we are looking for.
00439   // current_row is the row in the rating matrix we are currently at.
00440   // start is the row we started initially, so that we can know where
00441   // to append the results to the matrix. num_frag_parts is the total
00442   // number of pieces we are looking for and num_blobs is the size of the
00443   // ratings matrix.
00444   void get_fragment_lists(inT16 current_frag,
00445                           inT16 current_row,
00446                           inT16 start,
00447                           inT16 num_frag_parts,
00448                           inT16 num_blobs,
00449                           MATRIX *ratings,
00450                           BLOB_CHOICE_LIST *choice_lists);
00451   // Merge the fragment lists in choice_lists and append it to the
00452   // ratings matrix
00453   void merge_and_put_fragment_lists(inT16 row,
00454                                     inT16 column,
00455                                     inT16 num_frag_parts,
00456                                     BLOB_CHOICE_LIST *choice_lists,
00457                                     MATRIX *ratings);
00458   // Filter the fragment list so that the filtered_choices only contain
00459   // fragments that are in the correct position. choices is the list
00460   // that we are going to filter. fragment_pos is the position in the
00461   // fragment that we are looking for and num_frag_parts is the the
00462   // total number of pieces. The result will be appended to
00463   // filtered_choices.
00464   void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
00465                                    int fragment_pos,
00466                                    int num_frag_parts,
00467                                    BLOB_CHOICE_LIST *filtered_choices);
00468   BLOB_CHOICE_LIST *get_piece_rating(MATRIX *ratings,
00469                                      TBLOB *blobs,
00470                                      const DENORM& denorm,
00471                                      SEAMS seams,
00472                                      inT16 start,
00473                                      inT16 end,
00474                                      BlamerBundle *blamer_bundle);
00475   // returns an array of bounding boxes for the given list of blobs.
00476   TBOX *record_blob_bounds(TBLOB *blobs);
00477   MATRIX *record_piece_ratings(TBLOB *blobs);
00478 
00479   // heuristic.cpp
00480   WIDTH_RECORD* state_char_widths(WIDTH_RECORD *chunk_widths,
00481                                   STATE *state,
00482                                   int num_joints);
00483   FLOAT32 get_width_variance(WIDTH_RECORD *wrec, float norm_height);
00484   FLOAT32 get_gap_variance(WIDTH_RECORD *wrec, float norm_height);
00485   FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record,
00486                            SEARCH_RECORD *the_search);
00487   FLOAT32 width_priority(CHUNKS_RECORD *chunks_record,
00488                          STATE *state,
00489                          int num_joints);
00490   FLOAT32 seamcut_priority(SEAMS seams,
00491                            STATE *state,
00492                            int num_joints);
00493   FLOAT32 rating_priority(CHUNKS_RECORD *chunks_record,
00494                           STATE *state,
00495                           int num_joints);
00496 
00497   // Member variables.
00498 
00499   LanguageModel *language_model_;
00500   PRIORITY pass2_ok_split;
00501   int pass2_seg_states;
00502   int num_joints;
00503   int num_pushed;
00504   int num_popped;
00505   BlobMatchTable blob_match_table;
00506   EVALUATION_ARRAY last_segmentation;
00507   // Stores the best choice for the previous word in the paragraph.
00508   // This variable is modified by PAGE_RES_IT when iterating over
00509   // words to OCR on the page.
00510   WERD_CHOICE *prev_word_best_choice_;
00511   // Sums of blame reasons computed by the blamer.
00512   GenericVector<int> blame_reasons_;
00513   // Function used to fill char choice lattices.
00514   void (Wordrec::*fill_lattice_)(const MATRIX &ratings,
00515                                   const LIST &best_choices,
00516                                   const UNICHARSET &unicharset,
00517                                   BlamerBundle *blamer_bundle);
00518 
00519  protected:
00520   inline bool SegSearchDone(int num_futile_classifications) {
00521     return (language_model_->AcceptableChoiceFound() ||
00522             num_futile_classifications >=
00523             segsearch_max_futile_classifications);
00524   }
00525 
00526   // Updates the language model state recorded for the child entries specified
00527   // in pending[starting_col]. Enqueues the children of the updated entries
00528   // into pending and proceeds to update (and remove from pending) all the
00529   // remaining entries in pending[col] (col >= starting_col). Upon termination
00530   // of this function all the pending[col] lists will be empty.
00531   //
00532   // The arguments:
00533   //
00534   // starting_col: index of the column in chunks_record->ratings from
00535   // which the update should be started
00536   //
00537   // pending: list of entries listing chunks_record->ratings entries
00538   // that should be updated
00539   //
00540   // pain_points: priority heap listing the pain points generated by
00541   // the language model
00542   //
00543   // temp_pain_points: temporary storage for tentative pain points generated
00544   // by the language model after a single call to LanguageModel::UpdateState()
00545   // (the argument is passed in rather than created before each
00546   // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
00547   //
00548   // best_choice_bundle: a collection of variables that should be updated
00549   // if a new best choice is found
00550   //
00551   void UpdateSegSearchNodes(int starting_col,
00552                             SEG_SEARCH_PENDING_LIST *pending[],
00553                             BestPathByColumn *best_path_by_column[],
00554                             CHUNKS_RECORD *chunks_record,
00555                             HEAP *pain_points,
00556                             BestChoiceBundle *best_choice_bundle,
00557                             BlamerBundle *blamer_bundle);
00558 
00559   // Process the given pain point: classify the corresponding blob, enqueue
00560   // new pain points to join the newly classified blob with its neighbors.
00561   void ProcessSegSearchPainPoint(float pain_point_priority,
00562                                  const MATRIX_COORD &pain_point,
00563                                  const WERD_CHOICE *best_choice,
00564                                  SEG_SEARCH_PENDING_LIST *pending[],
00565                                  CHUNKS_RECORD *chunks_record,
00566                                  HEAP *pain_points,
00567                                  BlamerBundle *blamer_bundle);
00568 
00569   // Add pain points for classifying blobs on the correct segmentation path
00570   // (so that we can evaluate correct segmentation path and discover the reason
00571   // for incorrect result).
00572   void InitBlamerForSegSearch(const WERD_CHOICE *best_choice,
00573                               CHUNKS_RECORD *chunks_record,
00574                               HEAP *pain_points,
00575                               BlamerBundle *blamer_bundle,
00576                               STRING *blamer_debug);
00577 
00578   // Analyze the contents of BlamerBundle and set incorrect result reason.
00579   void FinishBlamerForSegSearch(const WERD_CHOICE *best_choice,
00580                                 BlamerBundle *blamer_bundle,
00581                                 STRING *blamer_debug);
00582 
00583 };
00584 
00585 
00586 }  // namespace tesseract
00587 
00588 #endif  // TESSERACT_WORDREC_WORDREC_H__