Tesseract
3.02
|
00001 00002 // File: language_model.h 00003 // Description: Functions that utilize the knowledge about the properties, 00004 // structure and statistics of the language to help recognition. 00005 // Author: Daria Antonova 00006 // Created: Mon Nov 11 11:26:43 PST 2009 00007 // 00008 // (C) Copyright 2009, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 00022 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 00023 00024 #include "associate.h" 00025 #include "dawg.h" 00026 #include "dict.h" 00027 #include "fontinfo.h" 00028 #include "intproto.h" 00029 #include "matrix.h" 00030 #include "oldheap.h" 00031 #include "params.h" 00032 #include "pageres.h" 00033 00034 namespace tesseract { 00035 00036 // Used for expressing various language model flags. 00037 typedef unsigned char LanguageModelFlagsType; 00038 00039 // Struct for keeping track of the consistency of the path. 00040 struct LanguageModelConsistencyInfo { 00041 LanguageModelConsistencyInfo() 00042 : punc_ref(NO_EDGE), num_punc(0), invalid_punc(false), 00043 num_non_first_upper(0), num_lower(0), 00044 script_id(0), inconsistent_script(false), 00045 num_alphas(0), num_digits(0), num_other(0), 00046 num_inconsistent_spaces(0), inconsistent_font(false) {} 00047 inline int NumInconsistentPunc() const { 00048 return invalid_punc ? num_punc : 0; 00049 } 00050 inline int NumInconsistentCase() const { 00051 return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper; 00052 } 00053 inline int NumInconsistentChartype() const { 00054 return (NumInconsistentPunc() + num_other + 00055 ((num_alphas > num_digits) ? num_digits : num_alphas)); 00056 } 00057 inline bool Consistent() const { 00058 return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 && 00059 NumInconsistentChartype() == 0 && !inconsistent_script); 00060 } 00061 inline int NumInconsistentSpaces() const { 00062 return num_inconsistent_spaces; 00063 } 00064 00065 EDGE_REF punc_ref; 00066 int num_punc; 00067 bool invalid_punc; 00068 int num_non_first_upper; 00069 int num_lower; 00070 int script_id; 00071 bool inconsistent_script; 00072 int num_alphas; 00073 int num_digits; 00074 int num_other; 00075 int num_inconsistent_spaces; 00076 bool inconsistent_font; 00077 }; 00078 00079 00080 // The following structs are used for storing the state of the language model 00081 // in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs 00082 // and the links are the replationships between the underlying blobs (see 00083 // segsearch.h for a more detailed description). 00084 // Each of the BLOB_CHOICEs contains LanguageModelState struct, which has 00085 // a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi 00086 // search leading up to and including this BLOB_CHOICE. 00087 // Each ViterbiStateEntry contains information from various components of the 00088 // language model: dawgs in which the path is found, character ngram model 00089 // probability of the path, script/chartype/font consistency info, state for 00090 // language-specific heuristics (e.g. hyphenated and compund words, lower/upper 00091 // case preferences, etc). 00092 // Each ViterbiStateEntry also contains the parent pointer, so that the path 00093 // that it represents (WERD_CHOICE) can be constructed by following these 00094 // parent pointers. 00095 00096 // Struct for storing additional information used by Dawg language model 00097 // component. It stores the set of active dawgs in which the sequence of 00098 // letters on a path can be found and the constraints that have to be 00099 // satisfied at the end of the word (e.g. beginning/ending punctuation). 00100 struct LanguageModelDawgInfo { 00101 LanguageModelDawgInfo(DawgInfoVector *a, DawgInfoVector *c, 00102 PermuterType pt) : permuter(pt) { 00103 active_dawgs = new DawgInfoVector(*a); 00104 constraints = new DawgInfoVector(*c); 00105 } 00106 ~LanguageModelDawgInfo() { 00107 delete active_dawgs; 00108 delete constraints; 00109 } 00110 DawgInfoVector *active_dawgs; 00111 DawgInfoVector *constraints; 00112 PermuterType permuter; 00113 }; 00114 00115 // Struct for storing additional information used by Ngram language model 00116 // component. 00117 struct LanguageModelNgramInfo { 00118 LanguageModelNgramInfo(const char *c, int l, bool p, float np, float nc) 00119 : context(c), context_unichar_step_len(l), pruned(p), ngram_prob(np), 00120 ngram_cost(nc) {} 00121 STRING context; // context string 00122 // Length of the context measured by advancing using UNICHAR::utf8_step() 00123 // (should be at most the order of the character ngram model used). 00124 int context_unichar_step_len; 00125 // The paths with pruned set are pruned out from the perspective of the 00126 // character ngram model. They are explored further because they represent 00127 // a dictionary match or a top choice. Thus ngram_info is still computed 00128 // for them in order to calculate the combined cost. 00129 bool pruned; 00130 // -ln(P_ngram_model(path)) 00131 float ngram_prob; 00132 // -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ] 00133 float ngram_cost; 00134 }; 00135 00136 // Struct for storing the information about a path in the segmentation graph 00137 // explored by Viterbi search. 00138 struct ViterbiStateEntry : public ELIST_LINK { 00139 ViterbiStateEntry(BLOB_CHOICE *pb, ViterbiStateEntry *pe, 00140 BLOB_CHOICE *b, float c, float ol, 00141 const LanguageModelConsistencyInfo &ci, 00142 const AssociateStats &as, 00143 LanguageModelFlagsType tcf, 00144 LanguageModelDawgInfo *d, LanguageModelNgramInfo *n) 00145 : cost(c), parent_b(pb), parent_vse(pe), ratings_sum(b->rating()), 00146 min_certainty(b->certainty()), adapted(b->adapted()), length(1), 00147 outline_length(ol), consistency_info(ci), associate_stats(as), 00148 top_choice_flags(tcf), dawg_info(d), ngram_info(n), updated(true) { 00149 if (pe != NULL) { 00150 ratings_sum += pe->ratings_sum; 00151 if (pe->min_certainty < min_certainty) { 00152 min_certainty = pe->min_certainty; 00153 } 00154 adapted += pe->adapted; 00155 length += pe->length; 00156 outline_length += pe->outline_length; 00157 } 00158 } 00159 ~ViterbiStateEntry() { 00160 delete dawg_info; 00161 delete ngram_info; 00162 } 00163 // Comparator function for sorting ViterbiStateEntry_LISTs in 00164 // non-increasing order of costs. 00165 static int Compare(const void *e1, const void *e2) { 00166 const ViterbiStateEntry *ve1 = 00167 *reinterpret_cast<const ViterbiStateEntry * const *>(e1); 00168 const ViterbiStateEntry *ve2 = 00169 *reinterpret_cast<const ViterbiStateEntry * const *>(e2); 00170 return (ve1->cost < ve2->cost) ? -1 : 1; 00171 } 00172 inline bool Consistent() const { 00173 if (dawg_info != NULL && consistency_info.NumInconsistentCase() == 0) { 00174 return true; 00175 } 00176 return consistency_info.Consistent(); 00177 } 00178 00179 // The cost is an adjusted ratings sum, that is adjusted by all the language 00180 // model components that use Viterbi search. 00181 float cost; 00182 00183 // Pointers to parent BLOB_CHOICE and ViterbiStateEntry (not owned by this). 00184 BLOB_CHOICE *parent_b; 00185 ViterbiStateEntry *parent_vse; 00186 00187 // Various information about the characters on the path represented 00188 // by this ViterbiStateEntry. 00189 float ratings_sum; // sum of ratings of character on the path 00190 float min_certainty; // minimum certainty on the path 00191 int adapted; // number of BLOB_CHOICES from adapted templates 00192 int length; // number of characters on the path 00193 float outline_length; // length of the outline so far 00194 LanguageModelConsistencyInfo consistency_info; // path consistency info 00195 AssociateStats associate_stats; // character widths/gaps/seams 00196 00197 // Flags for marking the entry as a top choice path with 00198 // the smallest rating or lower/upper case letters). 00199 LanguageModelFlagsType top_choice_flags; 00200 00201 // Extra information maintained by Dawg laguage model component 00202 // (owned by ViterbiStateEntry). 00203 LanguageModelDawgInfo *dawg_info; 00204 00205 // Extra information maintained by Ngram laguage model component 00206 // (owned by ViterbiStateEntry). 00207 LanguageModelNgramInfo *ngram_info; 00208 00209 bool updated; // set to true if the entry has just been created/updated 00210 }; 00211 00212 ELISTIZEH(ViterbiStateEntry); 00213 00214 // Struct to store information maintained by various language model components. 00215 struct LanguageModelState { 00216 LanguageModelState(int col, int row) : contained_in_col(col), 00217 contained_in_row(row), viterbi_state_entries_prunable_length(0), 00218 viterbi_state_entries_length(0), 00219 viterbi_state_entries_prunable_max_cost(MAX_FLOAT32) {} 00220 ~LanguageModelState() {} 00221 00222 // Ratings matrix cell that holds this LanguageModelState 00223 // (needed to construct best STATE for rebuild_current_state() 00224 // and best BLOB_CHOICE_LIST_VECTOR for AcceptableChoice()). 00225 int contained_in_col; 00226 int contained_in_row; 00227 00228 // Storage for the Viterbi state. 00229 ViterbiStateEntry_LIST viterbi_state_entries; 00230 // Number and max cost of prunable paths in viterbi_state_entries. 00231 int viterbi_state_entries_prunable_length; 00232 // Total number of entries in viterbi_state_entries. 00233 int viterbi_state_entries_length; 00234 float viterbi_state_entries_prunable_max_cost; 00235 00236 // TODO(daria): add font consistency checking. 00237 }; 00238 00239 // Bundle together all the things pertaining to the best choice/state. 00240 struct BestChoiceBundle { 00241 BestChoiceBundle(STATE *s, WERD_CHOICE *bc, WERD_CHOICE *rc, 00242 BLOB_CHOICE_LIST_VECTOR *bcc) 00243 : best_state(s), best_choice(bc), raw_choice(rc), 00244 best_char_choices(bcc), updated(false), best_vse(NULL), best_b(NULL) {} 00245 00246 STATE *best_state; 00247 WERD_CHOICE *best_choice; 00248 WERD_CHOICE *raw_choice; 00249 BLOB_CHOICE_LIST_VECTOR *best_char_choices; 00250 bool updated; 00251 DANGERR fixpt; 00252 ViterbiStateEntry *best_vse; // best ViterbiStateEntry and BLOB_CHOICE 00253 BLOB_CHOICE *best_b; // at the end of the best choice path 00254 }; 00255 00256 struct BestPathByColumn { 00257 float avg_cost; 00258 ViterbiStateEntry *best_vse; 00259 BLOB_CHOICE *best_b; 00260 }; 00261 00262 // This class that contains the data structures and functions necessary 00263 // to represent and use the knowledge about the language. 00264 class LanguageModel { 00265 public: 00266 // Adjustments to pain point priority. 00267 static const float kInitialPainPointPriorityAdjustment; 00268 static const float kDefaultPainPointPriorityAdjustment; 00269 static const float kBestChoicePainPointPriorityAdjustment; 00270 static const float kCriticalPainPointPriorityAdjustment; 00271 00272 // Denominator for normalizing per-letter ngram cost when deriving 00273 // penalty adjustments. 00274 static const float kMaxAvgNgramCost; 00275 // Minimum word length for fixed length dawgs. 00276 // TODO(daria): check in the new chi/jpn.traineddata without the 00277 // fixed length dawg of length 1 and delete this variable. 00278 static const int kMinFixedLengthDawgLength; 00279 // If there is a significant drop in character ngram probability or a 00280 // dangerous ambiguity make the thresholds on what blob combinations 00281 // can be classified looser. 00282 static const float kLooseMaxCharWhRatio; 00283 00284 // Masks for interpreting which language model components 00285 // were changed by the call to UpdateState(). 00286 static const LanguageModelFlagsType kSmallestRatingFlag = 0x1; 00287 static const LanguageModelFlagsType kLowerCaseFlag = 0x2; 00288 static const LanguageModelFlagsType kUpperCaseFlag = 0x4; 00289 static const LanguageModelFlagsType kConsistentFlag = 0x8; 00290 static const LanguageModelFlagsType kDawgFlag = 0x10; 00291 static const LanguageModelFlagsType kNgramFlag = 0x20; 00292 static const LanguageModelFlagsType kJustClassifiedFlag = 0x80; 00293 static const LanguageModelFlagsType kAllChangedFlag = 0xff; 00294 00295 LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict); 00296 ~LanguageModel(); 00297 00298 // Updates data structures that are used for the duration of the segmentation 00299 // search on the current word; 00300 void InitForWord(const WERD_CHOICE *prev_word, 00301 bool fixed_pitch, float best_choice_cert, 00302 float max_char_wh_ratio, float rating_cert_scale, 00303 HEAP *pain_points, CHUNKS_RECORD *chunks_record, 00304 BlamerBundle *blamer_bundle, bool debug_blamer); 00305 // Resets all the "updated" flags used by the Viterbi search that were 00306 // "registered" during the update of the ratings matrix. 00307 void CleanUp(); 00308 // Deletes and sets to NULL language model states of each of the 00309 // BLOB_CHOICEs in the given BLOB_CHOICE_LIST. 00310 void DeleteState(BLOB_CHOICE_LIST *choices); 00311 00312 // Updates language model state of the given BLOB_CHOICE_LIST (from 00313 // the ratings matrix) a its parent. Updates pain_points if new 00314 // problematic points are found in the segmentation graph. 00315 // 00316 // At most language_model_viterbi_list_size are kept in each 00317 // LanguageModelState.viterbi_state_entries list. 00318 // At most language_model_viterbi_list_max_num_prunable of those are prunable 00319 // (non-dictionary) paths. 00320 // The entries that represent dictionary word paths are kept at the front 00321 // of the list. 00322 // The list ordered by cost that is computed collectively by several 00323 // language model components (currently dawg and ngram components). 00324 // 00325 // best_path_by_column records the lowest cost path found so far for each 00326 // column of the chunks_record->ratings matrix over all the rows. This 00327 // array is updated if a lower cost ViterbiStateEntry is created in curr_col. 00328 LanguageModelFlagsType UpdateState( 00329 LanguageModelFlagsType changed, 00330 int curr_col, int curr_row, 00331 BLOB_CHOICE_LIST *curr_list, 00332 BLOB_CHOICE_LIST *parent_list, 00333 HEAP *pain_points, 00334 BestPathByColumn *best_path_by_column[], 00335 CHUNKS_RECORD *chunks_record, 00336 BestChoiceBundle *best_choice_bundle, 00337 BlamerBundle *blamer_bundle); 00338 00339 // Generates pain points from the problematic top choice paths when the 00340 // segmentation search is guided by the character ngram model. 00341 // It is necessary to consider problematic the top choice paths instead of 00342 // the problematic lowest cost paths because the character ngram model 00343 // might assign a very high cost to very improbably paths. For example, 00344 // "liot" might have a much lower cost than "llot", and the character ngram 00345 // model might detect a dip in probability for p(t|lio) at the end of the 00346 // word, but not at the beginning (p(i|l) would be ok). However, looking at 00347 // the dips in character ngram probability of the top choices we would be 00348 // able to stop the problematic points (p(l| l) would be low). 00349 void GenerateNgramModelPainPointsFromColumn(int col, int row, 00350 HEAP *pain_points, 00351 CHUNKS_RECORD *chunks_record); 00352 00353 // Generates pain points from the problematic lowest cost paths that are 00354 // "promising" (i.e. would have the cost lower than the one recorded in 00355 // best_path_by_column if the problematic ending of the path is removed 00356 // and after being combined with another blob the certainty of the last 00357 // blob is improved). 00358 void GenerateProblematicPathPainPointsFromColumn( 00359 int col, int row, float best_choice_cert, 00360 HEAP *pain_points, BestPathByColumn *best_path_by_column[], 00361 CHUNKS_RECORD *chunks_record); 00362 00363 // This function can be called after processing column col of the 00364 // chunks_record->ratings matrix in order to find the promising paths 00365 // that were terminated or made inconsistent by the character choices 00366 // in column col. If such paths are identified, this function generates 00367 // pain points to combine the problematic cells of the matrix. 00368 void GeneratePainPointsFromColumn( 00369 int col, 00370 const GenericVector<int> &non_empty_rows, 00371 float best_choice_cert, 00372 HEAP *pain_points, 00373 BestPathByColumn *best_path_by_column[], 00374 CHUNKS_RECORD *chunks_record); 00375 00376 // Generates a pain point for each problematic point on the best choice 00377 // path. Such problematic points could be a termination of a dicionary 00378 // word, dip in ngram probability, invalid punctuation, inconsistent 00379 // case/chartype/script or punctuation in the middle of a word. 00380 void GeneratePainPointsFromBestChoice( 00381 HEAP *pain_points, 00382 CHUNKS_RECORD *chunks_record, 00383 BestChoiceBundle *best_choice_bundle); 00384 00385 // Adds a pain point to the given pain_points queue that will cause 00386 // the entry at chunks_record->ratings(col, row) to be classified. 00387 // The priority of the pain point is set to be: 00388 // 00389 // priority_adjustment * sqrt(avg_parent_cost) 00390 // ---------------------------------------------------- 00391 // sqrt(dict_parent_path_length) * |worst_piece_cert| 00392 // 00393 // The priority is further lowered if fragmented is true. 00394 // Reurns true if a new pain point was added to pain_points. 00395 bool GeneratePainPoint(int col, int row, bool ok_to_extend, 00396 float priority_adjustment, 00397 float worst_piece_cert, 00398 bool fragmented, 00399 float best_choice_cert, 00400 float max_char_wh_ratio, 00401 BLOB_CHOICE *parent_b, 00402 ViterbiStateEntry *parent_vse, 00403 CHUNKS_RECORD *chunks_record, 00404 HEAP *pain_points); 00405 00406 // Returns true if an acceptable best choice was discovered. 00407 inline bool AcceptableChoiceFound() { return acceptable_choice_found_; } 00408 00409 // Fills cert with the worst certainty of the top non-fragmented choice 00410 // of the left and right neighbor of the given col,row. 00411 // Sets fragmented if any of the neighbors have a fragmented character 00412 // as the top choice. 00413 inline void GetWorstPieceCertainty(int col, int row, MATRIX *ratings, 00414 float *cert, bool *fragmented) { 00415 *cert = 0.0f; 00416 *fragmented = false; 00417 if (row > 0) { 00418 GetPieceCertainty(ratings->get(col, row-1), cert, fragmented); 00419 } 00420 if (col+1 < ratings->dimension()) { 00421 GetPieceCertainty(ratings->get(col+1, row), cert, fragmented); 00422 } 00423 ASSERT_HOST(*cert < 0.0f); 00424 } 00425 00426 // Returns outline length of the given blob is computed as: 00427 // rating_cert_scale * rating / certainty 00428 // Since from Wordrec::SegSearch() in segsearch.cpp 00429 // rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale 00430 // And from Classify::ConvertMatchesToChoices() in adaptmatch.cpp 00431 // Rating = Certainty = next.rating 00432 // Rating *= rating_scale * Results->BlobLength 00433 // Certainty *= -(getDict().certainty_scale) 00434 inline float ComputeOutlineLength(BLOB_CHOICE *b) { 00435 return rating_cert_scale_ * b->rating() / b->certainty(); 00436 } 00437 00438 protected: 00439 00440 inline float CertaintyScore(float cert) { 00441 if (language_model_use_sigmoidal_certainty) { 00442 // cert is assumed to be between 0 and -dict_->certainty_scale. 00443 // If you enable language_model_use_sigmoidal_certainty, you 00444 // need to adjust language_model_ngram_nonmatch_score as well. 00445 cert = -cert / dict_->certainty_scale; 00446 return 1.0f / (1.0f + exp(10.0f * cert)); 00447 } else { 00448 return (-1.0f / cert); 00449 } 00450 } 00451 00452 inline bool NonAlphaOrDigitMiddle(int col, int row, int dimension, 00453 UNICHAR_ID unichar_id) { 00454 return (!dict_->getUnicharset().get_isalpha(unichar_id) && 00455 !dict_->getUnicharset().get_isdigit(unichar_id) && 00456 col > 0 && row+1 < dimension); 00457 } 00458 00459 inline bool IsFragment(BLOB_CHOICE *b) { 00460 return dict_->getUnicharset().get_fragment(b->unichar_id()); 00461 } 00462 00463 inline bool IsHan(int script_id) { 00464 return ((dict_->getUnicharset().han_sid() != 00465 dict_->getUnicharset().null_sid()) && 00466 (script_id == dict_->getUnicharset().han_sid())); 00467 } 00468 00469 // Finds the first non-fragmented character in the given BLOB_CHOICE_LIST 00470 // and updates cert if its certainty is less than the one recorded in cert. 00471 // Sets fragmented if the first choice in BLOB_CHOICE_LIST is a fragment. 00472 inline void GetPieceCertainty(BLOB_CHOICE_LIST *blist, 00473 float *cert, bool *fragmented) { 00474 if (blist == NOT_CLASSIFIED || blist->empty()) return; 00475 BLOB_CHOICE_IT bit(blist); 00476 while (!bit.at_last() && IsFragment(bit.data())) { 00477 *fragmented = true; 00478 bit.forward(); // skip fragments 00479 } 00480 // Each classification must have at least one non-fragmented choice. 00481 ASSERT_HOST(!IsFragment(bit.data())); 00482 if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty(); 00483 } 00484 00485 inline float ComputeAdjustment(int num_problems, float penalty) { 00486 if (num_problems == 0) return 0.0f; 00487 if (num_problems == 1) return penalty; 00488 return (penalty + (language_model_penalty_increment * 00489 static_cast<float>(num_problems-1))); 00490 } 00491 00492 // Computes the adjustment to the ratings sum based on the given 00493 // consistency_info. The paths with invalid punctuation, inconsistent 00494 // case and character type are penalized proportionally to the number 00495 // of inconsistencies on the path. 00496 inline float ComputeConsistencyAdjustment( 00497 const LanguageModelDawgInfo *dawg_info, 00498 const LanguageModelConsistencyInfo &consistency_info) { 00499 if (dawg_info != NULL) { 00500 return ComputeAdjustment(consistency_info.NumInconsistentCase(), 00501 language_model_penalty_case); 00502 } 00503 return (ComputeAdjustment(consistency_info.NumInconsistentPunc(), 00504 language_model_penalty_punc) + 00505 ComputeAdjustment(consistency_info.NumInconsistentCase(), 00506 language_model_penalty_case) + 00507 ComputeAdjustment(consistency_info.NumInconsistentChartype(), 00508 language_model_penalty_chartype) + 00509 ComputeAdjustment(consistency_info.NumInconsistentSpaces(), 00510 language_model_penalty_spacing) + 00511 (consistency_info.inconsistent_script ? 00512 language_model_penalty_script : 0.0f) + 00513 (consistency_info.inconsistent_font ? 00514 language_model_penalty_font : 0.0f)); 00515 } 00516 00517 // Returns an adjusted ratings sum that includes inconsistency penalties. 00518 inline float ComputeConsistencyAdjustedRatingsSum( 00519 float ratings_sum, 00520 const LanguageModelDawgInfo *dawg_info, 00521 const LanguageModelConsistencyInfo &consistency_info) { 00522 return (ratings_sum * (1.0f + ComputeConsistencyAdjustment( 00523 dawg_info, consistency_info))); 00524 } 00525 00526 // Returns an adjusted ratings sum that includes inconsistency penalties, 00527 // penalties for non-dictionary paths and paths with dips in ngram 00528 // probability. 00529 float ComputeAdjustedPathCost( 00530 float ratings_sum, int length, float dawg_score, 00531 const LanguageModelDawgInfo *dawg_info, 00532 const LanguageModelNgramInfo *ngram_info, 00533 const LanguageModelConsistencyInfo &consistency_info, 00534 const AssociateStats &associate_stats, 00535 ViterbiStateEntry *parent_vse); 00536 00537 // Returns true if the given ViterbiStateEntry represents a problematic 00538 // path. A path is considered problematic if the last unichar makes it 00539 // inconsistent, introduces a dip in ngram probability or transforms a 00540 // dictionary path into a non-dictionary one. 00541 bool ProblematicPath(const ViterbiStateEntry &vse, 00542 UNICHAR_ID unichar_id, bool word_end); 00543 00544 // Finds the first lower and upper case character in curr_list. 00545 // If none found, chooses the first character in the list. 00546 void GetTopChoiceLowerUpper(LanguageModelFlagsType changed, 00547 BLOB_CHOICE_LIST *curr_list, 00548 BLOB_CHOICE **first_lower, 00549 BLOB_CHOICE **first_upper); 00550 00551 // Helper function that computes the cost of the path composed of the 00552 // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE. 00553 // Adds a new ViterbiStateEntry to the list of viterbi entries 00554 // in the given BLOB_CHOICE if the new path looks good enough. 00555 // Returns LanguageModelFlagsType that indicates which language 00556 // model components were involved in creating the new entry. 00557 LanguageModelFlagsType AddViterbiStateEntry( 00558 LanguageModelFlagsType top_choice_flags, 00559 float denom, 00560 bool word_end, 00561 int curr_col, int curr_row, 00562 BLOB_CHOICE *b, 00563 BLOB_CHOICE *parent_b, 00564 ViterbiStateEntry *parent_vse, 00565 HEAP *pain_points, 00566 BestPathByColumn *best_path_by_column[], 00567 CHUNKS_RECORD *chunks_record, 00568 BestChoiceBundle *best_choice_bundle, 00569 BlamerBundle *blamer_bundle); 00570 00571 // Pretty print information in the given ViterbiStateEntry. 00572 void PrintViterbiStateEntry(const char *msg, 00573 ViterbiStateEntry *vse, 00574 BLOB_CHOICE *b, 00575 CHUNKS_RECORD *chunks_record); 00576 00577 // Determines whether a potential entry is a true top choice and 00578 // updates changed accordingly. 00579 // 00580 // Note: The function assumes that b, top_choice_flags and changed 00581 // are not NULL. 00582 void GenerateTopChoiceInfo( 00583 float ratings_sum, 00584 const LanguageModelDawgInfo *dawg_info, 00585 const LanguageModelConsistencyInfo &consistency_info, 00586 const ViterbiStateEntry *parent_vse, 00587 BLOB_CHOICE *b, 00588 LanguageModelFlagsType *top_choice_flags, 00589 LanguageModelFlagsType *changed); 00590 00591 // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and 00592 // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo 00593 // with updated active dawgs, constraints and permuter. 00594 // 00595 // Note: the caller is responsible for deleting the returned pointer. 00596 LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int script_id, 00597 int curr_col, int curr_row, 00598 const BLOB_CHOICE &b, 00599 const ViterbiStateEntry *parent_vse, 00600 LanguageModelFlagsType *changed); 00601 00602 // Computes p(unichar | parent context) and records it in ngram_cost. 00603 // If b.unichar_id() is an unlikely continuation of the parent context 00604 // sets found_small_prob to true and returns NULL. 00605 // Otherwise creates a new LanguageModelNgramInfo entry containing the 00606 // updated context (that includes b.unichar_id() at the end) and returns it. 00607 // 00608 // Note: the caller is responsible for deleting the returned pointer. 00609 LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar, 00610 float certainty, float denom, 00611 int curr_col, int curr_row, 00612 const ViterbiStateEntry *parent_vse, 00613 BLOB_CHOICE *parent_b, 00614 LanguageModelFlagsType *changed); 00615 00616 // Computes -(log(prob(classifier)) + log(prob(ngram model))) 00617 // for the given unichar in the given context. If there are multiple 00618 // unichars at one position - takes the average of their probabilities. 00619 // UNICHAR::utf8_step() is used to separate out individual UTF8 characters, 00620 // since probability_in_context() can only handle one at a time (while 00621 // unicharset might contain ngrams and glyphs composed from multiple UTF8 00622 // characters). 00623 float ComputeNgramCost(const char *unichar, float certainty, float denom, 00624 const char *context, int *unichar_step_len, 00625 bool *found_small_prob, float *ngram_prob); 00626 00627 // Computes the normalization factors for the classifier confidences 00628 // (used by ComputeNgramCost()). 00629 float ComputeDenom(BLOB_CHOICE_LIST *curr_list); 00630 00631 // Fills the given consistenty_info based on parent_vse.consistency_info 00632 // and on the consistency of the given unichar_id with parent_vse. 00633 void FillConsistencyInfo( 00634 int curr_col, bool word_end, BLOB_CHOICE *b, 00635 ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, 00636 CHUNKS_RECORD *chunks_record, 00637 LanguageModelConsistencyInfo *consistency_info); 00638 00639 // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs 00640 // on the path represented by the given BLOB_CHOICE and language model 00641 // state entries (lmse, dse). The path is re-constructed by following 00642 // the parent pointers in the the lang model state entries). If the 00643 // constructed WERD_CHOICE is better than the best/raw choice recorded 00644 // in the best_choice_bundle, this function updates the corresponding 00645 // fields and sets best_choice_bunldle->updated to true. 00646 void UpdateBestChoice(BLOB_CHOICE *b, 00647 ViterbiStateEntry *vse, 00648 HEAP *pain_points, 00649 CHUNKS_RECORD *chunks_record, 00650 BestChoiceBundle *best_choice_bundle, 00651 BlamerBundle *blamer_bundle); 00652 00653 // Fills the given floats array with raw features extracted from the 00654 // path represented by the given ViterbiStateEntry. 00655 // See ccstruct/params_training_featdef.h for feature information. 00656 void ExtractRawFeaturesFromPath(const ViterbiStateEntry &vse, 00657 float *features); 00658 00659 // Constructs a WERD_CHOICE by tracing parent pointers starting with 00660 // the given LanguageModelStateEntry. Returns the constructed word. 00661 // Updates best_char_choices, certainties and state if they are not 00662 // NULL (best_char_choices and certainties are assumed to have the 00663 // length equal to lmse->length). 00664 // The caller is resposible for freeing memory associated with the 00665 // returned WERD_CHOICE. 00666 WERD_CHOICE *ConstructWord(BLOB_CHOICE *b, 00667 ViterbiStateEntry *vse, 00668 CHUNKS_RECORD *chunks_record, 00669 BLOB_CHOICE_LIST_VECTOR *best_char_choices, 00670 float certainties[], 00671 float *dawg_score, 00672 STATE *state, 00673 BlamerBundle *blamer_bundle, 00674 bool *truth_path); 00675 00676 // This function is used for non-space delimited languages when looking 00677 // for word endings recorded while trying to separate the path into words. 00678 // 00679 // The function increments covered if a valid word ending is found in 00680 // active_dawgs (if covered is incremented, skip is set to the number 00681 // of unichars that should be skipped because they are covered by the 00682 // word whose ending was just discovered). 00683 // 00684 // dawg_score and dawg_score_done are updated if: 00685 // -- at the end of the path we discover a valid word ending from a 00686 // non-fixed length dawg (this means that the whole word is a 00687 // valid word, so dawg_score is set to 1.0f 00688 // -- word_start is true (dawg_score is set to covered / word length) 00689 // 00690 // Note: this function assumes that skip, covered, dawg_score and 00691 // dawg_score_done are not NULL. 00692 void UpdateCoveredByFixedLengthDawgs(const DawgInfoVector &active_dawgs, 00693 int word_index, int word_length, 00694 int *skip, int *covered, 00695 float *dawg_score, 00696 bool *dawg_score_done); 00697 00698 // Wrapper around AssociateUtils::ComputeStats(). 00699 inline void ComputeAssociateStats(int col, int row, 00700 float max_char_wh_ratio, 00701 ViterbiStateEntry *parent_vse, 00702 CHUNKS_RECORD *chunks_record, 00703 AssociateStats *associate_stats) { 00704 AssociateUtils::ComputeStats( 00705 col, row, 00706 (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL, 00707 (parent_vse != NULL) ? parent_vse->length : 0, 00708 fixed_pitch_, max_char_wh_ratio, 00709 chunks_record->word_res != NULL ? &chunks_record->word_res->denorm : NULL, 00710 chunks_record, language_model_debug_level, associate_stats); 00711 } 00712 00713 // Returns true if the path with such top_choice_flags and dawg_info 00714 // could be pruned out (i.e. is neither a system/user/frequent dictionary 00715 // nor a top choice path). 00716 // In non-space delimited languages all paths can be "somewhat" dictionary 00717 // words. In such languages we can not do dictionary-driven path prunning, 00718 // so paths with non-empty dawg_info are considered prunable. 00719 inline bool PrunablePath(LanguageModelFlagsType top_choice_flags, 00720 const LanguageModelDawgInfo *dawg_info) { 00721 if (top_choice_flags) return false; 00722 if (dawg_info != NULL && 00723 (dawg_info->permuter == SYSTEM_DAWG_PERM || 00724 dawg_info->permuter == USER_DAWG_PERM || 00725 dawg_info->permuter == FREQ_DAWG_PERM) && 00726 dict_->GetMaxFixedLengthDawgIndex() < 0) return false; 00727 return true; 00728 } 00729 00730 // Returns true if the given ViterbiStateEntry represents an acceptable path. 00731 inline bool AcceptablePath(const ViterbiStateEntry &vse) { 00732 return (vse.dawg_info != NULL || vse.Consistent() || 00733 (vse.ngram_info != NULL && !vse.ngram_info->pruned)); 00734 } 00735 00736 public: 00737 // Parameters. 00738 INT_VAR_H(language_model_debug_level, 0, "Language model debug level"); 00739 BOOL_VAR_H(language_model_ngram_on, false, 00740 "Turn on/off the use of character ngram model"); 00741 INT_VAR_H(language_model_ngram_order, 8, 00742 "Maximum order of the character ngram model"); 00743 INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10, 00744 "Maximum number of prunable (those for which PrunablePath() is true)" 00745 "entries in each viterbi list recorded in BLOB_CHOICEs"); 00746 INT_VAR_H(language_model_viterbi_list_max_size, 500, 00747 "Maximum size of viterbi lists recorded in BLOB_CHOICEs"); 00748 double_VAR_H(language_model_ngram_small_prob, 0.000001, 00749 "To avoid overly small denominators use this as the floor" 00750 " of the probability returned by the ngram model"); 00751 double_VAR_H(language_model_ngram_nonmatch_score, -40.0, 00752 "Average classifier score of a non-matching unichar"); 00753 BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false, 00754 "Use only the first UTF8 step of the given string" 00755 " when computing log probabilities"); 00756 double_VAR_H(language_model_ngram_scale_factor, 0.03, 00757 "Strength of the character ngram model relative to the" 00758 " character classifier "); 00759 BOOL_VAR_H(language_model_ngram_space_delimited_language, true, 00760 "Words are delimited by space"); 00761 00762 INT_VAR_H(language_model_min_compound_length, 3, 00763 "Minimum length of compound words"); 00764 INT_VAR_H(language_model_fixed_length_choices_depth, 3, 00765 "Depth of blob choice lists to explore" 00766 " when fixed length dawgs are on"); 00767 // Penalties used for adjusting path costs and final word rating. 00768 double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1, 00769 "Penalty for words not in the frequent word dictionary"); 00770 double_VAR_H(language_model_penalty_non_dict_word, 0.15, 00771 "Penalty for non-dictionary words"); 00772 double_VAR_H(language_model_penalty_punc, 0.2, 00773 "Penalty for inconsistent punctuation"); 00774 double_VAR_H(language_model_penalty_case, 0.1, 00775 "Penalty for inconsistent case"); 00776 double_VAR_H(language_model_penalty_script, 0.5, 00777 "Penalty for inconsistent script"); 00778 double_VAR_H(language_model_penalty_chartype, 0.3, 00779 "Penalty for inconsistent character type"); 00780 double_VAR_H(language_model_penalty_font, 0.00, 00781 "Penalty for inconsistent font"); 00782 double_VAR_H(language_model_penalty_spacing, 0.05, 00783 "Penalty for inconsistent spacing"); 00784 double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment"); 00785 BOOL_VAR_H(language_model_use_sigmoidal_certainty, false, 00786 "Use sigmoidal score for certainty"); 00787 00788 protected: 00789 // Member Variables. 00790 00791 // Temporary DawgArgs struct that is re-used across different words to 00792 // avoid dynamic memory re-allocation (should be cleared before each use). 00793 DawgArgs *dawg_args_; 00794 // List of pointers to updated flags used by Viterbi search to mark 00795 // recently updated ViterbiStateEntries. 00796 GenericVector<bool *> updated_flags_; 00797 // Scaling for recovering blob outline length from rating and certainty. 00798 float rating_cert_scale_; 00799 00800 // The following variables are set at construction time. 00801 00802 // Pointer to fontinfo table (not owned by LanguageModel). 00803 const UnicityTable<FontInfo> *fontinfo_table_; 00804 00805 // Pointer to Dict class, that is used for querying the dictionaries 00806 // (the pointer is not owned by LanguageModel). 00807 Dict *dict_; 00808 00809 // TODO(daria): the following variables should become LanguageModel params 00810 // when the old code in bestfirst.cpp and heuristic.cpp is deprecated. 00811 // 00812 // Set to true if we are dealing with fixed pitch text 00813 // (set to assume_fixed_pitch_char_segment). 00814 bool fixed_pitch_; 00815 // Max char width-to-height ratio allowed 00816 // (set to segsearch_max_char_wh_ratio). 00817 float max_char_wh_ratio_; 00818 00819 // The following variables are initialized with InitForWord(). 00820 00821 // String representation of the classification of the previous word 00822 // (since this is only used by the character ngram model component, 00823 // only the last language_model_ngram_order of the word are stored). 00824 STRING prev_word_str_; 00825 int prev_word_unichar_step_len_; 00826 // Active dawg and constraints vector. 00827 DawgInfoVector *beginning_active_dawgs_; 00828 DawgInfoVector *beginning_constraints_; 00829 DawgInfoVector *fixed_length_beginning_active_dawgs_; 00830 DawgInfoVector *empty_dawg_info_vec_; 00831 // Maximum adjustment factor for character ngram choices. 00832 float max_penalty_adjust_; 00833 // Set to true if acceptable choice was discovered. 00834 // Note: it would be nice to use this to terminate the search once an 00835 // acceptable choices is found. However we do not do that and once an 00836 // acceptable choice is found we finish looking for alternative choices 00837 // in the current segmentation graph and then exit the search (no more 00838 // classifications are done after an acceptable choice is found). 00839 // This is needed in order to let the search find the words very close to 00840 // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these 00841 // choices. This way the stopper will know that the best choice is not 00842 // ambiguous (i.e. there are best choices in the best choice list that have 00843 // ratings close to the very best one) and will be less likely to mis-adapt. 00844 bool acceptable_choice_found_; 00845 // Set to true if a choice representing correct segmentation was explored. 00846 bool correct_segmentation_explored_; 00847 00848 }; 00849 00850 } // namespace tesseract 00851 00852 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_