Tesseract
3.02
|
00001 00002 // File: language_model.cpp 00003 // Description: Functions that utilize the knowledge about the properties, 00004 // structure and statistics of the language to help recognition. 00005 // Author: Daria Antonova 00006 // Created: Mon Nov 11 11:26:43 PST 2009 00007 // 00008 // (C) Copyright 2009, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #include <math.h> 00022 00023 #include "language_model.h" 00024 00025 #include "dawg.h" 00026 #include "intproto.h" 00027 #include "matrix.h" 00028 #include "params.h" 00029 #include "params_training_featdef.h" 00030 00031 namespace tesseract { 00032 00033 ELISTIZE(ViterbiStateEntry); 00034 00035 const float LanguageModel::kInitialPainPointPriorityAdjustment = 5.0f; 00036 const float LanguageModel::kDefaultPainPointPriorityAdjustment = 2.0f; 00037 const float LanguageModel::kBestChoicePainPointPriorityAdjustment = 0.5f; 00038 const float LanguageModel::kCriticalPainPointPriorityAdjustment = 0.1f; 00039 const float LanguageModel::kMaxAvgNgramCost = 25.0f; 00040 const int LanguageModel::kMinFixedLengthDawgLength = 2; 00041 const float LanguageModel::kLooseMaxCharWhRatio = 2.5f; 00042 00043 LanguageModel::LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, 00044 Dict *dict) 00045 : INT_MEMBER(language_model_debug_level, 0, "Language model debug level", 00046 dict->getImage()->getCCUtil()->params()), 00047 BOOL_INIT_MEMBER(language_model_ngram_on, false, 00048 "Turn on/off the use of character ngram model", 00049 dict->getImage()->getCCUtil()->params()), 00050 INT_MEMBER(language_model_ngram_order, 8, 00051 "Maximum order of the character ngram model", 00052 dict->getImage()->getCCUtil()->params()), 00053 INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10, 00054 "Maximum number of prunable (those for which" 00055 " PrunablePath() is true) entries in each viterbi list" 00056 " recorded in BLOB_CHOICEs", 00057 dict->getImage()->getCCUtil()->params()), 00058 INT_MEMBER(language_model_viterbi_list_max_size, 500, 00059 "Maximum size of viterbi lists recorded in BLOB_CHOICEs", 00060 dict->getImage()->getCCUtil()->params()), 00061 double_MEMBER(language_model_ngram_small_prob, 0.000001, 00062 "To avoid overly small denominators use this as the " 00063 "floor of the probability returned by the ngram model.", 00064 dict->getImage()->getCCUtil()->params()), 00065 double_MEMBER(language_model_ngram_nonmatch_score, -40.0, 00066 "Average classifier score of a non-matching unichar.", 00067 dict->getImage()->getCCUtil()->params()), 00068 BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false, 00069 "Use only the first UTF8 step of the given string" 00070 " when computing log probabilities.", 00071 dict->getImage()->getCCUtil()->params()), 00072 double_MEMBER(language_model_ngram_scale_factor, 0.03, 00073 "Strength of the character ngram model relative to the" 00074 " character classifier ", 00075 dict->getImage()->getCCUtil()->params()), 00076 BOOL_MEMBER(language_model_ngram_space_delimited_language, true, 00077 "Words are delimited by space", 00078 dict->getImage()->getCCUtil()->params()), 00079 INT_MEMBER(language_model_min_compound_length, 3, 00080 "Minimum length of compound words", 00081 dict->getImage()->getCCUtil()->params()), 00082 INT_MEMBER(language_model_fixed_length_choices_depth, 3, 00083 "Depth of blob choice lists to explore" 00084 " when fixed length dawgs are on", 00085 dict->getImage()->getCCUtil()->params()), 00086 double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1, 00087 "Penalty for words not in the frequent word dictionary", 00088 dict->getImage()->getCCUtil()->params()), 00089 double_MEMBER(language_model_penalty_non_dict_word, 0.15, 00090 "Penalty for non-dictionary words", 00091 dict->getImage()->getCCUtil()->params()), 00092 double_MEMBER(language_model_penalty_punc, 0.2, 00093 "Penalty for inconsistent punctuation", 00094 dict->getImage()->getCCUtil()->params()), 00095 double_MEMBER(language_model_penalty_case, 0.1, 00096 "Penalty for inconsistent case", 00097 dict->getImage()->getCCUtil()->params()), 00098 double_MEMBER(language_model_penalty_script, 0.5, 00099 "Penalty for inconsistent script", 00100 dict->getImage()->getCCUtil()->params()), 00101 double_MEMBER(language_model_penalty_chartype, 0.3, 00102 "Penalty for inconsistent character type", 00103 dict->getImage()->getCCUtil()->params()), 00104 // TODO(daria, rays): enable font consistency checking 00105 // after improving font analysis. 00106 double_MEMBER(language_model_penalty_font, 0.00, 00107 "Penalty for inconsistent font", 00108 dict->getImage()->getCCUtil()->params()), 00109 double_MEMBER(language_model_penalty_spacing, 0.05, 00110 "Penalty for inconsistent spacing", 00111 dict->getImage()->getCCUtil()->params()), 00112 double_MEMBER(language_model_penalty_increment, 0.01, 00113 "Penalty increment", 00114 dict->getImage()->getCCUtil()->params()), 00115 BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false, 00116 "Use sigmoidal score for certainty", 00117 dict->getImage()->getCCUtil()->params()), 00118 fontinfo_table_(fontinfo_table), dict_(dict), 00119 fixed_pitch_(false), max_char_wh_ratio_(0.0), 00120 acceptable_choice_found_(false) { 00121 ASSERT_HOST(dict_ != NULL); 00122 dawg_args_ = new DawgArgs(NULL, NULL, new DawgInfoVector(), 00123 new DawgInfoVector(), 00124 0.0, NO_PERM, kAnyWordLength, -1); 00125 beginning_active_dawgs_ = new DawgInfoVector(); 00126 beginning_constraints_ = new DawgInfoVector(); 00127 fixed_length_beginning_active_dawgs_ = new DawgInfoVector(); 00128 empty_dawg_info_vec_ = new DawgInfoVector(); 00129 } 00130 00131 LanguageModel::~LanguageModel() { 00132 delete beginning_active_dawgs_; 00133 delete beginning_constraints_; 00134 delete fixed_length_beginning_active_dawgs_; 00135 delete empty_dawg_info_vec_; 00136 delete dawg_args_->updated_active_dawgs; 00137 delete dawg_args_->updated_constraints; 00138 delete dawg_args_; 00139 } 00140 00141 void LanguageModel::InitForWord( 00142 const WERD_CHOICE *prev_word, 00143 bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio, 00144 float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record, 00145 BlamerBundle *blamer_bundle, bool debug_blamer) { 00146 fixed_pitch_ = fixed_pitch; 00147 max_char_wh_ratio_ = max_char_wh_ratio; 00148 rating_cert_scale_ = rating_cert_scale; 00149 acceptable_choice_found_ = false; 00150 correct_segmentation_explored_ = false; 00151 00152 // For each cell, generate a "pain point" if the cell is not classified 00153 // and has a left or right neighbor that was classified. 00154 MATRIX *ratings = chunks_record->ratings; 00155 for (int col = 0; col < ratings->dimension(); ++col) { 00156 for (int row = col+1; row < ratings->dimension(); ++row) { 00157 if ((row > 0 && ratings->get(col, row-1) != NOT_CLASSIFIED) || 00158 (col+1 < ratings->dimension() && 00159 ratings->get(col+1, row) != NOT_CLASSIFIED)) { 00160 float worst_piece_cert; 00161 bool fragmented; 00162 GetWorstPieceCertainty(col, row, chunks_record->ratings, 00163 &worst_piece_cert, &fragmented); 00164 GeneratePainPoint(col, row, true, kInitialPainPointPriorityAdjustment, 00165 worst_piece_cert, fragmented, best_choice_cert, 00166 max_char_wh_ratio_, NULL, NULL, 00167 chunks_record, pain_points); 00168 } 00169 } 00170 } 00171 00172 // Initialize vectors with beginning DawgInfos. 00173 beginning_active_dawgs_->clear(); 00174 dict_->init_active_dawgs(kAnyWordLength, beginning_active_dawgs_, false); 00175 beginning_constraints_->clear(); 00176 dict_->init_constraints(beginning_constraints_); 00177 if (dict_->GetMaxFixedLengthDawgIndex() >= 0) { 00178 fixed_length_beginning_active_dawgs_->clear(); 00179 for (int i = 0; i < beginning_active_dawgs_->size(); ++i) { 00180 int dawg_index = (*beginning_active_dawgs_)[i].dawg_index; 00181 if (dawg_index <= dict_->GetMaxFixedLengthDawgIndex() && 00182 dawg_index >= kMinFixedLengthDawgLength) { 00183 *fixed_length_beginning_active_dawgs_ += (*beginning_active_dawgs_)[i]; 00184 } 00185 } 00186 } 00187 00188 max_penalty_adjust_ = (dict_->segment_penalty_dict_nonword - 00189 dict_->segment_penalty_dict_case_ok); 00190 00191 // Fill prev_word_str_ with the last language_model_ngram_order 00192 // unichars from prev_word. 00193 if (language_model_ngram_on) { 00194 if (prev_word != NULL && prev_word->unichar_string() != NULL) { 00195 prev_word_str_ = prev_word->unichar_string(); 00196 if (language_model_ngram_space_delimited_language) prev_word_str_ += ' '; 00197 } else { 00198 prev_word_str_ += ' '; 00199 } 00200 const char *str_ptr = prev_word_str_.string(); 00201 const char *str_end = str_ptr + prev_word_str_.length(); 00202 int step; 00203 prev_word_unichar_step_len_ = 0; 00204 while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) { 00205 str_ptr += step; 00206 ++prev_word_unichar_step_len_; 00207 } 00208 ASSERT_HOST(str_ptr == str_end); 00209 } 00210 00211 // Initialize blamer-related information: map character boxes recorded in 00212 // blamer_bundle->norm_truth_word to the corresponding i,j indices in the 00213 // ratings matrix. We expect this step to succeed, since when running the 00214 // chopper we checked that the correct chops are present. 00215 if (blamer_bundle != NULL && 00216 blamer_bundle->incorrect_result_reason == IRR_CORRECT && 00217 blamer_bundle->truth_has_char_boxes) { 00218 STRING blamer_debug; 00219 blamer_debug += "Blamer computing correct_segmentation_cols\n"; 00220 int curr_box_col = 0; 00221 int next_box_col = 0; 00222 TBLOB *blob = chunks_record->chunks; 00223 inT16 next_box_x = (blob != NULL) ? blob->bounding_box().right() : 0; 00224 for (int truth_idx = 0; 00225 blob != NULL && truth_idx < blamer_bundle->norm_truth_word.length(); 00226 blob = blob->next) { 00227 ++next_box_col; 00228 inT16 curr_box_x = next_box_x; 00229 if (blob->next != NULL) next_box_x = blob->next->bounding_box().right(); 00230 inT16 truth_x = blamer_bundle->norm_truth_word.BlobBox(truth_idx).right(); 00231 blamer_debug.add_str_int("Box x coord vs. truth: ", curr_box_x); 00232 blamer_debug.add_str_int(" ", truth_x); 00233 blamer_debug += "\n"; 00234 if (curr_box_x > (truth_x + blamer_bundle->norm_box_tolerance)) { 00235 break; // failed to find a matching box 00236 } else if (curr_box_x >= 00237 (truth_x - blamer_bundle->norm_box_tolerance) && // matched 00238 (blob->next == NULL || // next box can't be included 00239 next_box_x > truth_x + blamer_bundle->norm_box_tolerance)) { 00240 blamer_bundle->correct_segmentation_cols.push_back(curr_box_col); 00241 blamer_bundle->correct_segmentation_rows.push_back(next_box_col-1); 00242 ++truth_idx; 00243 blamer_debug.add_str_int("col=", curr_box_col); 00244 blamer_debug.add_str_int(" row=", next_box_col-1); 00245 blamer_debug += "\n"; 00246 curr_box_col = next_box_col; 00247 } 00248 } 00249 if (blob != NULL || // trailing blobs 00250 blamer_bundle->correct_segmentation_cols.length() != 00251 blamer_bundle->norm_truth_word.length()) { 00252 blamer_debug.add_str_int("Blamer failed to find correct segmentation" 00253 " (tolerance=", blamer_bundle->norm_box_tolerance); 00254 if (blob == NULL) blamer_debug += " blob == NULL"; 00255 blamer_debug += ")\n"; 00256 blamer_debug.add_str_int( 00257 " path length ", blamer_bundle->correct_segmentation_cols.length()); 00258 blamer_debug.add_str_int(" vs. truth ", 00259 blamer_bundle->norm_truth_word.length()); 00260 blamer_debug += "\n"; 00261 blamer_bundle->SetBlame(IRR_UNKNOWN, blamer_debug, NULL, debug_blamer); 00262 blamer_bundle->correct_segmentation_cols.clear(); 00263 blamer_bundle->correct_segmentation_rows.clear(); 00264 } 00265 } // end if (blamer_bundle != NULL) 00266 00267 // Start a new hypothesis list for this run of segmentation search. 00268 if (blamer_bundle != NULL) { 00269 blamer_bundle->params_training_bundle.StartHypothesisList(); 00270 } 00271 } 00272 00273 void LanguageModel::CleanUp() { 00274 for (int i = 0; i < updated_flags_.size(); ++i) { 00275 *(updated_flags_[i]) = false; 00276 } 00277 updated_flags_.clear(); 00278 } 00279 00280 void LanguageModel::DeleteState(BLOB_CHOICE_LIST *choices) { 00281 BLOB_CHOICE_IT b_it(choices); 00282 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00283 if (b_it.data()->language_model_state() != NULL) { 00284 LanguageModelState *state = reinterpret_cast<LanguageModelState *>( 00285 b_it.data()->language_model_state()); 00286 delete state; 00287 b_it.data()->set_language_model_state(NULL); 00288 } 00289 } 00290 } 00291 00292 LanguageModelFlagsType LanguageModel::UpdateState( 00293 LanguageModelFlagsType changed, 00294 int curr_col, int curr_row, 00295 BLOB_CHOICE_LIST *curr_list, 00296 BLOB_CHOICE_LIST *parent_list, 00297 HEAP *pain_points, 00298 BestPathByColumn *best_path_by_column[], 00299 CHUNKS_RECORD *chunks_record, 00300 BestChoiceBundle *best_choice_bundle, 00301 BlamerBundle *blamer_bundle) { 00302 if (language_model_debug_level > 0) { 00303 tprintf("\nUpdateState: col=%d row=%d (changed=0x%x parent=%p)\n", 00304 curr_col, curr_row, changed, parent_list); 00305 } 00306 // Initialize helper variables. 00307 bool word_end = (curr_row+1 >= chunks_record->ratings->dimension()); 00308 bool just_classified = (changed & kJustClassifiedFlag); 00309 LanguageModelFlagsType new_changed = 0x0; 00310 float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f; 00311 00312 // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry. 00313 ViterbiStateEntry_IT vit; 00314 BLOB_CHOICE_IT c_it(curr_list); 00315 int c_it_counter = 0; 00316 bool first_iteration = true; 00317 BLOB_CHOICE *first_lower = NULL; 00318 BLOB_CHOICE *first_upper = NULL; 00319 GetTopChoiceLowerUpper(changed, curr_list, &first_lower, &first_upper); 00320 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00321 if (dict_->GetMaxFixedLengthDawgIndex() >= 0 && 00322 c_it_counter++ >= language_model_fixed_length_choices_depth) { 00323 break; 00324 } 00325 // Skip NULL unichars unless it is the only choice. 00326 if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue; 00327 if (dict_->getUnicharset().get_fragment(c_it.data()->unichar_id())) { 00328 continue; // skip fragments 00329 } 00330 // Set top choice flags. 00331 LanguageModelFlagsType top_choice_flags = 0x0; 00332 if (first_iteration && (changed | kSmallestRatingFlag)) { 00333 top_choice_flags |= kSmallestRatingFlag; 00334 } 00335 if (first_lower == c_it.data()) top_choice_flags |= kLowerCaseFlag; 00336 if (first_upper == c_it.data()) top_choice_flags |= kUpperCaseFlag; 00337 00338 if (parent_list == NULL) { // process the beginning of a word 00339 new_changed |= AddViterbiStateEntry( 00340 top_choice_flags, denom, word_end, curr_col, curr_row, 00341 c_it.data(), NULL, NULL, pain_points, best_path_by_column, 00342 chunks_record, best_choice_bundle, blamer_bundle); 00343 } else { // get viterbi entries from each of the parent BLOB_CHOICEs 00344 BLOB_CHOICE_IT p_it(parent_list); 00345 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) { 00346 LanguageModelState *parent_lms = 00347 reinterpret_cast<LanguageModelState *>( 00348 p_it.data()->language_model_state()); 00349 if (parent_lms == NULL || parent_lms->viterbi_state_entries.empty()) { 00350 continue; 00351 } 00352 vit.set_to_list(&(parent_lms->viterbi_state_entries)); 00353 int vit_counter = 0; 00354 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) { 00355 // Skip pruned entries and do not look at prunable entries if already 00356 // examined language_model_viterbi_list_max_num_prunable of those. 00357 if (PrunablePath(vit.data()->top_choice_flags, 00358 vit.data()->dawg_info) && 00359 (++vit_counter > language_model_viterbi_list_max_num_prunable || 00360 (language_model_ngram_on && vit.data()->ngram_info->pruned))) { 00361 continue; 00362 } 00363 // Only consider the parent if it has been updated or 00364 // if the current ratings cell has just been classified. 00365 if (!just_classified && !vit.data()->updated) continue; 00366 // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data() 00367 // looks good according to the Dawgs or character ngram model. 00368 new_changed |= AddViterbiStateEntry( 00369 top_choice_flags, denom, word_end, curr_col, curr_row, 00370 c_it.data(), p_it.data(), vit.data(), pain_points, 00371 best_path_by_column, chunks_record, 00372 best_choice_bundle, blamer_bundle); 00373 } 00374 } // done looking at parents for this c_it.data() 00375 } 00376 first_iteration = false; 00377 } 00378 return new_changed; 00379 } 00380 00381 bool LanguageModel::ProblematicPath(const ViterbiStateEntry &vse, 00382 UNICHAR_ID unichar_id, bool word_end) { 00383 // The path is problematic if it is inconsistent and has a parent that 00384 // is consistent (or a NULL parent). 00385 if (!vse.Consistent() && (vse.parent_vse == NULL || 00386 vse.parent_vse->Consistent())) { 00387 if (language_model_debug_level > 0) { 00388 tprintf("ProblematicPath: inconsistent\n"); 00389 } 00390 return true; 00391 } 00392 // The path is problematic if it does not represent a dictionary word, 00393 // while its parent does. 00394 if (vse.dawg_info == NULL && 00395 (vse.parent_vse == NULL || vse.parent_vse->dawg_info != NULL)) { 00396 if (language_model_debug_level > 0) { 00397 tprintf("ProblematicPath: dict word terminated\n"); 00398 } 00399 return true; 00400 } 00401 // The path is problematic if ngram info indicates that this path is 00402 // an unlikely sequence of character, while its parent is does not have 00403 // extreme dips in ngram probabilities. 00404 if (vse.ngram_info != NULL && vse.ngram_info->pruned && 00405 (vse.parent_vse == NULL || !vse.parent_vse->ngram_info->pruned)) { 00406 if (language_model_debug_level > 0) { 00407 tprintf("ProblematicPath: small ngram prob\n"); 00408 } 00409 return true; 00410 } 00411 // The path is problematic if there is a non-alpha character in the 00412 // middle of the path (unless it is a digit in the middle of a path 00413 // that represents a number). 00414 if ((vse.parent_vse != NULL) && !word_end && // is middle 00415 !(dict_->getUnicharset().get_isalpha(unichar_id) || // alpha 00416 (dict_->getUnicharset().get_isdigit(unichar_id) && // ok digit 00417 vse.dawg_info != NULL && vse.dawg_info->permuter == NUMBER_PERM))) { 00418 if (language_model_debug_level > 0) { 00419 tprintf("ProblematicPath: non-alpha middle\n"); 00420 } 00421 return true; 00422 } 00423 return false; 00424 } 00425 00426 void LanguageModel::GetTopChoiceLowerUpper(LanguageModelFlagsType changed, 00427 BLOB_CHOICE_LIST *curr_list, 00428 BLOB_CHOICE **first_lower, 00429 BLOB_CHOICE **first_upper) { 00430 if (!(changed & kLowerCaseFlag || changed & kUpperCaseFlag)) return; 00431 BLOB_CHOICE_IT c_it(curr_list); 00432 const UNICHARSET &unicharset = dict_->getUnicharset(); 00433 BLOB_CHOICE *first_unichar = NULL; 00434 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00435 UNICHAR_ID unichar_id = c_it.data()->unichar_id(); 00436 if (unicharset.get_fragment(unichar_id)) continue; // skip fragments 00437 if (first_unichar == NULL) first_unichar = c_it.data(); 00438 if (*first_lower == NULL && unicharset.get_islower(unichar_id)) { 00439 *first_lower = c_it.data(); 00440 } 00441 if (*first_upper == NULL && unicharset.get_isupper(unichar_id)) { 00442 *first_upper = c_it.data(); 00443 } 00444 } 00445 ASSERT_HOST(first_unichar != NULL); 00446 if (*first_lower == NULL) *first_lower = first_unichar; 00447 if (*first_upper == NULL) *first_upper = first_unichar; 00448 } 00449 00450 LanguageModelFlagsType LanguageModel::AddViterbiStateEntry( 00451 LanguageModelFlagsType top_choice_flags, 00452 float denom, 00453 bool word_end, 00454 int curr_col, int curr_row, 00455 BLOB_CHOICE *b, 00456 BLOB_CHOICE *parent_b, 00457 ViterbiStateEntry *parent_vse, 00458 HEAP *pain_points, 00459 BestPathByColumn *best_path_by_column[], 00460 CHUNKS_RECORD *chunks_record, 00461 BestChoiceBundle *best_choice_bundle, 00462 BlamerBundle *blamer_bundle) { 00463 ViterbiStateEntry_IT vit; 00464 if (language_model_debug_level > 0) { 00465 tprintf("\nAddViterbiStateEntry for unichar %s rating=%.4f" 00466 " certainty=%.4f top_choice_flags=0x%x parent_vse=%p\n", 00467 dict_->getUnicharset().id_to_unichar(b->unichar_id()), 00468 b->rating(), b->certainty(), top_choice_flags, parent_vse); 00469 if (language_model_debug_level > 3 && b->language_model_state() != NULL) { 00470 tprintf("Existing viterbi list:\n"); 00471 vit.set_to_list(&(reinterpret_cast<LanguageModelState *>( 00472 b->language_model_state())->viterbi_state_entries)); 00473 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) { 00474 PrintViterbiStateEntry("", vit.data(), b, chunks_record); 00475 } 00476 } 00477 } 00478 // Check whether the list is full. 00479 if (b->language_model_state() != NULL && 00480 (reinterpret_cast<LanguageModelState *>( 00481 b->language_model_state())->viterbi_state_entries_length >= 00482 language_model_viterbi_list_max_size)) { 00483 if (language_model_debug_level > 1) { 00484 tprintf("AddViterbiStateEntry: viterbi list is full!\n"); 00485 } 00486 return 0x0; 00487 } 00488 00489 LanguageModelFlagsType changed = 0x0; 00490 float optimistic_cost = 0.0f; 00491 if (!language_model_ngram_on) optimistic_cost += b->rating(); 00492 if (parent_vse != NULL) optimistic_cost += parent_vse->cost; 00493 // Discard this entry if it will not beat best choice. 00494 if (optimistic_cost >= best_choice_bundle->best_choice->rating()) { 00495 if (language_model_debug_level > 1) { 00496 tprintf("Discarded ViterbiEntry with high cost %.4f" 00497 " best_choice->rating()=%.4f\n", optimistic_cost, 00498 best_choice_bundle->best_choice->rating()); 00499 } 00500 return 0x0; 00501 } 00502 00503 // Check consistency of the path and set the relevant consistency_info. 00504 LanguageModelConsistencyInfo consistency_info; 00505 FillConsistencyInfo(curr_col, word_end, b, parent_vse, parent_b, 00506 chunks_record, &consistency_info); 00507 00508 // Invoke Dawg language model component. 00509 LanguageModelDawgInfo *dawg_info = 00510 GenerateDawgInfo(word_end, consistency_info.script_id, 00511 curr_col, curr_row, *b, parent_vse, &changed); 00512 00513 // Invoke TopChoice language model component 00514 float ratings_sum = b->rating(); 00515 if (parent_vse != NULL) ratings_sum += parent_vse->ratings_sum; 00516 GenerateTopChoiceInfo(ratings_sum, dawg_info, consistency_info, 00517 parent_vse, b, &top_choice_flags, &changed); 00518 00519 // Invoke Ngram language model component. 00520 LanguageModelNgramInfo *ngram_info = NULL; 00521 if (language_model_ngram_on) { 00522 ngram_info = GenerateNgramInfo( 00523 dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(), 00524 denom, curr_col, curr_row, parent_vse, parent_b, &changed); 00525 ASSERT_HOST(ngram_info != NULL); 00526 } 00527 00528 // Prune non-top choice paths with inconsistent scripts. 00529 if (consistency_info.inconsistent_script) { 00530 if (!(top_choice_flags & kSmallestRatingFlag)) changed = 0x0; 00531 if (ngram_info != NULL) ngram_info->pruned = true; 00532 } 00533 00534 // If language model components did not like this unichar - return 00535 if (!changed) { 00536 if (language_model_debug_level > 0) { 00537 tprintf("Language model components did not like this entry\n"); 00538 } 00539 delete dawg_info; 00540 delete ngram_info; 00541 return 0x0; 00542 } 00543 00544 // Compute cost of associating the blobs that represent the current unichar. 00545 AssociateStats associate_stats; 00546 ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_, 00547 parent_vse, chunks_record, &associate_stats); 00548 if (parent_vse != NULL) { 00549 associate_stats.shape_cost += parent_vse->associate_stats.shape_cost; 00550 associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape; 00551 } 00552 00553 // Compute the aggregate cost (adjusted ratings sum). 00554 float cost = ComputeAdjustedPathCost( 00555 ratings_sum, 00556 (parent_vse == NULL) ? 1 : (parent_vse->length+1), 00557 (dawg_info == NULL) ? 0.0f : 1.0f, 00558 dawg_info, ngram_info, consistency_info, associate_stats, parent_vse); 00559 00560 if (b->language_model_state() == NULL) { 00561 b->set_language_model_state(new LanguageModelState(curr_col, curr_row)); 00562 } 00563 LanguageModelState *lms = 00564 reinterpret_cast<LanguageModelState *>(b->language_model_state()); 00565 00566 // Discard this entry if it represents a prunable path and 00567 // language_model_viterbi_list_max_num_prunable such entries with a lower 00568 // cost have already been recorded. 00569 if (PrunablePath(top_choice_flags, dawg_info) && 00570 (lms->viterbi_state_entries_prunable_length >= 00571 language_model_viterbi_list_max_num_prunable) && 00572 cost >= lms->viterbi_state_entries_prunable_max_cost) { 00573 if (language_model_debug_level > 1) { 00574 tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n", 00575 cost, lms->viterbi_state_entries_prunable_max_cost); 00576 } 00577 delete dawg_info; 00578 delete ngram_info; 00579 return 0x0; 00580 } 00581 00582 // Create the new ViterbiStateEntry and add it to lms->viterbi_state_entries 00583 ViterbiStateEntry *new_vse = new ViterbiStateEntry( 00584 parent_b, parent_vse, b, cost, ComputeOutlineLength(b), consistency_info, 00585 associate_stats, top_choice_flags, dawg_info, ngram_info); 00586 updated_flags_.push_back(&(new_vse->updated)); 00587 lms->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare, 00588 false, new_vse); 00589 lms->viterbi_state_entries_length++; 00590 if (PrunablePath(top_choice_flags, dawg_info)) { 00591 lms->viterbi_state_entries_prunable_length++; 00592 } 00593 00594 // Update lms->viterbi_state_entries_prunable_max_cost and clear 00595 // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum. 00596 if ((lms->viterbi_state_entries_prunable_length >= 00597 language_model_viterbi_list_max_num_prunable) || top_choice_flags) { 00598 ASSERT_HOST(!lms->viterbi_state_entries.empty()); 00599 int prunable_counter = language_model_viterbi_list_max_num_prunable; 00600 vit.set_to_list(&(lms->viterbi_state_entries)); 00601 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) { 00602 ViterbiStateEntry *curr_vse = vit.data(); 00603 // Clear the appropriate top choice flags of the entries in the 00604 // list that have ratings_sum higher thank new_entry->ratings_sum 00605 // (since they will not be top choices any more). 00606 if (curr_vse->top_choice_flags && curr_vse != new_vse && 00607 ComputeConsistencyAdjustedRatingsSum( 00608 curr_vse->ratings_sum, curr_vse->dawg_info, 00609 curr_vse->consistency_info) > 00610 ComputeConsistencyAdjustedRatingsSum( 00611 new_vse->ratings_sum, new_vse->dawg_info, 00612 new_vse->consistency_info)) { 00613 curr_vse->top_choice_flags &= ~(top_choice_flags); 00614 } 00615 if (prunable_counter > 0 && 00616 PrunablePath(curr_vse->top_choice_flags, curr_vse->dawg_info)) { 00617 --prunable_counter; 00618 } 00619 // Update lms->viterbi_state_entries_prunable_max_cost. 00620 if (prunable_counter == 0) { 00621 lms->viterbi_state_entries_prunable_max_cost = vit.data()->cost; 00622 if (language_model_debug_level > 1) { 00623 tprintf("Set viterbi_state_entries_prunable_max_cost to %.4f\n", 00624 lms->viterbi_state_entries_prunable_max_cost); 00625 } 00626 prunable_counter = -1; // stop counting 00627 } 00628 } 00629 } 00630 00631 // Print the newly created ViterbiStateEntry. 00632 if (language_model_debug_level > 2) { 00633 PrintViterbiStateEntry("New", new_vse, b, chunks_record); 00634 if (language_model_debug_level > 3) { 00635 tprintf("Updated viterbi list (max cost %g):\n", 00636 lms->viterbi_state_entries_prunable_max_cost); 00637 vit.set_to_list(&(lms->viterbi_state_entries)); 00638 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) { 00639 PrintViterbiStateEntry("", vit.data(), b, chunks_record); 00640 } 00641 } 00642 } 00643 00644 // Update best choice if needed. 00645 if (word_end) { 00646 UpdateBestChoice(b, new_vse, pain_points, chunks_record, 00647 best_choice_bundle, blamer_bundle); 00648 } 00649 00650 // Update stats in best_path_by_column. 00651 if (new_vse->Consistent() || new_vse->dawg_info != NULL || 00652 (new_vse->ngram_info != NULL && !new_vse->ngram_info->pruned)) { 00653 float avg_cost = new_vse->cost / static_cast<float>(curr_row+1); 00654 for (int c = curr_col; c <= curr_row; ++c) { 00655 if (avg_cost < (*best_path_by_column)[c].avg_cost) { 00656 (*best_path_by_column)[c].avg_cost = avg_cost; 00657 (*best_path_by_column)[c].best_vse = new_vse; 00658 (*best_path_by_column)[c].best_b = b; 00659 if (language_model_debug_level > 0) { 00660 tprintf("Set best_path_by_column[%d]=(%g %p)\n", 00661 c, avg_cost, new_vse); 00662 } 00663 } 00664 } 00665 } 00666 return changed; 00667 } 00668 00669 void LanguageModel::PrintViterbiStateEntry( 00670 const char *msg, ViterbiStateEntry *vse, 00671 BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record) { 00672 tprintf("%s ViterbiStateEntry %p with ratings_sum=%.4f length=%d cost=%.4f", 00673 msg, vse, vse->ratings_sum, vse->length, vse->cost); 00674 if (vse->top_choice_flags) { 00675 tprintf(" top_choice_flags=0x%x", vse->top_choice_flags); 00676 } 00677 if (!vse->Consistent()) { 00678 tprintf(" inconsistent=(punc %d case %d chartype %d script %d)\n", 00679 vse->consistency_info.NumInconsistentPunc(), 00680 vse->consistency_info.NumInconsistentCase(), 00681 vse->consistency_info.NumInconsistentChartype(), 00682 vse->consistency_info.inconsistent_script); 00683 } 00684 if (vse->dawg_info) tprintf(" permuter=%d", vse->dawg_info->permuter); 00685 if (vse->ngram_info) { 00686 tprintf(" ngram_cost=%g context=%s ngram pruned=%d", 00687 vse->ngram_info->ngram_cost, 00688 vse->ngram_info->context.string(), 00689 vse->ngram_info->pruned); 00690 } 00691 if (vse->associate_stats.shape_cost > 0.0f) { 00692 tprintf(" shape_cost=%g", vse->associate_stats.shape_cost); 00693 } 00694 if (language_model_debug_level > 3) { 00695 STRING wd_str; 00696 WERD_CHOICE *wd = ConstructWord(b, vse, chunks_record, 00697 NULL, NULL, NULL, NULL, NULL, NULL); 00698 wd->string_and_lengths(&wd_str, NULL); 00699 delete wd; 00700 tprintf(" str=%s", wd_str.string()); 00701 } 00702 tprintf("\n"); 00703 } 00704 00705 void LanguageModel::GenerateTopChoiceInfo( 00706 float ratings_sum, 00707 const LanguageModelDawgInfo *dawg_info, 00708 const LanguageModelConsistencyInfo &consistency_info, 00709 const ViterbiStateEntry *parent_vse, 00710 BLOB_CHOICE *b, 00711 LanguageModelFlagsType *top_choice_flags, 00712 LanguageModelFlagsType *changed) { 00713 ratings_sum = ComputeConsistencyAdjustedRatingsSum( 00714 ratings_sum, dawg_info, consistency_info); 00715 // Clear flags that do not agree with parent_vse->top_choice_flags. 00716 if (parent_vse != NULL) *top_choice_flags &= parent_vse->top_choice_flags; 00717 if (consistency_info.Consistent()) *top_choice_flags |= kConsistentFlag; 00718 if (*top_choice_flags == 0x0) return; 00719 LanguageModelState *lms = 00720 reinterpret_cast<LanguageModelState *>(b->language_model_state()); 00721 if (lms != NULL && !lms->viterbi_state_entries.empty()) { 00722 ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries)); 00723 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) { 00724 if (ratings_sum >= ComputeConsistencyAdjustedRatingsSum( 00725 vit.data()->ratings_sum, vit.data()->dawg_info, 00726 vit.data()->consistency_info)) { 00727 // Clear the appropriate flags if the list already contains 00728 // a top choice entry with a lower cost. 00729 *top_choice_flags &= ~(vit.data()->top_choice_flags); 00730 } 00731 } 00732 } 00733 if (language_model_debug_level > 0) { 00734 tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n", 00735 *top_choice_flags); 00736 } 00737 *changed |= *top_choice_flags; 00738 } 00739 00740 LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo( 00741 bool word_end, int script_id, 00742 int curr_col, int curr_row, 00743 const BLOB_CHOICE &b, 00744 const ViterbiStateEntry *parent_vse, 00745 LanguageModelFlagsType *changed) { 00746 bool use_fixed_length_dawgs = !fixed_length_beginning_active_dawgs_->empty(); 00747 00748 // Initialize active_dawgs and constraints from parent_vse if it is not NULL, 00749 // otherwise use beginning_active_dawgs_ and beginning_constraints_. 00750 if (parent_vse == NULL || 00751 (use_fixed_length_dawgs && parent_vse->dawg_info == NULL)) { 00752 dawg_args_->active_dawgs = beginning_active_dawgs_; 00753 dawg_args_->constraints = beginning_constraints_; 00754 dawg_args_->permuter = NO_PERM; 00755 } else { 00756 if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path 00757 dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs; 00758 dawg_args_->constraints = parent_vse->dawg_info->constraints; 00759 dawg_args_->permuter = parent_vse->dawg_info->permuter; 00760 } 00761 00762 // Deal with hyphenated words. 00763 if (!use_fixed_length_dawgs && word_end && 00764 dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) { 00765 if (language_model_debug_level > 0) tprintf("Hyphenated word found\n"); 00766 *changed |= kDawgFlag; 00767 return new LanguageModelDawgInfo(dawg_args_->active_dawgs, 00768 dawg_args_->constraints, 00769 COMPOUND_PERM); 00770 } 00771 00772 // Deal with compound words. 00773 if (!use_fixed_length_dawgs && dict_->compound_marker(b.unichar_id()) && 00774 (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) { 00775 if (language_model_debug_level > 0) tprintf("Found compound marker"); 00776 // Do not allow compound operators at the beginning and end of the word. 00777 // Do not allow more than one compound operator per word. 00778 // Do not allow compounding of words with lengths shorter than 00779 // language_model_min_compound_length 00780 if (parent_vse == NULL || word_end || 00781 dawg_args_->permuter == COMPOUND_PERM || 00782 parent_vse->length < language_model_min_compound_length) return NULL; 00783 00784 int i; 00785 // Check a that the path terminated before the current character is a word. 00786 bool has_word_ending = false; 00787 for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) { 00788 const DawgInfo &info = (*parent_vse->dawg_info->active_dawgs)[i]; 00789 const Dawg *pdawg = dict_->GetDawg(info.dawg_index); 00790 assert(pdawg != NULL); 00791 if (pdawg->type() == DAWG_TYPE_WORD && info.ref != NO_EDGE && 00792 pdawg->end_of_word(info.ref)) { 00793 has_word_ending = true; 00794 break; 00795 } 00796 } 00797 if (!has_word_ending) return NULL; 00798 00799 // Return LanguageModelDawgInfo with active_dawgs set to 00800 // the beginning dawgs of type DAWG_TYPE_WORD. 00801 if (language_model_debug_level > 0) tprintf("Compound word found\n"); 00802 DawgInfoVector beginning_word_dawgs; 00803 for (i = 0; i < beginning_active_dawgs_->size(); ++i) { 00804 const Dawg *bdawg = 00805 dict_->GetDawg((*beginning_active_dawgs_)[i].dawg_index); 00806 if (bdawg->type() == DAWG_TYPE_WORD) { 00807 beginning_word_dawgs += (*beginning_active_dawgs_)[i]; 00808 } 00809 } 00810 *changed |= kDawgFlag; 00811 return new LanguageModelDawgInfo(&(beginning_word_dawgs), 00812 dawg_args_->constraints, 00813 COMPOUND_PERM); 00814 } // done dealing with compound words 00815 00816 LanguageModelDawgInfo *dawg_info = NULL; 00817 00818 // Call LetterIsOkay(). 00819 dict_->LetterIsOkay(dawg_args_, b.unichar_id(), word_end); 00820 if (dawg_args_->permuter != NO_PERM) { 00821 *changed |= kDawgFlag; 00822 dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_active_dawgs, 00823 dawg_args_->updated_constraints, 00824 dawg_args_->permuter); 00825 } 00826 00827 // For non-space delimited languages: since every letter could be 00828 // a valid word, a new word could start at every unichar. Thus append 00829 // fixed_length_beginning_active_dawgs_ to dawg_info->active_dawgs. 00830 if (use_fixed_length_dawgs) { 00831 if (dawg_info == NULL) { 00832 *changed |= kDawgFlag; 00833 dawg_info = new LanguageModelDawgInfo( 00834 fixed_length_beginning_active_dawgs_, 00835 empty_dawg_info_vec_, SYSTEM_DAWG_PERM); 00836 } else { 00837 *(dawg_info->active_dawgs) += *(fixed_length_beginning_active_dawgs_); 00838 } 00839 } // done dealing with fixed-length dawgs 00840 00841 return dawg_info; 00842 } 00843 00844 LanguageModelNgramInfo *LanguageModel::GenerateNgramInfo( 00845 const char *unichar, float certainty, float denom, 00846 int curr_col, int curr_row, 00847 const ViterbiStateEntry *parent_vse, 00848 BLOB_CHOICE *parent_b, 00849 LanguageModelFlagsType *changed) { 00850 // Initialize parent context. 00851 const char *pcontext_ptr = ""; 00852 int pcontext_unichar_step_len = 0; 00853 if (parent_vse == NULL) { 00854 pcontext_ptr = prev_word_str_.string(); 00855 pcontext_unichar_step_len = prev_word_unichar_step_len_; 00856 } else { 00857 pcontext_ptr = parent_vse->ngram_info->context.string(); 00858 pcontext_unichar_step_len = 00859 parent_vse->ngram_info->context_unichar_step_len; 00860 } 00861 // Compute p(unichar | parent context). 00862 int unichar_step_len = 0; 00863 bool pruned = false; 00864 float ngram_prob; 00865 float ngram_cost = ComputeNgramCost(unichar, certainty, denom, 00866 pcontext_ptr, &unichar_step_len, 00867 &pruned, &ngram_prob); 00868 // First attempt to normalize ngram_cost for strings of different 00869 // lengths - we multiply ngram_cost by P(char | context) as many times 00870 // as the number of chunks occupied by char. This makes the ngram costs 00871 // of all the paths ending at the current BLOB_CHOICE comparable. 00872 // TODO(daria): it would be worth looking at different ways of normalization. 00873 if (curr_row > curr_col) { 00874 ngram_cost += (curr_row - curr_col) * ngram_cost; 00875 ngram_prob += (curr_row - curr_col) * ngram_prob; 00876 } 00877 // Add the ngram_cost of the parent. 00878 if (parent_vse != NULL) { 00879 ngram_cost += parent_vse->ngram_info->ngram_cost; 00880 ngram_prob += parent_vse->ngram_info->ngram_prob; 00881 } 00882 00883 // Shorten parent context string by unichar_step_len unichars. 00884 int num_remove = (unichar_step_len + pcontext_unichar_step_len - 00885 language_model_ngram_order); 00886 if (num_remove > 0) pcontext_unichar_step_len -= num_remove; 00887 while (num_remove > 0 && *pcontext_ptr != '\0') { 00888 pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr); 00889 --num_remove; 00890 } 00891 00892 // Decide whether to prune this ngram path and update changed accordingly. 00893 if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true; 00894 if (!pruned) *changed |= kNgramFlag; 00895 00896 // Construct and return the new LanguageModelNgramInfo. 00897 LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo( 00898 pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_prob, ngram_cost); 00899 ngram_info->context += unichar; 00900 ngram_info->context_unichar_step_len += unichar_step_len; 00901 assert(ngram_info->context_unichar_step_len <= language_model_ngram_order); 00902 return ngram_info; 00903 } 00904 00905 float LanguageModel::ComputeNgramCost(const char *unichar, 00906 float certainty, 00907 float denom, 00908 const char *context, 00909 int *unichar_step_len, 00910 bool *found_small_prob, 00911 float *ngram_prob) { 00912 const char *context_ptr = context; 00913 char *modified_context = NULL; 00914 char *modified_context_end = NULL; 00915 const char *unichar_ptr = unichar; 00916 const char *unichar_end = unichar_ptr + strlen(unichar_ptr); 00917 float prob = 0.0f; 00918 int step = 0; 00919 while (unichar_ptr < unichar_end && 00920 (step = UNICHAR::utf8_step(unichar_ptr)) > 0) { 00921 if (language_model_debug_level > 1) { 00922 tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr, 00923 dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step)); 00924 } 00925 prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step); 00926 ++(*unichar_step_len); 00927 if (language_model_ngram_use_only_first_uft8_step) break; 00928 unichar_ptr += step; 00929 // If there are multiple UTF8 characters present in unichar, context is 00930 // updated to include the previously examined characters from str, 00931 // unless use_only_first_uft8_step is true. 00932 if (unichar_ptr < unichar_end) { 00933 if (modified_context == NULL) { 00934 int context_len = strlen(context); 00935 modified_context = 00936 new char[context_len + strlen(unichar_ptr) + step + 1]; 00937 strncpy(modified_context, context, context_len); 00938 modified_context_end = modified_context + context_len; 00939 context_ptr = modified_context; 00940 } 00941 strncpy(modified_context_end, unichar_ptr - step, step); 00942 modified_context_end += step; 00943 *modified_context_end = '\0'; 00944 } 00945 } 00946 prob /= static_cast<float>(*unichar_step_len); // normalize 00947 if (prob < language_model_ngram_small_prob) { 00948 if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob); 00949 *found_small_prob = true; 00950 } 00951 *ngram_prob = -1.0*log(prob); 00952 float cost = -1.0*log(CertaintyScore(certainty)/denom) + 00953 *ngram_prob * language_model_ngram_scale_factor; 00954 if (language_model_debug_level > 1) { 00955 tprintf("-log [ p(%s) * p(%s | %s) ] = -log(%g*%g) = %g\n", unichar, 00956 unichar, context_ptr, CertaintyScore(certainty)/denom, prob, cost); 00957 } 00958 if (modified_context != NULL) delete[] modified_context; 00959 return cost; 00960 } 00961 00962 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) { 00963 if (curr_list->empty()) return 1.0f; 00964 float denom = 0.0f; 00965 int len = 0; 00966 BLOB_CHOICE_IT c_it(curr_list); 00967 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00968 ASSERT_HOST(c_it.data() != NULL); 00969 ++len; 00970 denom += CertaintyScore(c_it.data()->certainty()); 00971 } 00972 assert(len != 0); 00973 // The ideal situation would be to have the classifier scores for 00974 // classifying each position as each of the characters in the unicharset. 00975 // Since we can not do this because of speed, we add a very crude estimate 00976 // of what these scores for the "missing" classifications would sum up to. 00977 denom += (dict_->getUnicharset().size() - len) * 00978 CertaintyScore(language_model_ngram_nonmatch_score); 00979 00980 return denom; 00981 } 00982 00983 void LanguageModel::FillConsistencyInfo( 00984 int curr_col, 00985 bool word_end, 00986 BLOB_CHOICE *b, 00987 ViterbiStateEntry *parent_vse, 00988 BLOB_CHOICE *parent_b, 00989 CHUNKS_RECORD *chunks_record, 00990 LanguageModelConsistencyInfo *consistency_info) { 00991 const UNICHARSET &unicharset = dict_->getUnicharset(); 00992 UNICHAR_ID unichar_id = b->unichar_id(); 00993 if (parent_vse != NULL) *consistency_info = parent_vse->consistency_info; 00994 00995 // Check punctuation validity. 00996 if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++; 00997 if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) { 00998 if (dict_->compound_marker(unichar_id) && parent_b != NULL && 00999 (unicharset.get_isalpha(parent_b->unichar_id()) || 01000 unicharset.get_isdigit(parent_b->unichar_id()))) { 01001 // reset punc_ref for compound words 01002 consistency_info->punc_ref = NO_EDGE; 01003 } else { 01004 UNICHAR_ID pattern_unichar_id = 01005 (unicharset.get_isalpha(unichar_id) || 01006 unicharset.get_isdigit(unichar_id)) ? 01007 Dawg::kPatternUnicharID : unichar_id; 01008 if (consistency_info->punc_ref == NO_EDGE || 01009 pattern_unichar_id != Dawg::kPatternUnicharID || 01010 dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) != 01011 Dawg::kPatternUnicharID) { 01012 NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(), 01013 consistency_info->punc_ref); 01014 consistency_info->punc_ref = 01015 (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of( 01016 node, pattern_unichar_id, word_end) : NO_EDGE; 01017 if (consistency_info->punc_ref == NO_EDGE) { 01018 consistency_info->invalid_punc = true; 01019 } 01020 } 01021 } 01022 } 01023 01024 // Update case related counters. 01025 if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) { 01026 // Reset counters if we are dealing with a compound word. 01027 consistency_info->num_lower = 0; 01028 consistency_info->num_non_first_upper = 0; 01029 } 01030 else if (unicharset.get_islower(unichar_id)) { 01031 consistency_info->num_lower++; 01032 } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) { 01033 if (unicharset.get_isupper(parent_b->unichar_id()) || 01034 consistency_info->num_lower > 0 || 01035 consistency_info->num_non_first_upper > 0) { 01036 consistency_info->num_non_first_upper++; 01037 } 01038 } 01039 01040 // Initialize consistency_info->script_id (use script of unichar_id 01041 // if it is not Common, use script id recorded by the parent otherwise). 01042 // Set inconsistent_script to true if the script of the current unichar 01043 // is not consistent with that of the parent. 01044 consistency_info->script_id = unicharset.get_script(unichar_id); 01045 // Hiragana and Katakana can mix with Han. 01046 if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) { 01047 if ((unicharset.hiragana_sid() != unicharset.null_sid() && 01048 consistency_info->script_id == unicharset.hiragana_sid()) || 01049 (unicharset.katakana_sid() != unicharset.null_sid() && 01050 consistency_info->script_id == unicharset.katakana_sid())) { 01051 consistency_info->script_id = dict_->getUnicharset().han_sid(); 01052 } 01053 } 01054 01055 if (parent_vse != NULL && 01056 (parent_vse->consistency_info.script_id != 01057 dict_->getUnicharset().common_sid())) { 01058 int parent_script_id = parent_vse->consistency_info.script_id; 01059 // If script_id is Common, use script id of the parent instead. 01060 if (consistency_info->script_id == dict_->getUnicharset().common_sid()) { 01061 consistency_info->script_id = parent_script_id; 01062 } 01063 if (consistency_info->script_id != parent_script_id) { 01064 consistency_info->inconsistent_script = true; 01065 } 01066 } 01067 01068 // Update chartype related counters. 01069 if (unicharset.get_isalpha(unichar_id)) { 01070 consistency_info->num_alphas++; 01071 } else if (unicharset.get_isdigit(unichar_id)) { 01072 consistency_info->num_digits++; 01073 } else if (!unicharset.get_ispunctuation(unichar_id)) { 01074 consistency_info->num_other++; 01075 } 01076 01077 // Check font and spacing consistency. 01078 if (parent_b != NULL) { 01079 int fontinfo_id = -1; 01080 if (parent_b->fontinfo_id() == b->fontinfo_id() || 01081 parent_b->fontinfo_id2() == b->fontinfo_id()) { 01082 fontinfo_id = b->fontinfo_id(); 01083 } else if (parent_b->fontinfo_id() == b->fontinfo_id2() || 01084 parent_b->fontinfo_id2() == b->fontinfo_id2()) { 01085 fontinfo_id = b->fontinfo_id2(); 01086 } 01087 if(language_model_debug_level > 1) { 01088 tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n", 01089 (parent_b->fontinfo_id() >= 0) ? 01090 fontinfo_table_->get(parent_b->fontinfo_id()).name : "" , 01091 (parent_b->fontinfo_id2() >= 0) ? 01092 fontinfo_table_->get(parent_b->fontinfo_id2()).name : "", 01093 (b->fontinfo_id() >= 0) ? 01094 fontinfo_table_->get(b->fontinfo_id()).name : "", 01095 (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "", 01096 (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "", 01097 fontinfo_id); 01098 } 01099 bool expected_gap_found = false; 01100 float expected_gap; 01101 int temp_gap; 01102 if (fontinfo_id >= 0) { // found a common font 01103 if (fontinfo_table_->get(fontinfo_id).get_spacing( 01104 parent_b->unichar_id(), unichar_id, &temp_gap)) { 01105 expected_gap = temp_gap; 01106 expected_gap_found = true; 01107 } 01108 } else { 01109 consistency_info->inconsistent_font = true; 01110 // Get an average of the expected gaps in each font 01111 int num_addends = 0; 01112 expected_gap = 0; 01113 int temp_fid; 01114 for (int i = 0; i < 4; ++i) { 01115 if (i == 0) { 01116 temp_fid = parent_b->fontinfo_id(); 01117 } else if (i == 1) { 01118 temp_fid = parent_b->fontinfo_id2(); 01119 } else if (i == 2) { 01120 temp_fid = b->fontinfo_id(); 01121 } else { 01122 temp_fid = b->fontinfo_id2(); 01123 } 01124 if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing( 01125 parent_b->unichar_id(), unichar_id, &temp_gap)) { 01126 expected_gap += temp_gap; 01127 num_addends++; 01128 } 01129 } 01130 expected_gap_found = (num_addends > 0); 01131 if (num_addends > 0) expected_gap /= static_cast<float>(num_addends); 01132 } 01133 if (expected_gap_found) { 01134 float actual_gap = 01135 static_cast<float>(AssociateUtils::GetChunksGap( 01136 chunks_record->chunk_widths, curr_col-1)); 01137 float gap_ratio = expected_gap / actual_gap; 01138 // TODO(daria): find a good way to tune this heuristic estimate. 01139 if (gap_ratio < 1/2 || gap_ratio > 2) { 01140 consistency_info->num_inconsistent_spaces++; 01141 } 01142 if(language_model_debug_level > 1) { 01143 tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n", 01144 unicharset.id_to_unichar(parent_b->unichar_id()), 01145 parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id), 01146 unichar_id, curr_col, expected_gap, actual_gap); 01147 } 01148 } 01149 } 01150 } 01151 01152 float LanguageModel::ComputeAdjustedPathCost( 01153 float ratings_sum, int length, float dawg_score, 01154 const LanguageModelDawgInfo *dawg_info, 01155 const LanguageModelNgramInfo *ngram_info, 01156 const LanguageModelConsistencyInfo &consistency_info, 01157 const AssociateStats &associate_stats, 01158 ViterbiStateEntry *parent_vse) { 01159 float adjustment = 1.0f; 01160 if (dawg_info == NULL || dawg_info->permuter != FREQ_DAWG_PERM) { 01161 adjustment += language_model_penalty_non_freq_dict_word; 01162 } 01163 if (dawg_score == 0.0f) { 01164 adjustment += language_model_penalty_non_dict_word; 01165 if (length > language_model_min_compound_length) { 01166 adjustment += ((length - language_model_min_compound_length) * 01167 language_model_penalty_increment); 01168 } 01169 } else if (dawg_score < 1.0f) { 01170 adjustment += (1.0f - dawg_score) * language_model_penalty_non_dict_word; 01171 } 01172 if (associate_stats.shape_cost > 0) { 01173 adjustment += associate_stats.shape_cost / static_cast<float>(length); 01174 } 01175 if (language_model_ngram_on) { 01176 ASSERT_HOST(ngram_info != NULL); 01177 return ngram_info->ngram_cost * adjustment; 01178 } else { 01179 adjustment += ComputeConsistencyAdjustment(dawg_info, consistency_info); 01180 return ratings_sum * adjustment; 01181 } 01182 } 01183 01184 void LanguageModel::UpdateBestChoice( 01185 BLOB_CHOICE *b, 01186 ViterbiStateEntry *vse, 01187 HEAP *pain_points, 01188 CHUNKS_RECORD *chunks_record, 01189 BestChoiceBundle *best_choice_bundle, 01190 BlamerBundle *blamer_bundle) { 01191 int i; 01192 BLOB_CHOICE_LIST_VECTOR temp_best_char_choices(vse->length); 01193 for (i = 0; i < vse->length; ++i) { 01194 temp_best_char_choices.push_back(NULL); 01195 } 01196 float *certainties = new float[vse->length]; 01197 STATE temp_state; 01198 // The fraction of letters in the path that are "covered" by dawgs. 01199 // For space delimited languages this number will be 0.0 for nonwords 01200 // and 1.0 for dictionary words. For non-space delimited languages 01201 // this number will be in the [0.0, 1.0] range. 01202 float dawg_score; 01203 bool truth_path; 01204 WERD_CHOICE *word = ConstructWord(b, vse, chunks_record, 01205 &temp_best_char_choices, certainties, 01206 &dawg_score, &temp_state, 01207 blamer_bundle, &truth_path); 01208 ASSERT_HOST(word != NULL); 01209 bool not_blaming = 01210 (blamer_bundle == NULL || !blamer_bundle->segsearch_is_looking_for_blame); 01211 01212 // Log new segmentation (for dict_->LogNewChoice()). 01213 if (not_blaming) { 01214 PIECES_STATE pieces_widths; 01215 bin_to_pieces(&temp_state, chunks_record->ratings->dimension() - 1, 01216 pieces_widths); 01217 dict_->LogNewSegmentation(pieces_widths); 01218 } 01219 01220 if (language_model_debug_level > 0) { 01221 STRING word_str; 01222 word->string_and_lengths(&word_str, NULL); 01223 tprintf("UpdateBestChoice() constructed word %s\n", word_str.string()); 01224 if (language_model_debug_level > 2) word->print(); 01225 }; 01226 01227 // Update raw_choice if needed. 01228 if ((vse->top_choice_flags & kSmallestRatingFlag) && 01229 word->rating() < best_choice_bundle->raw_choice->rating() && 01230 not_blaming) { 01231 dict_->LogNewChoice(1.0, certainties, true, word); 01232 *(best_choice_bundle->raw_choice) = *word; 01233 best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM); 01234 if (language_model_debug_level > 0) tprintf("Updated raw choice\n"); 01235 } 01236 01237 // When working with non-space delimited languages we re-adjust the cost 01238 // taking into account the final dawg_score and a more precise shape cost. 01239 // While constructing the paths we assume that they are all dictionary words 01240 // (since a single character would be a valid dictionary word). At the end 01241 // we compute dawg_score which reflects how many characters on the path are 01242 // "covered" by dictionary words of length > 1. 01243 if (vse->associate_stats.full_wh_ratio_var != 0.0f || 01244 (dict_->GetMaxFixedLengthDawgIndex() >= 0 && dawg_score < 1.0f)) { 01245 vse->cost = ComputeAdjustedPathCost( 01246 vse->ratings_sum, vse->length, dawg_score, vse->dawg_info, 01247 vse->ngram_info, vse->consistency_info, vse->associate_stats, 01248 vse->parent_vse); 01249 if (language_model_debug_level > 0) { 01250 tprintf("Updated vse cost to %g (dawg_score %g full_wh_ratio_var %g)\n", 01251 vse->cost, dawg_score, vse->associate_stats.full_wh_ratio_var); 01252 } 01253 } 01254 01255 // Update best choice and best char choices if needed. 01256 // TODO(daria): re-write AcceptableChoice() and NoDangerousAmbig() 01257 // to fit better into the new segmentation search. 01258 word->set_rating(vse->cost); 01259 if (word->rating() < best_choice_bundle->best_choice->rating() && 01260 not_blaming) { 01261 dict_->LogNewChoice(vse->cost / (language_model_ngram_on ? 01262 vse->ngram_info->ngram_cost : 01263 vse->ratings_sum), 01264 certainties, false, word); 01265 // Since the rating of the word could have been modified by 01266 // Dict::LogNewChoice() - check again. 01267 if (word->rating() < best_choice_bundle->best_choice->rating()) { 01268 bool modified_blobs; // not used 01269 DANGERR fixpt; 01270 if (dict_->AcceptableChoice(&temp_best_char_choices, word, &fixpt, 01271 ASSOCIATOR_CALLER, &modified_blobs) && 01272 AcceptablePath(*vse)) { 01273 acceptable_choice_found_ = true; 01274 } 01275 // Update best_choice_bundle. 01276 *(best_choice_bundle->best_choice) = *word; 01277 best_choice_bundle->updated = true; 01278 best_choice_bundle->best_char_choices->delete_data_pointers(); 01279 best_choice_bundle->best_char_choices->clear(); 01280 for (i = 0; i < temp_best_char_choices.size(); ++i) { 01281 BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST(); 01282 cc_list->deep_copy(temp_best_char_choices[i], &BLOB_CHOICE::deep_copy); 01283 best_choice_bundle->best_char_choices->push_back(cc_list); 01284 } 01285 best_choice_bundle->best_state->part2 = temp_state.part2; 01286 best_choice_bundle->best_state->part1 = temp_state.part1; 01287 if (language_model_debug_level > 0) { 01288 tprintf("Updated best choice\n"); 01289 print_state("New state ", best_choice_bundle->best_state, 01290 chunks_record->ratings->dimension()-1); 01291 } 01292 // Update hyphen state if we are dealing with a dictionary word. 01293 if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) { 01294 if (dict_->has_hyphen_end(*word)) { 01295 dict_->set_hyphen_word(*word, *(dawg_args_->active_dawgs), 01296 *(dawg_args_->constraints)); 01297 } else { 01298 dict_->reset_hyphen_vars(true); 01299 } 01300 } 01301 best_choice_bundle->best_vse = vse; 01302 best_choice_bundle->best_b = b; 01303 best_choice_bundle->fixpt = fixpt; 01304 01305 if (blamer_bundle != NULL) { 01306 blamer_bundle->best_choice_is_dict_and_top_choice = 01307 (vse->dawg_info != NULL && 01308 dict_->GetMaxFixedLengthDawgIndex() < 0 && 01309 (vse->top_choice_flags)); 01310 } 01311 } 01312 } 01313 if (blamer_bundle != NULL) { 01314 // Record the current hypothesis in params_training_bundle. 01315 ParamsTrainingHypothesis &hyp = 01316 blamer_bundle->params_training_bundle.AddHypothesis(); 01317 word->string_and_lengths(&(hyp.str), NULL); 01318 ExtractRawFeaturesFromPath(*vse, hyp.features); 01319 if (truth_path && // update best truth path rating 01320 word->rating() < blamer_bundle->best_correctly_segmented_rating) { 01321 blamer_bundle->best_correctly_segmented_rating = word->rating(); 01322 } 01323 } 01324 01325 // Clean up. 01326 delete[] certainties; 01327 delete word; 01328 } 01329 01330 void LanguageModel::ExtractRawFeaturesFromPath(const ViterbiStateEntry &vse, 01331 float *features) { 01332 for (int f = 0; f < PTRAIN_NUM_RAW_FEATURE_TYPES; ++f) features[f] = 0.0; 01333 // Dictionary-related features. 01334 if (vse.dawg_info != NULL) { 01335 features[PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE] = vse.dawg_info->permuter; 01336 01337 // Mark as unambiguous if unambig_dawg is found among active dawgs. 01338 for (int d = 0; d < vse.dawg_info->active_dawgs->size(); ++d) { 01339 if (dict_->GetDawg(vse.dawg_info->active_dawgs->get(d).dawg_index) == 01340 dict_->GetUnambigDawg()) { 01341 features[PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH] = 1.0; 01342 break; 01343 } 01344 } 01345 } 01346 if (vse.associate_stats.shape_cost > 0) { 01347 features[PTRAIN_RAW_FEATURE_SHAPE_COST] = vse.associate_stats.shape_cost; 01348 } 01349 if (language_model_ngram_on) { 01350 ASSERT_HOST(vse.ngram_info != NULL); 01351 features[PTRAIN_RAW_FEATURE_NGRAM_PROB] = vse.ngram_info->ngram_prob; 01352 } 01353 // Consistency-related features. 01354 features[PTRAIN_RAW_FEATURE_NUM_BAD_PUNC] = 01355 vse.consistency_info.NumInconsistentPunc(); 01356 features[PTRAIN_RAW_FEATURE_NUM_BAD_CASE] = 01357 vse.consistency_info.NumInconsistentCase(); 01358 features[PTRAIN_RAW_FEATURE_NUM_BAD_CHAR_TYPE] = 01359 vse.consistency_info.NumInconsistentChartype(); 01360 features[PTRAIN_RAW_FEATURE_NUM_BAD_SPACING] = 01361 vse.consistency_info.NumInconsistentSpaces(); 01362 features[PTRAIN_RAW_FEATURE_NUM_BAD_SCRIPT] = 01363 vse.consistency_info.inconsistent_script; 01364 features[PTRAIN_RAW_FEATURE_NUM_BAD_FONT] = 01365 vse.consistency_info.inconsistent_font; 01366 // Classifier-related features. 01367 features[PTRAIN_RAW_FEATURE_WORST_CERT] = vse.min_certainty; 01368 features[PTRAIN_RAW_FEATURE_RATING] = vse.ratings_sum; 01369 features[PTRAIN_RAW_FEATURE_ADAPTED] = vse.adapted; 01370 // Normalization-related features. 01371 features[PTRAIN_RAW_FEATURE_NUM_UNICHARS] = vse.length; 01372 features[PTRAIN_RAW_FEATURE_OUTLINE_LEN] = vse.outline_length; 01373 } 01374 01375 WERD_CHOICE *LanguageModel::ConstructWord( 01376 BLOB_CHOICE *b, 01377 ViterbiStateEntry *vse, 01378 CHUNKS_RECORD *chunks_record, 01379 BLOB_CHOICE_LIST_VECTOR *best_char_choices, 01380 float certainties[], 01381 float *dawg_score, 01382 STATE *state, 01383 BlamerBundle *blamer_bundle, 01384 bool *truth_path) { 01385 uinT64 state_uint = 0x0; 01386 if (truth_path != NULL) { 01387 *truth_path = 01388 (blamer_bundle != NULL && 01389 !blamer_bundle->correct_segmentation_cols.empty() && 01390 vse->length == blamer_bundle->correct_segmentation_cols.length()); 01391 } 01392 const uinT64 kHighestBitOn = 0x8000000000000000LL; 01393 BLOB_CHOICE *curr_b = b; 01394 LanguageModelState *curr_lms = 01395 reinterpret_cast<LanguageModelState *>(b->language_model_state()); 01396 ViterbiStateEntry *curr_vse = vse; 01397 01398 int i; 01399 bool compound = dict_->hyphenated(); // treat hyphenated words as compound 01400 bool dawg_score_done = true; 01401 if (dawg_score != NULL) { 01402 *dawg_score = 0.0f; 01403 // For space-delimited languages the presence of dawg_info in the last 01404 // ViterbyStateEntry on the path means that the whole path represents 01405 // a valid dictionary word. 01406 if (dict_->GetMaxFixedLengthDawgIndex() < 0) { 01407 if (vse->dawg_info != NULL) *dawg_score = 1.0f; 01408 } else if (vse->length == 1) { 01409 *dawg_score = 1.0f; // each one-letter word is legal 01410 dawg_score_done = true; // in non-space delimited languages 01411 } else { 01412 dawg_score_done = false; // do more work to compute dawg_score 01413 } 01414 } 01415 // For non-space delimited languages we compute the fraction of letters 01416 // "covered" by fixed length dawgs (i.e. words of length > 1 on the path). 01417 int covered_by_fixed_length_dawgs = 0; 01418 // The number of unichars remaining that should be skipped because 01419 // they are covered by the previous word from fixed length dawgs. 01420 int fixed_length_num_unichars_to_skip = 0; 01421 01422 // Re-compute the variance of the width-to-height ratios (since we now 01423 // can compute the mean over the whole word). 01424 float full_wh_ratio_mean = 0.0f; 01425 if (vse->associate_stats.full_wh_ratio_var != 0.0f) { 01426 vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var; 01427 full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total / 01428 static_cast<float>(vse->length)); 01429 vse->associate_stats.full_wh_ratio_var = 0.0f; 01430 } 01431 01432 // Construct a WERD_CHOICE by tracing parent pointers. 01433 WERD_CHOICE *word = new WERD_CHOICE(chunks_record->word_res->uch_set, 01434 vse->length); 01435 word->set_length(vse->length); 01436 for (i = (vse->length-1); i >= 0; --i) { 01437 if (blamer_bundle != NULL && truth_path != NULL && *truth_path) { 01438 if ((blamer_bundle->correct_segmentation_cols[i] != 01439 curr_lms->contained_in_col) || 01440 (blamer_bundle->correct_segmentation_rows[i] != 01441 curr_lms->contained_in_row)) { 01442 *truth_path = false; 01443 } 01444 } 01445 word->set_unichar_id(curr_b->unichar_id(), i); 01446 word->set_fragment_length(1, i); 01447 if (certainties != NULL) certainties[i] = curr_b->certainty(); 01448 if (best_char_choices != NULL) { 01449 best_char_choices->set(chunks_record->ratings->get( 01450 curr_lms->contained_in_col, curr_lms->contained_in_row), i); 01451 } 01452 if (state != NULL) { 01453 // Record row minus col zeroes in the reverse state to mark the number 01454 // of joins done by using a blob from this cell in the ratings matrix. 01455 state_uint >>= (curr_lms->contained_in_row - curr_lms->contained_in_col); 01456 // Record a one in the reverse state to indicate the split before 01457 // the blob from the next cell in the ratings matrix (unless we are 01458 // at the first cell, in which case there is no next blob). 01459 if (i != 0) { 01460 state_uint >>= 1; 01461 state_uint |= kHighestBitOn; 01462 } 01463 } 01464 // For non-space delimited languages: find word endings recorded while 01465 // trying to separate the current path into words (for words found in 01466 // fixed length dawgs. 01467 if (!dawg_score_done && curr_vse->dawg_info != NULL) { 01468 UpdateCoveredByFixedLengthDawgs(*(curr_vse->dawg_info->active_dawgs), 01469 i, vse->length, 01470 &fixed_length_num_unichars_to_skip, 01471 &covered_by_fixed_length_dawgs, 01472 dawg_score, &dawg_score_done); 01473 } 01474 // Update the width-to-height ratio variance. Useful non-space delimited 01475 // languages to ensure that the blobs are of uniform width. 01476 // Skip leading and trailing punctuation when computing the variance. 01477 if ((full_wh_ratio_mean != 0.0f && 01478 ((curr_vse != vse && curr_vse->parent_vse != NULL) || 01479 !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) { 01480 vse->associate_stats.full_wh_ratio_var += 01481 pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2); 01482 if (language_model_debug_level > 2) { 01483 tprintf("full_wh_ratio_var += (%g-%g)^2\n", 01484 full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio); 01485 } 01486 } 01487 01488 // Mark the word as compound if compound permuter was set for any of 01489 // the unichars on the path (usually this will happen for unichars 01490 // that are compounding operators, like "-" and "/"). 01491 if (!compound && curr_vse->dawg_info && 01492 curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true; 01493 01494 // Update curr_* pointers. 01495 if (curr_vse->parent_b == NULL) break; 01496 curr_b = curr_vse->parent_b; 01497 curr_lms = 01498 reinterpret_cast<LanguageModelState *>(curr_b->language_model_state()); 01499 curr_vse = curr_vse->parent_vse; 01500 } 01501 ASSERT_HOST(i == 0); // check that we recorded all the unichar ids 01502 // Re-adjust shape cost to include the updated width-to-height variance. 01503 if (full_wh_ratio_mean != 0.0f) { 01504 vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var; 01505 } 01506 01507 if (state != NULL) { 01508 state_uint >>= (64 - (chunks_record->ratings->dimension()-1)); 01509 state->part2 = state_uint; 01510 state_uint >>= 32; 01511 state->part1 = state_uint; 01512 } 01513 word->set_rating(vse->ratings_sum); 01514 word->set_certainty(vse->min_certainty); 01515 if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) { 01516 word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter); 01517 } else if (language_model_ngram_on && !vse->ngram_info->pruned) { 01518 word->set_permuter(NGRAM_PERM); 01519 } else if (vse->top_choice_flags) { 01520 word->set_permuter(TOP_CHOICE_PERM); 01521 } else { 01522 word->set_permuter(NO_PERM); 01523 } 01524 return word; 01525 } 01526 01527 void LanguageModel::UpdateCoveredByFixedLengthDawgs( 01528 const DawgInfoVector &active_dawgs, int word_index, int word_length, 01529 int *skip, int *covered, float *dawg_score, bool *dawg_score_done) { 01530 if (language_model_debug_level > 3) { 01531 tprintf("UpdateCoveredByFixedLengthDawgs for index %d skip=%d\n", 01532 word_index, *skip, word_length); 01533 } 01534 01535 if (*skip > 0) { 01536 --(*skip); 01537 } else { 01538 int best_index = -1; 01539 for (int d = 0; d < active_dawgs.size(); ++d) { 01540 int dawg_index = (active_dawgs[d]).dawg_index; 01541 if (dawg_index > dict_->GetMaxFixedLengthDawgIndex()) { 01542 // If active_dawgs of the last ViterbiStateEntry on the path 01543 // contain a non-fixed length dawg, this means that the whole 01544 // path represents a word from a non-fixed length word dawg. 01545 if (word_index == (word_length - 1)) { 01546 *dawg_score = 1.0f; 01547 *dawg_score_done = true; 01548 return; 01549 } 01550 } else if (dawg_index >= kMinFixedLengthDawgLength) { 01551 const Dawg *curr_dawg = dict_->GetDawg(dawg_index); 01552 ASSERT_HOST(curr_dawg != NULL); 01553 if ((active_dawgs[d]).ref != NO_EDGE && 01554 curr_dawg->end_of_word((active_dawgs[d]).ref) && 01555 dawg_index > best_index) { 01556 best_index = dawg_index; 01557 } 01558 01559 if (language_model_debug_level > 3) { 01560 tprintf("dawg_index %d, ref %d, eow %d\n", dawg_index, 01561 (active_dawgs[d]).ref, 01562 ((active_dawgs[d]).ref != NO_EDGE && 01563 curr_dawg->end_of_word((active_dawgs[d]).ref))); 01564 } 01565 } 01566 } // end for 01567 if (best_index != -1) { 01568 *skip = best_index - 1; 01569 *covered += best_index; 01570 } 01571 } // end if/else skip 01572 01573 if (word_index == 0) { 01574 ASSERT_HOST(*covered <= word_length); 01575 *dawg_score = (static_cast<float>(*covered) / 01576 static_cast<float>(word_length)); 01577 *dawg_score_done = true; 01578 } 01579 } 01580 01581 void LanguageModel::GeneratePainPointsFromColumn( 01582 int col, 01583 const GenericVector<int> &non_empty_rows, 01584 float best_choice_cert, 01585 HEAP *pain_points, 01586 BestPathByColumn *best_path_by_column[], 01587 CHUNKS_RECORD *chunks_record) { 01588 for (int i = 0; i < non_empty_rows.length(); ++i) { 01589 int row = non_empty_rows[i]; 01590 if (language_model_debug_level > 0) { 01591 tprintf("\nLooking for pain points in col=%d row=%d\n", col, row); 01592 } 01593 if (language_model_ngram_on) { 01594 GenerateNgramModelPainPointsFromColumn( 01595 col, row, pain_points, chunks_record); 01596 } else { 01597 GenerateProblematicPathPainPointsFromColumn( 01598 col, row, best_choice_cert, pain_points, 01599 best_path_by_column, chunks_record); 01600 } 01601 } 01602 } 01603 01604 void LanguageModel::GenerateNgramModelPainPointsFromColumn( 01605 int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record) { 01606 // Find the first top choice path recorded for this cell. 01607 // If this path is pruned - generate a pain point. 01608 ASSERT_HOST(chunks_record->ratings->get(col, row) != NULL); 01609 BLOB_CHOICE_IT bit(chunks_record->ratings->get(col, row)); 01610 bool fragmented = false; 01611 for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) { 01612 if (dict_->getUnicharset().get_fragment(bit.data()->unichar_id())) { 01613 fragmented = true; 01614 continue; 01615 } 01616 LanguageModelState *lms = reinterpret_cast<LanguageModelState *>( 01617 bit.data()->language_model_state()); 01618 if (lms == NULL) continue; 01619 ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries)); 01620 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) { 01621 const ViterbiStateEntry *vse = vit.data(); 01622 if (!vse->top_choice_flags) continue; 01623 ASSERT_HOST(vse->ngram_info != NULL); 01624 if (vse->ngram_info->pruned && (vse->parent_vse == NULL || 01625 !vse->parent_vse->ngram_info->pruned)) { 01626 if (vse->parent_vse != NULL) { 01627 LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>( 01628 vse->parent_b->language_model_state()); 01629 GeneratePainPoint(pp_lms->contained_in_col, row, false, 01630 kDefaultPainPointPriorityAdjustment, 01631 -1.0f, fragmented, -1.0f, 01632 max_char_wh_ratio_, 01633 vse->parent_vse->parent_b, 01634 vse->parent_vse->parent_vse, 01635 chunks_record, pain_points); 01636 } 01637 if (vse->parent_vse != NULL && 01638 vse->parent_vse->parent_vse != NULL && 01639 dict_->getUnicharset().get_ispunctuation( 01640 vse->parent_b->unichar_id())) { 01641 // If the dip in the ngram probability is due to punctuation in the 01642 // middle of the word - go two unichars back to combine this 01643 // punctuation mark with the previous character on the path. 01644 LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>( 01645 vse->parent_vse->parent_b->language_model_state()); 01646 GeneratePainPoint(pp_lms->contained_in_col, col-1, false, 01647 kDefaultPainPointPriorityAdjustment, 01648 -1.0f, fragmented, -1.0f, 01649 max_char_wh_ratio_, 01650 vse->parent_vse->parent_vse->parent_b, 01651 vse->parent_vse->parent_vse->parent_vse, 01652 chunks_record, pain_points); 01653 } else if (row+1 < chunks_record->ratings->dimension()) { 01654 GeneratePainPoint(col, row+1, true, 01655 kDefaultPainPointPriorityAdjustment, 01656 -1.0f, fragmented, -1.0f, 01657 max_char_wh_ratio_, 01658 vse->parent_b, 01659 vse->parent_vse, 01660 chunks_record, pain_points); 01661 } 01662 } 01663 return; // examined the lowest cost top choice path 01664 } 01665 } 01666 } 01667 01668 void LanguageModel::GenerateProblematicPathPainPointsFromColumn( 01669 int col, int row, float best_choice_cert, 01670 HEAP *pain_points, BestPathByColumn *best_path_by_column[], 01671 CHUNKS_RECORD *chunks_record) { 01672 MATRIX *ratings = chunks_record->ratings; 01673 01674 // Get the best path from this matrix cell. 01675 BLOB_CHOICE_LIST *blist = ratings->get(col, row); 01676 ASSERT_HOST(blist != NULL); 01677 if (blist->empty()) return; 01678 BLOB_CHOICE_IT bit(blist); 01679 bool fragment = false; 01680 while (dict_->getUnicharset().get_fragment(bit.data()->unichar_id()) && 01681 !bit.at_last()) { // skip fragments 01682 fragment = true; 01683 bit.forward(); 01684 } 01685 if (bit.data()->language_model_state() == NULL) return; 01686 ViterbiStateEntry_IT vit(&(reinterpret_cast<LanguageModelState *>( 01687 bit.data()->language_model_state())->viterbi_state_entries)); 01688 if (vit.empty()) return; 01689 ViterbiStateEntry *vse = vit.data(); 01690 // Check whether this path is promising. 01691 bool path_is_promising = true; 01692 if (vse->parent_vse != NULL) { 01693 float potential_avg_cost = 01694 ((vse->parent_vse->cost + bit.data()->rating()*0.5f) / 01695 static_cast<float>(row+1)); 01696 if (language_model_debug_level > 0) { 01697 tprintf("potential_avg_cost %g best cost %g\n", 01698 potential_avg_cost, (*best_path_by_column)[col].avg_cost); 01699 } 01700 if (potential_avg_cost >= (*best_path_by_column)[col].avg_cost) { 01701 path_is_promising = false; 01702 } 01703 } 01704 // Set best_parent_vse to the best parent recorded in best_path_by_column. 01705 ViterbiStateEntry *best_parent_vse = vse->parent_vse; 01706 BLOB_CHOICE *best_parent_b = vse->parent_b; 01707 if (col > 0 && (*best_path_by_column)[col-1].best_vse != NULL) { 01708 ASSERT_HOST((*best_path_by_column)[col-1].best_b != NULL); 01709 LanguageModelState *best_lms = reinterpret_cast<LanguageModelState *>( 01710 ((*best_path_by_column)[col-1].best_b)->language_model_state()); 01711 if (best_lms->contained_in_row == col-1) { 01712 best_parent_vse = (*best_path_by_column)[col-1].best_vse; 01713 best_parent_b = (*best_path_by_column)[col-1].best_b; 01714 if (language_model_debug_level > 0) { 01715 tprintf("Setting best_parent_vse to %p\n", best_parent_vse); 01716 } 01717 } 01718 } 01719 // Check whether this entry terminates the best parent path 01720 // recorded in best_path_by_column. 01721 bool best_not_prolonged = (best_parent_vse != vse->parent_vse); 01722 01723 // If this path is problematic because of the last unichar - generate 01724 // a pain point to combine it with its left and right neighbor. 01725 BLOB_CHOICE_IT tmp_bit; 01726 if (best_not_prolonged || 01727 (path_is_promising && 01728 ProblematicPath(*vse, bit.data()->unichar_id(), 01729 row+1 == ratings->dimension()))) { 01730 float worst_piece_cert; 01731 bool fragmented; 01732 if (col-1 > 0) { 01733 GetWorstPieceCertainty(col-1, row, chunks_record->ratings, 01734 &worst_piece_cert, &fragmented); 01735 GeneratePainPoint(col-1, row, false, 01736 kDefaultPainPointPriorityAdjustment, 01737 worst_piece_cert, fragmented, best_choice_cert, 01738 max_char_wh_ratio_, best_parent_b, best_parent_vse, 01739 chunks_record, pain_points); 01740 } 01741 if (row+1 < ratings->dimension()) { 01742 GetWorstPieceCertainty(col, row+1, chunks_record->ratings, 01743 &worst_piece_cert, &fragmented); 01744 GeneratePainPoint(col, row+1, true, kDefaultPainPointPriorityAdjustment, 01745 worst_piece_cert, fragmented, best_choice_cert, 01746 max_char_wh_ratio_, best_parent_b, best_parent_vse, 01747 chunks_record, pain_points); 01748 } 01749 } // for ProblematicPath 01750 } 01751 01752 void LanguageModel::GeneratePainPointsFromBestChoice( 01753 HEAP *pain_points, 01754 CHUNKS_RECORD *chunks_record, 01755 BestChoiceBundle *best_choice_bundle) { 01756 // Variables to backtrack best_vse path; 01757 ViterbiStateEntry *curr_vse = best_choice_bundle->best_vse; 01758 BLOB_CHOICE *curr_b = best_choice_bundle->best_b; 01759 01760 // Begins and ends in DANGERR vector record the positions in the blob choice 01761 // list of the best choice. We need to translate these endpoints into the 01762 // beginning column and ending row for the pain points. We maintain 01763 // danger_begin and danger_end arrays indexed by the position in 01764 // best_choice_bundle->best_char_choices (which is equal to the position 01765 // on the best_choice_bundle->best_vse path). 01766 // danger_end[d] stores the DANGERR_INFO structs with end==d and is 01767 // initialized at the beginning of this function. 01768 // danger_begin[d] stores the DANGERR_INFO struct with begin==d and 01769 // has end set to the row of the end of this ambiguity. 01770 // The translation from end in terms of the best choice index to the end row 01771 // is done while following the parents of best_choice_bundle->best_vse. 01772 assert(best_choice_bundle->best_char_choices->length() == 01773 best_choice_bundle->best_vse->length); 01774 DANGERR *danger_begin = NULL; 01775 DANGERR *danger_end = NULL; 01776 int d; 01777 if (!best_choice_bundle->fixpt.empty()) { 01778 danger_begin = new DANGERR[best_choice_bundle->best_vse->length]; 01779 danger_end = new DANGERR[best_choice_bundle->best_vse->length]; 01780 for (d = 0; d < best_choice_bundle->fixpt.size(); ++d) { 01781 const DANGERR_INFO &danger = best_choice_bundle->fixpt[d]; 01782 // Only use n->1 ambiguities. 01783 if (danger.end > danger.begin && !danger.correct_is_ngram && 01784 (!language_model_ngram_on || danger.dangerous)) { 01785 danger_end[danger.end].push_back(danger); 01786 } 01787 } 01788 } 01789 01790 // Variables to keep track of punctuation/number streaks. 01791 int punc_streak_end_row = -1; 01792 int punc_streak_length = 0; 01793 float punc_streak_min_cert = 0.0f; 01794 01795 if (language_model_debug_level > 0) { 01796 tprintf("\nGenerating pain points for best path=%p\n", curr_vse); 01797 } 01798 01799 int word_index = best_choice_bundle->best_vse->length; 01800 while (curr_vse != NULL) { 01801 word_index--; 01802 ASSERT_HOST(word_index >= 0); 01803 ASSERT_HOST(curr_b != NULL); 01804 if (language_model_debug_level > 0) { 01805 tprintf("Looking at unichar %s\n", 01806 dict_->getUnicharset().id_to_unichar(curr_b->unichar_id())); 01807 } 01808 01809 int pp_col = reinterpret_cast<LanguageModelState *>( 01810 curr_b->language_model_state())->contained_in_col; 01811 int pp_row = reinterpret_cast<LanguageModelState *>( 01812 curr_b->language_model_state())->contained_in_row; 01813 01814 // Generate pain points for ambiguities found by NoDangerousAmbig(). 01815 if (danger_end != NULL) { 01816 // Translate end index of an ambiguity to an end row. 01817 for (d = 0; d < danger_end[word_index].size(); ++d) { 01818 danger_end[word_index][d].end = pp_row; 01819 danger_begin[danger_end[word_index][d].begin].push_back( 01820 danger_end[word_index][d]); 01821 } 01822 // Generate a pain point to combine unchars in the "wrong" part 01823 // of the ambiguity. 01824 for (d = 0; d < danger_begin[word_index].size(); ++d) { 01825 if (language_model_debug_level > 0) { 01826 tprintf("Generating pain point from %sambiguity\n", 01827 danger_begin[word_index][d].dangerous ? "dangerous " : ""); 01828 } 01829 GeneratePainPoint(pp_col, danger_begin[word_index][d].end, false, 01830 danger_begin[word_index][d].dangerous ? 01831 kCriticalPainPointPriorityAdjustment : 01832 kBestChoicePainPointPriorityAdjustment, 01833 best_choice_bundle->best_choice->certainty(), true, 01834 best_choice_bundle->best_choice->certainty(), 01835 kLooseMaxCharWhRatio, 01836 curr_vse->parent_b, curr_vse->parent_vse, 01837 chunks_record, pain_points); 01838 } 01839 } 01840 01841 if (!language_model_ngram_on) { // no need to use further heuristics if we 01842 // are guided by the character ngram model 01843 // Generate pain points for problematic sub-paths. 01844 if (ProblematicPath(*curr_vse, curr_b->unichar_id(), 01845 pp_row+1 == chunks_record->ratings->dimension())) { 01846 if (language_model_debug_level > 0) { 01847 tprintf("Generating pain point from a problematic sub-path\n"); 01848 } 01849 float worst_piece_cert; 01850 bool fragment; 01851 if (pp_col > 0) { 01852 GetWorstPieceCertainty(pp_col-1, pp_row, chunks_record->ratings, 01853 &worst_piece_cert, &fragment); 01854 GeneratePainPoint(pp_col-1, pp_row, false, 01855 kBestChoicePainPointPriorityAdjustment, 01856 worst_piece_cert, true, 01857 best_choice_bundle->best_choice->certainty(), 01858 max_char_wh_ratio_, NULL, NULL, 01859 chunks_record, pain_points); 01860 } 01861 if (pp_row+1 < chunks_record->ratings->dimension()) { 01862 GetWorstPieceCertainty(pp_col, pp_row+1, chunks_record->ratings, 01863 &worst_piece_cert, &fragment); 01864 GeneratePainPoint(pp_col, pp_row+1, true, 01865 kBestChoicePainPointPriorityAdjustment, 01866 worst_piece_cert, true, 01867 best_choice_bundle->best_choice->certainty(), 01868 max_char_wh_ratio_, NULL, NULL, 01869 chunks_record, pain_points); 01870 } 01871 } 01872 01873 // Generate a pain point if we encountered a punctuation/number streak to 01874 // combine all punctuation marks into one blob and try to classify it. 01875 bool is_alpha = dict_->getUnicharset().get_isalpha(curr_b->unichar_id()); 01876 if (!is_alpha) { 01877 if (punc_streak_end_row == -1) punc_streak_end_row = pp_row; 01878 punc_streak_length++; 01879 if (curr_b->certainty() < punc_streak_min_cert) 01880 punc_streak_min_cert = curr_b->certainty(); 01881 } 01882 if (is_alpha || curr_vse->parent_vse == NULL) { 01883 if (punc_streak_end_row != -1 && punc_streak_length > 1) { 01884 if (language_model_debug_level > 0) { 01885 tprintf("Generating pain point from a punctuation streak\n"); 01886 } 01887 if (is_alpha || 01888 (curr_vse->parent_vse == NULL && punc_streak_length > 2)) { 01889 GeneratePainPoint(pp_row+1, punc_streak_end_row, false, 01890 kBestChoicePainPointPriorityAdjustment, 01891 punc_streak_min_cert, true, 01892 best_choice_bundle->best_choice->certainty(), 01893 max_char_wh_ratio_, curr_b, curr_vse, 01894 chunks_record, pain_points); 01895 } 01896 // Handle punctuation/number streaks starting from the first unichar. 01897 if (curr_vse->parent_vse == NULL) { 01898 GeneratePainPoint(0, punc_streak_end_row, false, 01899 kBestChoicePainPointPriorityAdjustment, 01900 punc_streak_min_cert, true, 01901 best_choice_bundle->best_choice->certainty(), 01902 max_char_wh_ratio_, NULL, NULL, 01903 chunks_record, pain_points); 01904 } 01905 } 01906 punc_streak_end_row = -1; 01907 punc_streak_length = 0; 01908 punc_streak_min_cert = 0.0f; 01909 } // end handling punctuation streaks 01910 } 01911 01912 curr_b = curr_vse->parent_b; 01913 curr_vse = curr_vse->parent_vse; 01914 } // end looking at best choice subpaths 01915 01916 // Clean up. 01917 if (danger_end != NULL) { 01918 delete[] danger_begin; 01919 delete[] danger_end; 01920 } 01921 } 01922 01923 bool LanguageModel::GeneratePainPoint( 01924 int col, int row, bool ok_to_extend, float priority, 01925 float worst_piece_cert, bool fragmented, float best_choice_cert, 01926 float max_char_wh_ratio, 01927 BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, 01928 CHUNKS_RECORD *chunks_record, HEAP *pain_points) { 01929 if (col < 0 || row >= chunks_record->ratings->dimension() || 01930 chunks_record->ratings->get(col, row) != NOT_CLASSIFIED) { 01931 return false; 01932 } 01933 if (language_model_debug_level > 3) { 01934 tprintf("\nGenerating pain point for col=%d row=%d priority=%g parent=", 01935 col, row, priority); 01936 if (parent_vse != NULL) { 01937 PrintViterbiStateEntry("", parent_vse, parent_b, chunks_record); 01938 } else { 01939 tprintf("NULL"); 01940 } 01941 tprintf("\n"); 01942 } 01943 01944 AssociateStats associate_stats; 01945 ComputeAssociateStats(col, row, max_char_wh_ratio, parent_vse, 01946 chunks_record, &associate_stats); 01947 // For fixed-pitch fonts/languages: if the current combined blob overlaps 01948 // the next blob on the right and it is ok to extend the blob, try expending 01949 // the blob until there is no overlap with the next blob on the right or 01950 // until the width-to-height ratio becomes too large. 01951 if (ok_to_extend) { 01952 while (associate_stats.bad_fixed_pitch_right_gap && 01953 row+1 < chunks_record->ratings->dimension() && 01954 !associate_stats.bad_fixed_pitch_wh_ratio) { 01955 ComputeAssociateStats(col, ++row, max_char_wh_ratio, parent_vse, 01956 chunks_record, &associate_stats); 01957 } 01958 } 01959 01960 if (associate_stats.bad_shape) { 01961 if (language_model_debug_level > 3) { 01962 tprintf("Discarded pain point with a bad shape\n"); 01963 } 01964 return false; 01965 } 01966 01967 // Compute pain point priority. 01968 if (associate_stats.shape_cost > 0) { 01969 priority *= associate_stats.shape_cost; 01970 } 01971 if (worst_piece_cert < best_choice_cert) { 01972 worst_piece_cert = best_choice_cert; 01973 } 01974 priority *= CertaintyScore(worst_piece_cert); 01975 if (fragmented) priority /= kDefaultPainPointPriorityAdjustment; 01976 01977 if (language_model_debug_level > 3) { 01978 tprintf("worst_piece_cert=%g fragmented=%d\n", 01979 worst_piece_cert, fragmented); 01980 } 01981 01982 if (parent_vse != NULL) { 01983 priority *= sqrt(parent_vse->cost / static_cast<float>(col)); 01984 if (parent_vse->dawg_info != NULL) { 01985 priority /= kDefaultPainPointPriorityAdjustment; 01986 if (parent_vse->length > language_model_min_compound_length) { 01987 priority /= sqrt(static_cast<double>(parent_vse->length)); 01988 } 01989 } 01990 } 01991 01992 MATRIX_COORD *pain_point = new MATRIX_COORD(col, row); 01993 if (HeapPushCheckSize(pain_points, priority, pain_point)) { 01994 if (language_model_debug_level) { 01995 tprintf("Added pain point with priority %g\n", priority); 01996 } 01997 return true; 01998 } else { 01999 delete pain_point; 02000 if (language_model_debug_level) tprintf("Pain points heap is full\n"); 02001 return false; 02002 } 02003 } 02004 02005 } // namespace tesseract