Tesseract  3.02
tesseract-ocr/wordrec/language_model.cpp
Go to the documentation of this file.
00001 
00002 // File:        language_model.cpp
00003 // Description: Functions that utilize the knowledge about the properties,
00004 //              structure and statistics of the language to help recognition.
00005 // Author:      Daria Antonova
00006 // Created:     Mon Nov 11 11:26:43 PST 2009
00007 //
00008 // (C) Copyright 2009, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #include <math.h>
00022 
00023 #include "language_model.h"
00024 
00025 #include "dawg.h"
00026 #include "intproto.h"
00027 #include "matrix.h"
00028 #include "params.h"
00029 #include "params_training_featdef.h"
00030 
00031 namespace tesseract {
00032 
00033 ELISTIZE(ViterbiStateEntry);
00034 
00035 const float LanguageModel::kInitialPainPointPriorityAdjustment = 5.0f;
00036 const float LanguageModel::kDefaultPainPointPriorityAdjustment = 2.0f;
00037 const float LanguageModel::kBestChoicePainPointPriorityAdjustment = 0.5f;
00038 const float LanguageModel::kCriticalPainPointPriorityAdjustment = 0.1f;
00039 const float LanguageModel::kMaxAvgNgramCost = 25.0f;
00040 const int LanguageModel::kMinFixedLengthDawgLength = 2;
00041 const float LanguageModel::kLooseMaxCharWhRatio = 2.5f;
00042 
00043 LanguageModel::LanguageModel(const UnicityTable<FontInfo> *fontinfo_table,
00044                              Dict *dict)
00045   : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
00046                dict->getImage()->getCCUtil()->params()),
00047     BOOL_INIT_MEMBER(language_model_ngram_on, false,
00048                      "Turn on/off the use of character ngram model",
00049                      dict->getImage()->getCCUtil()->params()),
00050     INT_MEMBER(language_model_ngram_order, 8,
00051                "Maximum order of the character ngram model",
00052                dict->getImage()->getCCUtil()->params()),
00053     INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
00054                "Maximum number of prunable (those for which"
00055                " PrunablePath() is true) entries in each viterbi list"
00056                " recorded in BLOB_CHOICEs",
00057                dict->getImage()->getCCUtil()->params()),
00058     INT_MEMBER(language_model_viterbi_list_max_size, 500,
00059                "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
00060                dict->getImage()->getCCUtil()->params()),
00061     double_MEMBER(language_model_ngram_small_prob, 0.000001,
00062                   "To avoid overly small denominators use this as the "
00063                   "floor of the probability returned by the ngram model.",
00064                   dict->getImage()->getCCUtil()->params()),
00065     double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
00066                   "Average classifier score of a non-matching unichar.",
00067                   dict->getImage()->getCCUtil()->params()),
00068     BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
00069                 "Use only the first UTF8 step of the given string"
00070                 " when computing log probabilities.",
00071                 dict->getImage()->getCCUtil()->params()),
00072     double_MEMBER(language_model_ngram_scale_factor, 0.03,
00073                   "Strength of the character ngram model relative to the"
00074                   " character classifier ",
00075                   dict->getImage()->getCCUtil()->params()),
00076     BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
00077                 "Words are delimited by space",
00078                 dict->getImage()->getCCUtil()->params()),
00079     INT_MEMBER(language_model_min_compound_length, 3,
00080                "Minimum length of compound words",
00081                dict->getImage()->getCCUtil()->params()),
00082     INT_MEMBER(language_model_fixed_length_choices_depth, 3,
00083                "Depth of blob choice lists to explore"
00084                " when fixed length dawgs are on",
00085                dict->getImage()->getCCUtil()->params()),
00086     double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
00087                   "Penalty for words not in the frequent word dictionary",
00088                   dict->getImage()->getCCUtil()->params()),
00089     double_MEMBER(language_model_penalty_non_dict_word, 0.15,
00090                   "Penalty for non-dictionary words",
00091                   dict->getImage()->getCCUtil()->params()),
00092     double_MEMBER(language_model_penalty_punc, 0.2,
00093                   "Penalty for inconsistent punctuation",
00094                   dict->getImage()->getCCUtil()->params()),
00095     double_MEMBER(language_model_penalty_case, 0.1,
00096                   "Penalty for inconsistent case",
00097                   dict->getImage()->getCCUtil()->params()),
00098     double_MEMBER(language_model_penalty_script, 0.5,
00099                   "Penalty for inconsistent script",
00100                   dict->getImage()->getCCUtil()->params()),
00101     double_MEMBER(language_model_penalty_chartype, 0.3,
00102                   "Penalty for inconsistent character type",
00103                   dict->getImage()->getCCUtil()->params()),
00104     // TODO(daria, rays): enable font consistency checking
00105     // after improving font analysis.
00106     double_MEMBER(language_model_penalty_font, 0.00,
00107                   "Penalty for inconsistent font",
00108                   dict->getImage()->getCCUtil()->params()),
00109     double_MEMBER(language_model_penalty_spacing, 0.05,
00110                   "Penalty for inconsistent spacing",
00111                   dict->getImage()->getCCUtil()->params()),
00112     double_MEMBER(language_model_penalty_increment, 0.01,
00113                   "Penalty increment",
00114                   dict->getImage()->getCCUtil()->params()),
00115     BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
00116                      "Use sigmoidal score for certainty",
00117                      dict->getImage()->getCCUtil()->params()),
00118   fontinfo_table_(fontinfo_table), dict_(dict),
00119   fixed_pitch_(false), max_char_wh_ratio_(0.0),
00120   acceptable_choice_found_(false) {
00121   ASSERT_HOST(dict_ != NULL);
00122   dawg_args_ = new DawgArgs(NULL, NULL, new DawgInfoVector(),
00123                             new DawgInfoVector(),
00124                             0.0, NO_PERM, kAnyWordLength, -1);
00125   beginning_active_dawgs_ = new DawgInfoVector();
00126   beginning_constraints_ = new DawgInfoVector();
00127   fixed_length_beginning_active_dawgs_ = new DawgInfoVector();
00128   empty_dawg_info_vec_ = new DawgInfoVector();
00129 }
00130 
00131 LanguageModel::~LanguageModel() {
00132   delete beginning_active_dawgs_;
00133   delete beginning_constraints_;
00134   delete fixed_length_beginning_active_dawgs_;
00135   delete empty_dawg_info_vec_;
00136   delete dawg_args_->updated_active_dawgs;
00137   delete dawg_args_->updated_constraints;
00138   delete dawg_args_;
00139 }
00140 
00141 void LanguageModel::InitForWord(
00142     const WERD_CHOICE *prev_word,
00143     bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio,
00144     float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record,
00145     BlamerBundle *blamer_bundle, bool debug_blamer) {
00146   fixed_pitch_ = fixed_pitch;
00147   max_char_wh_ratio_ = max_char_wh_ratio;
00148   rating_cert_scale_ = rating_cert_scale;
00149   acceptable_choice_found_ = false;
00150   correct_segmentation_explored_ = false;
00151 
00152   // For each cell, generate a "pain point" if the cell is not classified
00153   // and has a left or right neighbor that was classified.
00154   MATRIX *ratings = chunks_record->ratings;
00155   for (int col = 0; col < ratings->dimension(); ++col) {
00156     for (int row = col+1; row < ratings->dimension(); ++row) {
00157       if ((row > 0 && ratings->get(col, row-1) != NOT_CLASSIFIED) ||
00158           (col+1 < ratings->dimension() &&
00159            ratings->get(col+1, row) != NOT_CLASSIFIED)) {
00160         float worst_piece_cert;
00161         bool fragmented;
00162         GetWorstPieceCertainty(col, row, chunks_record->ratings,
00163                                &worst_piece_cert, &fragmented);
00164         GeneratePainPoint(col, row, true, kInitialPainPointPriorityAdjustment,
00165                           worst_piece_cert, fragmented, best_choice_cert,
00166                           max_char_wh_ratio_, NULL, NULL,
00167                           chunks_record, pain_points);
00168       }
00169     }
00170   }
00171 
00172   // Initialize vectors with beginning DawgInfos.
00173   beginning_active_dawgs_->clear();
00174   dict_->init_active_dawgs(kAnyWordLength, beginning_active_dawgs_, false);
00175   beginning_constraints_->clear();
00176   dict_->init_constraints(beginning_constraints_);
00177   if (dict_->GetMaxFixedLengthDawgIndex() >= 0) {
00178     fixed_length_beginning_active_dawgs_->clear();
00179     for (int i = 0; i < beginning_active_dawgs_->size(); ++i) {
00180       int dawg_index = (*beginning_active_dawgs_)[i].dawg_index;
00181       if (dawg_index <= dict_->GetMaxFixedLengthDawgIndex() &&
00182           dawg_index >= kMinFixedLengthDawgLength) {
00183         *fixed_length_beginning_active_dawgs_ += (*beginning_active_dawgs_)[i];
00184       }
00185     }
00186   }
00187 
00188   max_penalty_adjust_ = (dict_->segment_penalty_dict_nonword -
00189                          dict_->segment_penalty_dict_case_ok);
00190 
00191   // Fill prev_word_str_ with the last language_model_ngram_order
00192   // unichars from prev_word.
00193   if (language_model_ngram_on) {
00194     if (prev_word != NULL && prev_word->unichar_string() != NULL) {
00195       prev_word_str_ = prev_word->unichar_string();
00196       if (language_model_ngram_space_delimited_language) prev_word_str_ += ' ';
00197     } else {
00198       prev_word_str_ += ' ';
00199     }
00200     const char *str_ptr = prev_word_str_.string();
00201     const char *str_end = str_ptr + prev_word_str_.length();
00202     int step;
00203     prev_word_unichar_step_len_ = 0;
00204     while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
00205       str_ptr += step;
00206       ++prev_word_unichar_step_len_;
00207     }
00208     ASSERT_HOST(str_ptr == str_end);
00209   }
00210 
00211   // Initialize blamer-related information: map character boxes recorded in
00212   // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
00213   // ratings matrix. We expect this step to succeed, since when running the
00214   // chopper we checked that the correct chops are present.
00215   if (blamer_bundle != NULL &&
00216       blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
00217       blamer_bundle->truth_has_char_boxes) {
00218     STRING blamer_debug;
00219     blamer_debug += "Blamer computing correct_segmentation_cols\n";
00220     int curr_box_col = 0;
00221     int next_box_col = 0;
00222     TBLOB *blob = chunks_record->chunks;
00223     inT16 next_box_x = (blob != NULL) ? blob->bounding_box().right() : 0;
00224     for (int truth_idx = 0;
00225         blob != NULL && truth_idx < blamer_bundle->norm_truth_word.length();
00226         blob = blob->next) {
00227       ++next_box_col;
00228       inT16 curr_box_x = next_box_x;
00229       if (blob->next != NULL) next_box_x = blob->next->bounding_box().right();
00230       inT16 truth_x = blamer_bundle->norm_truth_word.BlobBox(truth_idx).right();
00231       blamer_debug.add_str_int("Box x coord vs. truth: ", curr_box_x);
00232       blamer_debug.add_str_int(" ", truth_x);
00233       blamer_debug += "\n";
00234       if (curr_box_x > (truth_x + blamer_bundle->norm_box_tolerance)) {
00235         break;  // failed to find a matching box
00236       } else if (curr_box_x >=
00237                   (truth_x - blamer_bundle->norm_box_tolerance) &&  // matched
00238                   (blob->next == NULL ||  // next box can't be included
00239                    next_box_x > truth_x + blamer_bundle->norm_box_tolerance)) {
00240         blamer_bundle->correct_segmentation_cols.push_back(curr_box_col);
00241         blamer_bundle->correct_segmentation_rows.push_back(next_box_col-1);
00242         ++truth_idx;
00243         blamer_debug.add_str_int("col=", curr_box_col);
00244         blamer_debug.add_str_int(" row=", next_box_col-1);
00245         blamer_debug += "\n";
00246         curr_box_col = next_box_col;
00247       }
00248     }
00249     if (blob != NULL ||  // trailing blobs
00250         blamer_bundle->correct_segmentation_cols.length() !=
00251         blamer_bundle->norm_truth_word.length()) {
00252       blamer_debug.add_str_int("Blamer failed to find correct segmentation"
00253           " (tolerance=", blamer_bundle->norm_box_tolerance);
00254       if (blob == NULL) blamer_debug += " blob == NULL";
00255       blamer_debug += ")\n";
00256       blamer_debug.add_str_int(
00257           " path length ", blamer_bundle->correct_segmentation_cols.length());
00258       blamer_debug.add_str_int(" vs. truth ",
00259                                blamer_bundle->norm_truth_word.length());
00260       blamer_debug += "\n";
00261       blamer_bundle->SetBlame(IRR_UNKNOWN, blamer_debug, NULL, debug_blamer);
00262       blamer_bundle->correct_segmentation_cols.clear();
00263       blamer_bundle->correct_segmentation_rows.clear();
00264     }
00265   } // end if (blamer_bundle != NULL)
00266 
00267   // Start a new hypothesis list for this run of segmentation search.
00268   if (blamer_bundle != NULL) {
00269     blamer_bundle->params_training_bundle.StartHypothesisList();
00270   }
00271 }
00272 
00273 void LanguageModel::CleanUp() {
00274   for (int i = 0; i < updated_flags_.size(); ++i) {
00275     *(updated_flags_[i]) = false;
00276   }
00277   updated_flags_.clear();
00278 }
00279 
00280 void LanguageModel::DeleteState(BLOB_CHOICE_LIST *choices) {
00281   BLOB_CHOICE_IT b_it(choices);
00282   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00283     if (b_it.data()->language_model_state() != NULL) {
00284       LanguageModelState *state = reinterpret_cast<LanguageModelState *>(
00285           b_it.data()->language_model_state());
00286       delete state;
00287       b_it.data()->set_language_model_state(NULL);
00288     }
00289   }
00290 }
00291 
00292 LanguageModelFlagsType LanguageModel::UpdateState(
00293     LanguageModelFlagsType changed,
00294     int curr_col, int curr_row,
00295     BLOB_CHOICE_LIST *curr_list,
00296     BLOB_CHOICE_LIST *parent_list,
00297     HEAP *pain_points,
00298     BestPathByColumn *best_path_by_column[],
00299     CHUNKS_RECORD *chunks_record,
00300     BestChoiceBundle *best_choice_bundle,
00301     BlamerBundle *blamer_bundle) {
00302   if (language_model_debug_level > 0) {
00303     tprintf("\nUpdateState: col=%d row=%d (changed=0x%x parent=%p)\n",
00304             curr_col, curr_row, changed, parent_list);
00305   }
00306   // Initialize helper variables.
00307   bool word_end = (curr_row+1 >= chunks_record->ratings->dimension());
00308   bool just_classified = (changed & kJustClassifiedFlag);
00309   LanguageModelFlagsType new_changed = 0x0;
00310   float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
00311 
00312   // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
00313   ViterbiStateEntry_IT vit;
00314   BLOB_CHOICE_IT c_it(curr_list);
00315   int c_it_counter = 0;
00316   bool first_iteration = true;
00317   BLOB_CHOICE *first_lower = NULL;
00318   BLOB_CHOICE *first_upper = NULL;
00319   GetTopChoiceLowerUpper(changed, curr_list, &first_lower, &first_upper);
00320   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00321     if (dict_->GetMaxFixedLengthDawgIndex() >= 0 &&
00322         c_it_counter++ >= language_model_fixed_length_choices_depth) {
00323       break;
00324     }
00325     // Skip NULL unichars unless it is the only choice.
00326     if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
00327     if (dict_->getUnicharset().get_fragment(c_it.data()->unichar_id())) {
00328       continue;  // skip fragments
00329     }
00330     // Set top choice flags.
00331     LanguageModelFlagsType top_choice_flags = 0x0;
00332     if (first_iteration && (changed | kSmallestRatingFlag)) {
00333       top_choice_flags |= kSmallestRatingFlag;
00334     }
00335     if (first_lower == c_it.data()) top_choice_flags |= kLowerCaseFlag;
00336     if (first_upper == c_it.data()) top_choice_flags |= kUpperCaseFlag;
00337 
00338     if (parent_list == NULL) {  // process the beginning of a word
00339       new_changed |= AddViterbiStateEntry(
00340           top_choice_flags, denom, word_end, curr_col, curr_row,
00341           c_it.data(), NULL, NULL, pain_points, best_path_by_column,
00342           chunks_record, best_choice_bundle, blamer_bundle);
00343     } else {  // get viterbi entries from each of the parent BLOB_CHOICEs
00344       BLOB_CHOICE_IT p_it(parent_list);
00345       for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
00346         LanguageModelState *parent_lms =
00347           reinterpret_cast<LanguageModelState *>(
00348               p_it.data()->language_model_state());
00349         if (parent_lms == NULL || parent_lms->viterbi_state_entries.empty()) {
00350           continue;
00351         }
00352         vit.set_to_list(&(parent_lms->viterbi_state_entries));
00353         int vit_counter = 0;
00354         for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
00355           // Skip pruned entries and do not look at prunable entries if already
00356           // examined language_model_viterbi_list_max_num_prunable of those.
00357           if (PrunablePath(vit.data()->top_choice_flags,
00358                            vit.data()->dawg_info) &&
00359               (++vit_counter > language_model_viterbi_list_max_num_prunable ||
00360                (language_model_ngram_on && vit.data()->ngram_info->pruned))) {
00361             continue;
00362           }
00363           // Only consider the parent if it has been updated or
00364           // if the current ratings cell has just been classified.
00365           if (!just_classified && !vit.data()->updated) continue;
00366           // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
00367           // looks good according to the Dawgs or character ngram model.
00368           new_changed |= AddViterbiStateEntry(
00369               top_choice_flags, denom, word_end, curr_col, curr_row,
00370               c_it.data(), p_it.data(), vit.data(), pain_points,
00371               best_path_by_column, chunks_record,
00372               best_choice_bundle, blamer_bundle);
00373         }
00374       }  // done looking at parents for this c_it.data()
00375     }
00376     first_iteration = false;
00377   }
00378   return new_changed;
00379 }
00380 
00381 bool LanguageModel::ProblematicPath(const ViterbiStateEntry &vse,
00382                                     UNICHAR_ID unichar_id, bool word_end) {
00383   // The path is problematic if it is inconsistent and has a parent that
00384   // is consistent (or a NULL parent).
00385   if (!vse.Consistent() && (vse.parent_vse == NULL ||
00386                             vse.parent_vse->Consistent())) {
00387     if (language_model_debug_level > 0) {
00388       tprintf("ProblematicPath: inconsistent\n");
00389     }
00390     return true;
00391   }
00392   // The path is problematic if it does not represent a dictionary word,
00393   // while its parent does.
00394   if (vse.dawg_info == NULL &&
00395       (vse.parent_vse == NULL || vse.parent_vse->dawg_info != NULL)) {
00396     if (language_model_debug_level > 0) {
00397       tprintf("ProblematicPath: dict word terminated\n");
00398     }
00399     return true;
00400   }
00401   // The path is problematic if ngram info indicates that this path is
00402   // an unlikely sequence of character, while its parent is does not have
00403   // extreme dips in ngram probabilities.
00404   if (vse.ngram_info != NULL && vse.ngram_info->pruned &&
00405       (vse.parent_vse == NULL || !vse.parent_vse->ngram_info->pruned)) {
00406     if (language_model_debug_level > 0) {
00407       tprintf("ProblematicPath: small ngram prob\n");
00408     }
00409     return true;
00410   }
00411   // The path is problematic if there is a non-alpha character in the
00412   // middle of the path (unless it is a digit in the middle of a path
00413   // that represents a number).
00414   if ((vse.parent_vse != NULL) && !word_end &&              // is middle
00415       !(dict_->getUnicharset().get_isalpha(unichar_id) ||   // alpha
00416         (dict_->getUnicharset().get_isdigit(unichar_id) &&  // ok digit
00417          vse.dawg_info != NULL && vse.dawg_info->permuter == NUMBER_PERM))) {
00418     if (language_model_debug_level > 0) {
00419       tprintf("ProblematicPath: non-alpha middle\n");
00420     }
00421     return true;
00422   }
00423   return false;
00424 }
00425 
00426 void LanguageModel::GetTopChoiceLowerUpper(LanguageModelFlagsType changed,
00427                                            BLOB_CHOICE_LIST *curr_list,
00428                                            BLOB_CHOICE **first_lower,
00429                                            BLOB_CHOICE **first_upper) {
00430   if (!(changed & kLowerCaseFlag || changed & kUpperCaseFlag)) return;
00431   BLOB_CHOICE_IT c_it(curr_list);
00432   const UNICHARSET &unicharset = dict_->getUnicharset();
00433   BLOB_CHOICE *first_unichar = NULL;
00434   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00435     UNICHAR_ID unichar_id = c_it.data()->unichar_id();
00436     if (unicharset.get_fragment(unichar_id)) continue;  // skip fragments
00437     if (first_unichar == NULL) first_unichar = c_it.data();
00438     if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
00439       *first_lower = c_it.data();
00440     }
00441     if (*first_upper == NULL && unicharset.get_isupper(unichar_id)) {
00442       *first_upper = c_it.data();
00443     }
00444   }
00445   ASSERT_HOST(first_unichar != NULL);
00446   if (*first_lower == NULL) *first_lower = first_unichar;
00447   if (*first_upper == NULL) *first_upper = first_unichar;
00448 }
00449 
00450 LanguageModelFlagsType LanguageModel::AddViterbiStateEntry(
00451     LanguageModelFlagsType top_choice_flags,
00452     float denom,
00453     bool word_end,
00454     int curr_col, int curr_row,
00455     BLOB_CHOICE *b,
00456     BLOB_CHOICE *parent_b,
00457     ViterbiStateEntry *parent_vse,
00458     HEAP *pain_points,
00459     BestPathByColumn *best_path_by_column[],
00460     CHUNKS_RECORD *chunks_record,
00461     BestChoiceBundle *best_choice_bundle,
00462     BlamerBundle *blamer_bundle) {
00463   ViterbiStateEntry_IT vit;
00464   if (language_model_debug_level > 0) {
00465     tprintf("\nAddViterbiStateEntry for unichar %s rating=%.4f"
00466             " certainty=%.4f top_choice_flags=0x%x parent_vse=%p\n",
00467             dict_->getUnicharset().id_to_unichar(b->unichar_id()),
00468             b->rating(), b->certainty(), top_choice_flags, parent_vse);
00469     if (language_model_debug_level > 3 && b->language_model_state() != NULL) {
00470       tprintf("Existing viterbi list:\n");
00471       vit.set_to_list(&(reinterpret_cast<LanguageModelState *>(
00472           b->language_model_state())->viterbi_state_entries));
00473       for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
00474         PrintViterbiStateEntry("", vit.data(), b, chunks_record);
00475       }
00476     }
00477   }
00478   // Check whether the list is full.
00479   if (b->language_model_state() != NULL &&
00480       (reinterpret_cast<LanguageModelState *>(
00481           b->language_model_state())->viterbi_state_entries_length >=
00482           language_model_viterbi_list_max_size)) {
00483     if (language_model_debug_level > 1) {
00484       tprintf("AddViterbiStateEntry: viterbi list is full!\n");
00485     }
00486     return 0x0;
00487   }
00488 
00489   LanguageModelFlagsType changed = 0x0;
00490   float optimistic_cost = 0.0f;
00491   if (!language_model_ngram_on) optimistic_cost += b->rating();
00492   if (parent_vse != NULL) optimistic_cost += parent_vse->cost;
00493   // Discard this entry if it will not beat best choice.
00494   if (optimistic_cost >= best_choice_bundle->best_choice->rating()) {
00495     if (language_model_debug_level > 1) {
00496       tprintf("Discarded ViterbiEntry with high cost %.4f"
00497               " best_choice->rating()=%.4f\n", optimistic_cost,
00498               best_choice_bundle->best_choice->rating());
00499     }
00500     return 0x0;
00501   }
00502 
00503   // Check consistency of the path and set the relevant consistency_info.
00504   LanguageModelConsistencyInfo consistency_info;
00505   FillConsistencyInfo(curr_col, word_end, b, parent_vse, parent_b,
00506                       chunks_record, &consistency_info);
00507 
00508   // Invoke Dawg language model component.
00509   LanguageModelDawgInfo *dawg_info =
00510     GenerateDawgInfo(word_end, consistency_info.script_id,
00511                      curr_col, curr_row, *b, parent_vse, &changed);
00512 
00513   // Invoke TopChoice language model component
00514   float ratings_sum = b->rating();
00515   if (parent_vse != NULL) ratings_sum += parent_vse->ratings_sum;
00516   GenerateTopChoiceInfo(ratings_sum, dawg_info, consistency_info,
00517                         parent_vse, b, &top_choice_flags, &changed);
00518 
00519   // Invoke Ngram language model component.
00520   LanguageModelNgramInfo *ngram_info = NULL;
00521   if (language_model_ngram_on) {
00522     ngram_info = GenerateNgramInfo(
00523         dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(),
00524         denom, curr_col, curr_row, parent_vse, parent_b, &changed);
00525     ASSERT_HOST(ngram_info != NULL);
00526   }
00527 
00528   // Prune non-top choice paths with inconsistent scripts.
00529   if (consistency_info.inconsistent_script) {
00530     if (!(top_choice_flags & kSmallestRatingFlag)) changed = 0x0;
00531     if (ngram_info != NULL) ngram_info->pruned = true;
00532   }
00533 
00534   // If language model components did not like this unichar - return
00535   if (!changed) {
00536     if (language_model_debug_level > 0) {
00537       tprintf("Language model components did not like this entry\n");
00538     }
00539     delete dawg_info;
00540     delete ngram_info;
00541     return 0x0;
00542   }
00543 
00544   // Compute cost of associating the blobs that represent the current unichar.
00545   AssociateStats associate_stats;
00546   ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
00547                         parent_vse, chunks_record, &associate_stats);
00548   if (parent_vse != NULL) {
00549     associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
00550     associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
00551   }
00552 
00553   // Compute the aggregate cost (adjusted ratings sum).
00554   float cost = ComputeAdjustedPathCost(
00555       ratings_sum,
00556       (parent_vse == NULL) ? 1 : (parent_vse->length+1),
00557       (dawg_info == NULL) ? 0.0f : 1.0f,
00558       dawg_info, ngram_info, consistency_info, associate_stats, parent_vse);
00559 
00560   if (b->language_model_state() == NULL) {
00561     b->set_language_model_state(new LanguageModelState(curr_col, curr_row));
00562   }
00563   LanguageModelState *lms =
00564     reinterpret_cast<LanguageModelState *>(b->language_model_state());
00565 
00566   // Discard this entry if it represents a prunable path and
00567   // language_model_viterbi_list_max_num_prunable such entries with a lower
00568   // cost have already been recorded.
00569   if (PrunablePath(top_choice_flags, dawg_info) &&
00570       (lms->viterbi_state_entries_prunable_length >=
00571        language_model_viterbi_list_max_num_prunable) &&
00572       cost >= lms->viterbi_state_entries_prunable_max_cost) {
00573     if (language_model_debug_level > 1) {
00574       tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
00575               cost, lms->viterbi_state_entries_prunable_max_cost);
00576     }
00577     delete dawg_info;
00578     delete ngram_info;
00579     return 0x0;
00580   }
00581 
00582   // Create the new ViterbiStateEntry and add it to lms->viterbi_state_entries
00583   ViterbiStateEntry *new_vse = new ViterbiStateEntry(
00584       parent_b, parent_vse, b, cost, ComputeOutlineLength(b), consistency_info,
00585       associate_stats, top_choice_flags, dawg_info, ngram_info);
00586   updated_flags_.push_back(&(new_vse->updated));
00587   lms->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
00588                                         false, new_vse);
00589   lms->viterbi_state_entries_length++;
00590   if (PrunablePath(top_choice_flags, dawg_info)) {
00591     lms->viterbi_state_entries_prunable_length++;
00592   }
00593 
00594   // Update lms->viterbi_state_entries_prunable_max_cost and clear
00595   // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
00596   if ((lms->viterbi_state_entries_prunable_length >=
00597        language_model_viterbi_list_max_num_prunable) || top_choice_flags) {
00598     ASSERT_HOST(!lms->viterbi_state_entries.empty());
00599     int prunable_counter = language_model_viterbi_list_max_num_prunable;
00600     vit.set_to_list(&(lms->viterbi_state_entries));
00601     for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
00602       ViterbiStateEntry *curr_vse = vit.data();
00603       // Clear the appropriate top choice flags of the entries in the
00604       // list that have ratings_sum higher thank new_entry->ratings_sum
00605       // (since they will not be top choices any more).
00606       if (curr_vse->top_choice_flags && curr_vse != new_vse &&
00607           ComputeConsistencyAdjustedRatingsSum(
00608               curr_vse->ratings_sum, curr_vse->dawg_info,
00609               curr_vse->consistency_info) >
00610           ComputeConsistencyAdjustedRatingsSum(
00611               new_vse->ratings_sum, new_vse->dawg_info,
00612               new_vse->consistency_info)) {
00613         curr_vse->top_choice_flags &= ~(top_choice_flags);
00614       }
00615       if (prunable_counter > 0 &&
00616           PrunablePath(curr_vse->top_choice_flags, curr_vse->dawg_info)) {
00617         --prunable_counter;
00618       }
00619       // Update lms->viterbi_state_entries_prunable_max_cost.
00620       if (prunable_counter == 0) {
00621         lms->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
00622         if (language_model_debug_level > 1) {
00623           tprintf("Set viterbi_state_entries_prunable_max_cost to %.4f\n",
00624                   lms->viterbi_state_entries_prunable_max_cost);
00625         }
00626         prunable_counter = -1;  // stop counting
00627       }
00628     }
00629   }
00630 
00631   // Print the newly created ViterbiStateEntry.
00632   if (language_model_debug_level > 2) {
00633     PrintViterbiStateEntry("New", new_vse, b, chunks_record);
00634     if (language_model_debug_level > 3) {
00635       tprintf("Updated viterbi list (max cost %g):\n",
00636               lms->viterbi_state_entries_prunable_max_cost);
00637       vit.set_to_list(&(lms->viterbi_state_entries));
00638       for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
00639         PrintViterbiStateEntry("", vit.data(), b, chunks_record);
00640       }
00641     }
00642   }
00643 
00644   // Update best choice if needed.
00645   if (word_end) {
00646     UpdateBestChoice(b, new_vse, pain_points, chunks_record,
00647                      best_choice_bundle, blamer_bundle);
00648   }
00649 
00650   // Update stats in best_path_by_column.
00651   if (new_vse->Consistent() || new_vse->dawg_info != NULL ||
00652       (new_vse->ngram_info != NULL && !new_vse->ngram_info->pruned)) {
00653     float avg_cost = new_vse->cost / static_cast<float>(curr_row+1);
00654     for (int c = curr_col; c <= curr_row; ++c) {
00655       if (avg_cost < (*best_path_by_column)[c].avg_cost) {
00656         (*best_path_by_column)[c].avg_cost = avg_cost;
00657         (*best_path_by_column)[c].best_vse = new_vse;
00658         (*best_path_by_column)[c].best_b = b;
00659         if (language_model_debug_level > 0) {
00660           tprintf("Set best_path_by_column[%d]=(%g %p)\n",
00661                   c, avg_cost, new_vse);
00662         }
00663       }
00664     }
00665   }
00666   return changed;
00667 }
00668 
00669 void LanguageModel::PrintViterbiStateEntry(
00670     const char *msg, ViterbiStateEntry *vse,
00671     BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record) {
00672   tprintf("%s ViterbiStateEntry %p with ratings_sum=%.4f length=%d cost=%.4f",
00673           msg, vse, vse->ratings_sum, vse->length, vse->cost);
00674   if (vse->top_choice_flags) {
00675     tprintf(" top_choice_flags=0x%x", vse->top_choice_flags);
00676   }
00677   if (!vse->Consistent()) {
00678     tprintf(" inconsistent=(punc %d case %d chartype %d script %d)\n",
00679             vse->consistency_info.NumInconsistentPunc(),
00680             vse->consistency_info.NumInconsistentCase(),
00681             vse->consistency_info.NumInconsistentChartype(),
00682             vse->consistency_info.inconsistent_script);
00683   }
00684   if (vse->dawg_info) tprintf(" permuter=%d", vse->dawg_info->permuter);
00685   if (vse->ngram_info) {
00686     tprintf(" ngram_cost=%g context=%s ngram pruned=%d",
00687             vse->ngram_info->ngram_cost,
00688             vse->ngram_info->context.string(),
00689             vse->ngram_info->pruned);
00690   }
00691   if (vse->associate_stats.shape_cost > 0.0f) {
00692     tprintf(" shape_cost=%g", vse->associate_stats.shape_cost);
00693   }
00694   if (language_model_debug_level > 3) {
00695     STRING wd_str;
00696     WERD_CHOICE *wd = ConstructWord(b, vse, chunks_record,
00697                                     NULL, NULL, NULL, NULL, NULL, NULL);
00698     wd->string_and_lengths(&wd_str, NULL);
00699     delete wd;
00700     tprintf(" str=%s", wd_str.string());
00701   }
00702   tprintf("\n");
00703 }
00704 
00705 void LanguageModel::GenerateTopChoiceInfo(
00706     float ratings_sum,
00707     const LanguageModelDawgInfo *dawg_info,
00708     const LanguageModelConsistencyInfo &consistency_info,
00709     const ViterbiStateEntry *parent_vse,
00710     BLOB_CHOICE *b,
00711     LanguageModelFlagsType *top_choice_flags,
00712     LanguageModelFlagsType *changed) {
00713   ratings_sum = ComputeConsistencyAdjustedRatingsSum(
00714       ratings_sum, dawg_info, consistency_info);
00715   // Clear flags that do not agree with parent_vse->top_choice_flags.
00716   if (parent_vse != NULL) *top_choice_flags &= parent_vse->top_choice_flags;
00717   if (consistency_info.Consistent()) *top_choice_flags |= kConsistentFlag;
00718   if (*top_choice_flags == 0x0) return;
00719   LanguageModelState *lms =
00720     reinterpret_cast<LanguageModelState *>(b->language_model_state());
00721   if (lms != NULL && !lms->viterbi_state_entries.empty()) {
00722     ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
00723     for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
00724       if (ratings_sum >= ComputeConsistencyAdjustedRatingsSum(
00725           vit.data()->ratings_sum, vit.data()->dawg_info,
00726           vit.data()->consistency_info)) {
00727         // Clear the appropriate flags if the list already contains
00728         // a top choice entry with a lower cost.
00729         *top_choice_flags &= ~(vit.data()->top_choice_flags);
00730       }
00731     }
00732   }
00733   if (language_model_debug_level > 0) {
00734     tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
00735             *top_choice_flags);
00736   }
00737   *changed |= *top_choice_flags;
00738 }
00739 
00740 LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(
00741     bool word_end, int script_id,
00742     int curr_col, int curr_row,
00743     const BLOB_CHOICE &b,
00744     const ViterbiStateEntry *parent_vse,
00745     LanguageModelFlagsType *changed) {
00746   bool use_fixed_length_dawgs = !fixed_length_beginning_active_dawgs_->empty();
00747 
00748   // Initialize active_dawgs and constraints from parent_vse if it is not NULL,
00749   // otherwise use beginning_active_dawgs_ and beginning_constraints_.
00750   if (parent_vse == NULL ||
00751       (use_fixed_length_dawgs && parent_vse->dawg_info == NULL)) {
00752     dawg_args_->active_dawgs = beginning_active_dawgs_;
00753     dawg_args_->constraints = beginning_constraints_;
00754     dawg_args_->permuter = NO_PERM;
00755   } else {
00756     if (parent_vse->dawg_info == NULL) return NULL;  // not a dict word path
00757     dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs;
00758     dawg_args_->constraints = parent_vse->dawg_info->constraints;
00759     dawg_args_->permuter = parent_vse->dawg_info->permuter;
00760   }
00761 
00762   // Deal with hyphenated words.
00763   if (!use_fixed_length_dawgs && word_end &&
00764       dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
00765     if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
00766     *changed |= kDawgFlag;
00767     return new LanguageModelDawgInfo(dawg_args_->active_dawgs,
00768                                      dawg_args_->constraints,
00769                                      COMPOUND_PERM);
00770   }
00771 
00772   // Deal with compound words.
00773   if (!use_fixed_length_dawgs && dict_->compound_marker(b.unichar_id()) &&
00774       (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
00775     if (language_model_debug_level > 0) tprintf("Found compound marker");
00776     // Do not allow compound operators at the beginning and end of the word.
00777     // Do not allow more than one compound operator per word.
00778     // Do not allow compounding of words with lengths shorter than
00779     // language_model_min_compound_length
00780     if (parent_vse == NULL || word_end ||
00781         dawg_args_->permuter == COMPOUND_PERM ||
00782         parent_vse->length < language_model_min_compound_length) return NULL;
00783 
00784     int i;
00785     // Check a that the path terminated before the current character is a word.
00786     bool has_word_ending = false;
00787     for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
00788       const DawgInfo &info = (*parent_vse->dawg_info->active_dawgs)[i];
00789       const Dawg *pdawg = dict_->GetDawg(info.dawg_index);
00790       assert(pdawg != NULL);
00791       if (pdawg->type() == DAWG_TYPE_WORD && info.ref != NO_EDGE &&
00792           pdawg->end_of_word(info.ref)) {
00793         has_word_ending = true;
00794         break;
00795       }
00796     }
00797     if (!has_word_ending) return NULL;
00798 
00799     // Return LanguageModelDawgInfo with active_dawgs set to
00800     // the beginning dawgs of type DAWG_TYPE_WORD.
00801     if (language_model_debug_level > 0) tprintf("Compound word found\n");
00802     DawgInfoVector beginning_word_dawgs;
00803     for (i = 0; i < beginning_active_dawgs_->size(); ++i) {
00804       const Dawg *bdawg =
00805         dict_->GetDawg((*beginning_active_dawgs_)[i].dawg_index);
00806       if (bdawg->type() == DAWG_TYPE_WORD) {
00807         beginning_word_dawgs += (*beginning_active_dawgs_)[i];
00808       }
00809     }
00810     *changed |= kDawgFlag;
00811     return new LanguageModelDawgInfo(&(beginning_word_dawgs),
00812                                      dawg_args_->constraints,
00813                                      COMPOUND_PERM);
00814   }  // done dealing with compound words
00815 
00816   LanguageModelDawgInfo *dawg_info = NULL;
00817 
00818   // Call LetterIsOkay().
00819   dict_->LetterIsOkay(dawg_args_, b.unichar_id(), word_end);
00820   if (dawg_args_->permuter != NO_PERM) {
00821     *changed |= kDawgFlag;
00822     dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_active_dawgs,
00823                                           dawg_args_->updated_constraints,
00824                                           dawg_args_->permuter);
00825   }
00826 
00827   // For non-space delimited languages: since every letter could be
00828   // a valid word, a new word could start at every unichar. Thus append
00829   // fixed_length_beginning_active_dawgs_ to dawg_info->active_dawgs.
00830   if (use_fixed_length_dawgs) {
00831     if (dawg_info == NULL) {
00832       *changed |= kDawgFlag;
00833       dawg_info = new LanguageModelDawgInfo(
00834           fixed_length_beginning_active_dawgs_,
00835           empty_dawg_info_vec_, SYSTEM_DAWG_PERM);
00836     } else {
00837       *(dawg_info->active_dawgs) += *(fixed_length_beginning_active_dawgs_);
00838     }
00839   }  // done dealing with fixed-length dawgs
00840 
00841   return dawg_info;
00842 }
00843 
00844 LanguageModelNgramInfo *LanguageModel::GenerateNgramInfo(
00845     const char *unichar, float certainty, float denom,
00846     int curr_col, int curr_row,
00847     const ViterbiStateEntry *parent_vse,
00848     BLOB_CHOICE *parent_b,
00849     LanguageModelFlagsType *changed) {
00850   // Initialize parent context.
00851   const char *pcontext_ptr = "";
00852   int pcontext_unichar_step_len = 0;
00853   if (parent_vse == NULL) {
00854     pcontext_ptr = prev_word_str_.string();
00855     pcontext_unichar_step_len = prev_word_unichar_step_len_;
00856   } else {
00857     pcontext_ptr = parent_vse->ngram_info->context.string();
00858     pcontext_unichar_step_len =
00859       parent_vse->ngram_info->context_unichar_step_len;
00860   }
00861   // Compute p(unichar | parent context).
00862   int unichar_step_len = 0;
00863   bool pruned = false;
00864   float ngram_prob;
00865   float ngram_cost = ComputeNgramCost(unichar, certainty, denom,
00866                                       pcontext_ptr, &unichar_step_len,
00867                                       &pruned, &ngram_prob);
00868   // First attempt to normalize ngram_cost for strings of different
00869   // lengths - we multiply ngram_cost by P(char | context) as many times
00870   // as the number of chunks occupied by char. This makes the ngram costs
00871   // of all the paths ending at the current BLOB_CHOICE comparable.
00872   // TODO(daria): it would be worth looking at different ways of normalization.
00873   if (curr_row > curr_col) {
00874     ngram_cost += (curr_row - curr_col) * ngram_cost;
00875     ngram_prob += (curr_row - curr_col) * ngram_prob;
00876   }
00877   // Add the ngram_cost of the parent.
00878   if (parent_vse != NULL) {
00879     ngram_cost += parent_vse->ngram_info->ngram_cost;
00880     ngram_prob += parent_vse->ngram_info->ngram_prob;
00881   }
00882 
00883   // Shorten parent context string by unichar_step_len unichars.
00884   int num_remove = (unichar_step_len + pcontext_unichar_step_len -
00885                     language_model_ngram_order);
00886   if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
00887   while (num_remove > 0 && *pcontext_ptr != '\0') {
00888     pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
00889     --num_remove;
00890   }
00891 
00892   // Decide whether to prune this ngram path and update changed accordingly.
00893   if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
00894   if (!pruned) *changed |= kNgramFlag;
00895 
00896   // Construct and return the new LanguageModelNgramInfo.
00897   LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
00898       pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_prob, ngram_cost);
00899   ngram_info->context += unichar;
00900   ngram_info->context_unichar_step_len += unichar_step_len;
00901   assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
00902   return ngram_info;
00903 }
00904 
00905 float LanguageModel::ComputeNgramCost(const char *unichar,
00906                                       float certainty,
00907                                       float denom,
00908                                       const char *context,
00909                                       int *unichar_step_len,
00910                                       bool *found_small_prob,
00911                                       float *ngram_prob) {
00912   const char *context_ptr = context;
00913   char *modified_context = NULL;
00914   char *modified_context_end = NULL;
00915   const char *unichar_ptr = unichar;
00916   const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
00917   float prob = 0.0f;
00918   int step = 0;
00919   while (unichar_ptr < unichar_end &&
00920          (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
00921     if (language_model_debug_level > 1) {
00922       tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
00923               dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
00924     }
00925     prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
00926     ++(*unichar_step_len);
00927     if (language_model_ngram_use_only_first_uft8_step) break;
00928     unichar_ptr += step;
00929     // If there are multiple UTF8 characters present in unichar, context is
00930     // updated to include the previously examined characters from str,
00931     // unless use_only_first_uft8_step is true.
00932     if (unichar_ptr < unichar_end) {
00933       if (modified_context == NULL) {
00934         int context_len = strlen(context);
00935         modified_context =
00936           new char[context_len + strlen(unichar_ptr) + step + 1];
00937         strncpy(modified_context, context, context_len);
00938         modified_context_end = modified_context + context_len;
00939         context_ptr = modified_context;
00940       }
00941       strncpy(modified_context_end, unichar_ptr - step, step);
00942       modified_context_end += step;
00943       *modified_context_end = '\0';
00944     }
00945   }
00946   prob /= static_cast<float>(*unichar_step_len);  // normalize
00947   if (prob < language_model_ngram_small_prob) {
00948     if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
00949     *found_small_prob = true;
00950   }
00951   *ngram_prob = -1.0*log(prob);
00952   float cost = -1.0*log(CertaintyScore(certainty)/denom) +
00953       *ngram_prob * language_model_ngram_scale_factor;
00954   if (language_model_debug_level > 1) {
00955     tprintf("-log [ p(%s) * p(%s | %s) ] = -log(%g*%g) = %g\n", unichar,
00956             unichar, context_ptr, CertaintyScore(certainty)/denom, prob, cost);
00957   }
00958   if (modified_context != NULL) delete[] modified_context;
00959   return cost;
00960 }
00961 
00962 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
00963   if (curr_list->empty()) return 1.0f;
00964   float denom = 0.0f;
00965   int len = 0;
00966   BLOB_CHOICE_IT c_it(curr_list);
00967   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00968     ASSERT_HOST(c_it.data() != NULL);
00969     ++len;
00970     denom += CertaintyScore(c_it.data()->certainty());
00971   }
00972   assert(len != 0);
00973   // The ideal situation would be to have the classifier scores for
00974   // classifying each position as each of the characters in the unicharset.
00975   // Since we can not do this because of speed, we add a very crude estimate
00976   // of what these scores for the "missing" classifications would sum up to.
00977   denom += (dict_->getUnicharset().size() - len) *
00978     CertaintyScore(language_model_ngram_nonmatch_score);
00979 
00980   return denom;
00981 }
00982 
00983 void LanguageModel::FillConsistencyInfo(
00984     int curr_col,
00985     bool word_end,
00986     BLOB_CHOICE *b,
00987     ViterbiStateEntry *parent_vse,
00988     BLOB_CHOICE *parent_b,
00989     CHUNKS_RECORD *chunks_record,
00990     LanguageModelConsistencyInfo *consistency_info) {
00991   const UNICHARSET &unicharset = dict_->getUnicharset();
00992   UNICHAR_ID unichar_id = b->unichar_id();
00993   if (parent_vse != NULL) *consistency_info = parent_vse->consistency_info;
00994 
00995   // Check punctuation validity.
00996   if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
00997   if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
00998     if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
00999         (unicharset.get_isalpha(parent_b->unichar_id()) ||
01000          unicharset.get_isdigit(parent_b->unichar_id()))) {
01001       // reset punc_ref for compound words
01002       consistency_info->punc_ref = NO_EDGE;
01003     } else {
01004       UNICHAR_ID pattern_unichar_id =
01005         (unicharset.get_isalpha(unichar_id) ||
01006          unicharset.get_isdigit(unichar_id)) ?
01007         Dawg::kPatternUnicharID : unichar_id;
01008       if (consistency_info->punc_ref == NO_EDGE ||
01009           pattern_unichar_id != Dawg::kPatternUnicharID ||
01010           dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
01011           Dawg::kPatternUnicharID) {
01012         NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(),
01013                                               consistency_info->punc_ref);
01014         consistency_info->punc_ref =
01015           (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
01016               node, pattern_unichar_id, word_end) : NO_EDGE;
01017         if (consistency_info->punc_ref == NO_EDGE) {
01018           consistency_info->invalid_punc = true;
01019         }
01020       }
01021     }
01022   }
01023 
01024   // Update case related counters.
01025   if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
01026     // Reset counters if we are dealing with a compound word.
01027     consistency_info->num_lower = 0;
01028     consistency_info->num_non_first_upper = 0;
01029   }
01030   else if (unicharset.get_islower(unichar_id)) {
01031     consistency_info->num_lower++;
01032   } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
01033     if (unicharset.get_isupper(parent_b->unichar_id()) ||
01034         consistency_info->num_lower > 0 ||
01035         consistency_info->num_non_first_upper > 0) {
01036       consistency_info->num_non_first_upper++;
01037     }
01038   }
01039 
01040   // Initialize consistency_info->script_id (use script of unichar_id
01041   // if it is not Common, use script id recorded by the parent otherwise).
01042   // Set inconsistent_script to true if the script of the current unichar
01043   // is not consistent with that of the parent.
01044   consistency_info->script_id = unicharset.get_script(unichar_id);
01045   // Hiragana and Katakana can mix with Han.
01046   if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) {
01047     if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
01048          consistency_info->script_id == unicharset.hiragana_sid()) ||
01049         (unicharset.katakana_sid() != unicharset.null_sid() &&
01050          consistency_info->script_id == unicharset.katakana_sid())) {
01051       consistency_info->script_id = dict_->getUnicharset().han_sid();
01052     }
01053   }
01054 
01055   if (parent_vse != NULL &&
01056       (parent_vse->consistency_info.script_id !=
01057        dict_->getUnicharset().common_sid())) {
01058     int parent_script_id = parent_vse->consistency_info.script_id;
01059     // If script_id is Common, use script id of the parent instead.
01060     if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
01061       consistency_info->script_id = parent_script_id;
01062     }
01063     if (consistency_info->script_id != parent_script_id) {
01064       consistency_info->inconsistent_script = true;
01065     }
01066   }
01067 
01068   // Update chartype related counters.
01069   if (unicharset.get_isalpha(unichar_id)) {
01070     consistency_info->num_alphas++;
01071   } else if (unicharset.get_isdigit(unichar_id)) {
01072     consistency_info->num_digits++;
01073   } else if (!unicharset.get_ispunctuation(unichar_id)) {
01074     consistency_info->num_other++;
01075   }
01076 
01077   // Check font and spacing consistency.
01078   if (parent_b != NULL) {
01079     int fontinfo_id = -1;
01080     if (parent_b->fontinfo_id() == b->fontinfo_id() ||
01081         parent_b->fontinfo_id2() == b->fontinfo_id()) {
01082       fontinfo_id = b->fontinfo_id();
01083     } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
01084                 parent_b->fontinfo_id2() == b->fontinfo_id2()) {
01085       fontinfo_id = b->fontinfo_id2();
01086     }
01087     if(language_model_debug_level > 1) {
01088       tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
01089               (parent_b->fontinfo_id() >= 0) ?
01090                   fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
01091               (parent_b->fontinfo_id2() >= 0) ?
01092                   fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
01093               (b->fontinfo_id() >= 0) ?
01094                   fontinfo_table_->get(b->fontinfo_id()).name : "",
01095               (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
01096               (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
01097               fontinfo_id);
01098     }
01099     bool expected_gap_found = false;
01100     float expected_gap;
01101     int temp_gap;
01102     if (fontinfo_id >= 0) {  // found a common font
01103       if (fontinfo_table_->get(fontinfo_id).get_spacing(
01104           parent_b->unichar_id(), unichar_id, &temp_gap)) {
01105         expected_gap = temp_gap;
01106         expected_gap_found = true;
01107       }
01108     } else {
01109       consistency_info->inconsistent_font = true;
01110       // Get an average of the expected gaps in each font
01111       int num_addends = 0;
01112       expected_gap = 0;
01113       int temp_fid;
01114       for (int i = 0; i < 4; ++i) {
01115         if (i == 0) {
01116           temp_fid = parent_b->fontinfo_id();
01117         } else if (i == 1) {
01118           temp_fid = parent_b->fontinfo_id2();
01119         } else if (i == 2) {
01120           temp_fid = b->fontinfo_id();
01121         } else {
01122           temp_fid = b->fontinfo_id2();
01123         }
01124         if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
01125             parent_b->unichar_id(), unichar_id, &temp_gap)) {
01126           expected_gap += temp_gap;
01127           num_addends++;
01128         }
01129       }
01130       expected_gap_found = (num_addends > 0);
01131       if (num_addends > 0) expected_gap /= static_cast<float>(num_addends);
01132     }
01133     if (expected_gap_found) {
01134       float actual_gap =
01135           static_cast<float>(AssociateUtils::GetChunksGap(
01136               chunks_record->chunk_widths, curr_col-1));
01137       float gap_ratio = expected_gap / actual_gap;
01138       // TODO(daria): find a good way to tune this heuristic estimate.
01139       if (gap_ratio < 1/2 || gap_ratio > 2) {
01140         consistency_info->num_inconsistent_spaces++;
01141       }
01142       if(language_model_debug_level > 1) {
01143         tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
01144                 unicharset.id_to_unichar(parent_b->unichar_id()),
01145                 parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
01146                 unichar_id, curr_col, expected_gap, actual_gap);
01147       }
01148     }
01149   }
01150 }
01151 
01152 float LanguageModel::ComputeAdjustedPathCost(
01153     float ratings_sum, int length, float dawg_score,
01154     const LanguageModelDawgInfo *dawg_info,
01155     const LanguageModelNgramInfo *ngram_info,
01156     const LanguageModelConsistencyInfo &consistency_info,
01157     const AssociateStats &associate_stats,
01158     ViterbiStateEntry *parent_vse) {
01159   float adjustment = 1.0f;
01160   if (dawg_info == NULL || dawg_info->permuter != FREQ_DAWG_PERM) {
01161     adjustment += language_model_penalty_non_freq_dict_word;
01162   }
01163   if (dawg_score == 0.0f) {
01164     adjustment += language_model_penalty_non_dict_word;
01165     if (length > language_model_min_compound_length) {
01166       adjustment += ((length - language_model_min_compound_length) *
01167                      language_model_penalty_increment);
01168     }
01169   } else if (dawg_score < 1.0f) {
01170     adjustment += (1.0f - dawg_score) * language_model_penalty_non_dict_word;
01171   }
01172   if (associate_stats.shape_cost > 0) {
01173     adjustment += associate_stats.shape_cost / static_cast<float>(length);
01174   }
01175   if (language_model_ngram_on) {
01176     ASSERT_HOST(ngram_info != NULL);
01177     return ngram_info->ngram_cost * adjustment;
01178   } else {
01179     adjustment += ComputeConsistencyAdjustment(dawg_info, consistency_info);
01180     return ratings_sum * adjustment;
01181   }
01182 }
01183 
01184 void LanguageModel::UpdateBestChoice(
01185     BLOB_CHOICE *b,
01186     ViterbiStateEntry *vse,
01187     HEAP *pain_points,
01188     CHUNKS_RECORD *chunks_record,
01189     BestChoiceBundle *best_choice_bundle,
01190     BlamerBundle *blamer_bundle) {
01191   int i;
01192   BLOB_CHOICE_LIST_VECTOR temp_best_char_choices(vse->length);
01193   for (i = 0; i < vse->length; ++i) {
01194     temp_best_char_choices.push_back(NULL);
01195   }
01196   float *certainties = new float[vse->length];
01197   STATE temp_state;
01198   // The fraction of letters in the path that are "covered" by dawgs.
01199   // For space delimited languages this number will be 0.0 for nonwords
01200   // and 1.0 for dictionary words. For non-space delimited languages
01201   // this number will be in the [0.0, 1.0] range.
01202   float dawg_score;
01203   bool truth_path;
01204   WERD_CHOICE *word = ConstructWord(b, vse, chunks_record,
01205                                     &temp_best_char_choices, certainties,
01206                                     &dawg_score, &temp_state,
01207                                     blamer_bundle, &truth_path);
01208   ASSERT_HOST(word != NULL);
01209   bool not_blaming =
01210       (blamer_bundle == NULL || !blamer_bundle->segsearch_is_looking_for_blame);
01211 
01212   // Log new segmentation (for dict_->LogNewChoice()).
01213   if (not_blaming) {
01214     PIECES_STATE pieces_widths;
01215     bin_to_pieces(&temp_state, chunks_record->ratings->dimension() - 1,
01216                   pieces_widths);
01217     dict_->LogNewSegmentation(pieces_widths);
01218   }
01219 
01220   if (language_model_debug_level > 0) {
01221     STRING word_str;
01222     word->string_and_lengths(&word_str, NULL);
01223     tprintf("UpdateBestChoice() constructed word %s\n", word_str.string());
01224     if (language_model_debug_level > 2) word->print();
01225   };
01226 
01227   // Update raw_choice if needed.
01228   if ((vse->top_choice_flags & kSmallestRatingFlag) &&
01229       word->rating() < best_choice_bundle->raw_choice->rating() &&
01230       not_blaming) {
01231     dict_->LogNewChoice(1.0, certainties, true, word);
01232     *(best_choice_bundle->raw_choice) = *word;
01233     best_choice_bundle->raw_choice->set_permuter(TOP_CHOICE_PERM);
01234     if (language_model_debug_level > 0) tprintf("Updated raw choice\n");
01235   }
01236 
01237   // When working with non-space delimited languages we re-adjust the cost
01238   // taking into account the final dawg_score and a more precise shape cost.
01239   // While constructing the paths we assume that they are all dictionary words
01240   // (since a single character would be a valid dictionary word). At the end
01241   // we compute dawg_score which reflects how many characters on the path are
01242   // "covered" by dictionary words of length > 1.
01243   if (vse->associate_stats.full_wh_ratio_var != 0.0f ||
01244       (dict_->GetMaxFixedLengthDawgIndex() >= 0 && dawg_score < 1.0f)) {
01245     vse->cost = ComputeAdjustedPathCost(
01246         vse->ratings_sum, vse->length, dawg_score, vse->dawg_info,
01247         vse->ngram_info, vse->consistency_info, vse->associate_stats,
01248         vse->parent_vse);
01249     if (language_model_debug_level > 0) {
01250       tprintf("Updated vse cost to %g (dawg_score %g full_wh_ratio_var %g)\n",
01251               vse->cost, dawg_score, vse->associate_stats.full_wh_ratio_var);
01252     }
01253   }
01254 
01255   // Update best choice and best char choices if needed.
01256   // TODO(daria): re-write AcceptableChoice() and NoDangerousAmbig()
01257   // to fit better into the new segmentation search.
01258   word->set_rating(vse->cost);
01259   if (word->rating() < best_choice_bundle->best_choice->rating() &&
01260       not_blaming) {
01261     dict_->LogNewChoice(vse->cost / (language_model_ngram_on ?
01262                                      vse->ngram_info->ngram_cost :
01263                                      vse->ratings_sum),
01264                         certainties, false, word);
01265     // Since the rating of the word could have been modified by
01266     // Dict::LogNewChoice() - check again.
01267     if (word->rating() < best_choice_bundle->best_choice->rating()) {
01268       bool modified_blobs;  // not used
01269       DANGERR fixpt;
01270       if (dict_->AcceptableChoice(&temp_best_char_choices, word, &fixpt,
01271                                   ASSOCIATOR_CALLER, &modified_blobs) &&
01272           AcceptablePath(*vse)) {
01273         acceptable_choice_found_ = true;
01274       }
01275       // Update best_choice_bundle.
01276       *(best_choice_bundle->best_choice) = *word;
01277       best_choice_bundle->updated = true;
01278       best_choice_bundle->best_char_choices->delete_data_pointers();
01279       best_choice_bundle->best_char_choices->clear();
01280       for (i = 0; i < temp_best_char_choices.size(); ++i) {
01281         BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
01282         cc_list->deep_copy(temp_best_char_choices[i], &BLOB_CHOICE::deep_copy);
01283         best_choice_bundle->best_char_choices->push_back(cc_list);
01284       }
01285       best_choice_bundle->best_state->part2 = temp_state.part2;
01286       best_choice_bundle->best_state->part1 = temp_state.part1;
01287       if (language_model_debug_level > 0) {
01288         tprintf("Updated best choice\n");
01289         print_state("New state ", best_choice_bundle->best_state,
01290                     chunks_record->ratings->dimension()-1);
01291       }
01292       // Update hyphen state if we are dealing with a dictionary word.
01293       if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) {
01294         if (dict_->has_hyphen_end(*word)) {
01295           dict_->set_hyphen_word(*word, *(dawg_args_->active_dawgs),
01296                                  *(dawg_args_->constraints));
01297         } else {
01298           dict_->reset_hyphen_vars(true);
01299         }
01300       }
01301       best_choice_bundle->best_vse = vse;
01302       best_choice_bundle->best_b = b;
01303       best_choice_bundle->fixpt = fixpt;
01304 
01305       if (blamer_bundle != NULL) {
01306         blamer_bundle->best_choice_is_dict_and_top_choice =
01307             (vse->dawg_info != NULL &&
01308              dict_->GetMaxFixedLengthDawgIndex() < 0 &&
01309              (vse->top_choice_flags));
01310       }
01311     }
01312   }
01313   if (blamer_bundle != NULL) {
01314     // Record the current hypothesis in params_training_bundle.
01315     ParamsTrainingHypothesis &hyp =
01316         blamer_bundle->params_training_bundle.AddHypothesis();
01317     word->string_and_lengths(&(hyp.str), NULL);
01318     ExtractRawFeaturesFromPath(*vse, hyp.features);
01319     if (truth_path &&  // update best truth path rating
01320         word->rating() < blamer_bundle->best_correctly_segmented_rating) {
01321       blamer_bundle->best_correctly_segmented_rating = word->rating();
01322     }
01323   }
01324 
01325   // Clean up.
01326   delete[] certainties;
01327   delete word;
01328 }
01329 
01330 void LanguageModel::ExtractRawFeaturesFromPath(const ViterbiStateEntry &vse,
01331                                             float *features) {
01332   for (int f = 0; f < PTRAIN_NUM_RAW_FEATURE_TYPES; ++f) features[f] = 0.0;
01333   // Dictionary-related features.
01334   if (vse.dawg_info != NULL) {
01335     features[PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE] = vse.dawg_info->permuter;
01336 
01337     // Mark as unambiguous if unambig_dawg is found among active dawgs.
01338     for (int d = 0; d < vse.dawg_info->active_dawgs->size(); ++d) {
01339       if (dict_->GetDawg(vse.dawg_info->active_dawgs->get(d).dawg_index) ==
01340           dict_->GetUnambigDawg()) {
01341         features[PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH] = 1.0;
01342         break;
01343       }
01344     }
01345   }
01346   if (vse.associate_stats.shape_cost > 0) {
01347     features[PTRAIN_RAW_FEATURE_SHAPE_COST] = vse.associate_stats.shape_cost;
01348   }
01349   if (language_model_ngram_on) {
01350     ASSERT_HOST(vse.ngram_info != NULL);
01351     features[PTRAIN_RAW_FEATURE_NGRAM_PROB] = vse.ngram_info->ngram_prob;
01352   }
01353   // Consistency-related features.
01354   features[PTRAIN_RAW_FEATURE_NUM_BAD_PUNC] =
01355       vse.consistency_info.NumInconsistentPunc();
01356   features[PTRAIN_RAW_FEATURE_NUM_BAD_CASE] =
01357       vse.consistency_info.NumInconsistentCase();
01358   features[PTRAIN_RAW_FEATURE_NUM_BAD_CHAR_TYPE] =
01359       vse.consistency_info.NumInconsistentChartype();
01360   features[PTRAIN_RAW_FEATURE_NUM_BAD_SPACING] =
01361       vse.consistency_info.NumInconsistentSpaces();
01362   features[PTRAIN_RAW_FEATURE_NUM_BAD_SCRIPT] =
01363       vse.consistency_info.inconsistent_script;
01364   features[PTRAIN_RAW_FEATURE_NUM_BAD_FONT] =
01365       vse.consistency_info.inconsistent_font;
01366   // Classifier-related features.
01367   features[PTRAIN_RAW_FEATURE_WORST_CERT] = vse.min_certainty;
01368   features[PTRAIN_RAW_FEATURE_RATING] = vse.ratings_sum;
01369   features[PTRAIN_RAW_FEATURE_ADAPTED] = vse.adapted;
01370   // Normalization-related features.
01371   features[PTRAIN_RAW_FEATURE_NUM_UNICHARS] = vse.length;
01372   features[PTRAIN_RAW_FEATURE_OUTLINE_LEN] = vse.outline_length;
01373 }
01374 
01375 WERD_CHOICE *LanguageModel::ConstructWord(
01376     BLOB_CHOICE *b,
01377     ViterbiStateEntry *vse,
01378     CHUNKS_RECORD *chunks_record,
01379     BLOB_CHOICE_LIST_VECTOR *best_char_choices,
01380     float certainties[],
01381     float *dawg_score,
01382     STATE *state,
01383     BlamerBundle *blamer_bundle,
01384     bool *truth_path) {
01385   uinT64 state_uint = 0x0;
01386   if (truth_path != NULL) {
01387     *truth_path =
01388         (blamer_bundle != NULL &&
01389          !blamer_bundle->correct_segmentation_cols.empty() &&
01390          vse->length == blamer_bundle->correct_segmentation_cols.length());
01391   }
01392   const uinT64 kHighestBitOn = 0x8000000000000000LL;
01393   BLOB_CHOICE *curr_b = b;
01394   LanguageModelState *curr_lms =
01395     reinterpret_cast<LanguageModelState *>(b->language_model_state());
01396   ViterbiStateEntry *curr_vse = vse;
01397 
01398   int i;
01399   bool compound = dict_->hyphenated();  // treat hyphenated words as compound
01400   bool dawg_score_done = true;
01401   if (dawg_score != NULL) {
01402     *dawg_score = 0.0f;
01403     // For space-delimited languages the presence of dawg_info in the last
01404     // ViterbyStateEntry on the path means that the whole path represents
01405     // a valid dictionary word.
01406     if (dict_->GetMaxFixedLengthDawgIndex() < 0) {
01407       if (vse->dawg_info != NULL) *dawg_score = 1.0f;
01408     } else if (vse->length == 1) {
01409       *dawg_score = 1.0f;       // each one-letter word is legal
01410        dawg_score_done = true;  // in non-space delimited languages
01411     } else {
01412       dawg_score_done = false;  // do more work to compute dawg_score
01413     }
01414   }
01415   // For non-space delimited languages we compute the fraction of letters
01416   // "covered" by fixed length dawgs (i.e. words of length > 1 on the path).
01417   int covered_by_fixed_length_dawgs = 0;
01418   // The number of unichars remaining that should be skipped because
01419   // they are covered by the previous word from fixed length dawgs.
01420   int fixed_length_num_unichars_to_skip = 0;
01421 
01422   // Re-compute the variance of the width-to-height ratios (since we now
01423   // can compute the mean over the whole word).
01424   float full_wh_ratio_mean = 0.0f;
01425   if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
01426     vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
01427     full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
01428                           static_cast<float>(vse->length));
01429     vse->associate_stats.full_wh_ratio_var = 0.0f;
01430   }
01431 
01432   // Construct a WERD_CHOICE by tracing parent pointers.
01433   WERD_CHOICE *word = new WERD_CHOICE(chunks_record->word_res->uch_set,
01434                                       vse->length);
01435   word->set_length(vse->length);
01436   for (i = (vse->length-1); i >= 0; --i) {
01437     if (blamer_bundle != NULL && truth_path != NULL && *truth_path) {
01438       if ((blamer_bundle->correct_segmentation_cols[i] !=
01439            curr_lms->contained_in_col) ||
01440            (blamer_bundle->correct_segmentation_rows[i] !=
01441                curr_lms->contained_in_row)) {
01442         *truth_path = false;
01443       }
01444     }
01445     word->set_unichar_id(curr_b->unichar_id(), i);
01446     word->set_fragment_length(1, i);
01447     if (certainties != NULL) certainties[i] = curr_b->certainty();
01448     if (best_char_choices != NULL) {
01449       best_char_choices->set(chunks_record->ratings->get(
01450           curr_lms->contained_in_col, curr_lms->contained_in_row), i);
01451     }
01452     if (state != NULL) {
01453       // Record row minus col zeroes in the reverse state to mark the number
01454       // of joins done by using a blob from this cell in the ratings matrix.
01455       state_uint >>= (curr_lms->contained_in_row - curr_lms->contained_in_col);
01456       // Record a one in the reverse state to indicate the split before
01457       // the blob from the next cell in the ratings matrix (unless we are
01458       // at the first cell, in which case there is no next blob).
01459       if (i != 0) {
01460         state_uint >>= 1;
01461         state_uint |= kHighestBitOn;
01462       }
01463     }
01464     // For non-space delimited languages: find word endings recorded while
01465     // trying to separate the current path into words (for words found in
01466     // fixed length dawgs.
01467     if (!dawg_score_done && curr_vse->dawg_info != NULL) {
01468       UpdateCoveredByFixedLengthDawgs(*(curr_vse->dawg_info->active_dawgs),
01469                                       i, vse->length,
01470                                       &fixed_length_num_unichars_to_skip,
01471                                       &covered_by_fixed_length_dawgs,
01472                                       dawg_score, &dawg_score_done);
01473     }
01474     // Update the width-to-height ratio variance. Useful non-space delimited
01475     // languages to ensure that the blobs are of uniform width.
01476     // Skip leading and trailing punctuation when computing the variance.
01477     if ((full_wh_ratio_mean != 0.0f &&
01478          ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
01479           !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
01480       vse->associate_stats.full_wh_ratio_var +=
01481         pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
01482       if (language_model_debug_level > 2) {
01483         tprintf("full_wh_ratio_var += (%g-%g)^2\n",
01484                 full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
01485       }
01486     }
01487 
01488     // Mark the word as compound if compound permuter was set for any of
01489     // the unichars on the path (usually this will happen for unichars
01490     // that are compounding operators, like "-" and "/").
01491     if (!compound && curr_vse->dawg_info &&
01492         curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
01493 
01494     // Update curr_* pointers.
01495     if (curr_vse->parent_b == NULL) break;
01496     curr_b = curr_vse->parent_b;
01497     curr_lms =
01498       reinterpret_cast<LanguageModelState *>(curr_b->language_model_state());
01499     curr_vse = curr_vse->parent_vse;
01500   }
01501   ASSERT_HOST(i == 0);  // check that we recorded all the unichar ids
01502   // Re-adjust shape cost to include the updated width-to-height variance.
01503   if (full_wh_ratio_mean != 0.0f) {
01504     vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
01505   }
01506 
01507   if (state != NULL) {
01508     state_uint >>= (64 - (chunks_record->ratings->dimension()-1));
01509     state->part2 = state_uint;
01510     state_uint >>= 32;
01511     state->part1 = state_uint;
01512   }
01513   word->set_rating(vse->ratings_sum);
01514   word->set_certainty(vse->min_certainty);
01515   if (vse->dawg_info != NULL && dict_->GetMaxFixedLengthDawgIndex() < 0) {
01516     word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
01517   } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
01518     word->set_permuter(NGRAM_PERM);
01519   } else if (vse->top_choice_flags) {
01520     word->set_permuter(TOP_CHOICE_PERM);
01521   } else {
01522     word->set_permuter(NO_PERM);
01523   }
01524   return word;
01525 }
01526 
01527 void LanguageModel::UpdateCoveredByFixedLengthDawgs(
01528     const DawgInfoVector &active_dawgs, int word_index, int word_length,
01529     int *skip, int *covered, float *dawg_score, bool *dawg_score_done) {
01530   if (language_model_debug_level > 3) {
01531     tprintf("UpdateCoveredByFixedLengthDawgs for index %d skip=%d\n",
01532             word_index, *skip, word_length);
01533   }
01534 
01535   if (*skip > 0) {
01536     --(*skip);
01537   } else {
01538     int best_index = -1;
01539     for (int d = 0; d < active_dawgs.size(); ++d) {
01540       int dawg_index = (active_dawgs[d]).dawg_index;
01541       if (dawg_index > dict_->GetMaxFixedLengthDawgIndex()) {
01542         // If active_dawgs of the last ViterbiStateEntry on the path
01543         // contain a non-fixed length dawg, this means that the whole
01544         // path represents a word from a non-fixed length word dawg.
01545         if (word_index == (word_length - 1)) {
01546           *dawg_score = 1.0f;
01547           *dawg_score_done = true;
01548           return;
01549         }
01550       } else if (dawg_index >= kMinFixedLengthDawgLength) {
01551         const Dawg *curr_dawg = dict_->GetDawg(dawg_index);
01552         ASSERT_HOST(curr_dawg != NULL);
01553         if ((active_dawgs[d]).ref != NO_EDGE &&
01554             curr_dawg->end_of_word((active_dawgs[d]).ref) &&
01555             dawg_index > best_index) {
01556           best_index = dawg_index;
01557         }
01558 
01559         if (language_model_debug_level > 3) {
01560           tprintf("dawg_index %d, ref %d, eow %d\n", dawg_index,
01561                   (active_dawgs[d]).ref,
01562                   ((active_dawgs[d]).ref != NO_EDGE &&
01563                    curr_dawg->end_of_word((active_dawgs[d]).ref)));
01564         }
01565       }
01566     }  // end for
01567     if (best_index != -1) {
01568       *skip = best_index - 1;
01569       *covered += best_index;
01570     }
01571   }  // end if/else skip
01572 
01573   if (word_index == 0) {
01574     ASSERT_HOST(*covered <= word_length);
01575     *dawg_score = (static_cast<float>(*covered) /
01576                    static_cast<float>(word_length));
01577     *dawg_score_done = true;
01578   }
01579 }
01580 
01581 void LanguageModel::GeneratePainPointsFromColumn(
01582     int col,
01583     const GenericVector<int> &non_empty_rows,
01584     float best_choice_cert,
01585     HEAP *pain_points,
01586     BestPathByColumn *best_path_by_column[],
01587     CHUNKS_RECORD *chunks_record) {
01588   for (int i = 0; i < non_empty_rows.length(); ++i) {
01589     int row = non_empty_rows[i];
01590     if (language_model_debug_level > 0) {
01591       tprintf("\nLooking for pain points in col=%d row=%d\n", col, row);
01592     }
01593     if (language_model_ngram_on) {
01594       GenerateNgramModelPainPointsFromColumn(
01595           col, row, pain_points, chunks_record);
01596     } else {
01597       GenerateProblematicPathPainPointsFromColumn(
01598           col, row, best_choice_cert, pain_points,
01599           best_path_by_column, chunks_record);
01600     }
01601   }
01602 }
01603 
01604 void LanguageModel::GenerateNgramModelPainPointsFromColumn(
01605     int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record) {
01606   // Find the first top choice path recorded for this cell.
01607   // If this path is pruned - generate a pain point.
01608   ASSERT_HOST(chunks_record->ratings->get(col, row) != NULL);
01609   BLOB_CHOICE_IT bit(chunks_record->ratings->get(col, row));
01610   bool fragmented = false;
01611   for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
01612     if (dict_->getUnicharset().get_fragment(bit.data()->unichar_id())) {
01613       fragmented = true;
01614       continue;
01615     }
01616     LanguageModelState *lms = reinterpret_cast<LanguageModelState *>(
01617         bit.data()->language_model_state());
01618     if (lms == NULL) continue;
01619     ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
01620     for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
01621       const ViterbiStateEntry *vse = vit.data();
01622       if (!vse->top_choice_flags) continue;
01623       ASSERT_HOST(vse->ngram_info != NULL);
01624       if (vse->ngram_info->pruned && (vse->parent_vse == NULL ||
01625                                       !vse->parent_vse->ngram_info->pruned)) {
01626         if (vse->parent_vse != NULL) {
01627           LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>(
01628               vse->parent_b->language_model_state());
01629           GeneratePainPoint(pp_lms->contained_in_col, row, false,
01630                             kDefaultPainPointPriorityAdjustment,
01631                             -1.0f, fragmented, -1.0f,
01632                             max_char_wh_ratio_,
01633                             vse->parent_vse->parent_b,
01634                             vse->parent_vse->parent_vse,
01635                             chunks_record, pain_points);
01636         }
01637         if (vse->parent_vse != NULL &&
01638             vse->parent_vse->parent_vse != NULL &&
01639             dict_->getUnicharset().get_ispunctuation(
01640                 vse->parent_b->unichar_id())) {
01641           // If the dip in the ngram probability is due to punctuation in the
01642           // middle of the word - go two unichars back to combine this
01643           // punctuation mark with the previous character on the path.
01644           LanguageModelState *pp_lms = reinterpret_cast<LanguageModelState *>(
01645               vse->parent_vse->parent_b->language_model_state());
01646           GeneratePainPoint(pp_lms->contained_in_col, col-1, false,
01647                             kDefaultPainPointPriorityAdjustment,
01648                             -1.0f, fragmented, -1.0f,
01649                             max_char_wh_ratio_,
01650                             vse->parent_vse->parent_vse->parent_b,
01651                             vse->parent_vse->parent_vse->parent_vse,
01652                             chunks_record, pain_points);
01653         } else if (row+1 < chunks_record->ratings->dimension()) {
01654           GeneratePainPoint(col, row+1, true,
01655                             kDefaultPainPointPriorityAdjustment,
01656                             -1.0f, fragmented, -1.0f,
01657                             max_char_wh_ratio_,
01658                             vse->parent_b,
01659                             vse->parent_vse,
01660                             chunks_record, pain_points);
01661         }
01662       }
01663       return;  // examined the lowest cost top choice path
01664     }
01665   }
01666 }
01667 
01668 void LanguageModel::GenerateProblematicPathPainPointsFromColumn(
01669     int col, int row, float best_choice_cert,
01670     HEAP *pain_points, BestPathByColumn *best_path_by_column[],
01671     CHUNKS_RECORD *chunks_record) {
01672   MATRIX *ratings = chunks_record->ratings;
01673 
01674   // Get the best path from this matrix cell.
01675   BLOB_CHOICE_LIST *blist = ratings->get(col, row);
01676   ASSERT_HOST(blist != NULL);
01677   if (blist->empty()) return;
01678   BLOB_CHOICE_IT bit(blist);
01679   bool fragment = false;
01680   while (dict_->getUnicharset().get_fragment(bit.data()->unichar_id()) &&
01681          !bit.at_last()) {  // skip fragments
01682     fragment = true;
01683     bit.forward();
01684   }
01685   if (bit.data()->language_model_state() == NULL) return;
01686   ViterbiStateEntry_IT vit(&(reinterpret_cast<LanguageModelState *>(
01687       bit.data()->language_model_state())->viterbi_state_entries));
01688   if (vit.empty()) return;
01689   ViterbiStateEntry *vse = vit.data();
01690   // Check whether this path is promising.
01691   bool path_is_promising = true;
01692   if (vse->parent_vse != NULL) {
01693     float potential_avg_cost =
01694       ((vse->parent_vse->cost + bit.data()->rating()*0.5f) /
01695        static_cast<float>(row+1));
01696     if (language_model_debug_level > 0) {
01697       tprintf("potential_avg_cost %g best cost %g\n",
01698               potential_avg_cost, (*best_path_by_column)[col].avg_cost);
01699     }
01700     if (potential_avg_cost >= (*best_path_by_column)[col].avg_cost) {
01701       path_is_promising = false;
01702     }
01703   }
01704   // Set best_parent_vse to the best parent recorded in best_path_by_column.
01705   ViterbiStateEntry *best_parent_vse = vse->parent_vse;
01706   BLOB_CHOICE *best_parent_b = vse->parent_b;
01707   if (col > 0 && (*best_path_by_column)[col-1].best_vse != NULL) {
01708     ASSERT_HOST((*best_path_by_column)[col-1].best_b != NULL);
01709     LanguageModelState *best_lms = reinterpret_cast<LanguageModelState *>(
01710         ((*best_path_by_column)[col-1].best_b)->language_model_state());
01711     if (best_lms->contained_in_row == col-1) {
01712       best_parent_vse = (*best_path_by_column)[col-1].best_vse;
01713       best_parent_b = (*best_path_by_column)[col-1].best_b;
01714       if (language_model_debug_level > 0) {
01715         tprintf("Setting best_parent_vse to %p\n", best_parent_vse);
01716       }
01717     }
01718   }
01719   // Check whether this entry terminates the best parent path
01720   // recorded in best_path_by_column.
01721   bool best_not_prolonged = (best_parent_vse != vse->parent_vse);
01722 
01723   // If this path is problematic because of the last unichar - generate
01724   // a pain point to combine it with its left and right neighbor.
01725   BLOB_CHOICE_IT tmp_bit;
01726   if (best_not_prolonged ||
01727       (path_is_promising &&
01728        ProblematicPath(*vse, bit.data()->unichar_id(),
01729                          row+1 == ratings->dimension()))) {
01730     float worst_piece_cert;
01731     bool fragmented;
01732     if (col-1 > 0) {
01733       GetWorstPieceCertainty(col-1, row, chunks_record->ratings,
01734                              &worst_piece_cert, &fragmented);
01735       GeneratePainPoint(col-1, row, false,
01736                         kDefaultPainPointPriorityAdjustment,
01737                         worst_piece_cert, fragmented, best_choice_cert,
01738                         max_char_wh_ratio_, best_parent_b, best_parent_vse,
01739                         chunks_record, pain_points);
01740     }
01741     if (row+1 < ratings->dimension()) {
01742       GetWorstPieceCertainty(col, row+1, chunks_record->ratings,
01743                              &worst_piece_cert, &fragmented);
01744       GeneratePainPoint(col, row+1, true, kDefaultPainPointPriorityAdjustment,
01745                         worst_piece_cert, fragmented, best_choice_cert,
01746                         max_char_wh_ratio_, best_parent_b, best_parent_vse,
01747                         chunks_record, pain_points);
01748     }
01749   } // for ProblematicPath
01750 }
01751 
01752 void LanguageModel::GeneratePainPointsFromBestChoice(
01753     HEAP *pain_points,
01754     CHUNKS_RECORD *chunks_record,
01755     BestChoiceBundle *best_choice_bundle) {
01756   // Variables to backtrack best_vse path;
01757   ViterbiStateEntry *curr_vse = best_choice_bundle->best_vse;
01758   BLOB_CHOICE *curr_b = best_choice_bundle->best_b;
01759 
01760   // Begins and ends in DANGERR vector record the positions in the blob choice
01761   // list of the best choice. We need to translate these endpoints into the
01762   // beginning column and ending row for the pain points. We maintain
01763   // danger_begin and danger_end arrays indexed by the position in
01764   // best_choice_bundle->best_char_choices (which is equal to the position
01765   // on the best_choice_bundle->best_vse path).
01766   // danger_end[d] stores the DANGERR_INFO structs with end==d and is
01767   // initialized at the beginning of this function.
01768   // danger_begin[d] stores the DANGERR_INFO struct with begin==d and
01769   // has end set to the row of the end of this ambiguity.
01770   // The translation from end in terms of the best choice index to the end row
01771   // is done while following the parents of best_choice_bundle->best_vse.
01772   assert(best_choice_bundle->best_char_choices->length() ==
01773          best_choice_bundle->best_vse->length);
01774   DANGERR *danger_begin = NULL;
01775   DANGERR *danger_end = NULL;
01776   int d;
01777   if (!best_choice_bundle->fixpt.empty()) {
01778     danger_begin = new DANGERR[best_choice_bundle->best_vse->length];
01779     danger_end = new DANGERR[best_choice_bundle->best_vse->length];
01780     for (d = 0; d < best_choice_bundle->fixpt.size(); ++d) {
01781       const DANGERR_INFO &danger = best_choice_bundle->fixpt[d];
01782       // Only use n->1 ambiguities.
01783       if (danger.end > danger.begin && !danger.correct_is_ngram &&
01784           (!language_model_ngram_on || danger.dangerous)) {
01785         danger_end[danger.end].push_back(danger);
01786       }
01787     }
01788   }
01789 
01790   // Variables to keep track of punctuation/number streaks.
01791   int punc_streak_end_row = -1;
01792   int punc_streak_length = 0;
01793   float punc_streak_min_cert = 0.0f;
01794 
01795   if (language_model_debug_level > 0) {
01796     tprintf("\nGenerating pain points for best path=%p\n", curr_vse);
01797   }
01798 
01799   int word_index = best_choice_bundle->best_vse->length;
01800   while (curr_vse != NULL) {
01801     word_index--;
01802     ASSERT_HOST(word_index >= 0);
01803     ASSERT_HOST(curr_b != NULL);
01804     if (language_model_debug_level > 0) {
01805       tprintf("Looking at unichar %s\n",
01806               dict_->getUnicharset().id_to_unichar(curr_b->unichar_id()));
01807     }
01808 
01809     int pp_col = reinterpret_cast<LanguageModelState *>(
01810         curr_b->language_model_state())->contained_in_col;
01811     int pp_row = reinterpret_cast<LanguageModelState *>(
01812         curr_b->language_model_state())->contained_in_row;
01813 
01814     // Generate pain points for ambiguities found by NoDangerousAmbig().
01815     if (danger_end != NULL) {
01816       // Translate end index of an ambiguity to an end row.
01817       for (d = 0; d < danger_end[word_index].size(); ++d) {
01818         danger_end[word_index][d].end = pp_row;
01819         danger_begin[danger_end[word_index][d].begin].push_back(
01820             danger_end[word_index][d]);
01821       }
01822       // Generate a pain point to combine unchars in the "wrong" part
01823       // of the ambiguity.
01824       for (d = 0; d < danger_begin[word_index].size(); ++d) {
01825         if (language_model_debug_level > 0) {
01826           tprintf("Generating pain point from %sambiguity\n",
01827                   danger_begin[word_index][d].dangerous ? "dangerous " : "");
01828         }
01829         GeneratePainPoint(pp_col, danger_begin[word_index][d].end, false,
01830                           danger_begin[word_index][d].dangerous ?
01831                           kCriticalPainPointPriorityAdjustment :
01832                           kBestChoicePainPointPriorityAdjustment,
01833                           best_choice_bundle->best_choice->certainty(), true,
01834                           best_choice_bundle->best_choice->certainty(),
01835                           kLooseMaxCharWhRatio,
01836                           curr_vse->parent_b, curr_vse->parent_vse,
01837                           chunks_record, pain_points);
01838       }
01839     }
01840 
01841     if (!language_model_ngram_on) {  // no need to use further heuristics if we
01842                                      // are guided by the character ngram model
01843       // Generate pain points for problematic sub-paths.
01844       if (ProblematicPath(*curr_vse, curr_b->unichar_id(),
01845                           pp_row+1 == chunks_record->ratings->dimension())) {
01846         if (language_model_debug_level > 0) {
01847           tprintf("Generating pain point from a problematic sub-path\n");
01848         }
01849         float worst_piece_cert;
01850         bool fragment;
01851         if (pp_col > 0) {
01852           GetWorstPieceCertainty(pp_col-1, pp_row, chunks_record->ratings,
01853                                  &worst_piece_cert, &fragment);
01854           GeneratePainPoint(pp_col-1, pp_row, false,
01855                             kBestChoicePainPointPriorityAdjustment,
01856                             worst_piece_cert, true,
01857                             best_choice_bundle->best_choice->certainty(),
01858                             max_char_wh_ratio_, NULL, NULL,
01859                             chunks_record, pain_points);
01860         }
01861         if (pp_row+1 < chunks_record->ratings->dimension()) {
01862           GetWorstPieceCertainty(pp_col, pp_row+1, chunks_record->ratings,
01863                                  &worst_piece_cert, &fragment);
01864           GeneratePainPoint(pp_col, pp_row+1, true,
01865                             kBestChoicePainPointPriorityAdjustment,
01866                             worst_piece_cert, true,
01867                             best_choice_bundle->best_choice->certainty(),
01868                             max_char_wh_ratio_, NULL, NULL,
01869                             chunks_record, pain_points);
01870         }
01871       }
01872 
01873       // Generate a pain point if we encountered a punctuation/number streak to
01874       // combine all punctuation marks into one blob and try to classify it.
01875       bool is_alpha = dict_->getUnicharset().get_isalpha(curr_b->unichar_id());
01876       if (!is_alpha) {
01877         if (punc_streak_end_row == -1) punc_streak_end_row = pp_row;
01878         punc_streak_length++;
01879         if (curr_b->certainty() < punc_streak_min_cert)
01880           punc_streak_min_cert = curr_b->certainty();
01881       }
01882       if (is_alpha || curr_vse->parent_vse == NULL) {
01883         if (punc_streak_end_row != -1 && punc_streak_length > 1) {
01884           if (language_model_debug_level > 0) {
01885             tprintf("Generating pain point from a punctuation streak\n");
01886           }
01887           if (is_alpha ||
01888               (curr_vse->parent_vse == NULL && punc_streak_length > 2)) {
01889             GeneratePainPoint(pp_row+1, punc_streak_end_row, false,
01890                               kBestChoicePainPointPriorityAdjustment,
01891                               punc_streak_min_cert, true,
01892                               best_choice_bundle->best_choice->certainty(),
01893                               max_char_wh_ratio_, curr_b, curr_vse,
01894                               chunks_record, pain_points);
01895           }
01896           // Handle punctuation/number streaks starting from the first unichar.
01897           if (curr_vse->parent_vse == NULL) {
01898             GeneratePainPoint(0, punc_streak_end_row, false,
01899                               kBestChoicePainPointPriorityAdjustment,
01900                               punc_streak_min_cert, true,
01901                               best_choice_bundle->best_choice->certainty(),
01902                               max_char_wh_ratio_, NULL, NULL,
01903                               chunks_record, pain_points);
01904           }
01905         }
01906         punc_streak_end_row = -1;
01907         punc_streak_length = 0;
01908         punc_streak_min_cert = 0.0f;
01909       }  // end handling punctuation streaks
01910     }
01911 
01912     curr_b = curr_vse->parent_b;
01913     curr_vse = curr_vse->parent_vse;
01914   }  // end looking at best choice subpaths
01915 
01916   // Clean up.
01917   if (danger_end != NULL) {
01918     delete[] danger_begin;
01919     delete[] danger_end;
01920   }
01921 }
01922 
01923 bool LanguageModel::GeneratePainPoint(
01924     int col, int row, bool ok_to_extend, float priority,
01925     float worst_piece_cert, bool fragmented, float best_choice_cert,
01926     float max_char_wh_ratio,
01927     BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse,
01928     CHUNKS_RECORD *chunks_record, HEAP *pain_points) {
01929   if (col < 0 || row >= chunks_record->ratings->dimension() ||
01930       chunks_record->ratings->get(col, row) != NOT_CLASSIFIED) {
01931     return false;
01932   }
01933   if (language_model_debug_level > 3) {
01934     tprintf("\nGenerating pain point for col=%d row=%d priority=%g parent=",
01935             col, row, priority);
01936     if (parent_vse != NULL) {
01937       PrintViterbiStateEntry("", parent_vse, parent_b, chunks_record);
01938     } else {
01939       tprintf("NULL");
01940     }
01941     tprintf("\n");
01942   }
01943 
01944   AssociateStats associate_stats;
01945   ComputeAssociateStats(col, row, max_char_wh_ratio, parent_vse,
01946                         chunks_record, &associate_stats);
01947   // For fixed-pitch fonts/languages: if the current combined blob overlaps
01948   // the next blob on the right and it is ok to extend the blob, try expending
01949   // the blob until there is no overlap with the next blob on the right or
01950   // until the width-to-height ratio becomes too large.
01951   if (ok_to_extend) {
01952     while (associate_stats.bad_fixed_pitch_right_gap &&
01953            row+1 < chunks_record->ratings->dimension() &&
01954            !associate_stats.bad_fixed_pitch_wh_ratio) {
01955       ComputeAssociateStats(col, ++row, max_char_wh_ratio, parent_vse,
01956                             chunks_record, &associate_stats);
01957     }
01958   }
01959 
01960   if (associate_stats.bad_shape) {
01961     if (language_model_debug_level > 3) {
01962       tprintf("Discarded pain point with a bad shape\n");
01963     }
01964     return false;
01965   }
01966 
01967   // Compute pain point priority.
01968   if (associate_stats.shape_cost > 0) {
01969     priority *= associate_stats.shape_cost;
01970   }
01971   if (worst_piece_cert < best_choice_cert) {
01972     worst_piece_cert = best_choice_cert;
01973   }
01974   priority *= CertaintyScore(worst_piece_cert);
01975   if (fragmented) priority /= kDefaultPainPointPriorityAdjustment;
01976 
01977   if (language_model_debug_level > 3) {
01978     tprintf("worst_piece_cert=%g fragmented=%d\n",
01979             worst_piece_cert, fragmented);
01980   }
01981 
01982   if (parent_vse != NULL) {
01983     priority *= sqrt(parent_vse->cost / static_cast<float>(col));
01984     if (parent_vse->dawg_info != NULL) {
01985       priority /= kDefaultPainPointPriorityAdjustment;
01986       if (parent_vse->length > language_model_min_compound_length) {
01987         priority /= sqrt(static_cast<double>(parent_vse->length));
01988       }
01989     }
01990   }
01991 
01992   MATRIX_COORD *pain_point = new MATRIX_COORD(col, row);
01993   if (HeapPushCheckSize(pain_points, priority, pain_point)) {
01994     if (language_model_debug_level) {
01995       tprintf("Added pain point with priority %g\n", priority);
01996     }
01997     return true;
01998   } else {
01999     delete pain_point;
02000     if (language_model_debug_level) tprintf("Pain points heap is full\n");
02001     return false;
02002   }
02003 }
02004 
02005 }  // namespace tesseract