tesseract-doc/language__model_8h_source.html

00001
00002 // File:        language_model.h
00003 // Description: Functions that utilize the knowledge about the properties,
00004 //              structure and statistics of the language to help recognition.
00005 // Author:      Daria Antonova
00006 // Created:     Mon Nov 11 11:26:43 PST 2009
00007 //
00008 // (C) Copyright 2009, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020
00021 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
00022 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
00023
00024 #include "associate.h"
00025 #include "dawg.h"
00026 #include "dict.h"
00027 #include "fontinfo.h"
00028 #include "intproto.h"
00029 #include "matrix.h"
00030 #include "oldheap.h"
00031 #include "params.h"
00032 #include "pageres.h"
00033
00034 namespace tesseract {
00035
00036 // Used for expressing various language model flags.
00037 typedef unsigned char LanguageModelFlagsType;
00038
00039 // Struct for keeping track of the consistency of the path.
00040 struct LanguageModelConsistencyInfo {
00041   LanguageModelConsistencyInfo()
00042     : punc_ref(NO_EDGE), num_punc(0), invalid_punc(false),
00043       num_non_first_upper(0), num_lower(0),
00044       script_id(0), inconsistent_script(false),
00045       num_alphas(0), num_digits(0), num_other(0),
00046       num_inconsistent_spaces(0), inconsistent_font(false) {}
00047   inline int NumInconsistentPunc() const {
00048     return invalid_punc ? num_punc : 0;
00049   }
00050   inline int NumInconsistentCase() const {
00051     return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper;
00052   }
00053   inline int NumInconsistentChartype() const {
00054     return (NumInconsistentPunc() + num_other +
00055             ((num_alphas > num_digits) ? num_digits : num_alphas));
00056   }
00057   inline bool Consistent() const {
00058     return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 &&
00059             NumInconsistentChartype() == 0 && !inconsistent_script);
00060   }
00061   inline int  NumInconsistentSpaces() const {
00062     return num_inconsistent_spaces;
00063   }
00064
00065   EDGE_REF punc_ref;
00066   int num_punc;
00067   bool invalid_punc;
00068   int num_non_first_upper;
00069   int num_lower;
00070   int script_id;
00071   bool inconsistent_script;
00072   int num_alphas;
00073   int num_digits;
00074   int num_other;
00075   int num_inconsistent_spaces;
00076   bool inconsistent_font;
00077 };
00078
00079
00080 // The following structs are used for storing the state of the language model
00081 // in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs
00082 // and the links are the replationships between the underlying blobs (see
00083 // segsearch.h for a more detailed description).
00084 // Each of the BLOB_CHOICEs contains LanguageModelState struct, which has
00085 // a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi
00086 // search leading up to and including this BLOB_CHOICE.
00087 // Each ViterbiStateEntry contains information from various components of the
00088 // language model: dawgs in which the path is found, character ngram model
00089 // probability of the path, script/chartype/font consistency info, state for
00090 // language-specific heuristics (e.g. hyphenated and compund words, lower/upper
00091 // case preferences, etc).
00092 // Each ViterbiStateEntry also contains the parent pointer, so that the path
00093 // that it represents (WERD_CHOICE) can be constructed by following these
00094 // parent pointers.
00095
00096 // Struct for storing additional information used by Dawg language model
00097 // component. It stores the set of active dawgs in which the sequence of
00098 // letters on a path can be found and the constraints that have to be
00099 // satisfied at the end of the word (e.g. beginning/ending punctuation).
00100 struct LanguageModelDawgInfo {
00101   LanguageModelDawgInfo(DawgInfoVector *a, DawgInfoVector *c,
00102                         PermuterType pt) : permuter(pt) {
00103     active_dawgs = new DawgInfoVector(*a);
00104     constraints = new DawgInfoVector(*c);
00105   }
00106   ~LanguageModelDawgInfo() {
00107     delete active_dawgs;
00108     delete constraints;
00109   }
00110   DawgInfoVector *active_dawgs;
00111   DawgInfoVector *constraints;
00112   PermuterType permuter;
00113 };
00114
00115 // Struct for storing additional information used by Ngram language model
00116 // component.
00117 struct LanguageModelNgramInfo {
00118   LanguageModelNgramInfo(const char *c, int l, bool p, float np, float nc)
00119     : context(c), context_unichar_step_len(l), pruned(p), ngram_prob(np),
00120       ngram_cost(nc) {}
00121   STRING context;  // context string
00122   // Length of the context measured by advancing using UNICHAR::utf8_step()
00123   // (should be at most the order of the character ngram model used).
00124   int context_unichar_step_len;
00125   // The paths with pruned set are pruned out from the perspective of the
00126   // character ngram model. They are explored further because they represent
00127   // a dictionary match or a top choice. Thus ngram_info is still computed
00128   // for them in order to calculate the combined cost.
00129   bool pruned;
00130   // -ln(P_ngram_model(path))
00131   float ngram_prob;
00132   // -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
00133   float ngram_cost;
00134 };
00135
00136 // Struct for storing the information about a path in the segmentation graph
00137 // explored by Viterbi search.
00138 struct ViterbiStateEntry : public ELIST_LINK {
00139   ViterbiStateEntry(BLOB_CHOICE *pb, ViterbiStateEntry *pe,
00140                     BLOB_CHOICE *b, float c, float ol,
00141                     const LanguageModelConsistencyInfo &ci,
00142                     const AssociateStats &as,
00143                     LanguageModelFlagsType tcf,
00144                     LanguageModelDawgInfo *d, LanguageModelNgramInfo *n)
00145     : cost(c), parent_b(pb), parent_vse(pe), ratings_sum(b->rating()),
00146       min_certainty(b->certainty()), adapted(b->adapted()), length(1),
00147       outline_length(ol), consistency_info(ci), associate_stats(as),
00148       top_choice_flags(tcf), dawg_info(d), ngram_info(n), updated(true) {
00149     if (pe != NULL) {
00150       ratings_sum += pe->ratings_sum;
00151       if (pe->min_certainty < min_certainty) {
00152         min_certainty = pe->min_certainty;
00153       }
00154       adapted += pe->adapted;
00155       length += pe->length;
00156       outline_length += pe->outline_length;
00157     }
00158   }
00159   ~ViterbiStateEntry() {
00160     delete dawg_info;
00161     delete ngram_info;
00162   }
00163   // Comparator function for sorting ViterbiStateEntry_LISTs in
00164   // non-increasing order of costs.
00165   static int Compare(const void *e1, const void *e2) {
00166     const ViterbiStateEntry *ve1 =
00167       *reinterpret_cast<const ViterbiStateEntry * const *>(e1);
00168     const ViterbiStateEntry *ve2 =
00169       *reinterpret_cast<const ViterbiStateEntry * const *>(e2);
00170     return (ve1->cost < ve2->cost) ? -1 : 1;
00171   }
00172   inline bool Consistent() const {
00173     if (dawg_info != NULL && consistency_info.NumInconsistentCase() == 0) {
00174       return true;
00175     }
00176     return consistency_info.Consistent();
00177   }
00178
00179   // The cost is an adjusted ratings sum, that is adjusted by all the language
00180   // model components that use Viterbi search.
00181   float cost;
00182
00183   // Pointers to parent BLOB_CHOICE and ViterbiStateEntry (not owned by this).
00184   BLOB_CHOICE *parent_b;
00185   ViterbiStateEntry *parent_vse;
00186
00187   // Various information about the characters on the path represented
00188   // by this ViterbiStateEntry.
00189   float ratings_sum;  // sum of ratings of character on the path
00190   float min_certainty;  // minimum certainty on the path
00191   int adapted;  // number of BLOB_CHOICES from adapted templates
00192   int length;  // number of characters on the path
00193   float outline_length;  // length of the outline so far
00194   LanguageModelConsistencyInfo consistency_info;  // path consistency info
00195   AssociateStats associate_stats;  // character widths/gaps/seams
00196
00197   // Flags for marking the entry as a top choice path with
00198   // the smallest rating or lower/upper case letters).
00199   LanguageModelFlagsType top_choice_flags;
00200
00201   // Extra information maintained by Dawg laguage model component
00202   // (owned by ViterbiStateEntry).
00203   LanguageModelDawgInfo *dawg_info;
00204
00205   // Extra information maintained by Ngram laguage model component
00206   // (owned by ViterbiStateEntry).
00207   LanguageModelNgramInfo *ngram_info;
00208
00209   bool updated;  // set to true if the entry has just been created/updated
00210 };
00211
00212 ELISTIZEH(ViterbiStateEntry);
00213
00214 // Struct to store information maintained by various language model components.
00215 struct LanguageModelState {
00216   LanguageModelState(int col, int row) : contained_in_col(col),
00217       contained_in_row(row), viterbi_state_entries_prunable_length(0),
00218       viterbi_state_entries_length(0),
00219       viterbi_state_entries_prunable_max_cost(MAX_FLOAT32) {}
00220   ~LanguageModelState() {}
00221
00222   // Ratings matrix cell that holds this LanguageModelState
00223   // (needed to construct best STATE for rebuild_current_state()
00224   // and best BLOB_CHOICE_LIST_VECTOR for AcceptableChoice()).
00225   int contained_in_col;
00226   int contained_in_row;
00227
00228   // Storage for the Viterbi state.
00229   ViterbiStateEntry_LIST viterbi_state_entries;
00230   // Number and max cost of prunable paths in viterbi_state_entries.
00231   int viterbi_state_entries_prunable_length;
00232   // Total number of entries in viterbi_state_entries.
00233   int viterbi_state_entries_length;
00234   float viterbi_state_entries_prunable_max_cost;
00235
00236   // TODO(daria): add font consistency checking.
00237 };
00238
00239 // Bundle together all the things pertaining to the best choice/state.
00240 struct BestChoiceBundle {
00241   BestChoiceBundle(STATE *s, WERD_CHOICE *bc, WERD_CHOICE *rc,
00242                    BLOB_CHOICE_LIST_VECTOR *bcc)
00243     : best_state(s), best_choice(bc), raw_choice(rc),
00244       best_char_choices(bcc), updated(false), best_vse(NULL), best_b(NULL) {}
00245
00246   STATE *best_state;
00247   WERD_CHOICE *best_choice;
00248   WERD_CHOICE *raw_choice;
00249   BLOB_CHOICE_LIST_VECTOR *best_char_choices;
00250   bool updated;
00251   DANGERR fixpt;
00252   ViterbiStateEntry *best_vse; // best ViterbiStateEntry and BLOB_CHOICE
00253   BLOB_CHOICE *best_b;         // at the end of the best choice path
00254 };
00255
00256 struct BestPathByColumn {
00257   float avg_cost;
00258   ViterbiStateEntry *best_vse;
00259   BLOB_CHOICE *best_b;
00260 };
00261
00262 // This class that contains the data structures and functions necessary
00263 // to represent and use the knowledge about the language.
00264 class LanguageModel {
00265  public:
00266   // Adjustments to pain point priority.
00267   static const float kInitialPainPointPriorityAdjustment;
00268   static const float kDefaultPainPointPriorityAdjustment;
00269   static const float kBestChoicePainPointPriorityAdjustment;
00270   static const float kCriticalPainPointPriorityAdjustment;
00271
00272   // Denominator for normalizing per-letter ngram cost when deriving
00273   // penalty adjustments.
00274   static const float kMaxAvgNgramCost;
00275   // Minimum word length for fixed length dawgs.
00276   // TODO(daria): check in the new chi/jpn.traineddata without the
00277   // fixed length dawg of length 1 and delete this variable.
00278   static const int kMinFixedLengthDawgLength;
00279   // If there is a significant drop in character ngram probability or a
00280   // dangerous ambiguity make the thresholds on what blob combinations
00281   // can be classified looser.
00282   static const float kLooseMaxCharWhRatio;
00283
00284   // Masks for interpreting which language model components
00285   // were changed by the call to UpdateState().
00286   static const LanguageModelFlagsType kSmallestRatingFlag = 0x1;
00287   static const LanguageModelFlagsType kLowerCaseFlag = 0x2;
00288   static const LanguageModelFlagsType kUpperCaseFlag = 0x4;
00289   static const LanguageModelFlagsType kConsistentFlag = 0x8;
00290   static const LanguageModelFlagsType kDawgFlag = 0x10;
00291   static const LanguageModelFlagsType kNgramFlag = 0x20;
00292   static const LanguageModelFlagsType kJustClassifiedFlag = 0x80;
00293   static const LanguageModelFlagsType kAllChangedFlag = 0xff;
00294
00295   LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict);
00296   ~LanguageModel();
00297
00298   // Updates data structures that are used for the duration of the segmentation
00299   // search on the current word;
00300   void InitForWord(const WERD_CHOICE *prev_word,
00301                    bool fixed_pitch, float best_choice_cert,
00302                    float max_char_wh_ratio, float rating_cert_scale,
00303                    HEAP *pain_points, CHUNKS_RECORD *chunks_record,
00304                    BlamerBundle *blamer_bundle, bool debug_blamer);
00305   // Resets all the "updated" flags used by the Viterbi search that were
00306   // "registered" during the update of the ratings matrix.
00307   void CleanUp();
00308   // Deletes and sets to NULL language model states of each of the
00309   // BLOB_CHOICEs in the given BLOB_CHOICE_LIST.
00310   void DeleteState(BLOB_CHOICE_LIST *choices);
00311
00312   // Updates language model state of the given BLOB_CHOICE_LIST (from
00313   // the ratings matrix) a its parent. Updates pain_points if new
00314   // problematic points are found in the segmentation graph.
00315   //
00316   // At most language_model_viterbi_list_size are kept in each
00317   // LanguageModelState.viterbi_state_entries list.
00318   // At most language_model_viterbi_list_max_num_prunable of those are prunable
00319   // (non-dictionary) paths.
00320   // The entries that represent dictionary word paths are kept at the front
00321   // of the list.
00322   // The list ordered by cost that is computed collectively by several
00323   // language model components (currently dawg and ngram components).
00324   //
00325   // best_path_by_column records the lowest cost path found so far for each
00326   // column of the chunks_record->ratings matrix over all the rows. This
00327   // array is updated if a lower cost ViterbiStateEntry is created in curr_col.
00328   LanguageModelFlagsType UpdateState(
00329       LanguageModelFlagsType changed,
00330       int curr_col, int curr_row,
00331       BLOB_CHOICE_LIST *curr_list,
00332       BLOB_CHOICE_LIST *parent_list,
00333       HEAP *pain_points,
00334       BestPathByColumn *best_path_by_column[],
00335       CHUNKS_RECORD *chunks_record,
00336       BestChoiceBundle *best_choice_bundle,
00337       BlamerBundle *blamer_bundle);
00338
00339   // Generates pain points from the problematic top choice paths when the
00340   // segmentation search is guided by the character ngram model.
00341   // It is necessary to consider problematic the top choice paths instead of
00342   // the problematic lowest cost paths because the character ngram model
00343   // might assign a very high cost to very improbably paths. For example,
00344   // "liot" might have a much lower cost than "llot", and the character ngram
00345   // model might detect a dip in probability for p(t|lio) at the end of the
00346   // word, but not at the beginning (p(i|l) would be ok). However, looking at
00347   // the dips in character ngram probability of the top choices we would be
00348   // able to stop the problematic points (p(l| l) would be low).
00349   void GenerateNgramModelPainPointsFromColumn(int col, int row,
00350                                               HEAP *pain_points,
00351                                               CHUNKS_RECORD *chunks_record);
00352
00353   // Generates pain points from the problematic lowest cost paths that are
00354   // "promising" (i.e. would have the cost lower than the one recorded in
00355   // best_path_by_column if the problematic ending of the path is removed
00356   // and after being combined with another blob the certainty of the last
00357   // blob is improved).
00358   void GenerateProblematicPathPainPointsFromColumn(
00359       int col, int row, float best_choice_cert,
00360       HEAP *pain_points, BestPathByColumn *best_path_by_column[],
00361       CHUNKS_RECORD *chunks_record);
00362
00363   // This function can be called after processing column col of the
00364   // chunks_record->ratings matrix in order to find the promising paths
00365   // that were terminated or made inconsistent by the character choices
00366   // in column col. If such paths are identified, this function generates
00367   // pain points to combine the problematic cells of the matrix.
00368   void GeneratePainPointsFromColumn(
00369     int col,
00370     const GenericVector<int> &non_empty_rows,
00371     float best_choice_cert,
00372     HEAP *pain_points,
00373     BestPathByColumn *best_path_by_column[],
00374     CHUNKS_RECORD *chunks_record);
00375
00376   // Generates a pain point for each problematic point on the best choice
00377   // path. Such problematic points could be a termination of a dicionary
00378   // word, dip in ngram probability, invalid punctuation, inconsistent
00379   // case/chartype/script or punctuation in the middle of a word.
00380   void GeneratePainPointsFromBestChoice(
00381       HEAP *pain_points,
00382       CHUNKS_RECORD *chunks_record,
00383       BestChoiceBundle *best_choice_bundle);
00384
00385   // Adds a pain point to the given pain_points queue that will cause
00386   // the entry at chunks_record->ratings(col, row) to be classified.
00387   // The priority of the pain point is set to be:
00388   //
00389   // priority_adjustment * sqrt(avg_parent_cost)
00390   // ----------------------------------------------------
00391   // sqrt(dict_parent_path_length) * |worst_piece_cert|
00392   //
00393   // The priority is further lowered if fragmented is true.
00394   // Reurns true if a new pain point was added to pain_points.
00395   bool GeneratePainPoint(int col, int row, bool ok_to_extend,
00396                          float priority_adjustment,
00397                          float worst_piece_cert,
00398                          bool fragmented,
00399                          float best_choice_cert,
00400                          float max_char_wh_ratio,
00401                          BLOB_CHOICE *parent_b,
00402                          ViterbiStateEntry *parent_vse,
00403                          CHUNKS_RECORD *chunks_record,
00404                          HEAP *pain_points);
00405
00406   // Returns true if an acceptable best choice was discovered.
00407   inline bool AcceptableChoiceFound() { return acceptable_choice_found_; }
00408
00409   // Fills cert with the worst certainty of the top non-fragmented choice
00410   // of the left and right neighbor of the given col,row.
00411   // Sets fragmented if any of the neighbors have a fragmented character
00412   // as the top choice.
00413   inline void GetWorstPieceCertainty(int col, int row, MATRIX *ratings,
00414                                      float *cert, bool *fragmented) {
00415     *cert = 0.0f;
00416     *fragmented = false;
00417     if (row > 0) {
00418       GetPieceCertainty(ratings->get(col, row-1), cert, fragmented);
00419     }
00420     if (col+1 < ratings->dimension()) {
00421       GetPieceCertainty(ratings->get(col+1, row), cert, fragmented);
00422     }
00423     ASSERT_HOST(*cert < 0.0f);
00424   }
00425
00426   // Returns outline length of the given blob is computed as:
00427   // rating_cert_scale * rating / certainty
00428   // Since from Wordrec::SegSearch() in segsearch.cpp
00429   // rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale
00430   // And from Classify::ConvertMatchesToChoices() in adaptmatch.cpp
00431   // Rating = Certainty = next.rating
00432   // Rating *= rating_scale * Results->BlobLength
00433   // Certainty *= -(getDict().certainty_scale)
00434   inline float ComputeOutlineLength(BLOB_CHOICE *b) {
00435     return rating_cert_scale_ * b->rating() / b->certainty();
00436   }
00437
00438  protected:
00439
00440   inline float CertaintyScore(float cert) {
00441     if (language_model_use_sigmoidal_certainty) {
00442       // cert is assumed to be between 0 and -dict_->certainty_scale.
00443       // If you enable language_model_use_sigmoidal_certainty, you
00444       // need to adjust language_model_ngram_nonmatch_score as well.
00445       cert = -cert / dict_->certainty_scale;
00446       return 1.0f / (1.0f + exp(10.0f * cert));
00447     } else {
00448       return (-1.0f / cert);
00449     }
00450   }
00451
00452   inline bool NonAlphaOrDigitMiddle(int col, int row, int dimension,
00453                                     UNICHAR_ID unichar_id) {
00454     return (!dict_->getUnicharset().get_isalpha(unichar_id) &&
00455             !dict_->getUnicharset().get_isdigit(unichar_id) &&
00456             col > 0 && row+1 < dimension);
00457   }
00458
00459   inline bool IsFragment(BLOB_CHOICE *b) {
00460     return dict_->getUnicharset().get_fragment(b->unichar_id());
00461   }
00462
00463   inline bool IsHan(int script_id) {
00464     return ((dict_->getUnicharset().han_sid() !=
00465              dict_->getUnicharset().null_sid()) &&
00466             (script_id == dict_->getUnicharset().han_sid()));
00467   }
00468
00469   // Finds the first non-fragmented character in the given BLOB_CHOICE_LIST
00470   // and updates cert if its certainty is less than the one recorded in cert.
00471   // Sets fragmented if the first choice in BLOB_CHOICE_LIST is a fragment.
00472   inline void GetPieceCertainty(BLOB_CHOICE_LIST *blist,
00473                                 float *cert, bool *fragmented) {
00474     if (blist == NOT_CLASSIFIED || blist->empty()) return;
00475     BLOB_CHOICE_IT bit(blist);
00476     while (!bit.at_last() && IsFragment(bit.data())) {
00477       *fragmented = true;
00478       bit.forward();  // skip fragments
00479     }
00480     // Each classification must have at least one non-fragmented choice.
00481     ASSERT_HOST(!IsFragment(bit.data()));
00482     if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty();
00483   }
00484
00485   inline float ComputeAdjustment(int num_problems, float penalty) {
00486     if (num_problems == 0) return 0.0f;
00487     if (num_problems == 1) return penalty;
00488     return (penalty + (language_model_penalty_increment *
00489                        static_cast<float>(num_problems-1)));
00490   }
00491
00492   // Computes the adjustment to the ratings sum based on the given
00493   // consistency_info. The paths with invalid punctuation, inconsistent
00494   // case and character type are penalized proportionally to the number
00495   // of inconsistencies on the path.
00496   inline float ComputeConsistencyAdjustment(
00497       const LanguageModelDawgInfo *dawg_info,
00498       const LanguageModelConsistencyInfo &consistency_info) {
00499     if (dawg_info != NULL) {
00500       return ComputeAdjustment(consistency_info.NumInconsistentCase(),
00501                                language_model_penalty_case);
00502     }
00503     return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
00504                               language_model_penalty_punc) +
00505             ComputeAdjustment(consistency_info.NumInconsistentCase(),
00506                               language_model_penalty_case) +
00507             ComputeAdjustment(consistency_info.NumInconsistentChartype(),
00508                               language_model_penalty_chartype) +
00509             ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
00510                               language_model_penalty_spacing) +
00511             (consistency_info.inconsistent_script ?
00512              language_model_penalty_script : 0.0f) +
00513             (consistency_info.inconsistent_font ?
00514              language_model_penalty_font : 0.0f));
00515   }
00516
00517   // Returns an adjusted ratings sum that includes inconsistency penalties.
00518   inline float ComputeConsistencyAdjustedRatingsSum(
00519       float ratings_sum,
00520       const LanguageModelDawgInfo *dawg_info,
00521       const LanguageModelConsistencyInfo &consistency_info) {
00522     return (ratings_sum * (1.0f + ComputeConsistencyAdjustment(
00523         dawg_info, consistency_info)));
00524   }
00525
00526   // Returns an adjusted ratings sum that includes inconsistency penalties,
00527   // penalties for non-dictionary paths and paths with dips in ngram
00528   // probability.
00529   float ComputeAdjustedPathCost(
00530       float ratings_sum, int length, float dawg_score,
00531       const LanguageModelDawgInfo *dawg_info,
00532       const LanguageModelNgramInfo *ngram_info,
00533       const LanguageModelConsistencyInfo &consistency_info,
00534       const AssociateStats &associate_stats,
00535       ViterbiStateEntry *parent_vse);
00536
00537   // Returns true if the given ViterbiStateEntry represents a problematic
00538   // path. A path is considered problematic if the last unichar makes it
00539   // inconsistent, introduces a dip in ngram probability or transforms a
00540   // dictionary path into a non-dictionary one.
00541   bool ProblematicPath(const ViterbiStateEntry &vse,
00542                        UNICHAR_ID unichar_id, bool word_end);
00543
00544   // Finds the first lower and upper case character in curr_list.
00545   // If none found, chooses the first character in the list.
00546   void GetTopChoiceLowerUpper(LanguageModelFlagsType changed,
00547                               BLOB_CHOICE_LIST *curr_list,
00548                               BLOB_CHOICE **first_lower,
00549                               BLOB_CHOICE **first_upper);
00550
00551   // Helper function that computes the cost of the path composed of the
00552   // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
00553   // Adds a new ViterbiStateEntry to the list of viterbi entries
00554   // in the given BLOB_CHOICE if the new path looks good enough.
00555   // Returns LanguageModelFlagsType that indicates which language
00556   // model components were involved in creating the new entry.
00557   LanguageModelFlagsType AddViterbiStateEntry(
00558       LanguageModelFlagsType top_choice_flags,
00559       float denom,
00560       bool word_end,
00561       int curr_col, int curr_row,
00562       BLOB_CHOICE *b,
00563       BLOB_CHOICE *parent_b,
00564       ViterbiStateEntry *parent_vse,
00565       HEAP *pain_points,
00566       BestPathByColumn *best_path_by_column[],
00567       CHUNKS_RECORD *chunks_record,
00568       BestChoiceBundle *best_choice_bundle,
00569       BlamerBundle *blamer_bundle);
00570
00571   // Pretty print information in the given ViterbiStateEntry.
00572   void PrintViterbiStateEntry(const char *msg,
00573                               ViterbiStateEntry *vse,
00574                               BLOB_CHOICE *b,
00575                               CHUNKS_RECORD *chunks_record);
00576
00577   // Determines whether a potential entry is a true top choice and
00578   // updates changed accordingly.
00579   //
00580   // Note: The function assumes that b, top_choice_flags and changed
00581   // are not NULL.
00582   void GenerateTopChoiceInfo(
00583       float ratings_sum,
00584       const LanguageModelDawgInfo *dawg_info,
00585       const LanguageModelConsistencyInfo &consistency_info,
00586       const ViterbiStateEntry *parent_vse,
00587       BLOB_CHOICE *b,
00588       LanguageModelFlagsType *top_choice_flags,
00589       LanguageModelFlagsType *changed);
00590
00591   // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
00592   // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
00593   // with updated active dawgs, constraints and permuter.
00594   //
00595   // Note: the caller is responsible for deleting the returned pointer.
00596   LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int script_id,
00597                                           int curr_col, int curr_row,
00598                                           const BLOB_CHOICE &b,
00599                                           const ViterbiStateEntry *parent_vse,
00600                                           LanguageModelFlagsType *changed);
00601
00602   // Computes p(unichar | parent context) and records it in ngram_cost.
00603   // If b.unichar_id() is an unlikely continuation of the parent context
00604   // sets found_small_prob to true and returns NULL.
00605   // Otherwise creates a new LanguageModelNgramInfo entry containing the
00606   // updated context (that includes b.unichar_id() at the end) and returns it.
00607   //
00608   // Note: the caller is responsible for deleting the returned pointer.
00609   LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar,
00610                                             float certainty, float denom,
00611                                             int curr_col, int curr_row,
00612                                             const ViterbiStateEntry *parent_vse,
00613                                             BLOB_CHOICE *parent_b,
00614                                             LanguageModelFlagsType *changed);
00615
00616   // Computes -(log(prob(classifier)) + log(prob(ngram model)))
00617   // for the given unichar in the given context. If there are multiple
00618   // unichars at one position - takes the average of their probabilities.
00619   // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
00620   // since probability_in_context() can only handle one at a time (while
00621   // unicharset might contain ngrams and glyphs composed from multiple UTF8
00622   // characters).
00623   float ComputeNgramCost(const char *unichar, float certainty, float denom,
00624                          const char *context, int *unichar_step_len,
00625                          bool *found_small_prob, float *ngram_prob);
00626
00627   // Computes the normalization factors for the classifier confidences
00628   // (used by ComputeNgramCost()).
00629   float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
00630
00631   // Fills the given consistenty_info based on parent_vse.consistency_info
00632   // and on the consistency of the given unichar_id with parent_vse.
00633   void FillConsistencyInfo(
00634       int curr_col, bool word_end, BLOB_CHOICE *b,
00635       ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b,
00636       CHUNKS_RECORD *chunks_record,
00637       LanguageModelConsistencyInfo *consistency_info);
00638
00639   // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
00640   // on the path represented by the given BLOB_CHOICE and language model
00641   // state entries (lmse, dse). The path is re-constructed by following
00642   // the parent pointers in the the lang model state entries). If the
00643   // constructed WERD_CHOICE is better than the best/raw choice recorded
00644   // in the best_choice_bundle, this function updates the corresponding
00645   // fields and sets best_choice_bunldle->updated to true.
00646   void UpdateBestChoice(BLOB_CHOICE *b,
00647                         ViterbiStateEntry *vse,
00648                         HEAP *pain_points,
00649                         CHUNKS_RECORD *chunks_record,
00650                         BestChoiceBundle *best_choice_bundle,
00651                         BlamerBundle *blamer_bundle);
00652
00653   // Fills the given floats array with raw features extracted from the
00654   // path represented by the given ViterbiStateEntry.
00655   // See ccstruct/params_training_featdef.h for feature information.
00656   void ExtractRawFeaturesFromPath(const ViterbiStateEntry &vse,
00657                                   float *features);
00658
00659   // Constructs a WERD_CHOICE by tracing parent pointers starting with
00660   // the given LanguageModelStateEntry. Returns the constructed word.
00661   // Updates best_char_choices, certainties and state if they are not
00662   // NULL (best_char_choices and certainties are assumed to have the
00663   // length equal to lmse->length).
00664   // The caller is resposible for freeing memory associated with the
00665   // returned WERD_CHOICE.
00666   WERD_CHOICE *ConstructWord(BLOB_CHOICE *b,
00667                              ViterbiStateEntry *vse,
00668                              CHUNKS_RECORD *chunks_record,
00669                              BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00670                              float certainties[],
00671                              float *dawg_score,
00672                              STATE *state,
00673                              BlamerBundle *blamer_bundle,
00674                              bool *truth_path);
00675
00676   // This function is used for non-space delimited languages when looking
00677   // for word endings recorded while trying to separate the path into words.
00678   //
00679   // The function increments covered if a valid word ending is found in
00680   // active_dawgs (if covered is incremented, skip is set to the number
00681   // of unichars that should be skipped because they are covered by the
00682   // word whose ending was just discovered).
00683   //
00684   // dawg_score and dawg_score_done are updated if:
00685   // -- at the end of the path we discover a valid word ending from a
00686   //    non-fixed length dawg (this means that the whole word is a
00687   //    valid word, so dawg_score is set to 1.0f
00688   // -- word_start is true (dawg_score is set to covered / word length)
00689   //
00690   // Note: this function assumes that skip, covered, dawg_score and
00691   // dawg_score_done are not NULL.
00692   void UpdateCoveredByFixedLengthDawgs(const DawgInfoVector &active_dawgs,
00693                                        int word_index, int word_length,
00694                                        int *skip, int *covered,
00695                                        float *dawg_score,
00696                                        bool *dawg_score_done);
00697
00698   // Wrapper around AssociateUtils::ComputeStats().
00699   inline void ComputeAssociateStats(int col, int row,
00700                                     float max_char_wh_ratio,
00701                                     ViterbiStateEntry *parent_vse,
00702                                     CHUNKS_RECORD *chunks_record,
00703                                     AssociateStats *associate_stats) {
00704   AssociateUtils::ComputeStats(
00705       col, row,
00706       (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
00707       (parent_vse != NULL) ? parent_vse->length : 0,
00708       fixed_pitch_, max_char_wh_ratio,
00709       chunks_record->word_res != NULL ? &chunks_record->word_res->denorm : NULL,
00710       chunks_record, language_model_debug_level, associate_stats);
00711   }
00712
00713   // Returns true if the path with such top_choice_flags and dawg_info
00714   // could be pruned out (i.e. is neither a system/user/frequent dictionary
00715   // nor a top choice path).
00716   // In non-space delimited languages all paths can be "somewhat" dictionary
00717   // words. In such languages we can not do dictionary-driven path prunning,
00718   // so paths with non-empty dawg_info are considered prunable.
00719   inline bool PrunablePath(LanguageModelFlagsType top_choice_flags,
00720                            const LanguageModelDawgInfo *dawg_info) {
00721     if (top_choice_flags) return false;
00722     if (dawg_info != NULL &&
00723         (dawg_info->permuter == SYSTEM_DAWG_PERM ||
00724          dawg_info->permuter == USER_DAWG_PERM ||
00725          dawg_info->permuter == FREQ_DAWG_PERM) &&
00726          dict_->GetMaxFixedLengthDawgIndex() < 0) return false;
00727     return true;
00728   }
00729
00730   // Returns true if the given ViterbiStateEntry represents an acceptable path.
00731   inline bool AcceptablePath(const ViterbiStateEntry &vse) {
00732     return (vse.dawg_info != NULL || vse.Consistent() ||
00733             (vse.ngram_info != NULL && !vse.ngram_info->pruned));
00734   }
00735
00736  public:
00737   // Parameters.
00738   INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
00739   BOOL_VAR_H(language_model_ngram_on, false,
00740              "Turn on/off the use of character ngram model");
00741   INT_VAR_H(language_model_ngram_order, 8,
00742             "Maximum order of the character ngram model");
00743   INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10,
00744             "Maximum number of prunable (those for which PrunablePath() is true)"
00745             "entries in each viterbi list recorded in BLOB_CHOICEs");
00746   INT_VAR_H(language_model_viterbi_list_max_size, 500,
00747             "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
00748   double_VAR_H(language_model_ngram_small_prob, 0.000001,
00749                "To avoid overly small denominators use this as the floor"
00750                " of the probability returned by the ngram model");
00751   double_VAR_H(language_model_ngram_nonmatch_score, -40.0,
00752                "Average classifier score of a non-matching unichar");
00753   BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false,
00754              "Use only the first UTF8 step of the given string"
00755              " when computing log probabilities");
00756   double_VAR_H(language_model_ngram_scale_factor, 0.03,
00757                "Strength of the character ngram model relative to the"
00758                " character classifier ");
00759   BOOL_VAR_H(language_model_ngram_space_delimited_language, true,
00760              "Words are delimited by space");
00761
00762   INT_VAR_H(language_model_min_compound_length, 3,
00763             "Minimum length of compound words");
00764   INT_VAR_H(language_model_fixed_length_choices_depth, 3,
00765             "Depth of blob choice lists to explore"
00766             " when fixed length dawgs are on");
00767   // Penalties used for adjusting path costs and final word rating.
00768   double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1,
00769                "Penalty for words not in the frequent word dictionary");
00770   double_VAR_H(language_model_penalty_non_dict_word, 0.15,
00771                "Penalty for non-dictionary words");
00772   double_VAR_H(language_model_penalty_punc, 0.2,
00773                "Penalty for inconsistent punctuation");
00774   double_VAR_H(language_model_penalty_case, 0.1,
00775                "Penalty for inconsistent case");
00776   double_VAR_H(language_model_penalty_script, 0.5,
00777                "Penalty for inconsistent script");
00778   double_VAR_H(language_model_penalty_chartype, 0.3,
00779                "Penalty for inconsistent character type");
00780   double_VAR_H(language_model_penalty_font, 0.00,
00781                "Penalty for inconsistent font");
00782   double_VAR_H(language_model_penalty_spacing, 0.05,
00783                "Penalty for inconsistent spacing");
00784   double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment");
00785   BOOL_VAR_H(language_model_use_sigmoidal_certainty, false,
00786              "Use sigmoidal score for certainty");
00787
00788  protected:
00789   // Member Variables.
00790
00791   // Temporary DawgArgs struct that is re-used across different words to
00792   // avoid dynamic memory re-allocation (should be cleared before each use).
00793   DawgArgs *dawg_args_;
00794   // List of pointers to updated flags used by Viterbi search to mark
00795   // recently updated ViterbiStateEntries.
00796   GenericVector<bool *> updated_flags_;
00797   // Scaling for recovering blob outline length from rating and certainty.
00798   float rating_cert_scale_;
00799
00800   // The following variables are set at construction time.
00801
00802   // Pointer to fontinfo table (not owned by LanguageModel).
00803   const UnicityTable<FontInfo> *fontinfo_table_;
00804
00805   // Pointer to Dict class, that is used for querying the dictionaries
00806   // (the pointer is not owned by LanguageModel).
00807   Dict *dict_;
00808
00809   // TODO(daria): the following variables should become LanguageModel params
00810   // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
00811   //
00812   // Set to true if we are dealing with fixed pitch text
00813   // (set to assume_fixed_pitch_char_segment).
00814   bool fixed_pitch_;
00815   // Max char width-to-height ratio allowed
00816   // (set to segsearch_max_char_wh_ratio).
00817   float max_char_wh_ratio_;
00818
00819   // The following variables are initialized with InitForWord().
00820
00821   // String representation of the classification of the previous word
00822   // (since this is only used by the character ngram model component,
00823   // only the last language_model_ngram_order of the word are stored).
00824   STRING prev_word_str_;
00825   int prev_word_unichar_step_len_;
00826   // Active dawg and constraints vector.
00827   DawgInfoVector *beginning_active_dawgs_;
00828   DawgInfoVector *beginning_constraints_;
00829   DawgInfoVector *fixed_length_beginning_active_dawgs_;
00830   DawgInfoVector *empty_dawg_info_vec_;
00831   // Maximum adjustment factor for character ngram choices.
00832   float max_penalty_adjust_;
00833   // Set to true if acceptable choice was discovered.
00834   // Note: it would be nice to use this to terminate the search once an
00835   // acceptable choices is found. However we do not do that and once an
00836   // acceptable choice is found we finish looking for alternative choices
00837   // in the current segmentation graph and then exit the search (no more
00838   // classifications are done after an acceptable choice is found).
00839   // This is needed in order to let the search find the words very close to
00840   // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
00841   // choices. This way the stopper will know that the best choice is not
00842   // ambiguous (i.e. there are best choices in the best choice list that have
00843   // ratings close to the very best one) and will be less likely to mis-adapt.
00844   bool acceptable_choice_found_;
00845   // Set to true if a choice representing correct segmentation was explored.
00846   bool correct_segmentation_explored_;
00847
00848 };
00849
00850 }  // namespace tesseract
00851
00852 #endif  // TESSERACT_WORDREC_LANGUAGE_MODEL_H_