Tesseract  3.02
tesseract-ocr/dict/dict.h
Go to the documentation of this file.
00001 
00002 // File:        dict.h
00003 // Description: dict class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #ifndef TESSERACT_DICT_DICT_H_
00020 #define TESSERACT_DICT_DICT_H_
00021 
00022 #include "ambigs.h"
00023 #include "dawg.h"
00024 #include "host.h"
00025 #include "image.h"
00026 #include "oldlist.h"
00027 #include "ratngs.h"
00028 #include "stopper.h"
00029 #include "trie.h"
00030 #include "unicharset.h"
00031 #include "permute.h"
00032 
00033 #define MAX_WERD_LENGTH        (inT64) 128
00034 #define NO_RATING               -1
00035 
00037 struct CHAR_FRAGMENT_INFO {
00038   UNICHAR_ID unichar_id;
00039   const CHAR_FRAGMENT *fragment;
00040   int num_fragments;
00041   float rating;
00042   float certainty;
00043 };
00044 
00045 namespace tesseract {
00046 
00047 typedef GenericVector<Dawg *> DawgVector;
00048 
00049 //
00050 // Constants
00051 //
00052 static const int kAnyWordLength = -1;
00053 static const int kRatingPad = 4;
00054 // TODO(daria): If hyphens are different in different languages and can be
00055 // inferred from training data we should load their values dynamically.
00056 static const char kHyphenSymbol[] = "-";
00057 static const int kMaxNumDawgEdgees = 2000000;
00058 static const int kMaxDocDawgEdges = 250000;
00059 static const int kMaxUserDawgEdges = 50000;
00060 static const float kSimCertaintyScale = -10.0;   // similarity matcher scaling
00061 static const float kSimCertaintyOffset = -10.0;  // similarity matcher offset
00062 static const float kSimilarityFloor = 100.0;  // worst E*L product to stop on
00063 static const int kDocDictMaxRepChars = 4;
00064 
00065 struct DawgArgs {
00066   DawgArgs(DawgInfoVector *d, DawgInfoVector *c, DawgInfoVector *ud,
00067            DawgInfoVector *uc, float r, PermuterType p, int len, int e) :
00068     active_dawgs(d), constraints(c), updated_active_dawgs(ud),
00069     updated_constraints(uc), rating_margin(r) {
00070     for (int i = 0; i < MAX_WERD_LENGTH; ++i) {
00071       rating_array[i] = NO_RATING;
00072     }
00073     permuter = p;
00074     sought_word_length = len;
00075     end_char_choice_index = e;
00076   }
00077   DawgInfoVector *active_dawgs;
00078   DawgInfoVector *constraints;
00079   DawgInfoVector *updated_active_dawgs;
00080   DawgInfoVector *updated_constraints;
00081   PermuterType permuter;
00082   int sought_word_length;
00083 
00084   // TODO(daria): remove these fields when permdawg is deprecated.
00085   float rating_margin;  
00086   float rating_array[MAX_WERD_LENGTH];
00087   int end_char_choice_index;
00088 };
00089 
00090 class Dict {
00091  public:
00092   Dict(Image* image_ptr);
00093   ~Dict();
00094   const Image* getImage() const {
00095     return image_ptr_;
00096   }
00097   Image* getImage() {
00098     return image_ptr_;
00099   }
00100   const UNICHARSET& getUnicharset() const {
00101     return getImage()->getCCUtil()->unicharset;
00102   }
00103   UNICHARSET& getUnicharset() {
00104     return getImage()->getCCUtil()->unicharset;
00105   }
00106   const UnicharAmbigs &getUnicharAmbigs() {
00107     return getImage()->getCCUtil()->unichar_ambigs;
00108   }
00109 
00110   inline bool compound_marker(UNICHAR_ID unichar_id) {
00111     return (unichar_id == getUnicharset().unichar_to_id("-") ||
00112             unichar_id == getUnicharset().unichar_to_id("/"));
00113   }
00114 
00115   /* hyphen.cpp ************************************************************/
00116 
00118   inline bool hyphenated() const { return
00119     !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0;
00120   }
00122   inline int hyphen_base_size() const {
00123     return this->hyphenated() ? hyphen_word_->length() : 0;
00124   }
00128   inline void copy_hyphen_info(WERD_CHOICE *word) const {
00129     if (this->hyphenated()) {
00130       *word = *hyphen_word_;
00131       if (hyphen_debug_level) word->print("copy_hyphen_info: ");
00132     }
00133   }
00137   inline void remove_hyphen_head(WERD_CHOICE *word) const {
00138     if (this->hyphenated()) {
00139       word->remove_unichar_ids(0, hyphen_word_->length());
00140       if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
00141     }
00142   }
00144   inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
00145     return (last_word_on_line_ && !first_pos &&
00146             unichar_id == hyphen_unichar_id_);
00147   }
00149   inline bool has_hyphen_end(const WERD_CHOICE &word) const {
00150     int word_index = word.length() - 1;
00151     return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
00152   }
00156   void reset_hyphen_vars(bool last_word_on_line);
00159   void set_hyphen_word(const WERD_CHOICE &word,
00160                        const DawgInfoVector &active_dawgs,
00161                        const DawgInfoVector &constraints);
00162 
00163   /* permdawg.cpp ************************************************************/
00166   inline void update_best_choice(const WERD_CHOICE &word,
00167                                  WERD_CHOICE *best_choice) {
00168     if (word.rating() < best_choice->rating()) *best_choice = word;
00169   }
00173   void init_active_dawgs(int sought_word_length,
00174                          DawgInfoVector *active_dawgs,
00175                          bool ambigs_mode) const;
00178   void init_constraints(DawgInfoVector *constraints) const;
00180   inline bool ambigs_mode(float rating_limit) {
00181     return rating_limit <= 0.0;
00182   }
00188   WERD_CHOICE *dawg_permute_and_select(
00189       const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
00190       int sought_word_length, int end_char_choice_index);
00191   WERD_CHOICE *dawg_permute_and_select(
00192       const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
00193     return dawg_permute_and_select(char_choices, rating_limit,
00194                                    kAnyWordLength, 0);
00195   }
00203   void go_deeper_dawg_fxn(
00204       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00205       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00206       bool word_ending, WERD_CHOICE *word, float certainties[],
00207       float *limit, WERD_CHOICE *best_choice, int *attempts_left,
00208       void *void_more_args);
00209 
00210   /* permute.cpp *************************************************************/
00211   WERD_CHOICE *get_top_choice_word(
00212       const BLOB_CHOICE_LIST_VECTOR &char_choices);
00213   WERD_CHOICE *permute_top_choice(
00214     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00215     float* rating_limit,
00216     WERD_CHOICE *raw_choice,
00217     BOOL8 *any_alpha);
00218   const char* choose_il1(const char *first_char,       //first choice
00219                          const char *second_char,      //second choice
00220                          const char *third_char,       //third choice
00221                          const char *prev_char,        //prev in word
00222                          const char *next_char,        //next in word
00223                          const char *next_next_char);  //after next next in word
00224   WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00225                            const WERD_CHOICE *best_choice,
00226                            WERD_CHOICE *raw_choice);
00227   void end_permute();
00228   void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00229                        float rating_limit,
00230                        int start,
00231                        int end,
00232                        WERD_CHOICE *current_word);
00233   bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00234                           WERD_CHOICE *best_choice,
00235                           WERD_CHOICE *raw_choice);
00236   WERD_CHOICE *permute_compound_words(
00237       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00238       float rating_limit);
00242   WERD_CHOICE *permute_fixed_length_words(
00243       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00244       PermuterState *permuter_state);
00246   void incorporate_segcost(WERD_CHOICE* word);
00250   WERD_CHOICE *permute_script_words(
00251       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00252       PermuterState *permuter_state);
00254   WERD_CHOICE *permute_chartype_words(
00255       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00256       PermuterState *permuter_state);
00257 
00261   char top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00262                          char* pos_chartypes);
00263 
00264   WERD_CHOICE *top_fragments_permute_and_select(
00265       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00266       float rating_limit);
00271   void go_deeper_top_fragments_fxn(
00272       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00273       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00274       bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
00275       WERD_CHOICE *best_choice, int *attempts_left, void *more_args);
00276 
00278   bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
00279                            float curr_rating, float curr_certainty,
00280                            const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00281                            const char *debug, int word_ending,
00282                            CHAR_FRAGMENT_INFO *char_frag_info);
00283   void permute_choices(
00284       const char *debug,
00285       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00286       int char_choice_index,
00287       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00288       WERD_CHOICE *word,
00289       float certainties[],
00290       float *limit,
00291       WERD_CHOICE *best_choice,
00292       int *attempts_left,
00293       void *more_args);
00294 
00295   void append_choices(
00296       const char *debug,
00297       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00298       const BLOB_CHOICE &blob_choice,
00299       int char_choice_index,
00300       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00301       WERD_CHOICE *word,
00302       float certainties[],
00303       float *limit,
00304       WERD_CHOICE *best_choice,
00305       int *attempts_left,
00306       void *more_args);
00308   void (Dict::*go_deeper_fxn_)(const char *debug,
00309                                const BLOB_CHOICE_LIST_VECTOR &char_choices,
00310                                int char_choice_index,
00311                                const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00312                                bool word_ending, WERD_CHOICE *word,
00313                                float certainties[], float *limit,
00314                                WERD_CHOICE *best_choice, int *attempts_left,
00315                                void *void_more_args);
00316   /* stopper.cpp *************************************************************/
00317   bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
00318                         DANGERR *fixpt,
00319                         bool fix_replaceable,
00320                         BLOB_CHOICE_LIST_VECTOR *Choices,
00321                         bool *modified_blobs);
00322   double StopperAmbigThreshold(double f1, double f2) {
00323     return (f2 - f1) * stopper_ambiguity_threshold_gain -
00324         stopper_ambiguity_threshold_offset;
00325   }
00326   // If the certainty of any chunk in Choice (item1) is not ambiguous with the
00327   // corresponding chunk in the best choice (item2), frees Choice and
00328   // returns true.
00329   int FreeBadChoice(void *item1,   // VIABLE_CHOICE Choice
00330                     void *item2);  // EXPANDED_CHOICE *BestChoice
00339   void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
00340                     UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
00341                     BLOB_CHOICE_LIST_VECTOR *blob_choices,
00342                     bool *modified_blobs);
00343 
00344   inline void DisableChoiceAccum() { keep_word_choices_ = false; }
00345   inline void EnableChoiceAccum() { keep_word_choices_ = true; }
00346   inline bool ChoiceAccumEnabled() { return keep_word_choices_; }
00347 
00349   int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
00356   VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
00357                                 FLOAT32 AdjustFactor,
00358                                 const float Certainties[]);
00360   void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
00363   bool StringSameAs(const WERD_CHOICE &WordChoice,
00364                     VIABLE_CHOICE ViableChoice);
00366   bool StringSameAs(const char *String,
00367                     const char *String_lengths,
00368                     VIABLE_CHOICE ViableChoice);
00376   int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
00377                          const WERD_CHOICE &BestChoice);
00379   bool AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
00380                         WERD_CHOICE *BestChoice,
00381                         DANGERR *fixpt,
00382                         ACCEPTABLE_CHOICE_CALLER caller,
00383                         bool *modified_blobs);
00387   bool AcceptableResult(const WERD_CHOICE &BestChoice);
00390   int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
00398   void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[],
00399                     bool raw_choice, WERD_CHOICE *WordChoice);
00400   void EndDangerousAmbigs();
00402   bool CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
00404   FLOAT32 CurrentBestChoiceAdjustFactor();
00406   bool CurrentWordAmbig();
00408   void DebugWordChoices();
00410   void PrintAmbigAlternatives(FILE *file, const char *label,
00411                               int label_num_unichars);
00414   void FillViableChoice(const WERD_CHOICE &WordChoice,
00415                         FLOAT32 AdjustFactor, const float Certainties[],
00416                         VIABLE_CHOICE ViableChoice);
00419   bool AlternativeChoicesWorseThan(FLOAT32 Threshold);
00422   void FilterWordChoices();
00437   void FindClassifierErrors(FLOAT32 MinRating,
00438                             FLOAT32 MaxRating,
00439                             FLOAT32 RatingMargin,
00440                             FLOAT32 Thresholds[]);
00443   void InitChoiceAccum();
00445   void ClearBestChoiceAccum();
00449   void LogNewSegmentation(PIECES_STATE BlobWidth);
00452   void LogNewSplit(int Blob);
00455   void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
00457   void SettupStopperPass1();
00459   void SettupStopperPass2();
00460   /* context.cpp *************************************************************/
00462   int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset);
00465   bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
00466 
00467   /* dict.cpp ****************************************************************/
00468 
00471   void Load();
00472   void End();
00473 
00474   // Resets the document dictionary analogous to ResetAdaptiveClassifier.
00475   void ResetDocumentDictionary() {
00476     if (pending_words_ != NULL)
00477       pending_words_->clear();
00478     if (document_words_ != NULL)
00479       document_words_->clear();
00480   }
00481 
00482   // Create unicharset adaptations of known, short lists of UTF-8 equivalent
00483   // characters (think all hyphen-like symbols).  The first version of the
00484   // list is taken as equivalent for matching against the dictionary.
00485   void LoadEquivalenceList(const char *unichar_strings[]);
00486 
00487   // Normalize all hyphen and apostrophes to the canonicalized one for
00488   // matching; pass everything else through as is.  See LoadEquivalenceList().
00489   UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const;
00490 
00553   //
00554   int def_letter_is_okay(void* void_dawg_args,
00555                          UNICHAR_ID unichar_id, bool word_end) const;
00556 
00557   int (Dict::*letter_is_okay_)(void* void_dawg_args,
00558                                UNICHAR_ID unichar_id, bool word_end) const;
00560   int LetterIsOkay(void* void_dawg_args,
00561                    UNICHAR_ID unichar_id, bool word_end) const {
00562     return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
00563   }
00564 
00565 
00567   double (Dict::*probability_in_context_)(const char* lang,
00568                                           const char* context,
00569                                           int context_bytes,
00570                                           const char* character,
00571                                           int character_bytes);
00573   double ProbabilityInContext(const char* context,
00574                               int context_bytes,
00575                               const char* character,
00576                               int character_bytes) {
00577     return (this->*probability_in_context_)(
00578         getImage()->getCCUtil()->lang.string(),
00579         context, context_bytes,
00580         character, character_bytes);
00581   }
00582 
00584   double def_probability_in_context(
00585       const char* lang, const char* context, int context_bytes,
00586       const char* character, int character_bytes) {
00587     (void) context;
00588     (void) context_bytes;
00589     (void) character;
00590     (void) character_bytes;
00591     return 0.0;
00592   }
00593   double ngram_probability_in_context(const char* lang,
00594                                       const char* context,
00595                                       int context_bytes,
00596                                       const char* character,
00597                                       int character_bytes);
00598 
00600   inline const int NumDawgs() const { return dawgs_.size(); }
00602   inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
00604   inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
00606   inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
00608   inline const Dawg *GetFixedLengthDawg(int word_length) const {
00609     if (word_length > max_fixed_length_dawgs_wdlen_) return NULL;
00610     assert(dawgs_.size() > word_length);
00611     return dawgs_[word_length];
00612   }
00613   inline const int GetMaxFixedLengthDawgIndex() const {
00614     return max_fixed_length_dawgs_wdlen_;
00615   }
00617   static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
00618     if (edge_ref == NO_EDGE) return 0;  // beginning to explore the dawg
00619     NODE_REF node = dawg->next_node(edge_ref);
00620     if (node == 0) node = NO_EDGE;  // end of word
00621     return node;
00622   }
00628   inline bool ConstraintsOk(const DawgInfoVector &constraints,
00629                             int word_end, DawgType current_dawg_type) const {
00630     if (!word_end) return true;
00631     if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
00632     for (int c = 0; c < constraints.length(); ++c) {
00633       const DawgInfo &cinfo = constraints[c];
00634       Dawg *cdawg = dawgs_[cinfo.dawg_index];
00635       if (!cdawg->end_of_word(cinfo.ref)) {
00636         if (dawg_debug_level >= 3) {
00637           tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
00638                   cinfo.dawg_index, cinfo.ref);
00639         }
00640         return false;
00641       }
00642     }
00643     return true;
00644   }
00645 
00651   void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
00652                            UNICHAR_ID unichar_id, bool word_end,
00653                            DawgArgs *dawg_args,
00654                            PermuterType *current_permuter) const;
00655 
00659 
00665   static void ReadFixedLengthDawgs(DawgType type, const STRING &lang,
00666                                    PermuterType perm, int debug_level,
00667                                    FILE *file, DawgVector *dawg_vec,
00668                                    int *max_wdlen);
00671   static void WriteFixedLengthDawgs(
00672       const GenericVector<SquishedDawg *> &dawg_vec,
00673       int num_dawgs, int debug_level, FILE *output_file);
00674 
00676   inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
00677     return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
00678             perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
00679             perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
00680   }
00681   int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
00682   int valid_word(const WERD_CHOICE &word) const {
00683     return valid_word(word, false);  // return NO_PERM for words with digits
00684   }
00685   int valid_word_or_number(const WERD_CHOICE &word) const {
00686     return valid_word(word, true);  // return NUMBER_PERM for valid numbers
00687   }
00689   int valid_word(const char *string) const {
00690     WERD_CHOICE word(string, getUnicharset());
00691     return valid_word(word);
00692   }
00693   // Do the two WERD_CHOICEs form a meaningful bigram?
00694   bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
00699   bool valid_punctuation(const WERD_CHOICE &word);
00701   int good_choice(const WERD_CHOICE &choice);
00703   void add_document_word(const WERD_CHOICE &best_choice);
00704   int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00705                           const UNICHARSET &unicharset);
00707   void adjust_word(WERD_CHOICE *word, float *certainty_array,
00708                    const BLOB_CHOICE_LIST_VECTOR *char_choices,
00709                    bool nonword, float additional_adjust, bool debug);
00710   void adjust_word(WERD_CHOICE *word, float *certainty_array, bool debug) {
00711     adjust_word(word, certainty_array, NULL, false, 0.0f, debug);
00712   }
00713   void adjust_non_word(WERD_CHOICE *word, float *certainty_array, bool debug) {
00714     adjust_word(word, certainty_array, NULL, true, 0.0f, debug);
00715   }
00717   inline void SetWordsegRatingAdjustFactor(float f) {
00718     wordseg_rating_adjust_factor_ = f;
00719   }
00720   // Accessor for best_choices_.
00721   const LIST &getBestChoices() { return best_choices_; }
00722 
00723  private:
00725   Image* image_ptr_;
00732   UnicharAmbigs *dang_ambigs_table_;
00734   UnicharAmbigs *replace_ambigs_table_;
00739   bool keep_word_choices_;
00741   FLOAT32 reject_offset_;
00743   PIECES_STATE current_segmentation_;
00745   VIABLE_CHOICE best_raw_choice_;
00746   LIST raw_choices_;
00747   LIST best_choices_;
00748   // Hyphen-related variables.
00749   UNICHAR_ID hyphen_unichar_id_;
00750   WERD_CHOICE *hyphen_word_;
00751   DawgInfoVector hyphen_active_dawgs_;
00752   DawgInfoVector hyphen_constraints_;
00753   bool last_word_on_line_;
00754   // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
00755   // matching.  The first member of each list is taken as canonical.  For
00756   // example, the first list contains hyphens and dashes with the first symbol
00757   // being the ASCII hyphen minus.
00758   GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
00759   // Dawgs.
00760   DawgVector dawgs_;
00761   SuccessorListsVector successors_;
00762   Trie *pending_words_;
00763   // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if
00764   // any of them are present on the best choices list for a word pair.
00765   // the bigrams are stored as space-separated words where:
00766   // (1) leading and trailing punctuation has been removed from each word and
00767   // (2) any digits have been replaced with '?' marks.
00768   Dawg *bigram_dawg_;
00771   // TODO(daria): need to support multiple languages in the future,
00772   // so maybe will need to maintain a list of dawgs of each kind.
00773   Dawg *freq_dawg_;
00774   Dawg *unambig_dawg_;
00775   Dawg *punc_dawg_;
00776   Trie *document_words_;
00779   int max_fixed_length_dawgs_wdlen_;
00782   float wordseg_rating_adjust_factor_;
00783   // File for recording ambiguities discovered during dictionary search.
00784   FILE *output_ambig_words_file_;
00785 
00786  public:
00790   STRING_VAR_H(user_words_suffix, "", "A list of user-provided words.");
00791   STRING_VAR_H(user_patterns_suffix, "",
00792                "A list of user-provided patterns.");
00793   BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
00794   BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
00795   BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
00796   BOOL_VAR_H(load_punc_dawg, true,
00797              "Load dawg with punctuation patterns.");
00798   BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
00799   BOOL_VAR_H(load_fixed_length_dawgs, true,  "Load fixed length"
00800              " dawgs (e.g. for non-space delimited languages)");
00801   BOOL_VAR_H(load_bigram_dawg, false,
00802              "Load dawg with special word bigrams.");
00803   double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
00804                "Score multiplier for word matches which have good case and"
00805                "are frequent in the given language (lower is better).");
00806 
00807   double_VAR_H(segment_penalty_dict_case_ok, 1.1,
00808                "Score multiplier for word matches that have good case "
00809                "(lower is better).");
00810 
00811   double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
00812                "Default score multiplier for word matches, which may have "
00813                "case issues (lower is better).");
00814 
00815   // TODO(daria): remove this param when ngram permuter is deprecated.
00816   double_VAR_H(segment_penalty_ngram_best_choice, 1.24,
00817                "Multipler to for the best choice from the ngram model.");
00818 
00819   double_VAR_H(segment_penalty_dict_nonword, 1.25,
00820                "Score multiplier for glyph fragment segmentations which "
00821                "do not match a dictionary word (lower is better).");
00822 
00823   double_VAR_H(segment_penalty_garbage, 1.50,
00824                "Score multiplier for poorly cased strings that are not in"
00825                " the dictionary and generally look like garbage (lower is"
00826                " better).");
00827   STRING_VAR_H(output_ambig_words_file, "",
00828                "Output file for ambiguities found in the dictionary");
00829   INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
00830             ", to 2 for more details, to 3 to see all the debug messages");
00831   INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
00832   INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");
00833   BOOL_VAR_H(use_only_first_uft8_step, false,
00834              "Use only the first UTF8 step of the given string"
00835              " when computing log probabilities.");
00836   double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
00837   double_VAR_H(stopper_nondict_certainty_base, -2.50,
00838                "Certainty threshold for non-dict words");
00839   double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
00840                "Reject certainty offset");
00841   INT_VAR_H(stopper_smallword_size, 2,
00842             "Size of dict word to be treated as non-dict word");
00843   double_VAR_H(stopper_certainty_per_char, -0.50,
00844                "Certainty to add for each dict char above small word size.");
00845   double_VAR_H(stopper_allowable_character_badness, 3.0,
00846                "Max certaintly variation allowed in a word (in sigma)");
00847   INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
00848   BOOL_VAR_H(stopper_no_acceptable_choices, false,
00849              "Make AcceptableChoice() always return false. Useful"
00850              " when there is a need to explore all segmentations");
00851   double_VAR_H(stopper_ambiguity_threshold_gain, 8.0,
00852                "Gain factor for ambiguity threshold.");
00853   double_VAR_H(stopper_ambiguity_threshold_offset, 1.5,
00854                "Certainty offset for ambiguity threshold.");
00855   BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices");
00856   INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
00857   STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
00858                " should be printed to stdout");
00859   STRING_VAR_H(word_to_debug_lengths, "",
00860                "Lengths of unichars in word_to_debug");
00861   INT_VAR_H(fragments_debug, 0, "Debug character fragments");
00862   INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
00863   BOOL_VAR_H(permute_debug, 0, "Debug char permutation process");
00864   double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of"
00865                " current best rate to prune other hypotheses");
00866   BOOL_VAR_H(permute_script_word, 0,
00867              "Turn on word script consistency permuter");
00868   BOOL_VAR_H(segment_segcost_rating, 0,
00869              "incorporate segmentation cost in word rating?");
00870   BOOL_VAR_H(segment_nonalphabetic_script, false,
00871              "Don't use any alphabetic-specific tricks."
00872              "Set to true in the traineddata config file for"
00873              " scripts that are cursive or inherently fixed-pitch");
00874   double_VAR_H(segment_reward_script, 0.95,
00875                "Score multipler for script consistency within a word. "
00876                "Being a 'reward' factor, it should be <= 1. "
00877                "Smaller value implies bigger reward.");
00878   BOOL_VAR_H(permute_fixed_length_dawg, 0,
00879              "Turn on fixed-length phrasebook search permuter");
00880   BOOL_VAR_H(permute_chartype_word, 0,
00881              "Turn on character type (property) consistency permuter");
00882   double_VAR_H(segment_reward_chartype, 0.97,
00883                "Score multipler for char type consistency within a word. ");
00884   // TODO(daria): remove this param when ngram permuter is deprecated.
00885   double_VAR_H(segment_reward_ngram_best_choice, 0.99,
00886                "Score multipler for ngram permuter's best choice"
00887                " (only used in the Han script path).");
00888   BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
00889   BOOL_VAR_H(doc_dict_enable, 1, "Enable Document Dictionary ");
00890   double_VAR_H(doc_dict_pending_threshold, 0.0,
00891                "Worst certainty for using pending dictionary");
00892   double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
00893                " for words that can be inserted into the document dictionary");
00894   BOOL_VAR_H(ngram_permuter_activated, false,
00895              "Activate character-level n-gram-based permuter");
00896   INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
00897             " character choices to consider during permutation."
00898             " This limit is especially useful when user patterns"
00899             " are specified, since overly generic patterns can result in"
00900             " dawg search exploring an overly large number of options.");
00901   BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter");
00902 };
00903 }  // namespace tesseract
00904 
00905 #endif  // THIRD_PARTY_TESSERACT_DICT_DICT_H_