Tesseract
3.02
|
00001 00002 // File: dict.h 00003 // Description: dict class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #ifndef TESSERACT_DICT_DICT_H_ 00020 #define TESSERACT_DICT_DICT_H_ 00021 00022 #include "ambigs.h" 00023 #include "dawg.h" 00024 #include "host.h" 00025 #include "image.h" 00026 #include "oldlist.h" 00027 #include "ratngs.h" 00028 #include "stopper.h" 00029 #include "trie.h" 00030 #include "unicharset.h" 00031 #include "permute.h" 00032 00033 #define MAX_WERD_LENGTH (inT64) 128 00034 #define NO_RATING -1 00035 00037 struct CHAR_FRAGMENT_INFO { 00038 UNICHAR_ID unichar_id; 00039 const CHAR_FRAGMENT *fragment; 00040 int num_fragments; 00041 float rating; 00042 float certainty; 00043 }; 00044 00045 namespace tesseract { 00046 00047 typedef GenericVector<Dawg *> DawgVector; 00048 00049 // 00050 // Constants 00051 // 00052 static const int kAnyWordLength = -1; 00053 static const int kRatingPad = 4; 00054 // TODO(daria): If hyphens are different in different languages and can be 00055 // inferred from training data we should load their values dynamically. 00056 static const char kHyphenSymbol[] = "-"; 00057 static const int kMaxNumDawgEdgees = 2000000; 00058 static const int kMaxDocDawgEdges = 250000; 00059 static const int kMaxUserDawgEdges = 50000; 00060 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling 00061 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset 00062 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on 00063 static const int kDocDictMaxRepChars = 4; 00064 00065 struct DawgArgs { 00066 DawgArgs(DawgInfoVector *d, DawgInfoVector *c, DawgInfoVector *ud, 00067 DawgInfoVector *uc, float r, PermuterType p, int len, int e) : 00068 active_dawgs(d), constraints(c), updated_active_dawgs(ud), 00069 updated_constraints(uc), rating_margin(r) { 00070 for (int i = 0; i < MAX_WERD_LENGTH; ++i) { 00071 rating_array[i] = NO_RATING; 00072 } 00073 permuter = p; 00074 sought_word_length = len; 00075 end_char_choice_index = e; 00076 } 00077 DawgInfoVector *active_dawgs; 00078 DawgInfoVector *constraints; 00079 DawgInfoVector *updated_active_dawgs; 00080 DawgInfoVector *updated_constraints; 00081 PermuterType permuter; 00082 int sought_word_length; 00083 00084 // TODO(daria): remove these fields when permdawg is deprecated. 00085 float rating_margin; 00086 float rating_array[MAX_WERD_LENGTH]; 00087 int end_char_choice_index; 00088 }; 00089 00090 class Dict { 00091 public: 00092 Dict(Image* image_ptr); 00093 ~Dict(); 00094 const Image* getImage() const { 00095 return image_ptr_; 00096 } 00097 Image* getImage() { 00098 return image_ptr_; 00099 } 00100 const UNICHARSET& getUnicharset() const { 00101 return getImage()->getCCUtil()->unicharset; 00102 } 00103 UNICHARSET& getUnicharset() { 00104 return getImage()->getCCUtil()->unicharset; 00105 } 00106 const UnicharAmbigs &getUnicharAmbigs() { 00107 return getImage()->getCCUtil()->unichar_ambigs; 00108 } 00109 00110 inline bool compound_marker(UNICHAR_ID unichar_id) { 00111 return (unichar_id == getUnicharset().unichar_to_id("-") || 00112 unichar_id == getUnicharset().unichar_to_id("/")); 00113 } 00114 00115 /* hyphen.cpp ************************************************************/ 00116 00118 inline bool hyphenated() const { return 00119 !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0; 00120 } 00122 inline int hyphen_base_size() const { 00123 return this->hyphenated() ? hyphen_word_->length() : 0; 00124 } 00128 inline void copy_hyphen_info(WERD_CHOICE *word) const { 00129 if (this->hyphenated()) { 00130 *word = *hyphen_word_; 00131 if (hyphen_debug_level) word->print("copy_hyphen_info: "); 00132 } 00133 } 00137 inline void remove_hyphen_head(WERD_CHOICE *word) const { 00138 if (this->hyphenated()) { 00139 word->remove_unichar_ids(0, hyphen_word_->length()); 00140 if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: "); 00141 } 00142 } 00144 inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const { 00145 return (last_word_on_line_ && !first_pos && 00146 unichar_id == hyphen_unichar_id_); 00147 } 00149 inline bool has_hyphen_end(const WERD_CHOICE &word) const { 00150 int word_index = word.length() - 1; 00151 return has_hyphen_end(word.unichar_id(word_index), word_index == 0); 00152 } 00156 void reset_hyphen_vars(bool last_word_on_line); 00159 void set_hyphen_word(const WERD_CHOICE &word, 00160 const DawgInfoVector &active_dawgs, 00161 const DawgInfoVector &constraints); 00162 00163 /* permdawg.cpp ************************************************************/ 00166 inline void update_best_choice(const WERD_CHOICE &word, 00167 WERD_CHOICE *best_choice) { 00168 if (word.rating() < best_choice->rating()) *best_choice = word; 00169 } 00173 void init_active_dawgs(int sought_word_length, 00174 DawgInfoVector *active_dawgs, 00175 bool ambigs_mode) const; 00178 void init_constraints(DawgInfoVector *constraints) const; 00180 inline bool ambigs_mode(float rating_limit) { 00181 return rating_limit <= 0.0; 00182 } 00188 WERD_CHOICE *dawg_permute_and_select( 00189 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, 00190 int sought_word_length, int end_char_choice_index); 00191 WERD_CHOICE *dawg_permute_and_select( 00192 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) { 00193 return dawg_permute_and_select(char_choices, rating_limit, 00194 kAnyWordLength, 0); 00195 } 00203 void go_deeper_dawg_fxn( 00204 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 00205 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00206 bool word_ending, WERD_CHOICE *word, float certainties[], 00207 float *limit, WERD_CHOICE *best_choice, int *attempts_left, 00208 void *void_more_args); 00209 00210 /* permute.cpp *************************************************************/ 00211 WERD_CHOICE *get_top_choice_word( 00212 const BLOB_CHOICE_LIST_VECTOR &char_choices); 00213 WERD_CHOICE *permute_top_choice( 00214 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00215 float* rating_limit, 00216 WERD_CHOICE *raw_choice, 00217 BOOL8 *any_alpha); 00218 const char* choose_il1(const char *first_char, //first choice 00219 const char *second_char, //second choice 00220 const char *third_char, //third choice 00221 const char *prev_char, //prev in word 00222 const char *next_char, //next in word 00223 const char *next_next_char); //after next next in word 00224 WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00225 const WERD_CHOICE *best_choice, 00226 WERD_CHOICE *raw_choice); 00227 void end_permute(); 00228 void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00229 float rating_limit, 00230 int start, 00231 int end, 00232 WERD_CHOICE *current_word); 00233 bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00234 WERD_CHOICE *best_choice, 00235 WERD_CHOICE *raw_choice); 00236 WERD_CHOICE *permute_compound_words( 00237 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00238 float rating_limit); 00242 WERD_CHOICE *permute_fixed_length_words( 00243 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00244 PermuterState *permuter_state); 00246 void incorporate_segcost(WERD_CHOICE* word); 00250 WERD_CHOICE *permute_script_words( 00251 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00252 PermuterState *permuter_state); 00254 WERD_CHOICE *permute_chartype_words( 00255 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00256 PermuterState *permuter_state); 00257 00261 char top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00262 char* pos_chartypes); 00263 00264 WERD_CHOICE *top_fragments_permute_and_select( 00265 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00266 float rating_limit); 00271 void go_deeper_top_fragments_fxn( 00272 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 00273 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00274 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, 00275 WERD_CHOICE *best_choice, int *attempts_left, void *more_args); 00276 00278 bool fragment_state_okay(UNICHAR_ID curr_unichar_id, 00279 float curr_rating, float curr_certainty, 00280 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00281 const char *debug, int word_ending, 00282 CHAR_FRAGMENT_INFO *char_frag_info); 00283 void permute_choices( 00284 const char *debug, 00285 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00286 int char_choice_index, 00287 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00288 WERD_CHOICE *word, 00289 float certainties[], 00290 float *limit, 00291 WERD_CHOICE *best_choice, 00292 int *attempts_left, 00293 void *more_args); 00294 00295 void append_choices( 00296 const char *debug, 00297 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00298 const BLOB_CHOICE &blob_choice, 00299 int char_choice_index, 00300 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00301 WERD_CHOICE *word, 00302 float certainties[], 00303 float *limit, 00304 WERD_CHOICE *best_choice, 00305 int *attempts_left, 00306 void *more_args); 00308 void (Dict::*go_deeper_fxn_)(const char *debug, 00309 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00310 int char_choice_index, 00311 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00312 bool word_ending, WERD_CHOICE *word, 00313 float certainties[], float *limit, 00314 WERD_CHOICE *best_choice, int *attempts_left, 00315 void *void_more_args); 00316 /* stopper.cpp *************************************************************/ 00317 bool NoDangerousAmbig(WERD_CHOICE *BestChoice, 00318 DANGERR *fixpt, 00319 bool fix_replaceable, 00320 BLOB_CHOICE_LIST_VECTOR *Choices, 00321 bool *modified_blobs); 00322 double StopperAmbigThreshold(double f1, double f2) { 00323 return (f2 - f1) * stopper_ambiguity_threshold_gain - 00324 stopper_ambiguity_threshold_offset; 00325 } 00326 // If the certainty of any chunk in Choice (item1) is not ambiguous with the 00327 // corresponding chunk in the best choice (item2), frees Choice and 00328 // returns true. 00329 int FreeBadChoice(void *item1, // VIABLE_CHOICE Choice 00330 void *item2); // EXPANDED_CHOICE *BestChoice 00339 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, 00340 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, 00341 BLOB_CHOICE_LIST_VECTOR *blob_choices, 00342 bool *modified_blobs); 00343 00344 inline void DisableChoiceAccum() { keep_word_choices_ = false; } 00345 inline void EnableChoiceAccum() { keep_word_choices_ = true; } 00346 inline bool ChoiceAccumEnabled() { return keep_word_choices_; } 00347 00349 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice); 00356 VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice, 00357 FLOAT32 AdjustFactor, 00358 const float Certainties[]); 00360 void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice); 00363 bool StringSameAs(const WERD_CHOICE &WordChoice, 00364 VIABLE_CHOICE ViableChoice); 00366 bool StringSameAs(const char *String, 00367 const char *String_lengths, 00368 VIABLE_CHOICE ViableChoice); 00376 int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices, 00377 const WERD_CHOICE &BestChoice); 00379 bool AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, 00380 WERD_CHOICE *BestChoice, 00381 DANGERR *fixpt, 00382 ACCEPTABLE_CHOICE_CALLER caller, 00383 bool *modified_blobs); 00387 bool AcceptableResult(const WERD_CHOICE &BestChoice); 00390 int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice); 00398 void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], 00399 bool raw_choice, WERD_CHOICE *WordChoice); 00400 void EndDangerousAmbigs(); 00402 bool CurrentBestChoiceIs(const WERD_CHOICE &WordChoice); 00404 FLOAT32 CurrentBestChoiceAdjustFactor(); 00406 bool CurrentWordAmbig(); 00408 void DebugWordChoices(); 00410 void PrintAmbigAlternatives(FILE *file, const char *label, 00411 int label_num_unichars); 00414 void FillViableChoice(const WERD_CHOICE &WordChoice, 00415 FLOAT32 AdjustFactor, const float Certainties[], 00416 VIABLE_CHOICE ViableChoice); 00419 bool AlternativeChoicesWorseThan(FLOAT32 Threshold); 00422 void FilterWordChoices(); 00437 void FindClassifierErrors(FLOAT32 MinRating, 00438 FLOAT32 MaxRating, 00439 FLOAT32 RatingMargin, 00440 FLOAT32 Thresholds[]); 00443 void InitChoiceAccum(); 00445 void ClearBestChoiceAccum(); 00449 void LogNewSegmentation(PIECES_STATE BlobWidth); 00452 void LogNewSplit(int Blob); 00455 void AddNewChunk(VIABLE_CHOICE Choice, int Blob); 00457 void SettupStopperPass1(); 00459 void SettupStopperPass2(); 00460 /* context.cpp *************************************************************/ 00462 int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset); 00465 bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset); 00466 00467 /* dict.cpp ****************************************************************/ 00468 00471 void Load(); 00472 void End(); 00473 00474 // Resets the document dictionary analogous to ResetAdaptiveClassifier. 00475 void ResetDocumentDictionary() { 00476 if (pending_words_ != NULL) 00477 pending_words_->clear(); 00478 if (document_words_ != NULL) 00479 document_words_->clear(); 00480 } 00481 00482 // Create unicharset adaptations of known, short lists of UTF-8 equivalent 00483 // characters (think all hyphen-like symbols). The first version of the 00484 // list is taken as equivalent for matching against the dictionary. 00485 void LoadEquivalenceList(const char *unichar_strings[]); 00486 00487 // Normalize all hyphen and apostrophes to the canonicalized one for 00488 // matching; pass everything else through as is. See LoadEquivalenceList(). 00489 UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const; 00490 00553 // 00554 int def_letter_is_okay(void* void_dawg_args, 00555 UNICHAR_ID unichar_id, bool word_end) const; 00556 00557 int (Dict::*letter_is_okay_)(void* void_dawg_args, 00558 UNICHAR_ID unichar_id, bool word_end) const; 00560 int LetterIsOkay(void* void_dawg_args, 00561 UNICHAR_ID unichar_id, bool word_end) const { 00562 return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end); 00563 } 00564 00565 00567 double (Dict::*probability_in_context_)(const char* lang, 00568 const char* context, 00569 int context_bytes, 00570 const char* character, 00571 int character_bytes); 00573 double ProbabilityInContext(const char* context, 00574 int context_bytes, 00575 const char* character, 00576 int character_bytes) { 00577 return (this->*probability_in_context_)( 00578 getImage()->getCCUtil()->lang.string(), 00579 context, context_bytes, 00580 character, character_bytes); 00581 } 00582 00584 double def_probability_in_context( 00585 const char* lang, const char* context, int context_bytes, 00586 const char* character, int character_bytes) { 00587 (void) context; 00588 (void) context_bytes; 00589 (void) character; 00590 (void) character_bytes; 00591 return 0.0; 00592 } 00593 double ngram_probability_in_context(const char* lang, 00594 const char* context, 00595 int context_bytes, 00596 const char* character, 00597 int character_bytes); 00598 00600 inline const int NumDawgs() const { return dawgs_.size(); } 00602 inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } 00604 inline const Dawg *GetPuncDawg() const { return punc_dawg_; } 00606 inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; } 00608 inline const Dawg *GetFixedLengthDawg(int word_length) const { 00609 if (word_length > max_fixed_length_dawgs_wdlen_) return NULL; 00610 assert(dawgs_.size() > word_length); 00611 return dawgs_[word_length]; 00612 } 00613 inline const int GetMaxFixedLengthDawgIndex() const { 00614 return max_fixed_length_dawgs_wdlen_; 00615 } 00617 static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) { 00618 if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg 00619 NODE_REF node = dawg->next_node(edge_ref); 00620 if (node == 0) node = NO_EDGE; // end of word 00621 return node; 00622 } 00628 inline bool ConstraintsOk(const DawgInfoVector &constraints, 00629 int word_end, DawgType current_dawg_type) const { 00630 if (!word_end) return true; 00631 if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true; 00632 for (int c = 0; c < constraints.length(); ++c) { 00633 const DawgInfo &cinfo = constraints[c]; 00634 Dawg *cdawg = dawgs_[cinfo.dawg_index]; 00635 if (!cdawg->end_of_word(cinfo.ref)) { 00636 if (dawg_debug_level >= 3) { 00637 tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n", 00638 cinfo.dawg_index, cinfo.ref); 00639 } 00640 return false; 00641 } 00642 } 00643 return true; 00644 } 00645 00651 void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info, 00652 UNICHAR_ID unichar_id, bool word_end, 00653 DawgArgs *dawg_args, 00654 PermuterType *current_permuter) const; 00655 00659 00665 static void ReadFixedLengthDawgs(DawgType type, const STRING &lang, 00666 PermuterType perm, int debug_level, 00667 FILE *file, DawgVector *dawg_vec, 00668 int *max_wdlen); 00671 static void WriteFixedLengthDawgs( 00672 const GenericVector<SquishedDawg *> &dawg_vec, 00673 int num_dawgs, int debug_level, FILE *output_file); 00674 00676 inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) { 00677 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || 00678 perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM || 00679 perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM)); 00680 } 00681 int valid_word(const WERD_CHOICE &word, bool numbers_ok) const; 00682 int valid_word(const WERD_CHOICE &word) const { 00683 return valid_word(word, false); // return NO_PERM for words with digits 00684 } 00685 int valid_word_or_number(const WERD_CHOICE &word) const { 00686 return valid_word(word, true); // return NUMBER_PERM for valid numbers 00687 } 00689 int valid_word(const char *string) const { 00690 WERD_CHOICE word(string, getUnicharset()); 00691 return valid_word(word); 00692 } 00693 // Do the two WERD_CHOICEs form a meaningful bigram? 00694 bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const; 00699 bool valid_punctuation(const WERD_CHOICE &word); 00701 int good_choice(const WERD_CHOICE &choice); 00703 void add_document_word(const WERD_CHOICE &best_choice); 00704 int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00705 const UNICHARSET &unicharset); 00707 void adjust_word(WERD_CHOICE *word, float *certainty_array, 00708 const BLOB_CHOICE_LIST_VECTOR *char_choices, 00709 bool nonword, float additional_adjust, bool debug); 00710 void adjust_word(WERD_CHOICE *word, float *certainty_array, bool debug) { 00711 adjust_word(word, certainty_array, NULL, false, 0.0f, debug); 00712 } 00713 void adjust_non_word(WERD_CHOICE *word, float *certainty_array, bool debug) { 00714 adjust_word(word, certainty_array, NULL, true, 0.0f, debug); 00715 } 00717 inline void SetWordsegRatingAdjustFactor(float f) { 00718 wordseg_rating_adjust_factor_ = f; 00719 } 00720 // Accessor for best_choices_. 00721 const LIST &getBestChoices() { return best_choices_; } 00722 00723 private: 00725 Image* image_ptr_; 00732 UnicharAmbigs *dang_ambigs_table_; 00734 UnicharAmbigs *replace_ambigs_table_; 00739 bool keep_word_choices_; 00741 FLOAT32 reject_offset_; 00743 PIECES_STATE current_segmentation_; 00745 VIABLE_CHOICE best_raw_choice_; 00746 LIST raw_choices_; 00747 LIST best_choices_; 00748 // Hyphen-related variables. 00749 UNICHAR_ID hyphen_unichar_id_; 00750 WERD_CHOICE *hyphen_word_; 00751 DawgInfoVector hyphen_active_dawgs_; 00752 DawgInfoVector hyphen_constraints_; 00753 bool last_word_on_line_; 00754 // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary 00755 // matching. The first member of each list is taken as canonical. For 00756 // example, the first list contains hyphens and dashes with the first symbol 00757 // being the ASCII hyphen minus. 00758 GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_; 00759 // Dawgs. 00760 DawgVector dawgs_; 00761 SuccessorListsVector successors_; 00762 Trie *pending_words_; 00763 // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if 00764 // any of them are present on the best choices list for a word pair. 00765 // the bigrams are stored as space-separated words where: 00766 // (1) leading and trailing punctuation has been removed from each word and 00767 // (2) any digits have been replaced with '?' marks. 00768 Dawg *bigram_dawg_; 00771 // TODO(daria): need to support multiple languages in the future, 00772 // so maybe will need to maintain a list of dawgs of each kind. 00773 Dawg *freq_dawg_; 00774 Dawg *unambig_dawg_; 00775 Dawg *punc_dawg_; 00776 Trie *document_words_; 00779 int max_fixed_length_dawgs_wdlen_; 00782 float wordseg_rating_adjust_factor_; 00783 // File for recording ambiguities discovered during dictionary search. 00784 FILE *output_ambig_words_file_; 00785 00786 public: 00790 STRING_VAR_H(user_words_suffix, "", "A list of user-provided words."); 00791 STRING_VAR_H(user_patterns_suffix, "", 00792 "A list of user-provided patterns."); 00793 BOOL_VAR_H(load_system_dawg, true, "Load system word dawg."); 00794 BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg."); 00795 BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg."); 00796 BOOL_VAR_H(load_punc_dawg, true, 00797 "Load dawg with punctuation patterns."); 00798 BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns."); 00799 BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length" 00800 " dawgs (e.g. for non-space delimited languages)"); 00801 BOOL_VAR_H(load_bigram_dawg, false, 00802 "Load dawg with special word bigrams."); 00803 double_VAR_H(segment_penalty_dict_frequent_word, 1.0, 00804 "Score multiplier for word matches which have good case and" 00805 "are frequent in the given language (lower is better)."); 00806 00807 double_VAR_H(segment_penalty_dict_case_ok, 1.1, 00808 "Score multiplier for word matches that have good case " 00809 "(lower is better)."); 00810 00811 double_VAR_H(segment_penalty_dict_case_bad, 1.3125, 00812 "Default score multiplier for word matches, which may have " 00813 "case issues (lower is better)."); 00814 00815 // TODO(daria): remove this param when ngram permuter is deprecated. 00816 double_VAR_H(segment_penalty_ngram_best_choice, 1.24, 00817 "Multipler to for the best choice from the ngram model."); 00818 00819 double_VAR_H(segment_penalty_dict_nonword, 1.25, 00820 "Score multiplier for glyph fragment segmentations which " 00821 "do not match a dictionary word (lower is better)."); 00822 00823 double_VAR_H(segment_penalty_garbage, 1.50, 00824 "Score multiplier for poorly cased strings that are not in" 00825 " the dictionary and generally look like garbage (lower is" 00826 " better)."); 00827 STRING_VAR_H(output_ambig_words_file, "", 00828 "Output file for ambiguities found in the dictionary"); 00829 INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info" 00830 ", to 2 for more details, to 3 to see all the debug messages"); 00831 INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words."); 00832 INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list."); 00833 BOOL_VAR_H(use_only_first_uft8_step, false, 00834 "Use only the first UTF8 step of the given string" 00835 " when computing log probabilities."); 00836 double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor"); 00837 double_VAR_H(stopper_nondict_certainty_base, -2.50, 00838 "Certainty threshold for non-dict words"); 00839 double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0, 00840 "Reject certainty offset"); 00841 INT_VAR_H(stopper_smallword_size, 2, 00842 "Size of dict word to be treated as non-dict word"); 00843 double_VAR_H(stopper_certainty_per_char, -0.50, 00844 "Certainty to add for each dict char above small word size."); 00845 double_VAR_H(stopper_allowable_character_badness, 3.0, 00846 "Max certaintly variation allowed in a word (in sigma)"); 00847 INT_VAR_H(stopper_debug_level, 0, "Stopper debug level"); 00848 BOOL_VAR_H(stopper_no_acceptable_choices, false, 00849 "Make AcceptableChoice() always return false. Useful" 00850 " when there is a need to explore all segmentations"); 00851 double_VAR_H(stopper_ambiguity_threshold_gain, 8.0, 00852 "Gain factor for ambiguity threshold."); 00853 double_VAR_H(stopper_ambiguity_threshold_offset, 1.5, 00854 "Certainty offset for ambiguity threshold."); 00855 BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices"); 00856 INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list"); 00857 STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information" 00858 " should be printed to stdout"); 00859 STRING_VAR_H(word_to_debug_lengths, "", 00860 "Lengths of unichars in word_to_debug"); 00861 INT_VAR_H(fragments_debug, 0, "Debug character fragments"); 00862 INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process"); 00863 BOOL_VAR_H(permute_debug, 0, "Debug char permutation process"); 00864 double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of" 00865 " current best rate to prune other hypotheses"); 00866 BOOL_VAR_H(permute_script_word, 0, 00867 "Turn on word script consistency permuter"); 00868 BOOL_VAR_H(segment_segcost_rating, 0, 00869 "incorporate segmentation cost in word rating?"); 00870 BOOL_VAR_H(segment_nonalphabetic_script, false, 00871 "Don't use any alphabetic-specific tricks." 00872 "Set to true in the traineddata config file for" 00873 " scripts that are cursive or inherently fixed-pitch"); 00874 double_VAR_H(segment_reward_script, 0.95, 00875 "Score multipler for script consistency within a word. " 00876 "Being a 'reward' factor, it should be <= 1. " 00877 "Smaller value implies bigger reward."); 00878 BOOL_VAR_H(permute_fixed_length_dawg, 0, 00879 "Turn on fixed-length phrasebook search permuter"); 00880 BOOL_VAR_H(permute_chartype_word, 0, 00881 "Turn on character type (property) consistency permuter"); 00882 double_VAR_H(segment_reward_chartype, 0.97, 00883 "Score multipler for char type consistency within a word. "); 00884 // TODO(daria): remove this param when ngram permuter is deprecated. 00885 double_VAR_H(segment_reward_ngram_best_choice, 0.99, 00886 "Score multipler for ngram permuter's best choice" 00887 " (only used in the Han script path)."); 00888 BOOL_VAR_H(save_doc_words, 0, "Save Document Words"); 00889 BOOL_VAR_H(doc_dict_enable, 1, "Enable Document Dictionary "); 00890 double_VAR_H(doc_dict_pending_threshold, 0.0, 00891 "Worst certainty for using pending dictionary"); 00892 double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty" 00893 " for words that can be inserted into the document dictionary"); 00894 BOOL_VAR_H(ngram_permuter_activated, false, 00895 "Activate character-level n-gram-based permuter"); 00896 INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different" 00897 " character choices to consider during permutation." 00898 " This limit is especially useful when user patterns" 00899 " are specified, since overly generic patterns can result in" 00900 " dawg search exploring an overly large number of options."); 00901 BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter"); 00902 }; 00903 } // namespace tesseract 00904 00905 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_