Tesseract
3.02
|
00001 00002 // File: dict.cpp 00003 // Description: dict class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #include <stdio.h> 00020 00021 #include "dict.h" 00022 #include "unicodes.h" 00023 00024 #ifdef _MSC_VER 00025 #pragma warning(disable:4244) // Conversion warnings 00026 #endif 00027 #include "tprintf.h" 00028 00029 namespace tesseract { 00030 00031 class Image; 00032 00033 Dict::Dict(Image* image_ptr) 00034 : letter_is_okay_(&tesseract::Dict::def_letter_is_okay), 00035 probability_in_context_(&tesseract::Dict::def_probability_in_context), 00036 image_ptr_(image_ptr), 00037 STRING_INIT_MEMBER(user_words_suffix, "", 00038 "A list of user-provided words.", 00039 getImage()->getCCUtil()->params()), 00040 STRING_INIT_MEMBER(user_patterns_suffix, "", 00041 "A list of user-provided patterns.", 00042 getImage()->getCCUtil()->params()), 00043 BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", 00044 getImage()->getCCUtil()->params()), 00045 BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", 00046 getImage()->getCCUtil()->params()), 00047 BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.", 00048 getImage()->getCCUtil()->params()), 00049 BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation" 00050 " patterns.", getImage()->getCCUtil()->params()), 00051 BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number" 00052 " patterns.", getImage()->getCCUtil()->params()), 00053 BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs" 00054 " (e.g. for non-space delimited languages)", 00055 getImage()->getCCUtil()->params()), 00056 BOOL_INIT_MEMBER(load_bigram_dawg, false, "Load dawg with special word " 00057 "bigrams.", getImage()->getCCUtil()->params()), 00058 double_MEMBER(segment_penalty_dict_frequent_word, 1.0, 00059 "Score multiplier for word matches which have good case and" 00060 "are frequent in the given language (lower is better).", 00061 getImage()->getCCUtil()->params()), 00062 double_MEMBER(segment_penalty_dict_case_ok, 1.1, 00063 "Score multiplier for word matches that have good case " 00064 "(lower is better).", getImage()->getCCUtil()->params()), 00065 double_MEMBER(segment_penalty_dict_case_bad, 1.3125, 00066 "Default score multiplier for word matches, which may have " 00067 "case issues (lower is better).", 00068 getImage()->getCCUtil()->params()), 00069 double_MEMBER(segment_penalty_ngram_best_choice, 1.24, 00070 "Multipler to for the best choice from the ngram model.", 00071 getImage()->getCCUtil()->params()), 00072 double_MEMBER(segment_penalty_dict_nonword, 1.25, 00073 "Score multiplier for glyph fragment segmentations which " 00074 "do not match a dictionary word (lower is better).", 00075 getImage()->getCCUtil()->params()), 00076 double_MEMBER(segment_penalty_garbage, 1.50, 00077 "Score multiplier for poorly cased strings that are not in" 00078 " the dictionary and generally look like garbage (lower is" 00079 " better).", getImage()->getCCUtil()->params()), 00080 STRING_MEMBER(output_ambig_words_file, "", 00081 "Output file for ambiguities found in the dictionary", 00082 getImage()->getCCUtil()->params()), 00083 INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info" 00084 ", to 2 for more details, to 3 to see all the debug messages", 00085 getImage()->getCCUtil()->params()), 00086 INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", 00087 getImage()->getCCUtil()->params()), 00088 INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.", 00089 getImage()->getCCUtil()->params()), 00090 BOOL_MEMBER(use_only_first_uft8_step, false, 00091 "Use only the first UTF8 step of the given string" 00092 " when computing log probabilities.", 00093 getImage()->getCCUtil()->params()), 00094 double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", 00095 getImage()->getCCUtil()->params()), 00096 double_MEMBER(stopper_nondict_certainty_base, -2.50, 00097 "Certainty threshold for non-dict words", 00098 getImage()->getCCUtil()->params()), 00099 double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, 00100 "Reject certainty offset", 00101 getImage()->getCCUtil()->params()), 00102 INT_MEMBER(stopper_smallword_size, 2, 00103 "Size of dict word to be treated as non-dict word", 00104 getImage()->getCCUtil()->params()), 00105 double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add" 00106 " for each dict char above small word size.", 00107 getImage()->getCCUtil()->params()), 00108 double_MEMBER(stopper_allowable_character_badness, 3.0, 00109 "Max certaintly variation allowed in a word (in sigma)", 00110 getImage()->getCCUtil()->params()), 00111 INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", 00112 getImage()->getCCUtil()->params()), 00113 BOOL_MEMBER(stopper_no_acceptable_choices, false, 00114 "Make AcceptableChoice() always return false. Useful" 00115 " when there is a need to explore all segmentations", 00116 getImage()->getCCUtil()->params()), 00117 double_MEMBER(stopper_ambiguity_threshold_gain, 8.0, 00118 "Gain factor for ambiguity threshold.", 00119 getImage()->getCCUtil()->params()), 00120 double_MEMBER(stopper_ambiguity_threshold_offset, 1.5, 00121 "Certainty offset for ambiguity threshold.", 00122 getImage()->getCCUtil()->params()), 00123 BOOL_MEMBER(save_raw_choices, false, "Save all explored raw choices", 00124 getImage()->getCCUtil()->params()), 00125 INT_MEMBER(tessedit_truncate_wordchoice_log, 10, 00126 "Max words to keep in list", 00127 getImage()->getCCUtil()->params()), 00128 STRING_MEMBER(word_to_debug, "", "Word for which stopper debug" 00129 " information should be printed to stdout", 00130 getImage()->getCCUtil()->params()), 00131 STRING_MEMBER(word_to_debug_lengths, "", 00132 "Lengths of unichars in word_to_debug", 00133 getImage()->getCCUtil()->params()), 00134 INT_MEMBER(fragments_debug, 0, "Debug character fragments", 00135 getImage()->getCCUtil()->params()), 00136 INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process", 00137 getImage()->getCCUtil()->params()), 00138 BOOL_MEMBER(permute_debug, 0, "Debug char permutation process", 00139 getImage()->getCCUtil()->params()), 00140 double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of" 00141 " current best rate to prune other hypotheses", 00142 getImage()->getCCUtil()->params()), 00143 BOOL_MEMBER(permute_script_word, 0, 00144 "Turn on word script consistency permuter", 00145 getImage()->getCCUtil()->params()), 00146 BOOL_MEMBER(segment_segcost_rating, 0, 00147 "incorporate segmentation cost in word rating?", 00148 getImage()->getCCUtil()->params()), 00149 BOOL_MEMBER(segment_nonalphabetic_script, false, 00150 "Don't use any alphabetic-specific tricks." 00151 "Set to true in the traineddata config file for" 00152 " scripts that are cursive or inherently fixed-pitch", 00153 getImage()->getCCUtil()->params()), 00154 double_MEMBER(segment_reward_script, 0.95, 00155 "Score multipler for script consistency within a word. " 00156 "Being a 'reward' factor, it should be <= 1. " 00157 "Smaller value implies bigger reward.", 00158 getImage()->getCCUtil()->params()), 00159 BOOL_MEMBER(permute_fixed_length_dawg, 0, 00160 "Turn on fixed-length phrasebook search permuter", 00161 getImage()->getCCUtil()->params()), 00162 BOOL_MEMBER(permute_chartype_word, 0, 00163 "Turn on character type (property) consistency permuter", 00164 getImage()->getCCUtil()->params()), 00165 double_MEMBER(segment_reward_chartype, 0.97, 00166 "Score multipler for char type consistency within a word. ", 00167 getImage()->getCCUtil()->params()), 00168 double_MEMBER(segment_reward_ngram_best_choice, 0.99, 00169 "Score multipler for ngram permuter's best choice" 00170 " (only used in the Han script path).", 00171 getImage()->getCCUtil()->params()), 00172 BOOL_MEMBER(save_doc_words, 0, "Save Document Words", 00173 getImage()->getCCUtil()->params()), 00174 BOOL_MEMBER(doc_dict_enable, 1, "Enable Document Dictionary ", 00175 getImage()->getCCUtil()->params()), 00176 double_MEMBER(doc_dict_pending_threshold, 0.0, 00177 "Worst certainty for using pending dictionary", 00178 getImage()->getCCUtil()->params()), 00179 double_MEMBER(doc_dict_certainty_threshold, -2.25, 00180 "Worst certainty for words that can be inserted into the" 00181 "document dictionary", getImage()->getCCUtil()->params()), 00182 BOOL_MEMBER(ngram_permuter_activated, false, 00183 "Activate character-level n-gram-based permuter", 00184 getImage()->getCCUtil()->params()), 00185 INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different" 00186 " character choices to consider during permutation." 00187 " This limit is especially useful when user patterns" 00188 " are specified, since overly generic patterns can result in" 00189 " dawg search exploring an overly large number of options.", 00190 getImage()->getCCUtil()->params()), 00191 BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter", 00192 getImage()->getCCUtil()->params()) { 00193 dang_ambigs_table_ = NULL; 00194 replace_ambigs_table_ = NULL; 00195 keep_word_choices_ = false; 00196 reject_offset_ = 0.0; 00197 best_raw_choice_ = NULL; 00198 best_choices_ = NIL_LIST; 00199 raw_choices_ = NIL_LIST; 00200 go_deeper_fxn_ = NULL; 00201 hyphen_word_ = NULL; 00202 last_word_on_line_ = false; 00203 hyphen_unichar_id_ = INVALID_UNICHAR_ID; 00204 document_words_ = NULL; 00205 pending_words_ = NULL; 00206 bigram_dawg_ = NULL; 00207 freq_dawg_ = NULL; 00208 punc_dawg_ = NULL; 00209 max_fixed_length_dawgs_wdlen_ = -1; 00210 wordseg_rating_adjust_factor_ = -1.0f; 00211 output_ambig_words_file_ = NULL; 00212 } 00213 00214 Dict::~Dict() { 00215 if (hyphen_word_ != NULL) delete hyphen_word_; 00216 if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_); 00217 } 00218 00219 void Dict::Load() { 00220 STRING name; 00221 STRING &lang = getImage()->getCCUtil()->lang; 00222 00223 if (dawgs_.length() != 0) this->End(); 00224 00225 hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol); 00226 00227 LoadEquivalenceList(kHyphenLikeUTF8); 00228 LoadEquivalenceList(kApostropheLikeUTF8); 00229 00230 TessdataManager &tessdata_manager = 00231 getImage()->getCCUtil()->tessdata_manager; 00232 00233 // Load dawgs_. 00234 if (load_punc_dawg && tessdata_manager.SeekToStart(TESSDATA_PUNC_DAWG)) { 00235 punc_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(), 00236 DAWG_TYPE_PUNCTUATION, lang, PUNC_PERM, 00237 dawg_debug_level); 00238 dawgs_ += punc_dawg_; 00239 } 00240 if (load_system_dawg && tessdata_manager.SeekToStart(TESSDATA_SYSTEM_DAWG)) { 00241 dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(), 00242 DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM, 00243 dawg_debug_level); 00244 } 00245 if (load_number_dawg && tessdata_manager.SeekToStart(TESSDATA_NUMBER_DAWG)) { 00246 dawgs_ += 00247 new SquishedDawg(tessdata_manager.GetDataFilePtr(), 00248 DAWG_TYPE_NUMBER, lang, NUMBER_PERM, dawg_debug_level); 00249 } 00250 if (load_bigram_dawg && tessdata_manager.SeekToStart(TESSDATA_BIGRAM_DAWG)) { 00251 bigram_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(), 00252 DAWG_TYPE_WORD, // doesn't actually matter. 00253 lang, 00254 COMPOUND_PERM, // doesn't actually matter. 00255 dawg_debug_level); 00256 } 00257 if (load_freq_dawg && tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) { 00258 freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(), 00259 DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM, 00260 dawg_debug_level); 00261 dawgs_ += freq_dawg_; 00262 } 00263 if (load_unambig_dawg && 00264 tessdata_manager.SeekToStart(TESSDATA_UNAMBIG_DAWG)) { 00265 unambig_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(), 00266 DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM, 00267 dawg_debug_level); 00268 dawgs_ += unambig_dawg_; 00269 } 00270 00271 if (((STRING &)user_words_suffix).length() > 0) { 00272 Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, 00273 kMaxUserDawgEdges, getUnicharset().size(), 00274 dawg_debug_level); 00275 name = getImage()->getCCUtil()->language_data_path_prefix; 00276 name += user_words_suffix; 00277 if (!trie_ptr->read_word_list(name.string(), getUnicharset(), 00278 Trie::RRP_REVERSE_IF_HAS_RTL)) { 00279 tprintf("Error: failed to load %s\n", name.string()); 00280 exit(1); 00281 } 00282 dawgs_ += trie_ptr; 00283 } 00284 00285 if (((STRING &)user_patterns_suffix).length() > 0) { 00286 Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, 00287 kMaxUserDawgEdges, getUnicharset().size(), 00288 dawg_debug_level); 00289 trie_ptr->initialize_patterns(&(getUnicharset())); 00290 name = getImage()->getCCUtil()->language_data_path_prefix; 00291 name += user_patterns_suffix; 00292 if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) { 00293 tprintf("Error: failed to load %s\n", name.string()); 00294 exit(1); 00295 } 00296 dawgs_ += trie_ptr; 00297 } 00298 00299 document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, 00300 kMaxDocDawgEdges, getUnicharset().size(), 00301 dawg_debug_level); 00302 dawgs_ += document_words_; 00303 00304 // This dawg is temporary and should not be searched by letter_is_ok. 00305 pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM, 00306 kMaxDocDawgEdges, getUnicharset().size(), 00307 dawg_debug_level); 00308 00309 // Load fixed length dawgs if necessary (used for phrase search 00310 // for non-space delimited languages). 00311 if (load_fixed_length_dawgs && 00312 tessdata_manager.SeekToStart(TESSDATA_FIXED_LENGTH_DAWGS)) { 00313 ReadFixedLengthDawgs(DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM, 00314 dawg_debug_level, tessdata_manager.GetDataFilePtr(), 00315 &dawgs_, &max_fixed_length_dawgs_wdlen_); 00316 } 00317 00318 // Construct a list of corresponding successors for each dawg. Each entry i 00319 // in the successors_ vector is a vector of integers that represent the 00320 // indices into the dawgs_ vector of the successors for dawg i. 00321 successors_.reserve(dawgs_.length()); 00322 for (int i = 0; i < dawgs_.length(); ++i) { 00323 const Dawg *dawg = dawgs_[i]; 00324 SuccessorList *lst = new SuccessorList(); 00325 for (int j = 0; j < dawgs_.length(); ++j) { 00326 const Dawg *other = dawgs_[j]; 00327 if (dawg != NULL && other != NULL && 00328 (dawg->lang() == other->lang()) && 00329 kDawgSuccessors[dawg->type()][other->type()]) *lst += j; 00330 } 00331 successors_ += lst; 00332 } 00333 } 00334 00335 void Dict::End() { 00336 if (dawgs_.length() == 0) 00337 return; // Not safe to call twice. 00338 dawgs_.delete_data_pointers(); 00339 successors_.delete_data_pointers(); 00340 dawgs_.clear(); 00341 delete bigram_dawg_; 00342 successors_.clear(); 00343 document_words_ = NULL; 00344 max_fixed_length_dawgs_wdlen_ = -1; 00345 if (pending_words_ != NULL) { 00346 delete pending_words_; 00347 pending_words_ = NULL; 00348 } 00349 } 00350 00351 // Create unicharset adaptations of known, short lists of UTF-8 equivalent 00352 // characters (think all hyphen-like symbols). The first version of the 00353 // list is taken as equivalent for matching against the dictionary. 00354 void Dict::LoadEquivalenceList(const char *unichar_strings[]) { 00355 equivalent_symbols_.push_back(GenericVectorEqEq<UNICHAR_ID>()); 00356 const UNICHARSET &unicharset = getUnicharset(); 00357 GenericVectorEqEq<UNICHAR_ID> *equiv_list = &equivalent_symbols_.back(); 00358 for (int i = 0; unichar_strings[i] != 0; i++) { 00359 UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar_strings[i]); 00360 if (unichar_id != INVALID_UNICHAR_ID) { 00361 equiv_list->push_back(unichar_id); 00362 } 00363 } 00364 } 00365 00366 // Normalize all hyphen and apostrophes to the canonicalized one for 00367 // matching; pass everything else through as is. 00368 UNICHAR_ID Dict::NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const { 00369 for (int i = 0; i < equivalent_symbols_.size(); i++) { 00370 if (equivalent_symbols_[i].contains(unichar_id)) { 00371 return equivalent_symbols_[i][0]; 00372 } 00373 } 00374 return unichar_id; 00375 } 00376 00377 // Returns true if in light of the current state unichar_id is allowed 00378 // according to at least one of the dawgs in the dawgs_ vector. 00379 // See more extensive comments in dict.h where this function is declared. 00380 int Dict::def_letter_is_okay(void* void_dawg_args, 00381 UNICHAR_ID unichar_id, 00382 bool word_end) const { 00383 DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args); 00384 00385 if (dawg_debug_level >= 3) { 00386 tprintf("def_letter_is_okay: current unichar=%s word_end=%d" 00387 " num active dawgs=%d num constraints=%d\n", 00388 getUnicharset().debug_str(unichar_id).string(), word_end, 00389 dawg_args->active_dawgs->length(), 00390 dawg_args->constraints->length()); 00391 } 00392 00393 // Do not accept words that contain kPatternUnicharID. 00394 // (otherwise pattern dawgs would not function correctly). 00395 // Do not accept words containing INVALID_UNICHAR_IDs. 00396 if (unichar_id == Dawg::kPatternUnicharID || 00397 unichar_id == INVALID_UNICHAR_ID) { 00398 dawg_args->permuter = NO_PERM; 00399 return NO_PERM; 00400 } 00401 00402 // Initialization. 00403 PermuterType curr_perm = NO_PERM; 00404 dawg_args->updated_active_dawgs->clear(); 00405 const DawgInfoVector &constraints = *(dawg_args->constraints); 00406 *dawg_args->updated_constraints = constraints; 00407 00408 // Go over the active_dawgs vector and insert DawgInfo records with the 00409 // updated ref (an edge with the corresponding unichar id) into 00410 // dawg_args->updated_active_dawgs. 00411 for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) { 00412 const DawgInfo &info = (*dawg_args->active_dawgs)[a]; 00413 const Dawg *dawg = dawgs_[info.dawg_index]; 00414 // dawg_unichar_id will contain the literal unichar_id to be found in the 00415 // dawgs (e.g. didgit pattern if unichar_id is a digit and dawg contains 00416 // number patterns, word pattern if dawg is a puncutation dawg and we 00417 // reached an end of beginning puntuation pattern, etc). 00418 UNICHAR_ID dawg_unichar_id = unichar_id; 00419 00420 // If we are dealing with the pattern dawg, look up all the 00421 // possible edges, not only for the exact unichar_id, but also 00422 // for all its character classes (alpha, digit, etc). 00423 if (dawg->type() == DAWG_TYPE_PATTERN) { 00424 ProcessPatternEdges(dawg, info, dawg_unichar_id, word_end, 00425 dawg_args, &curr_perm); 00426 // There can't be any successors to dawg that is of type 00427 // DAWG_TYPE_PATTERN, so we are done examining this DawgInfo. 00428 continue; 00429 } 00430 00431 // The number dawg generalizes all digits to be kPatternUnicharID, 00432 // so try to match kPatternUnicharID if the current unichar is a digit. 00433 if (dawg->type() == DAWG_TYPE_NUMBER && 00434 getUnicharset().get_isdigit(dawg_unichar_id)) { 00435 dawg_unichar_id = Dawg::kPatternUnicharID; 00436 } 00437 00438 // Find the edge out of the node for the dawg_unichar_id. 00439 NODE_REF node = GetStartingNode(dawg, info.ref); 00440 EDGE_REF edge = (node != NO_EDGE) ? 00441 dawg->edge_char_of(node, dawg_unichar_id, word_end) : NO_EDGE; 00442 00443 if (dawg_debug_level >= 3) { 00444 tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", 00445 info.dawg_index, node, edge); 00446 } 00447 00448 if (edge != NO_EDGE) { // the unichar was found in the current dawg 00449 if (ConstraintsOk(*(dawg_args->updated_constraints), 00450 word_end, dawg->type())) { 00451 if (dawg_debug_level >=3) { 00452 tprintf("Letter found in dawg %d\n", info.dawg_index); 00453 } 00454 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); 00455 dawg_args->updated_active_dawgs->add_unique( 00456 DawgInfo(info.dawg_index, edge), dawg_debug_level > 0, 00457 "Append current dawg to updated active dawgs: "); 00458 } 00459 } else if (dawg_args->sought_word_length == kAnyWordLength) { 00460 // The unichar was not found in the current dawg. 00461 // Explore the successor dawgs (but only if we are not 00462 // just searching one dawg with a fixed word length). 00463 00464 // Handle leading/trailing punctuation dawgs that denote a word pattern 00465 // as an edge with kPatternUnicharID. If such an edge is found we add a 00466 // constraint denoting the state of the dawg before the word pattern. 00467 // This constraint will be applied later when this dawg is found among 00468 // successor dawgs as well potentially at the end of the word. 00469 if (dawg->type() == DAWG_TYPE_PUNCTUATION) { 00470 edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end); 00471 if (edge != NO_EDGE) { 00472 dawg_args->updated_constraints->add_unique( 00473 DawgInfo(info.dawg_index, edge), dawg_debug_level > 0, 00474 "Recording constraint: "); 00475 } else { 00476 // Do not explore successors of this dawg, since this 00477 // must be invalid leading or trailing punctuation. 00478 if (dawg_debug_level >= 3) { 00479 tprintf("Invalid punctuation from dawg %d\n", info.dawg_index); 00480 } 00481 continue; 00482 } 00483 } 00484 00485 if (info.ref == NO_EDGE) { 00486 if (dawg_debug_level >= 3) { 00487 tprintf("No letters matched in dawg %d\n", info.dawg_index); 00488 } 00489 continue; 00490 } 00491 00492 // Discard the dawg if the pattern can not end at previous letter. 00493 if (edge == NO_EDGE && // previous part is not leading punctuation 00494 !dawg->end_of_word(info.ref)) { 00495 if (dawg_debug_level >= 3) { 00496 tprintf("No valid pattern end in dawg %d\n", info.dawg_index); 00497 } 00498 continue; 00499 } 00500 00501 // Look for the unichar in each of this dawg's successors 00502 // and append those in which it is found to active_dawgs. 00503 const SuccessorList &slist = *(successors_[info.dawg_index]); 00504 for (int s = 0; s < slist.length(); ++s) { 00505 int sdawg_index = slist[s]; 00506 const Dawg *sdawg = dawgs_[sdawg_index]; 00507 NODE_REF snode = 0; 00508 // Apply constraints to the successor dawg. 00509 for (int c = 0; c < constraints.length(); ++c) { 00510 // If the successor dawg is described in the constraints change 00511 // the start ref from 0 to the one recorded as the constraint. 00512 const DawgInfo &cinfo = constraints[c]; 00513 if (cinfo.dawg_index == sdawg_index) { 00514 snode = sdawg->next_node(cinfo.ref); 00515 // Make sure we do not search the successor dawg if after 00516 // applying the saved constraint we are at the end of the word. 00517 if (snode == 0) snode = NO_EDGE; 00518 if (dawg_debug_level >= 3) { 00519 tprintf("Applying constraint [%d, " REFFORMAT "]\n", 00520 sdawg_index, snode); 00521 } 00522 } 00523 } 00524 // Look for the letter in this successor dawg. 00525 EDGE_REF sedge = sdawg->edge_char_of(snode, unichar_id, word_end); 00526 // If we found the letter append sdawg to the active_dawgs list. 00527 if (sedge != NO_EDGE && 00528 ConstraintsOk(*(dawg_args->updated_constraints), word_end, 00529 dawgs_[sdawg_index]->type())) { 00530 if (dawg_debug_level >= 3) { 00531 tprintf("Letter found in the successor dawg %d\n", sdawg_index); 00532 } 00533 if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter(); 00534 if (sdawg->next_node(sedge) != 0) { // if not word end 00535 dawg_args->updated_active_dawgs->add_unique( 00536 DawgInfo(sdawg_index, sedge), dawg_debug_level > 0, 00537 "Append successor to updated active dawgs: "); 00538 } 00539 } 00540 } // end successors loop 00541 } // end if/else 00542 } // end for 00543 // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM 00544 // or if we found the current letter in a non-punctuation dawg. This 00545 // allows preserving information on which dawg the "core" word came from. 00546 // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM. 00547 if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM || 00548 (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) { 00549 dawg_args->permuter = curr_perm; 00550 } 00551 return dawg_args->permuter; 00552 } 00553 00554 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info, 00555 UNICHAR_ID unichar_id, bool word_end, 00556 DawgArgs *dawg_args, 00557 PermuterType *curr_perm) const { 00558 NODE_REF node = GetStartingNode(dawg, info.ref); 00559 // Try to find the edge corresponding to the exact unichar_id and to all the 00560 // edges corresponding to the character class of unichar_id. 00561 GenericVector<UNICHAR_ID> unichar_id_patterns; 00562 unichar_id_patterns.push_back(unichar_id); 00563 dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), 00564 &unichar_id_patterns); 00565 for (int i = 0; i < unichar_id_patterns.size(); ++i) { 00566 // On the first iteration check all the outgoing edges. 00567 // On the second iteration check all self-loops. 00568 for (int k = 0; k < 2; ++k) { 00569 EDGE_REF edge = (k == 0) ? 00570 dawg->edge_char_of(node, unichar_id_patterns[i], word_end) 00571 : dawg->pattern_loop_edge(info.ref, unichar_id_patterns[i], word_end); 00572 if (edge != NO_EDGE) { 00573 if (dawg_debug_level >= 3) { 00574 tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", 00575 info.dawg_index, node, edge); 00576 } 00577 if (ConstraintsOk(*(dawg_args->updated_constraints), 00578 word_end, dawg->type())) { 00579 if (dawg_debug_level >=3) { 00580 tprintf("Letter found in pattern dawg %d\n", info.dawg_index); 00581 } 00582 if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter(); 00583 dawg_args->updated_active_dawgs->add_unique( 00584 DawgInfo(info.dawg_index, edge), dawg_debug_level > 0, 00585 "Append current dawg to updated active dawgs: "); 00586 } 00587 } 00588 } 00589 } 00590 } 00591 00592 void Dict::ReadFixedLengthDawgs(DawgType type, const STRING &lang, 00593 PermuterType perm, int debug_level, 00594 FILE *file, DawgVector *dawg_vec, 00595 int *max_wdlen) { 00596 int i; 00597 DawgVector dawg_vec_copy; 00598 dawg_vec_copy.move(dawg_vec); // save the input dawg_vec. 00599 inT32 num_dawgs; 00600 fread(&num_dawgs, sizeof(inT32), 1, file); 00601 bool swap = (num_dawgs > MAX_WERD_LENGTH); 00602 if (swap) num_dawgs = reverse32(num_dawgs); 00603 inT32 word_length; 00604 int max_word_length = 0; 00605 // Read and record pointers to fixed-length dawgs such that: 00606 // dawg_vec[word_length] = pointer to dawg with word length of word_length, 00607 // NULL if such fixed-length dawg does not exist. 00608 for (i = 0; i < num_dawgs; ++i) { 00609 fread(&word_length, sizeof(inT32), 1, file); 00610 if (swap) word_length = reverse32(word_length); 00611 ASSERT_HOST(word_length > 0 && word_length <= MAX_WERD_LENGTH); 00612 while (word_length >= dawg_vec->size()) dawg_vec->push_back(NULL); 00613 (*dawg_vec)[word_length] = 00614 new SquishedDawg(file, type, lang, perm, debug_level); 00615 if (word_length > max_word_length) max_word_length = word_length; 00616 } 00617 *max_wdlen = max_word_length; 00618 // Entries dawg_vec[0] to dawg_vec[max_word_length] now hold pointers 00619 // to fixed-length dawgs. The rest of the vector will contain the dawg 00620 // pointers from the original input dawg_vec. 00621 for (i = 0; i < dawg_vec_copy.size(); ++i) { 00622 dawg_vec->push_back(dawg_vec_copy[i]); 00623 } 00624 } 00625 00626 void Dict::WriteFixedLengthDawgs( 00627 const GenericVector<SquishedDawg *> &dawg_vec, 00628 int num_dawgs, int debug_level, FILE *output_file) { 00629 fwrite(&num_dawgs, sizeof(inT32), 1, output_file); 00630 if (debug_level) tprintf("Writing %d split length dawgs\n", num_dawgs); 00631 for (int i = 1; i < dawg_vec.size(); ++i) { 00632 if ((dawg_vec)[i] != NULL) { 00633 fwrite(&i, sizeof(inT32), 1, output_file); 00634 dawg_vec[i]->write_squished_dawg(output_file); 00635 if (debug_level) tprintf("Wrote Dawg with word length %d\n", i); 00636 } 00637 } 00638 } 00639 00640 // Fill the given active_dawgs vector with dawgs that could contain the 00641 // beginning of the word. If hyphenated() returns true, copy the entries 00642 // from hyphen_active_dawgs_ instead. 00643 void Dict::init_active_dawgs(int sought_word_length, 00644 DawgInfoVector *active_dawgs, 00645 bool ambigs_mode) const { 00646 int i; 00647 if (sought_word_length != kAnyWordLength) { 00648 // Only search one fixed word length dawg. 00649 if (sought_word_length <= max_fixed_length_dawgs_wdlen_ && 00650 dawgs_[sought_word_length] != NULL) { 00651 *active_dawgs += DawgInfo(sought_word_length, NO_EDGE); 00652 } 00653 } else if (hyphenated()) { 00654 *active_dawgs = hyphen_active_dawgs_; 00655 if (dawg_debug_level >= 3) { 00656 for (i = 0; i < hyphen_active_dawgs_.size(); ++i) { 00657 tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n", 00658 hyphen_active_dawgs_[i].dawg_index, 00659 hyphen_active_dawgs_[i].ref); 00660 } 00661 } 00662 } else { 00663 for (i = 0; i < dawgs_.length(); ++i) { 00664 if (dawgs_[i] != NULL && kBeginningDawgsType[(dawgs_[i])->type()] && 00665 !(ambigs_mode && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) { 00666 *active_dawgs += DawgInfo(i, NO_EDGE); 00667 if (dawg_debug_level >= 3) { 00668 tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); 00669 } 00670 } 00671 } 00672 } 00673 } 00674 00675 // If hyphenated() returns true, copy the entries from hyphen_constraints_ 00676 // into the given constraints vector. 00677 void Dict::init_constraints(DawgInfoVector *constraints) const { 00678 if (hyphenated()) { 00679 *constraints = hyphen_constraints_; 00680 if (dawg_debug_level >= 3) { 00681 for (int i = 0; i < hyphen_constraints_.size(); ++i) { 00682 tprintf("Adding hyphen constraint [%d, " REFFORMAT "]\n", 00683 hyphen_constraints_[i].dawg_index, 00684 hyphen_constraints_[i].ref); 00685 } 00686 } 00687 } 00688 } 00689 00690 void Dict::add_document_word(const WERD_CHOICE &best_choice) { 00691 // Do not add hyphenated word parts to the document dawg. 00692 // hyphen_word_ will be non-NULL after the set_hyphen_word() is 00693 // called when the first part of the hyphenated word is 00694 // discovered and while the second part of the word is recognized. 00695 // hyphen_word_ is cleared in cc_recg() before the next word on 00696 // the line is recognized. 00697 if (hyphen_word_) return; 00698 00699 char filename[CHARS_PER_LINE]; 00700 FILE *doc_word_file; 00701 int stringlen = best_choice.length(); 00702 00703 if (!doc_dict_enable || valid_word(best_choice) || 00704 CurrentWordAmbig() || stringlen < 2) 00705 return; 00706 00707 // Discard words that contain >= kDocDictMaxRepChars repeating unichars. 00708 if (best_choice.length() >= kDocDictMaxRepChars) { 00709 int num_rep_chars = 1; 00710 UNICHAR_ID uch_id = best_choice.unichar_id(0); 00711 for (int i = 1; i < best_choice.length(); ++i) { 00712 if (best_choice.unichar_id(i) != uch_id) { 00713 num_rep_chars = 1; 00714 uch_id = best_choice.unichar_id(i); 00715 } else { 00716 ++num_rep_chars; 00717 if (num_rep_chars == kDocDictMaxRepChars) return; 00718 } 00719 } 00720 } 00721 00722 if (best_choice.certainty() < doc_dict_certainty_threshold || 00723 stringlen == 2) { 00724 if (best_choice.certainty() < doc_dict_pending_threshold) 00725 return; 00726 00727 if (!pending_words_->word_in_dawg(best_choice)) { 00728 if (stringlen > 2 || 00729 (stringlen == 2 && 00730 getUnicharset().get_isupper(best_choice.unichar_id(0)) && 00731 getUnicharset().get_isupper(best_choice.unichar_id(1)))) { 00732 pending_words_->add_word_to_dawg(best_choice); 00733 } 00734 return; 00735 } 00736 } 00737 00738 if (save_doc_words) { 00739 strcpy(filename, getImage()->getCCUtil()->imagefile.string()); 00740 strcat(filename, ".doc"); 00741 doc_word_file = open_file (filename, "a"); 00742 fprintf(doc_word_file, "%s\n", 00743 best_choice.debug_string().string()); 00744 fclose(doc_word_file); 00745 } 00746 document_words_->add_word_to_dawg(best_choice); 00747 } 00748 00749 void Dict::adjust_word(WERD_CHOICE *word, 00750 float *certainty_array, 00751 const BLOB_CHOICE_LIST_VECTOR *char_choices, 00752 bool nonword, 00753 float additional_adjust, 00754 bool debug) { 00755 bool is_han = (char_choices != NULL && 00756 getUnicharset().han_sid() != getUnicharset().null_sid() && 00757 get_top_word_script(*char_choices, getUnicharset()) == 00758 getUnicharset().han_sid()); 00759 bool case_is_ok = (is_han || case_ok(*word, getUnicharset())); 00760 bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word)); 00761 00762 float adjust_factor = additional_adjust; 00763 float new_rating = word->rating(); 00764 if (debug) { 00765 tprintf("%sWord: %s %4.2f ", nonword ? "Non-" : "", 00766 word->debug_string().string(), word->rating()); 00767 } 00768 new_rating += kRatingPad; 00769 if (nonword) { // non-dictionary word 00770 if (case_is_ok && punc_is_ok) { 00771 adjust_factor += segment_penalty_dict_nonword; 00772 new_rating *= adjust_factor; 00773 if (debug) tprintf(", W"); 00774 } else { 00775 adjust_factor += segment_penalty_garbage; 00776 new_rating *= adjust_factor; 00777 if (debug) { 00778 if (!case_is_ok) tprintf(", C"); 00779 if (!punc_is_ok) tprintf(", P"); 00780 } 00781 } 00782 } else { // dictionary word 00783 if (case_is_ok) { 00784 if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) { 00785 word->set_permuter(FREQ_DAWG_PERM); 00786 adjust_factor += segment_penalty_dict_frequent_word; 00787 new_rating *= adjust_factor; 00788 if (debug) tprintf(", F"); 00789 } else { 00790 adjust_factor += segment_penalty_dict_case_ok; 00791 new_rating *= adjust_factor; 00792 if (debug) tprintf(", "); 00793 } 00794 } else { 00795 adjust_factor += segment_penalty_dict_case_bad; 00796 new_rating *= adjust_factor; 00797 if (debug) tprintf(", C"); 00798 } 00799 } 00800 new_rating -= kRatingPad; 00801 word->set_rating(new_rating); 00802 if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating); 00803 LogNewChoice(adjust_factor, certainty_array, false, word); 00804 } 00805 00806 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { 00807 const WERD_CHOICE *word_ptr = &word; 00808 WERD_CHOICE temp_word(word.unicharset()); 00809 if (hyphenated()) { 00810 copy_hyphen_info(&temp_word); 00811 temp_word += word; 00812 word_ptr = &temp_word; 00813 } 00814 if (word_ptr->length() == 0) return NO_PERM; 00815 // Allocate vectors for holding current and updated 00816 // active_dawgs and constraints and initialize them. 00817 DawgInfoVector *active_dawgs = new DawgInfoVector[2]; 00818 DawgInfoVector *constraints = new DawgInfoVector[2]; 00819 init_active_dawgs(kAnyWordLength, &(active_dawgs[0]), false); 00820 init_constraints(&(constraints[0])); 00821 DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]), 00822 &(active_dawgs[1]), &(constraints[1]), 00823 0.0, NO_PERM, kAnyWordLength, 0); 00824 int last_index = word_ptr->length() - 1; 00825 // Call leter_is_okay for each letter in the word. 00826 for (int i = hyphen_base_size(); i <= last_index; ++i) { 00827 if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i), 00828 i == last_index))) break; 00829 // Swap active_dawgs, constraints with the corresponding updated vector. 00830 if (dawg_args.updated_active_dawgs == &(active_dawgs[1])) { 00831 dawg_args.updated_active_dawgs = &(active_dawgs[0]); 00832 dawg_args.updated_constraints = &(constraints[0]); 00833 ++(dawg_args.active_dawgs); 00834 ++(dawg_args.constraints); 00835 } else { 00836 ++(dawg_args.updated_active_dawgs); 00837 ++(dawg_args.updated_constraints); 00838 dawg_args.active_dawgs = &(active_dawgs[0]); 00839 dawg_args.constraints = &(constraints[0]); 00840 } 00841 } 00842 delete[] active_dawgs; 00843 delete[] constraints; 00844 return valid_word_permuter(dawg_args.permuter, numbers_ok) ? 00845 dawg_args.permuter : NO_PERM; 00846 } 00847 00848 bool Dict::valid_bigram(const WERD_CHOICE &word1, 00849 const WERD_CHOICE &word2) const { 00850 if (bigram_dawg_ == NULL) return false; 00851 00852 // Extract the core word from the middle of each word with any digits 00853 // replaced with question marks. 00854 int w1start, w1end, w2start, w2end; 00855 word1.punct_stripped(&w1start, &w1end); 00856 word2.punct_stripped(&w2start, &w2end); 00857 00858 // We don't want to penalize a single guillemet, hyphen, etc. 00859 // But our bigram list doesn't have any information about punctuation. 00860 if (w1start >= w1end) return word1.length() < 3; 00861 if (w2start >= w2end) return word2.length() < 3; 00862 00863 const UNICHARSET& uchset = getUnicharset(); 00864 STRING bigram_string; 00865 for (int i = w1start; i < w1end; i++) { 00866 UNICHAR_ID ch = NormalizeUnicharIdForMatch(word1.unichar_id(i)); 00867 bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch); 00868 } 00869 bigram_string += " "; 00870 for (int i = w2start; i < w2end; i++) { 00871 UNICHAR_ID ch = NormalizeUnicharIdForMatch(word2.unichar_id(i)); 00872 bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch); 00873 } 00874 WERD_CHOICE normalized_word(bigram_string.string(), uchset); 00875 return bigram_dawg_->word_in_dawg(normalized_word); 00876 } 00877 00878 bool Dict::valid_punctuation(const WERD_CHOICE &word) { 00879 if (word.length() == 0) return NO_PERM; 00880 int i; 00881 WERD_CHOICE new_word(word.unicharset()); 00882 int last_index = word.length() - 1; 00883 int new_len = 0; 00884 for (i = 0; i <= last_index; ++i) { 00885 UNICHAR_ID unichar_id = (word.unichar_id(i)); 00886 if (getUnicharset().get_ispunctuation(unichar_id)) { 00887 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0); 00888 } else if (!getUnicharset().get_isalpha(unichar_id) && 00889 !getUnicharset().get_isdigit(unichar_id)) { 00890 return false; // neither punc, nor alpha, nor digit 00891 } else if ((new_len = new_word.length()) == 0 || 00892 new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) { 00893 new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0); 00894 } 00895 } 00896 for (i = 0; i < dawgs_.size(); ++i) { 00897 if (dawgs_[i] != NULL && 00898 dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION && 00899 dawgs_[i]->word_in_dawg(new_word)) return true; 00900 } 00901 return false; 00902 } 00903 00904 // Returns the "dominant" script ID for the word. By "dominant", the script 00905 // must account for at least half the characters. Otherwise, it returns 0. 00906 // Note that for Japanese, Hiragana and Katakana are simply treated as Han. 00907 int Dict::get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00908 const UNICHARSET &unicharset) { 00909 int max_script = unicharset.get_script_table_size(); 00910 int *sid = new int[max_script]; 00911 int x; 00912 for (x = 0; x < max_script; x++) sid[x] = 0; 00913 for (x = 0; x < char_choices.length(); ++x) { 00914 BLOB_CHOICE_IT blob_choice_it(char_choices.get(x)); 00915 sid[blob_choice_it.data()->script_id()]++; 00916 } 00917 if (unicharset.han_sid() != unicharset.null_sid()) { 00918 // Add the Hiragana & Katakana counts to Han and zero them out. 00919 if (unicharset.hiragana_sid() != unicharset.null_sid()) { 00920 sid[unicharset.han_sid()] += sid[unicharset.hiragana_sid()]; 00921 sid[unicharset.hiragana_sid()] = 0; 00922 } 00923 if (unicharset.katakana_sid() != unicharset.null_sid()) { 00924 sid[unicharset.han_sid()] += sid[unicharset.katakana_sid()]; 00925 sid[unicharset.katakana_sid()] = 0; 00926 } 00927 } 00928 // Note that high script ID overrides lower one on a tie, thus biasing 00929 // towards non-Common script (if sorted that way in unicharset file). 00930 int max_sid = 0; 00931 for (x = 1; x < max_script; x++) 00932 if (sid[x] >= sid[max_sid]) max_sid = x; 00933 if (sid[max_sid] < char_choices.length() / 2) 00934 max_sid = unicharset.null_sid(); 00935 delete[] sid; 00936 return max_sid; 00937 } 00938 00939 } // namespace tesseract