tesseract-doc/dict_8cpp_source.html

00001
00002 // File:        dict.cpp
00003 // Description: dict class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018
00019 #include <stdio.h>
00020
00021 #include "dict.h"
00022 #include "unicodes.h"
00023
00024 #ifdef _MSC_VER
00025 #pragma warning(disable:4244)  // Conversion warnings
00026 #endif
00027 #include "tprintf.h"
00028
00029 namespace tesseract {
00030
00031 class Image;
00032
00033 Dict::Dict(Image* image_ptr)
00034     : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
00035       probability_in_context_(&tesseract::Dict::def_probability_in_context),
00036       image_ptr_(image_ptr),
00037       STRING_INIT_MEMBER(user_words_suffix, "",
00038                          "A list of user-provided words.",
00039                          getImage()->getCCUtil()->params()),
00040       STRING_INIT_MEMBER(user_patterns_suffix, "",
00041                          "A list of user-provided patterns.",
00042                          getImage()->getCCUtil()->params()),
00043       BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
00044                        getImage()->getCCUtil()->params()),
00045       BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
00046                        getImage()->getCCUtil()->params()),
00047       BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
00048                        getImage()->getCCUtil()->params()),
00049       BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
00050                        " patterns.", getImage()->getCCUtil()->params()),
00051       BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
00052                        " patterns.", getImage()->getCCUtil()->params()),
00053       BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
00054                        " (e.g. for non-space delimited languages)",
00055                        getImage()->getCCUtil()->params()),
00056       BOOL_INIT_MEMBER(load_bigram_dawg, false, "Load dawg with special word "
00057                        "bigrams.", getImage()->getCCUtil()->params()),
00058       double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
00059                     "Score multiplier for word matches which have good case and"
00060                     "are frequent in the given language (lower is better).",
00061                     getImage()->getCCUtil()->params()),
00062       double_MEMBER(segment_penalty_dict_case_ok, 1.1,
00063                     "Score multiplier for word matches that have good case "
00064                     "(lower is better).", getImage()->getCCUtil()->params()),
00065       double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
00066                     "Default score multiplier for word matches, which may have "
00067                     "case issues (lower is better).",
00068                     getImage()->getCCUtil()->params()),
00069       double_MEMBER(segment_penalty_ngram_best_choice, 1.24,
00070                    "Multipler to for the best choice from the ngram model.",
00071                    getImage()->getCCUtil()->params()),
00072       double_MEMBER(segment_penalty_dict_nonword, 1.25,
00073                     "Score multiplier for glyph fragment segmentations which "
00074                     "do not match a dictionary word (lower is better).",
00075                     getImage()->getCCUtil()->params()),
00076       double_MEMBER(segment_penalty_garbage, 1.50,
00077                     "Score multiplier for poorly cased strings that are not in"
00078                     " the dictionary and generally look like garbage (lower is"
00079                     " better).", getImage()->getCCUtil()->params()),
00080       STRING_MEMBER(output_ambig_words_file, "",
00081                     "Output file for ambiguities found in the dictionary",
00082                     getImage()->getCCUtil()->params()),
00083       INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
00084                  ", to 2 for more details, to 3 to see all the debug messages",
00085                  getImage()->getCCUtil()->params()),
00086       INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
00087                  getImage()->getCCUtil()->params()),
00088       INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
00089                  getImage()->getCCUtil()->params()),
00090       BOOL_MEMBER(use_only_first_uft8_step, false,
00091                   "Use only the first UTF8 step of the given string"
00092                   " when computing log probabilities.",
00093                   getImage()->getCCUtil()->params()),
00094       double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
00095                     getImage()->getCCUtil()->params()),
00096       double_MEMBER(stopper_nondict_certainty_base, -2.50,
00097                     "Certainty threshold for non-dict words",
00098                     getImage()->getCCUtil()->params()),
00099       double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
00100                     "Reject certainty offset",
00101                     getImage()->getCCUtil()->params()),
00102       INT_MEMBER(stopper_smallword_size, 2,
00103                  "Size of dict word to be treated as non-dict word",
00104                  getImage()->getCCUtil()->params()),
00105       double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add"
00106                     " for each dict char above small word size.",
00107                     getImage()->getCCUtil()->params()),
00108       double_MEMBER(stopper_allowable_character_badness, 3.0,
00109                     "Max certaintly variation allowed in a word (in sigma)",
00110                     getImage()->getCCUtil()->params()),
00111       INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
00112                  getImage()->getCCUtil()->params()),
00113       BOOL_MEMBER(stopper_no_acceptable_choices, false,
00114                   "Make AcceptableChoice() always return false. Useful"
00115                   " when there is a need to explore all segmentations",
00116                   getImage()->getCCUtil()->params()),
00117       double_MEMBER(stopper_ambiguity_threshold_gain, 8.0,
00118                     "Gain factor for ambiguity threshold.",
00119                     getImage()->getCCUtil()->params()),
00120       double_MEMBER(stopper_ambiguity_threshold_offset, 1.5,
00121                     "Certainty offset for ambiguity threshold.",
00122                     getImage()->getCCUtil()->params()),
00123       BOOL_MEMBER(save_raw_choices, false, "Save all explored raw choices",
00124                   getImage()->getCCUtil()->params()),
00125       INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
00126                  "Max words to keep in list",
00127                  getImage()->getCCUtil()->params()),
00128       STRING_MEMBER(word_to_debug, "", "Word for which stopper debug"
00129                     " information should be printed to stdout",
00130                     getImage()->getCCUtil()->params()),
00131       STRING_MEMBER(word_to_debug_lengths, "",
00132                     "Lengths of unichars in word_to_debug",
00133                     getImage()->getCCUtil()->params()),
00134       INT_MEMBER(fragments_debug, 0, "Debug character fragments",
00135                  getImage()->getCCUtil()->params()),
00136       INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
00137                  getImage()->getCCUtil()->params()),
00138       BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
00139                   getImage()->getCCUtil()->params()),
00140       double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of"
00141                     " current best rate to prune other hypotheses",
00142                     getImage()->getCCUtil()->params()),
00143       BOOL_MEMBER(permute_script_word, 0,
00144                   "Turn on word script consistency permuter",
00145                   getImage()->getCCUtil()->params()),
00146       BOOL_MEMBER(segment_segcost_rating, 0,
00147                   "incorporate segmentation cost in word rating?",
00148                   getImage()->getCCUtil()->params()),
00149       BOOL_MEMBER(segment_nonalphabetic_script, false,
00150                  "Don't use any alphabetic-specific tricks."
00151                  "Set to true in the traineddata config file for"
00152                  " scripts that are cursive or inherently fixed-pitch",
00153                  getImage()->getCCUtil()->params()),
00154       double_MEMBER(segment_reward_script, 0.95,
00155                     "Score multipler for script consistency within a word. "
00156                     "Being a 'reward' factor, it should be <= 1. "
00157                     "Smaller value implies bigger reward.",
00158                     getImage()->getCCUtil()->params()),
00159       BOOL_MEMBER(permute_fixed_length_dawg, 0,
00160                   "Turn on fixed-length phrasebook search permuter",
00161                   getImage()->getCCUtil()->params()),
00162       BOOL_MEMBER(permute_chartype_word, 0,
00163                   "Turn on character type (property) consistency permuter",
00164                   getImage()->getCCUtil()->params()),
00165       double_MEMBER(segment_reward_chartype, 0.97,
00166                     "Score multipler for char type consistency within a word. ",
00167                     getImage()->getCCUtil()->params()),
00168       double_MEMBER(segment_reward_ngram_best_choice, 0.99,
00169                     "Score multipler for ngram permuter's best choice"
00170                     " (only used in the Han script path).",
00171                     getImage()->getCCUtil()->params()),
00172       BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
00173                   getImage()->getCCUtil()->params()),
00174       BOOL_MEMBER(doc_dict_enable, 1, "Enable Document Dictionary ",
00175                   getImage()->getCCUtil()->params()),
00176       double_MEMBER(doc_dict_pending_threshold, 0.0,
00177                     "Worst certainty for using pending dictionary",
00178                     getImage()->getCCUtil()->params()),
00179       double_MEMBER(doc_dict_certainty_threshold, -2.25,
00180                     "Worst certainty for words that can be inserted into the"
00181                     "document dictionary", getImage()->getCCUtil()->params()),
00182       BOOL_MEMBER(ngram_permuter_activated, false,
00183                   "Activate character-level n-gram-based permuter",
00184                   getImage()->getCCUtil()->params()),
00185       INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different"
00186                  " character choices to consider during permutation."
00187                  " This limit is especially useful when user patterns"
00188                  " are specified, since overly generic patterns can result in"
00189                  " dawg search exploring an overly large number of options.",
00190                  getImage()->getCCUtil()->params()),
00191       BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
00192                   getImage()->getCCUtil()->params()) {
00193   dang_ambigs_table_ = NULL;
00194   replace_ambigs_table_ = NULL;
00195   keep_word_choices_ = false;
00196   reject_offset_ = 0.0;
00197   best_raw_choice_ = NULL;
00198   best_choices_ = NIL_LIST;
00199   raw_choices_ = NIL_LIST;
00200   go_deeper_fxn_ = NULL;
00201   hyphen_word_ = NULL;
00202   last_word_on_line_ = false;
00203   hyphen_unichar_id_ = INVALID_UNICHAR_ID;
00204   document_words_ = NULL;
00205   pending_words_ = NULL;
00206   bigram_dawg_ = NULL;
00207   freq_dawg_ = NULL;
00208   punc_dawg_ = NULL;
00209   max_fixed_length_dawgs_wdlen_ = -1;
00210   wordseg_rating_adjust_factor_ = -1.0f;
00211   output_ambig_words_file_ = NULL;
00212 }
00213
00214 Dict::~Dict() {
00215   if (hyphen_word_ != NULL) delete hyphen_word_;
00216   if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
00217 }
00218
00219 void Dict::Load() {
00220   STRING name;
00221   STRING &lang = getImage()->getCCUtil()->lang;
00222
00223   if (dawgs_.length() != 0) this->End();
00224
00225   hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
00226
00227   LoadEquivalenceList(kHyphenLikeUTF8);
00228   LoadEquivalenceList(kApostropheLikeUTF8);
00229
00230   TessdataManager &tessdata_manager =
00231     getImage()->getCCUtil()->tessdata_manager;
00232
00233   // Load dawgs_.
00234   if (load_punc_dawg && tessdata_manager.SeekToStart(TESSDATA_PUNC_DAWG)) {
00235     punc_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
00236                                   DAWG_TYPE_PUNCTUATION, lang, PUNC_PERM,
00237                                   dawg_debug_level);
00238     dawgs_ += punc_dawg_;
00239   }
00240   if (load_system_dawg && tessdata_manager.SeekToStart(TESSDATA_SYSTEM_DAWG)) {
00241     dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
00242                                DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
00243                                dawg_debug_level);
00244   }
00245   if (load_number_dawg && tessdata_manager.SeekToStart(TESSDATA_NUMBER_DAWG)) {
00246     dawgs_ +=
00247       new SquishedDawg(tessdata_manager.GetDataFilePtr(),
00248                        DAWG_TYPE_NUMBER, lang, NUMBER_PERM, dawg_debug_level);
00249   }
00250   if (load_bigram_dawg && tessdata_manager.SeekToStart(TESSDATA_BIGRAM_DAWG)) {
00251     bigram_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
00252                                     DAWG_TYPE_WORD, // doesn't actually matter.
00253                                     lang,
00254                                     COMPOUND_PERM,  // doesn't actually matter.
00255                                     dawg_debug_level);
00256   }
00257   if (load_freq_dawg && tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
00258     freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
00259                                   DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM,
00260                                   dawg_debug_level);
00261     dawgs_ += freq_dawg_;
00262   }
00263   if (load_unambig_dawg &&
00264       tessdata_manager.SeekToStart(TESSDATA_UNAMBIG_DAWG)) {
00265     unambig_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
00266                                      DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
00267                                      dawg_debug_level);
00268     dawgs_ += unambig_dawg_;
00269   }
00270
00271   if (((STRING &)user_words_suffix).length() > 0) {
00272     Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
00273                               kMaxUserDawgEdges, getUnicharset().size(),
00274                               dawg_debug_level);
00275     name = getImage()->getCCUtil()->language_data_path_prefix;
00276     name += user_words_suffix;
00277     if (!trie_ptr->read_word_list(name.string(), getUnicharset(),
00278                                   Trie::RRP_REVERSE_IF_HAS_RTL)) {
00279       tprintf("Error: failed to load %s\n", name.string());
00280       exit(1);
00281     }
00282     dawgs_ += trie_ptr;
00283   }
00284
00285   if (((STRING &)user_patterns_suffix).length() > 0) {
00286     Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
00287                               kMaxUserDawgEdges, getUnicharset().size(),
00288                               dawg_debug_level);
00289     trie_ptr->initialize_patterns(&(getUnicharset()));
00290     name = getImage()->getCCUtil()->language_data_path_prefix;
00291     name += user_patterns_suffix;
00292     if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
00293       tprintf("Error: failed to load %s\n", name.string());
00294       exit(1);
00295     }
00296     dawgs_ += trie_ptr;
00297   }
00298
00299   document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
00300                              kMaxDocDawgEdges, getUnicharset().size(),
00301                              dawg_debug_level);
00302   dawgs_ += document_words_;
00303
00304   // This dawg is temporary and should not be searched by letter_is_ok.
00305   pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
00306                             kMaxDocDawgEdges, getUnicharset().size(),
00307                             dawg_debug_level);
00308
00309   // Load fixed length dawgs if necessary (used for phrase search
00310   // for non-space delimited languages).
00311   if (load_fixed_length_dawgs &&
00312       tessdata_manager.SeekToStart(TESSDATA_FIXED_LENGTH_DAWGS)) {
00313     ReadFixedLengthDawgs(DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
00314                          dawg_debug_level, tessdata_manager.GetDataFilePtr(),
00315                          &dawgs_, &max_fixed_length_dawgs_wdlen_);
00316   }
00317
00318   // Construct a list of corresponding successors for each dawg. Each entry i
00319   // in the successors_ vector is a vector of integers that represent the
00320   // indices into the dawgs_ vector of the successors for dawg i.
00321   successors_.reserve(dawgs_.length());
00322   for (int i = 0; i < dawgs_.length(); ++i) {
00323     const Dawg *dawg = dawgs_[i];
00324     SuccessorList *lst = new SuccessorList();
00325     for (int j = 0; j < dawgs_.length(); ++j) {
00326       const Dawg *other = dawgs_[j];
00327       if (dawg != NULL && other != NULL &&
00328           (dawg->lang() == other->lang()) &&
00329           kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
00330     }
00331     successors_ += lst;
00332   }
00333 }
00334
00335 void Dict::End() {
00336   if (dawgs_.length() == 0)
00337     return;  // Not safe to call twice.
00338   dawgs_.delete_data_pointers();
00339   successors_.delete_data_pointers();
00340   dawgs_.clear();
00341   delete bigram_dawg_;
00342   successors_.clear();
00343   document_words_ = NULL;
00344   max_fixed_length_dawgs_wdlen_ = -1;
00345   if (pending_words_ != NULL) {
00346     delete pending_words_;
00347     pending_words_ = NULL;
00348   }
00349 }
00350
00351 // Create unicharset adaptations of known, short lists of UTF-8 equivalent
00352 // characters (think all hyphen-like symbols).  The first version of the
00353 // list is taken as equivalent for matching against the dictionary.
00354 void Dict::LoadEquivalenceList(const char *unichar_strings[]) {
00355   equivalent_symbols_.push_back(GenericVectorEqEq<UNICHAR_ID>());
00356   const UNICHARSET &unicharset = getUnicharset();
00357   GenericVectorEqEq<UNICHAR_ID> *equiv_list = &equivalent_symbols_.back();
00358   for (int i = 0; unichar_strings[i] != 0; i++) {
00359     UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar_strings[i]);
00360     if (unichar_id != INVALID_UNICHAR_ID) {
00361       equiv_list->push_back(unichar_id);
00362     }
00363   }
00364 }
00365
00366 // Normalize all hyphen and apostrophes to the canonicalized one for
00367 // matching; pass everything else through as is.
00368 UNICHAR_ID Dict::NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const {
00369   for (int i = 0; i < equivalent_symbols_.size(); i++) {
00370     if (equivalent_symbols_[i].contains(unichar_id)) {
00371       return equivalent_symbols_[i][0];
00372     }
00373   }
00374   return unichar_id;
00375 }
00376
00377 // Returns true if in light of the current state unichar_id is allowed
00378 // according to at least one of the dawgs in the dawgs_ vector.
00379 // See more extensive comments in dict.h where this function is declared.
00380 int Dict::def_letter_is_okay(void* void_dawg_args,
00381                              UNICHAR_ID unichar_id,
00382                              bool word_end) const {
00383   DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
00384
00385   if (dawg_debug_level >= 3) {
00386     tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
00387             " num active dawgs=%d num constraints=%d\n",
00388             getUnicharset().debug_str(unichar_id).string(), word_end,
00389             dawg_args->active_dawgs->length(),
00390             dawg_args->constraints->length());
00391   }
00392
00393   // Do not accept words that contain kPatternUnicharID.
00394   // (otherwise pattern dawgs would not function correctly).
00395   // Do not accept words containing INVALID_UNICHAR_IDs.
00396   if (unichar_id == Dawg::kPatternUnicharID ||
00397       unichar_id == INVALID_UNICHAR_ID) {
00398     dawg_args->permuter = NO_PERM;
00399     return NO_PERM;
00400   }
00401
00402   // Initialization.
00403   PermuterType curr_perm = NO_PERM;
00404   dawg_args->updated_active_dawgs->clear();
00405   const DawgInfoVector &constraints = *(dawg_args->constraints);
00406   *dawg_args->updated_constraints = constraints;
00407
00408   // Go over the active_dawgs vector and insert DawgInfo records with the
00409   // updated ref (an edge with the corresponding unichar id) into
00410   // dawg_args->updated_active_dawgs.
00411   for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
00412     const DawgInfo &info = (*dawg_args->active_dawgs)[a];
00413     const Dawg *dawg = dawgs_[info.dawg_index];
00414     // dawg_unichar_id will contain the literal unichar_id to be found in the
00415     // dawgs (e.g. didgit pattern if unichar_id is a digit and dawg contains
00416     // number patterns, word pattern if dawg is a puncutation dawg and we
00417     // reached an end of beginning puntuation pattern, etc).
00418     UNICHAR_ID dawg_unichar_id = unichar_id;
00419
00420     // If we are dealing with the pattern dawg, look up all the
00421     // possible edges, not only for the exact unichar_id, but also
00422     // for all its character classes (alpha, digit, etc).
00423     if (dawg->type() == DAWG_TYPE_PATTERN) {
00424       ProcessPatternEdges(dawg, info, dawg_unichar_id, word_end,
00425                           dawg_args, &curr_perm);
00426       // There can't be any successors to dawg that is of type
00427       // DAWG_TYPE_PATTERN, so we are done examining this DawgInfo.
00428       continue;
00429     }
00430
00431     // The number dawg generalizes all digits to be kPatternUnicharID,
00432     // so try to match kPatternUnicharID if the current unichar is a digit.
00433     if (dawg->type() == DAWG_TYPE_NUMBER &&
00434         getUnicharset().get_isdigit(dawg_unichar_id)) {
00435       dawg_unichar_id = Dawg::kPatternUnicharID;
00436     }
00437
00438     // Find the edge out of the node for the dawg_unichar_id.
00439     NODE_REF node = GetStartingNode(dawg, info.ref);
00440     EDGE_REF edge = (node != NO_EDGE) ?
00441       dawg->edge_char_of(node, dawg_unichar_id, word_end) : NO_EDGE;
00442
00443     if (dawg_debug_level >= 3) {
00444       tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
00445               info.dawg_index, node, edge);
00446     }
00447
00448     if (edge != NO_EDGE) {  // the unichar was found in the current dawg
00449       if (ConstraintsOk(*(dawg_args->updated_constraints),
00450                         word_end, dawg->type())) {
00451         if (dawg_debug_level >=3) {
00452           tprintf("Letter found in dawg %d\n", info.dawg_index);
00453         }
00454         if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
00455         dawg_args->updated_active_dawgs->add_unique(
00456             DawgInfo(info.dawg_index, edge), dawg_debug_level > 0,
00457             "Append current dawg to updated active dawgs: ");
00458       }
00459     } else if (dawg_args->sought_word_length == kAnyWordLength) {
00460       // The unichar was not found in the current dawg.
00461       // Explore the successor dawgs (but only if we are not
00462       // just searching one dawg with a fixed word length).
00463
00464       // Handle leading/trailing punctuation dawgs that denote a word pattern
00465       // as an edge with kPatternUnicharID. If such an edge is found we add a
00466       // constraint denoting the state of the dawg before the word pattern.
00467       // This constraint will be applied later when this dawg is found among
00468       // successor dawgs as well potentially at the end of the word.
00469       if (dawg->type() == DAWG_TYPE_PUNCTUATION) {
00470         edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end);
00471         if (edge != NO_EDGE) {
00472           dawg_args->updated_constraints->add_unique(
00473               DawgInfo(info.dawg_index, edge), dawg_debug_level > 0,
00474               "Recording constraint: ");
00475         } else {
00476           // Do not explore successors of this dawg, since this
00477           // must be invalid leading or trailing punctuation.
00478           if (dawg_debug_level >= 3) {
00479             tprintf("Invalid punctuation from dawg %d\n", info.dawg_index);
00480           }
00481           continue;
00482         }
00483       }
00484
00485       if (info.ref == NO_EDGE) {
00486         if (dawg_debug_level >= 3) {
00487           tprintf("No letters matched in dawg %d\n", info.dawg_index);
00488         }
00489         continue;
00490       }
00491
00492       // Discard the dawg if the pattern can not end at previous letter.
00493       if (edge == NO_EDGE &&  // previous part is not leading punctuation
00494           !dawg->end_of_word(info.ref)) {
00495         if (dawg_debug_level >= 3) {
00496           tprintf("No valid pattern end in dawg %d\n", info.dawg_index);
00497         }
00498         continue;
00499       }
00500
00501       // Look for the unichar in each of this dawg's successors
00502       // and append those in which it is found to active_dawgs.
00503       const SuccessorList &slist = *(successors_[info.dawg_index]);
00504       for (int s = 0; s < slist.length(); ++s) {
00505         int sdawg_index = slist[s];
00506         const Dawg *sdawg = dawgs_[sdawg_index];
00507         NODE_REF snode = 0;
00508         // Apply constraints to the successor dawg.
00509         for (int c = 0; c < constraints.length(); ++c) {
00510           // If the successor dawg is described in the constraints change
00511           // the start ref from 0 to the one recorded as the constraint.
00512           const DawgInfo &cinfo = constraints[c];
00513           if (cinfo.dawg_index == sdawg_index) {
00514             snode = sdawg->next_node(cinfo.ref);
00515             // Make sure we do not search the successor dawg if after
00516             // applying the saved constraint we are at the end of the word.
00517             if (snode == 0) snode = NO_EDGE;
00518             if (dawg_debug_level >= 3) {
00519                tprintf("Applying constraint [%d, " REFFORMAT "]\n",
00520                        sdawg_index, snode);
00521             }
00522           }
00523         }
00524         // Look for the letter in this successor dawg.
00525         EDGE_REF sedge = sdawg->edge_char_of(snode, unichar_id, word_end);
00526         // If we found the letter append sdawg to the active_dawgs list.
00527         if (sedge != NO_EDGE &&
00528             ConstraintsOk(*(dawg_args->updated_constraints), word_end,
00529                           dawgs_[sdawg_index]->type())) {
00530           if (dawg_debug_level >= 3) {
00531             tprintf("Letter found in the successor dawg %d\n", sdawg_index);
00532           }
00533           if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
00534           if (sdawg->next_node(sedge) != 0) {  // if not word end
00535             dawg_args->updated_active_dawgs->add_unique(
00536               DawgInfo(sdawg_index, sedge), dawg_debug_level > 0,
00537               "Append successor to updated active dawgs: ");
00538           }
00539         }
00540       }  // end successors loop
00541     }  // end if/else
00542   }  // end for
00543   // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
00544   // or if we found the current letter in a non-punctuation dawg. This
00545   // allows preserving information on which dawg the "core" word came from.
00546   // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
00547   if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
00548       (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
00549     dawg_args->permuter = curr_perm;
00550   }
00551   return dawg_args->permuter;
00552 }
00553
00554 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
00555                                UNICHAR_ID unichar_id, bool word_end,
00556                                DawgArgs *dawg_args,
00557                                PermuterType *curr_perm) const {
00558   NODE_REF node = GetStartingNode(dawg, info.ref);
00559   // Try to find the edge corresponding to the exact unichar_id and to all the
00560   // edges corresponding to the character class of unichar_id.
00561   GenericVector<UNICHAR_ID> unichar_id_patterns;
00562   unichar_id_patterns.push_back(unichar_id);
00563   dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
00564                                &unichar_id_patterns);
00565   for (int i = 0; i < unichar_id_patterns.size(); ++i) {
00566     // On the first iteration check all the outgoing edges.
00567     // On the second iteration check all self-loops.
00568     for (int k = 0; k < 2; ++k) {
00569     EDGE_REF edge = (k == 0) ?
00570       dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
00571       : dawg->pattern_loop_edge(info.ref, unichar_id_patterns[i], word_end);
00572       if (edge != NO_EDGE) {
00573         if (dawg_debug_level >= 3) {
00574           tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
00575                   info.dawg_index, node, edge);
00576         }
00577         if (ConstraintsOk(*(dawg_args->updated_constraints),
00578                           word_end, dawg->type())) {
00579           if (dawg_debug_level >=3) {
00580             tprintf("Letter found in pattern dawg %d\n", info.dawg_index);
00581           }
00582           if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
00583           dawg_args->updated_active_dawgs->add_unique(
00584               DawgInfo(info.dawg_index, edge), dawg_debug_level > 0,
00585               "Append current dawg to updated active dawgs: ");
00586         }
00587       }
00588     }
00589   }
00590 }
00591
00592 void Dict::ReadFixedLengthDawgs(DawgType type, const STRING &lang,
00593                                 PermuterType perm, int debug_level,
00594                                 FILE *file, DawgVector *dawg_vec,
00595                                 int *max_wdlen) {
00596   int i;
00597   DawgVector dawg_vec_copy;
00598   dawg_vec_copy.move(dawg_vec); // save the input dawg_vec.
00599   inT32 num_dawgs;
00600   fread(&num_dawgs, sizeof(inT32), 1, file);
00601   bool swap = (num_dawgs > MAX_WERD_LENGTH);
00602   if (swap) num_dawgs = reverse32(num_dawgs);
00603   inT32 word_length;
00604   int max_word_length = 0;
00605   // Read and record pointers to fixed-length dawgs such that:
00606   // dawg_vec[word_length] = pointer to dawg with word length of word_length,
00607   //                         NULL if such fixed-length dawg does not exist.
00608   for (i = 0; i < num_dawgs; ++i) {
00609     fread(&word_length, sizeof(inT32), 1, file);
00610     if (swap) word_length = reverse32(word_length);
00611     ASSERT_HOST(word_length >  0 && word_length <= MAX_WERD_LENGTH);
00612     while (word_length >= dawg_vec->size()) dawg_vec->push_back(NULL);
00613     (*dawg_vec)[word_length] =
00614       new SquishedDawg(file, type, lang, perm, debug_level);
00615     if (word_length > max_word_length) max_word_length = word_length;
00616   }
00617   *max_wdlen = max_word_length;
00618   // Entries dawg_vec[0] to dawg_vec[max_word_length] now hold pointers
00619   // to fixed-length dawgs. The rest of the vector will contain the dawg
00620   // pointers from the original input dawg_vec.
00621   for (i = 0; i < dawg_vec_copy.size(); ++i) {
00622     dawg_vec->push_back(dawg_vec_copy[i]);
00623   }
00624 }
00625
00626 void Dict::WriteFixedLengthDawgs(
00627     const GenericVector<SquishedDawg *> &dawg_vec,
00628     int num_dawgs, int debug_level, FILE *output_file) {
00629   fwrite(&num_dawgs, sizeof(inT32), 1, output_file);
00630   if (debug_level) tprintf("Writing %d split length dawgs\n", num_dawgs);
00631   for (int i = 1; i < dawg_vec.size(); ++i) {
00632     if ((dawg_vec)[i] != NULL) {
00633       fwrite(&i, sizeof(inT32), 1, output_file);
00634       dawg_vec[i]->write_squished_dawg(output_file);
00635       if (debug_level) tprintf("Wrote Dawg with word length %d\n", i);
00636     }
00637   }
00638 }
00639
00640 // Fill the given active_dawgs vector with dawgs that could contain the
00641 // beginning of the word. If hyphenated() returns true, copy the entries
00642 // from hyphen_active_dawgs_ instead.
00643 void Dict::init_active_dawgs(int sought_word_length,
00644                              DawgInfoVector *active_dawgs,
00645                              bool ambigs_mode) const {
00646   int i;
00647   if (sought_word_length != kAnyWordLength) {
00648     // Only search one fixed word length dawg.
00649     if (sought_word_length <= max_fixed_length_dawgs_wdlen_ &&
00650         dawgs_[sought_word_length] != NULL) {
00651       *active_dawgs += DawgInfo(sought_word_length, NO_EDGE);
00652     }
00653   } else if (hyphenated()) {
00654     *active_dawgs = hyphen_active_dawgs_;
00655     if (dawg_debug_level >= 3) {
00656       for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
00657         tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
00658                 hyphen_active_dawgs_[i].dawg_index,
00659                 hyphen_active_dawgs_[i].ref);
00660       }
00661     }
00662   } else {
00663     for (i = 0; i < dawgs_.length(); ++i) {
00664       if (dawgs_[i] != NULL && kBeginningDawgsType[(dawgs_[i])->type()] &&
00665           !(ambigs_mode && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
00666         *active_dawgs += DawgInfo(i, NO_EDGE);
00667         if (dawg_debug_level >= 3) {
00668           tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
00669         }
00670       }
00671     }
00672   }
00673 }
00674
00675 // If hyphenated() returns true, copy the entries from hyphen_constraints_
00676 // into the given constraints vector.
00677 void Dict::init_constraints(DawgInfoVector *constraints) const {
00678   if (hyphenated()) {
00679     *constraints = hyphen_constraints_;
00680     if (dawg_debug_level >= 3) {
00681       for (int i = 0; i < hyphen_constraints_.size(); ++i) {
00682         tprintf("Adding hyphen constraint [%d, " REFFORMAT "]\n",
00683                 hyphen_constraints_[i].dawg_index,
00684                 hyphen_constraints_[i].ref);
00685       }
00686     }
00687   }
00688 }
00689
00690 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
00691   // Do not add hyphenated word parts to the document dawg.
00692   // hyphen_word_ will be non-NULL after the set_hyphen_word() is
00693   // called when the first part of the hyphenated word is
00694   // discovered and while the second part of the word is recognized.
00695   // hyphen_word_ is cleared in cc_recg() before the next word on
00696   // the line is recognized.
00697   if (hyphen_word_) return;
00698
00699   char filename[CHARS_PER_LINE];
00700   FILE *doc_word_file;
00701   int stringlen = best_choice.length();
00702
00703   if (!doc_dict_enable || valid_word(best_choice) ||
00704       CurrentWordAmbig() || stringlen < 2)
00705     return;
00706
00707   // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
00708   if (best_choice.length() >= kDocDictMaxRepChars) {
00709     int num_rep_chars = 1;
00710     UNICHAR_ID uch_id = best_choice.unichar_id(0);
00711     for (int i = 1; i < best_choice.length(); ++i) {
00712       if (best_choice.unichar_id(i) != uch_id) {
00713         num_rep_chars = 1;
00714         uch_id = best_choice.unichar_id(i);
00715       } else {
00716         ++num_rep_chars;
00717         if (num_rep_chars == kDocDictMaxRepChars) return;
00718       }
00719     }
00720   }
00721
00722   if (best_choice.certainty() < doc_dict_certainty_threshold ||
00723       stringlen == 2) {
00724     if (best_choice.certainty() < doc_dict_pending_threshold)
00725       return;
00726
00727     if (!pending_words_->word_in_dawg(best_choice)) {
00728       if (stringlen > 2 ||
00729           (stringlen == 2 &&
00730            getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
00731            getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
00732         pending_words_->add_word_to_dawg(best_choice);
00733       }
00734       return;
00735     }
00736   }
00737
00738   if (save_doc_words) {
00739     strcpy(filename, getImage()->getCCUtil()->imagefile.string());
00740     strcat(filename, ".doc");
00741     doc_word_file = open_file (filename, "a");
00742     fprintf(doc_word_file, "%s\n",
00743             best_choice.debug_string().string());
00744     fclose(doc_word_file);
00745   }
00746   document_words_->add_word_to_dawg(best_choice);
00747 }
00748
00749 void Dict::adjust_word(WERD_CHOICE *word,
00750                        float *certainty_array,
00751                        const BLOB_CHOICE_LIST_VECTOR *char_choices,
00752                        bool nonword,
00753                        float additional_adjust,
00754                        bool debug) {
00755   bool is_han = (char_choices != NULL &&
00756                  getUnicharset().han_sid() != getUnicharset().null_sid() &&
00757                  get_top_word_script(*char_choices, getUnicharset()) ==
00758                  getUnicharset().han_sid());
00759   bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
00760   bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
00761
00762   float adjust_factor = additional_adjust;
00763   float new_rating = word->rating();
00764   if (debug) {
00765     tprintf("%sWord: %s %4.2f ", nonword ? "Non-" : "",
00766             word->debug_string().string(), word->rating());
00767   }
00768   new_rating += kRatingPad;
00769   if (nonword) {  // non-dictionary word
00770     if (case_is_ok && punc_is_ok) {
00771       adjust_factor += segment_penalty_dict_nonword;
00772       new_rating *= adjust_factor;
00773       if (debug) tprintf(", W");
00774     } else {
00775       adjust_factor += segment_penalty_garbage;
00776       new_rating *= adjust_factor;
00777       if (debug) {
00778         if (!case_is_ok) tprintf(", C");
00779         if (!punc_is_ok) tprintf(", P");
00780       }
00781     }
00782   } else {  // dictionary word
00783     if (case_is_ok) {
00784       if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
00785         word->set_permuter(FREQ_DAWG_PERM);
00786         adjust_factor += segment_penalty_dict_frequent_word;
00787         new_rating *= adjust_factor;
00788         if (debug) tprintf(", F");
00789       } else {
00790         adjust_factor += segment_penalty_dict_case_ok;
00791         new_rating *= adjust_factor;
00792         if (debug) tprintf(", ");
00793       }
00794     } else {
00795       adjust_factor += segment_penalty_dict_case_bad;
00796       new_rating *= adjust_factor;
00797       if (debug) tprintf(", C");
00798     }
00799   }
00800   new_rating -= kRatingPad;
00801   word->set_rating(new_rating);
00802   if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
00803   LogNewChoice(adjust_factor, certainty_array, false, word);
00804 }
00805
00806 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
00807   const WERD_CHOICE *word_ptr = &word;
00808   WERD_CHOICE temp_word(word.unicharset());
00809   if (hyphenated()) {
00810     copy_hyphen_info(&temp_word);
00811     temp_word += word;
00812     word_ptr = &temp_word;
00813   }
00814   if (word_ptr->length() == 0) return NO_PERM;
00815   // Allocate vectors for holding current and updated
00816   // active_dawgs and constraints and initialize them.
00817   DawgInfoVector *active_dawgs = new DawgInfoVector[2];
00818   DawgInfoVector *constraints = new DawgInfoVector[2];
00819   init_active_dawgs(kAnyWordLength, &(active_dawgs[0]), false);
00820   init_constraints(&(constraints[0]));
00821   DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
00822                      &(active_dawgs[1]), &(constraints[1]),
00823                      0.0, NO_PERM, kAnyWordLength, 0);
00824   int last_index = word_ptr->length() - 1;
00825   // Call leter_is_okay for each letter in the word.
00826   for (int i = hyphen_base_size(); i <= last_index; ++i) {
00827     if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
00828                                    i == last_index))) break;
00829     // Swap active_dawgs, constraints with the corresponding updated vector.
00830     if (dawg_args.updated_active_dawgs == &(active_dawgs[1])) {
00831       dawg_args.updated_active_dawgs = &(active_dawgs[0]);
00832       dawg_args.updated_constraints = &(constraints[0]);
00833       ++(dawg_args.active_dawgs);
00834       ++(dawg_args.constraints);
00835     } else {
00836       ++(dawg_args.updated_active_dawgs);
00837       ++(dawg_args.updated_constraints);
00838       dawg_args.active_dawgs = &(active_dawgs[0]);
00839       dawg_args.constraints = &(constraints[0]);
00840     }
00841   }
00842   delete[] active_dawgs;
00843   delete[] constraints;
00844   return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
00845     dawg_args.permuter : NO_PERM;
00846 }
00847
00848 bool Dict::valid_bigram(const WERD_CHOICE &word1,
00849                         const WERD_CHOICE &word2) const {
00850   if (bigram_dawg_ == NULL) return false;
00851
00852   // Extract the core word from the middle of each word with any digits
00853   //         replaced with question marks.
00854   int w1start, w1end, w2start, w2end;
00855   word1.punct_stripped(&w1start, &w1end);
00856   word2.punct_stripped(&w2start, &w2end);
00857
00858   // We don't want to penalize a single guillemet, hyphen, etc.
00859   // But our bigram list doesn't have any information about punctuation.
00860   if (w1start >= w1end) return word1.length() < 3;
00861   if (w2start >= w2end) return word2.length() < 3;
00862
00863   const UNICHARSET& uchset = getUnicharset();
00864   STRING bigram_string;
00865   for (int i = w1start; i < w1end; i++) {
00866     UNICHAR_ID ch = NormalizeUnicharIdForMatch(word1.unichar_id(i));
00867     bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
00868   }
00869   bigram_string += " ";
00870   for (int i = w2start; i < w2end; i++) {
00871     UNICHAR_ID ch = NormalizeUnicharIdForMatch(word2.unichar_id(i));
00872     bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
00873   }
00874   WERD_CHOICE normalized_word(bigram_string.string(), uchset);
00875   return bigram_dawg_->word_in_dawg(normalized_word);
00876 }
00877
00878 bool Dict::valid_punctuation(const WERD_CHOICE &word) {
00879   if (word.length() == 0) return NO_PERM;
00880   int i;
00881   WERD_CHOICE new_word(word.unicharset());
00882   int last_index = word.length() - 1;
00883   int new_len = 0;
00884   for (i = 0; i <= last_index; ++i) {
00885     UNICHAR_ID unichar_id = (word.unichar_id(i));
00886     if (getUnicharset().get_ispunctuation(unichar_id)) {
00887       new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
00888     } else if (!getUnicharset().get_isalpha(unichar_id) &&
00889                !getUnicharset().get_isdigit(unichar_id)) {
00890       return false;  // neither punc, nor alpha, nor digit
00891     } else if ((new_len = new_word.length()) == 0 ||
00892                new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
00893       new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
00894     }
00895   }
00896   for (i = 0; i < dawgs_.size(); ++i) {
00897     if (dawgs_[i] != NULL &&
00898         dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
00899         dawgs_[i]->word_in_dawg(new_word)) return true;
00900   }
00901   return false;
00902 }
00903
00904 // Returns the "dominant" script ID for the word.  By "dominant", the script
00905 // must account for at least half the characters.  Otherwise, it returns 0.
00906 // Note that for Japanese, Hiragana and Katakana are simply treated as Han.
00907 int Dict::get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00908                               const UNICHARSET &unicharset) {
00909   int max_script = unicharset.get_script_table_size();
00910   int *sid = new int[max_script];
00911   int x;
00912   for (x = 0; x < max_script; x++) sid[x] = 0;
00913   for (x = 0; x < char_choices.length(); ++x) {
00914     BLOB_CHOICE_IT blob_choice_it(char_choices.get(x));
00915     sid[blob_choice_it.data()->script_id()]++;
00916   }
00917   if (unicharset.han_sid() != unicharset.null_sid()) {
00918     // Add the Hiragana & Katakana counts to Han and zero them out.
00919     if (unicharset.hiragana_sid() != unicharset.null_sid()) {
00920       sid[unicharset.han_sid()] += sid[unicharset.hiragana_sid()];
00921       sid[unicharset.hiragana_sid()] = 0;
00922     }
00923     if (unicharset.katakana_sid() != unicharset.null_sid()) {
00924       sid[unicharset.han_sid()] += sid[unicharset.katakana_sid()];
00925       sid[unicharset.katakana_sid()] = 0;
00926     }
00927   }
00928   // Note that high script ID overrides lower one on a tie, thus biasing
00929   // towards non-Common script (if sorted that way in unicharset file).
00930   int max_sid = 0;
00931   for (x = 1; x < max_script; x++)
00932     if (sid[x] >= sid[max_sid]) max_sid = x;
00933   if (sid[max_sid] < char_choices.length() / 2)
00934     max_sid = unicharset.null_sid();
00935   delete[] sid;
00936   return max_sid;
00937 }
00938
00939 }  // namespace tesseract