Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: permdawg.c (Formerly permdawg.c) 00005 * Description: Scale word choices by a dictionary 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 00029 #include "cutil.h" 00030 #include "dawg.h" 00031 #include "freelist.h" 00032 #include "globals.h" 00033 #include "ndminx.h" 00034 #include "permute.h" 00035 #include "stopper.h" 00036 #include "tprintf.h" 00037 #include "params.h" 00038 00039 #include <ctype.h> 00040 #include "dict.h" 00041 #include "image.h" 00042 00043 /*---------------------------------------------------------------------- 00044 F u n c t i o n s 00045 ----------------------------------------------------------------------*/ 00046 namespace tesseract { 00047 00048 static const float kPermDawgRatingPad = 5.0; 00049 00067 void Dict::go_deeper_dawg_fxn( 00068 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 00069 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00070 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, 00071 WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) { 00072 DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args); 00073 word_ending = (char_choice_index == more_args->end_char_choice_index); 00074 int word_index = word->length() - 1; 00075 00076 if (ambigs_mode(*limit)) { 00077 if (best_choice->rating() < *limit) return; 00078 } else { 00079 // Prune bad subwords 00080 if (more_args->rating_array[word_index] == NO_RATING) { 00081 more_args->rating_array[word_index] = word->rating(); 00082 } else { 00083 float permdawg_limit = more_args->rating_array[word_index] * 00084 more_args->rating_margin + kPermDawgRatingPad; 00085 if (permdawg_limit < word->rating()) { 00086 if (permute_debug && dawg_debug_level) { 00087 tprintf("early pruned word rating=%4.2f," 00088 " permdawg_limit=%4.2f, word=%s\n", word->rating(), 00089 permdawg_limit, word->debug_string().string()); 00090 } 00091 return; 00092 } 00093 } 00094 } 00095 // Deal with hyphens 00096 if (word_ending && more_args->sought_word_length == kAnyWordLength && 00097 has_hyphen_end(*word) && !ambigs_mode(*limit)) { 00098 // Copy more_args->active_dawgs to clean_active_dawgs removing 00099 // dawgs of type DAWG_TYPE_PATTERN. 00100 DawgInfoVector clean_active_dawgs; 00101 const DawgInfoVector &active_dawgs = *(more_args->active_dawgs); 00102 for (int i = 0; i < active_dawgs.size(); ++i) { 00103 if (dawgs_[active_dawgs[i].dawg_index]->type() != DAWG_TYPE_PATTERN) { 00104 clean_active_dawgs += active_dawgs[i]; 00105 } 00106 } 00107 if (clean_active_dawgs.size() > 0) { 00108 if (permute_debug && dawg_debug_level) 00109 tprintf("new hyphen choice = %s\n", word->debug_string().string()); 00110 word->set_permuter(more_args->permuter); 00111 adjust_word(word, certainties, permute_debug); 00112 set_hyphen_word(*word, *(more_args->active_dawgs), 00113 *(more_args->constraints)); 00114 update_best_choice(*word, best_choice); 00115 } 00116 } else { // Look up char in DAWG 00117 // TODO(daria): update the rest of the code that specifies alternative 00118 // letter_is_okay_ functions (e.g. TessCharNgram class) to work with 00119 // multi-byte unichars and/or unichar ids. 00120 00121 // If the current unichar is an ngram first try calling 00122 // letter_is_okay() for each unigram it contains separately. 00123 UNICHAR_ID orig_uch_id = word->unichar_id(word_index); 00124 bool checked_unigrams = false; 00125 if (getUnicharset().get_isngram(orig_uch_id)) { 00126 if (permute_debug && dawg_debug_level) { 00127 tprintf("checking unigrams in an ngram %s\n", 00128 getUnicharset().debug_str(orig_uch_id).string()); 00129 } 00130 int orig_num_fragments = word->fragment_length(word_index); 00131 int num_unigrams = 0; 00132 word->remove_last_unichar_id(); 00133 const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); 00134 const char *ngram_str_end = ngram_str + strlen(ngram_str); 00135 const char *ngram_ptr = ngram_str; 00136 bool unigrams_ok = true; 00137 // Construct DawgArgs that reflect the current state. 00138 DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs); 00139 DawgInfoVector unigram_constraints = *(more_args->constraints); 00140 DawgInfoVector unigram_updated_active_dawgs; 00141 DawgInfoVector unigram_updated_constraints; 00142 DawgArgs unigram_dawg_args(&unigram_active_dawgs, 00143 &unigram_constraints, 00144 &unigram_updated_active_dawgs, 00145 &unigram_updated_constraints, 0.0, 00146 more_args->permuter, 00147 more_args->sought_word_length, 00148 more_args->end_char_choice_index); 00149 // Check unigrams in the ngram with letter_is_okay(). 00150 while (unigrams_ok && ngram_ptr < ngram_str_end) { 00151 int step = getUnicharset().step(ngram_ptr); 00152 UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID : 00153 getUnicharset().unichar_to_id(ngram_ptr, step); 00154 ngram_ptr += step; 00155 ++num_unigrams; 00156 word->append_unichar_id(uch_id, 1, 0.0, 0.0); 00157 unigrams_ok = unigrams_ok && (this->*letter_is_okay_)( 00158 &unigram_dawg_args, 00159 word->unichar_id(word_index+num_unigrams-1), 00160 word_ending && (ngram_ptr == ngram_str_end)); 00161 (*unigram_dawg_args.active_dawgs) = 00162 *(unigram_dawg_args.updated_active_dawgs); 00163 (*unigram_dawg_args.constraints) = 00164 *(unigram_dawg_args.updated_constraints); 00165 if (permute_debug && dawg_debug_level) { 00166 tprintf("unigram %s is %s\n", 00167 getUnicharset().debug_str(uch_id).string(), 00168 unigrams_ok ? "OK" : "not OK"); 00169 } 00170 } 00171 // Restore the word and copy the updated dawg state if needed. 00172 while (num_unigrams-- > 0) word->remove_last_unichar_id(); 00173 word->append_unichar_id_space_allocated( 00174 orig_uch_id, orig_num_fragments, 0.0, 0.0); 00175 if (unigrams_ok) { 00176 checked_unigrams = true; 00177 more_args->permuter = unigram_dawg_args.permuter; 00178 *(more_args->updated_active_dawgs) = 00179 *(unigram_dawg_args.updated_active_dawgs); 00180 *(more_args->updated_constraints) = 00181 *(unigram_dawg_args.updated_constraints); 00182 } 00183 } 00184 00185 // Check which dawgs from the dawgs_ vector contain the word 00186 // up to and including the current unichar. 00187 if (checked_unigrams || (this->*letter_is_okay_)( 00188 more_args, word->unichar_id(word_index), word_ending)) { 00189 // Add a new word choice 00190 if (word_ending) { 00191 if (permute_debug && dawg_debug_level) { 00192 tprintf("found word = %s\n", word->debug_string().string()); 00193 } 00194 if (ambigs_mode(*limit) && 00195 strcmp(output_ambig_words_file.string(), "") != 0) { 00196 if (output_ambig_words_file_ == NULL) { 00197 output_ambig_words_file_ = 00198 fopen(output_ambig_words_file.string(), "wb+"); 00199 if (output_ambig_words_file_ == NULL) { 00200 tprintf("Failed to open output_ambig_words_file %s\n", 00201 output_ambig_words_file.string()); 00202 exit(1); 00203 } 00204 } 00205 STRING word_str; 00206 word->string_and_lengths(&word_str, NULL); 00207 word_str += " "; 00208 fprintf(output_ambig_words_file_, word_str.string()); 00209 } 00210 WERD_CHOICE *adjusted_word = word; 00211 WERD_CHOICE hyphen_tail_word(&getUnicharset()); 00212 if (hyphen_base_size() > 0) { 00213 hyphen_tail_word = *word; 00214 remove_hyphen_head(&hyphen_tail_word); 00215 adjusted_word = &hyphen_tail_word; 00216 } 00217 adjusted_word->set_permuter(more_args->permuter); 00218 if (!ambigs_mode(*limit)) { 00219 adjust_word(adjusted_word, &certainties[hyphen_base_size()], 00220 permute_debug); 00221 } 00222 update_best_choice(*adjusted_word, best_choice); 00223 } else { // search the next letter 00224 // Make updated_* point to the next entries in the DawgInfoVector 00225 // arrays (that were originally created in dawg_permute_and_select) 00226 ++(more_args->updated_active_dawgs); 00227 ++(more_args->updated_constraints); 00228 // Make active_dawgs and constraints point to the updated ones. 00229 ++(more_args->active_dawgs); 00230 ++(more_args->constraints); 00231 permute_choices(debug, char_choices, char_choice_index + 1, 00232 prev_char_frag_info, word, certainties, limit, 00233 best_choice, attempts_left, more_args); 00234 // Restore previous state to explore another letter in this position. 00235 --(more_args->updated_active_dawgs); 00236 --(more_args->updated_constraints); 00237 --(more_args->active_dawgs); 00238 --(more_args->constraints); 00239 } 00240 } else { 00241 if (permute_debug && dawg_debug_level) { 00242 tprintf("last unichar not OK at index %d in %s\n", 00243 word_index, word->debug_string().string()); 00244 } 00245 } 00246 } 00247 } 00248 00263 WERD_CHOICE *Dict::dawg_permute_and_select( 00264 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, 00265 int sought_word_length, int start_char_choice_index) { 00266 WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset()); 00267 best_choice->make_bad(); 00268 best_choice->set_rating(rating_limit); 00269 if (char_choices.length() == 0) return best_choice; 00270 DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1]; 00271 DawgInfoVector *constraints = new DawgInfoVector[char_choices.length() + 1]; 00272 init_active_dawgs(sought_word_length, &(active_dawgs[0]), 00273 ambigs_mode(rating_limit)); 00274 init_constraints(&(constraints[0])); 00275 int end_char_choice_index = (sought_word_length == kAnyWordLength) ? 00276 char_choices.length()-1 : start_char_choice_index+sought_word_length-1; 00277 // Need to skip accumulating word choices if we are only searching a part of 00278 // the word (e.g. for the phrase search in non-space delimited languages). 00279 // Also need to skip accumulating choices if char_choices are expanded 00280 // with ambiguities. 00281 bool re_enable_choice_accum = ChoiceAccumEnabled(); 00282 if (sought_word_length != kAnyWordLength || 00283 ambigs_mode(rating_limit)) DisableChoiceAccum(); 00284 DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]), 00285 &(active_dawgs[1]), &(constraints[1]), 00286 (segment_penalty_dict_case_bad / 00287 segment_penalty_dict_case_ok), 00288 NO_PERM, sought_word_length, end_char_choice_index); 00289 WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH); 00290 copy_hyphen_info(&word); 00291 // Discard rating and certainty of the hyphen base (if any). 00292 word.set_rating(0.0); 00293 word.set_certainty(0.0); 00294 if (word.length() + char_choices.length() > MAX_WERD_LENGTH) { 00295 delete[] active_dawgs; 00296 delete[] constraints; 00297 return best_choice; // the word is too long to permute 00298 } 00299 float certainties[MAX_WERD_LENGTH]; 00300 this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn; 00301 int attempts_left = max_permuter_attempts; 00302 permute_choices((permute_debug && dawg_debug_level) ? 00303 "permute_dawg_debug" : NULL, 00304 char_choices, start_char_choice_index, NULL, &word, 00305 certainties, &rating_limit, best_choice, &attempts_left, 00306 &dawg_args); 00307 delete[] active_dawgs; 00308 delete[] constraints; 00309 if (re_enable_choice_accum) EnableChoiceAccum(); 00310 return best_choice; 00311 } 00312 00313 } // namespace tesseract