Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: permute.c (Formerly permute.c) 00005 * Description: Choose OCR text given character-probability maps 00006 * for sequences of glyph fragments and a dictionary provided as 00007 * a Dual Acyclic Word Graph. 00008 * In this file, "permute" should be read "combine." 00009 * Author: Mark Seaman, OCR Technology 00010 * Created: Fri Sep 22 14:05:51 1989 00011 * Modified: Thu Jan 3 16:38:46 1991 (Mark Seaman) marks@hpgrlt 00012 * Language: C 00013 * Package: N/A 00014 * Status: Experimental (Do Not Distribute) 00015 * 00016 * (c) Copyright 1989, Hewlett-Packard Company. 00017 ** Licensed under the Apache License, Version 2.0 (the "License"); 00018 ** you may not use this file except in compliance with the License. 00019 ** You may obtain a copy of the License at 00020 ** http://www.apache.org/licenses/LICENSE-2.0 00021 ** Unless required by applicable law or agreed to in writing, software 00022 ** distributed under the License is distributed on an "AS IS" BASIS, 00023 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00024 ** See the License for the specific language governing permissions and 00025 ** limitations under the License. 00026 * 00027 *********************************************************************************/ 00028 /*---------------------------------------------------------------------- 00029 I n c l u d e s 00030 ---------------------------------------------------------------------*/ 00031 00032 #ifdef _MSC_VER 00033 #pragma warning(disable:4244) // Conversion warnings 00034 #pragma warning(disable:4800) // int/bool warnings 00035 #endif 00036 00037 #include <assert.h> 00038 #include <math.h> 00039 00040 #include "const.h" 00041 00042 #include "permute.h" 00043 00044 #include "callcpp.h" 00045 #include "ccutil.h" 00046 #include "dict.h" 00047 #include "freelist.h" 00048 #include "helpers.h" 00049 #include "image.h" 00050 #include "globals.h" 00051 #include "ndminx.h" 00052 #include "ratngs.h" 00053 #include "stopper.h" 00054 #include "tprintf.h" 00055 #include "trie.h" 00056 #include "params.h" 00057 #include "unicharset.h" 00058 00059 00060 namespace tesseract { 00061 00062 /*---------------------------------------------------------------------- 00063 F u n c t i o n s 00064 ----------------------------------------------------------------------*/ 00065 00074 WERD_CHOICE *get_best_delete_other(WERD_CHOICE *choice1, 00075 WERD_CHOICE *choice2) { 00076 if (!choice1) return choice2; 00077 if (!choice2) return choice1; 00078 if (choice1->rating() < choice2->rating() || choice2->length() == 0) { 00079 delete choice2; 00080 return choice1; 00081 } else { 00082 delete choice1; 00083 return choice2; 00084 } 00085 } 00086 00091 BLOB_CHOICE* get_nth_choice(BLOB_CHOICE_LIST* blob_list, int n) { 00092 BLOB_CHOICE_IT c_it(blob_list); 00093 while (n-- > 0 && !c_it.at_last()) 00094 c_it.forward(); 00095 return c_it.data(); 00096 } 00097 00099 UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list) { 00100 if (!blob_list) return INVALID_UNICHAR_ID; 00101 BLOB_CHOICE_IT blob_choice_it(blob_list); 00102 return (blob_choice_it.data()) ? blob_choice_it.data()->unichar_id() 00103 : INVALID_UNICHAR_ID; 00104 } 00105 00110 int find_choice_by_uid(BLOB_CHOICE_LIST *blob_list, UNICHAR_ID target_uid) { 00111 BLOB_CHOICE_IT c_it(blob_list); 00112 int pos = 0; 00113 while (1) { 00114 if (c_it.data()->unichar_id() == target_uid) return pos; 00115 if (c_it.at_last()) break; 00116 c_it.forward(); 00117 pos++; 00118 } 00119 return -1; 00120 } 00121 00129 WERD_CHOICE* get_choice_from_posstr(const UNICHARSET *unicharset, 00130 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00131 int start_pos, 00132 const char* pos_str, 00133 float *certainties) { 00134 int pos_str_len = strlen(pos_str); 00135 WERD_CHOICE* wchoice = new WERD_CHOICE(unicharset); 00136 if (start_pos + pos_str_len > char_choices.length()) { 00137 wchoice->make_bad(); 00138 return wchoice; 00139 } 00140 for (int x = 0; x < pos_str_len; x++) { 00141 int pos = pos_str[x]-'0'; 00142 if (pos < 0) pos = 0; // use the top choice by default, eg. '.' 00143 if (pos >= 10) 00144 tprintf("PosStr[%d](%d)=%c %d\n", x, pos_str_len, pos_str[x], pos); 00145 ASSERT_HOST(pos < 10); 00146 BLOB_CHOICE* blob_it = get_nth_choice(char_choices.get(start_pos+x), pos); 00147 wchoice->set_permuter(NO_PERM); 00148 wchoice->append_unichar_id(blob_it->unichar_id(), 1, 00149 blob_it->rating(), 00150 blob_it->certainty()); 00151 if (certainties != NULL) certainties[x] = blob_it->certainty(); 00152 } 00153 return wchoice; 00154 } 00155 00161 void get_posstr_from_choice(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00162 WERD_CHOICE* word_choice, 00163 int start_pos, 00164 char* pos_str) { 00165 for (int i = 0; i < word_choice->length(); i++) { 00166 UNICHAR_ID target_id = word_choice->unichar_id(i); 00167 BLOB_CHOICE_LIST* blob_choice_list = char_choices.get(start_pos + i); 00168 int pos = find_choice_by_uid(blob_choice_list, target_id); 00169 if (pos < 0) pos = 0; 00170 pos_str[i] = pos + '0'; 00171 } 00172 pos_str[word_choice->length()] = '\0'; 00173 } 00174 00181 BLOB_CHOICE* find_choice_by_type( 00182 BLOB_CHOICE_LIST *blob_choices, 00183 char target_type, 00184 const UNICHARSET &unicharset) { 00185 BLOB_CHOICE_IT c_it(blob_choices); 00186 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00187 if (c_it.data() && 00188 unicharset.get_chartype(c_it.data()->unichar_id()) == target_type) 00189 return c_it.data(); 00190 } 00191 return NULL; 00192 } 00193 00206 BLOB_CHOICE* find_choice_by_script( 00207 BLOB_CHOICE_LIST *blob_choices, 00208 int target_sid, 00209 int backup_sid, 00210 int secondary_sid) { 00211 BLOB_CHOICE_IT c_it(blob_choices); 00212 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00213 bool found = false; 00214 if (c_it.data()->script_id() == 0) continue; 00215 if (c_it.data()->script_id() == target_sid) found = true; 00216 if (backup_sid > 0 && c_it.data()->script_id() == backup_sid) found = true; 00217 if (found) return c_it.data(); 00218 } 00219 if (secondary_sid > 0) { 00220 c_it.set_to_list(blob_choices); 00221 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00222 if (c_it.data()->script_id() == 0) continue; 00223 if (c_it.data()->script_id() == secondary_sid) 00224 return c_it.data(); 00225 } 00226 } 00227 return NULL; 00228 } 00229 00230 00231 PermuterState::PermuterState() { 00232 unicharset_ = NULL; 00233 char_choices_ = NULL; 00234 adjust_factor_ = 1.0f; 00235 allow_collision_ = false; 00236 word_length_ = 0; 00237 debug_ = false; 00238 } 00239 00240 void PermuterState::Init(const BLOB_CHOICE_LIST_VECTOR& char_choices, 00241 const UNICHARSET& unicharset, 00242 float default_bias, 00243 bool debug) { 00244 ASSERT_HOST(char_choices.length() < MAX_PERM_LENGTH); 00245 unicharset_ = &unicharset; 00246 char_choices_ = &char_choices; 00247 word_length_ = char_choices.length(); 00248 for (int i = 0; i < word_length_; ++i) 00249 perm_state_[i] = kPosFree; 00250 perm_state_[word_length_] = '\0'; 00251 // Skip fragment choice and the mark positions so they remain unchanged. 00252 for (int i = 0; i < word_length_; ++i) { 00253 UNICHAR_ID unichar_id = get_top_choice_uid(char_choices.get(i)); 00254 if (unicharset.get_fragment(unichar_id) != NULL) 00255 perm_state_[i] = '1'; 00256 } 00257 adjust_factor_ = default_bias; 00258 allow_collision_ = false; 00259 debug_ = debug; 00260 } 00261 00262 // Promote char positions specified in pos_str with given weight. 00263 // For example, AddPreference(5, "234", 0.95f) 00264 // would promote the 3rd, 4th and 5th choice for character 5, 6, 7 00265 // (starting at 0) in the word with a rating adjustment of 0.95. 00266 void PermuterState::AddPreference(int start_pos, char* pos_str, float weight) { 00267 ASSERT_HOST(char_choices_ != NULL); 00268 ASSERT_HOST(start_pos + strlen(pos_str) - 1 < word_length_); 00269 if (debug_) { 00270 tprintf("Copy over %s -> %s @ %d ", pos_str, perm_state_, start_pos); 00271 } 00272 // copy over preferred position without the terminating null 00273 if (!allow_collision_) { 00274 int len = strlen(pos_str); 00275 for (int i = 0; i < len; ++i) 00276 if (position_marked(start_pos + i)) return; 00277 } 00278 strncpy(&perm_state_[start_pos], pos_str, strlen(pos_str)); 00279 adjust_factor_ *= weight; 00280 if (debug_) tprintf("==> %s %f\n", perm_state_, adjust_factor_); 00281 } 00282 00283 // Promote the input blob_choice at specified position with given weight. 00284 void PermuterState::AddPreference(int char_pos, BLOB_CHOICE* blob_choice, 00285 float weight) { 00286 ASSERT_HOST(char_choices_ != NULL); 00287 ASSERT_HOST(char_pos < word_length_); 00288 // avoid collision (but this doesn't work if the first choice is favored! 00289 if (!allow_collision_ && position_marked(char_pos)) return; 00290 00291 if (debug_) { 00292 tprintf("Set UID %d -> %s @ %d ", 00293 blob_choice->unichar_id(), perm_state_, char_pos); 00294 } 00295 int pos = find_choice_by_uid(char_choices_->get(char_pos), 00296 blob_choice->unichar_id()); 00297 perm_state_[char_pos] = pos + '0'; 00298 adjust_factor_ *= weight; 00299 if (debug_) tprintf("==> %s %f\n", perm_state_, adjust_factor_); 00300 } 00301 00302 // Get the best word permutation so far. Caller should destroy WERD_CHOICE. 00303 WERD_CHOICE* PermuterState::GetPermutedWord(float *certainties, 00304 float *adjust_factor) { 00305 ASSERT_HOST(char_choices_ != NULL); 00306 WERD_CHOICE *word_choice = get_choice_from_posstr( 00307 unicharset_, *char_choices_, 0, perm_state_, certainties); 00308 float rating = word_choice->rating() * adjust_factor_; 00309 word_choice->set_rating(rating); 00310 *adjust_factor = adjust_factor_; 00311 return word_choice; 00312 } 00313 00314 00315 /********************************************************************** 00316 * permute_all 00317 * 00318 * Permute all the characters together using all of the different types 00319 * of permuters/selectors available. Each of the characters must have 00320 * a non-NULL choice list. 00321 * 00322 * Note: order of applying permuters does matter, since the latter 00323 * permuter will be recorded if the resulting word ratings are the same. 00324 * 00325 * Note: the function assumes that best_choice and raw_choice are not NULL. 00326 * 00327 * Note: Since permuter_all maybe called recursively (through permuter_ 00328 * compound_words), there must be a separate instance of permuter_state 00329 * for each invocation. 00330 **********************************************************************/ 00331 WERD_CHOICE *Dict::permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00332 const WERD_CHOICE *best_choice, 00333 WERD_CHOICE *raw_choice) { 00334 WERD_CHOICE *result1 = NULL; 00335 WERD_CHOICE *result2 = NULL; 00336 BOOL8 any_alpha; 00337 float top_choice_rating_limit = best_choice->rating(); 00338 int word_script_id = get_top_word_script(char_choices, getUnicharset()); 00339 00340 PermuterState permuter_state; 00341 if (getUnicharset().han_sid() != getUnicharset().null_sid() && 00342 word_script_id == getUnicharset().han_sid()) { 00343 permuter_state.Init(char_choices, getUnicharset(), 1.0f, permute_debug); 00344 00345 result1 = get_top_choice_word(char_choices); 00346 00347 // Note that we no longer need the returned word from these permuters, 00348 // except to delete the memory. The word choice from all permutations 00349 // is returned by permuter_state.GetpermutedWord() at the end. 00350 if (permute_fixed_length_dawg) { 00351 result2 = permute_fixed_length_words(char_choices, &permuter_state); 00352 delete result2; 00353 } 00354 if (permute_chartype_word) { 00355 result2 = permute_chartype_words(char_choices, &permuter_state); 00356 delete result2; 00357 } 00358 if (permute_script_word) { 00359 result2 = permute_script_words(char_choices, &permuter_state); 00360 delete result2; 00361 } 00362 00363 float certainties[MAX_PERM_LENGTH]; 00364 float adjust_factor; 00365 result2 = permuter_state.GetPermutedWord(certainties, &adjust_factor); 00366 LogNewChoice(adjust_factor, certainties, false, result2); 00367 result1 = get_best_delete_other(result1, result2); 00368 00369 if (segment_segcost_rating) incorporate_segcost(result1); 00370 } else { 00371 result1 = permute_top_choice(char_choices, &top_choice_rating_limit, 00372 raw_choice, &any_alpha); 00373 if (result1 == NULL) 00374 return (NULL); 00375 if (permute_only_top) 00376 return result1; 00377 00378 if (permute_chartype_word) { 00379 permuter_state.Init(char_choices, getUnicharset(), 00380 segment_penalty_garbage, permute_debug); 00381 result2 = permute_chartype_words(char_choices, &permuter_state); 00382 result1 = get_best_delete_other(result1, result2); 00383 } 00384 00385 // Permute character fragments if necessary. 00386 if (result1 == NULL || result1->fragment_mark()) { 00387 result2 = top_fragments_permute_and_select(char_choices, 00388 top_choice_rating_limit); 00389 result1 = get_best_delete_other(result1, result2); 00390 } 00391 00392 result2 = dawg_permute_and_select(char_choices, best_choice->rating()); 00393 result1 = get_best_delete_other(result1, result2); 00394 00395 result2 = permute_compound_words(char_choices, best_choice->rating()); 00396 result1 = get_best_delete_other(result1, result2); 00397 } 00398 return result1; 00399 } 00400 00409 void Dict::incorporate_segcost(WERD_CHOICE *word) { 00410 if (!word || wordseg_rating_adjust_factor_ <= 0) return; 00411 00412 float old_rating = word->rating(); 00413 float new_rating = old_rating * wordseg_rating_adjust_factor_; 00414 word->set_rating(new_rating); 00415 if (permute_debug) 00416 tprintf("Permute segadjust %f * %f --> %f\n", 00417 old_rating, wordseg_rating_adjust_factor_, new_rating); 00418 } 00419 00420 00430 WERD_CHOICE* Dict::permute_fixed_length_words( 00431 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00432 PermuterState *permuter_state) { 00433 if (permute_debug) 00434 print_char_choices_list("\n\nPermute FixedLength Word", 00435 char_choices, getUnicharset(), false); 00436 WERD_CHOICE* best_choice = 00437 new WERD_CHOICE(&getUnicharset(), char_choices.length()); 00438 const int max_dict_len = max_fixed_length_dawgs_wdlen_; 00439 const int min_dict_len = 2; 00440 char posstr[256]; 00441 int match_score = 0; 00442 int anchor_pos = 0; 00443 while (anchor_pos < char_choices.length()) { 00444 // search from longest phrase to shortest, stop when we find a match 00445 WERD_CHOICE* part_choice = NULL; 00446 int step = max_dict_len; 00447 while (step >= min_dict_len) { 00448 int end_pos = anchor_pos + step - 1; 00449 if (end_pos < char_choices.length()) { 00450 part_choice = dawg_permute_and_select(char_choices, 00451 200.0, // rate limit 00452 step, 00453 anchor_pos); 00454 if (part_choice->length() == step) { 00455 if (permute_debug) 00456 tprintf("match found at pos=%d len=%d\n%s\n", anchor_pos, step, 00457 part_choice->unichar_string().string()); 00458 break; 00459 } 00460 delete part_choice; 00461 part_choice = NULL; 00462 } 00463 step--; 00464 } 00465 00466 if (part_choice && step > 1) { // found lexicon match 00467 get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr); 00468 float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length()); 00469 if (permuter_state) 00470 permuter_state->AddPreference(anchor_pos, posstr, adjust_factor); 00471 match_score += step - 1; // single chars don't count 00472 if (permute_debug) 00473 tprintf("Promote word rating %d-len%d\n%s\n", anchor_pos, step, 00474 part_choice->unichar_string().string()); 00475 } else { // no lexicon match 00476 step = 1; 00477 part_choice = get_choice_from_posstr(&getUnicharset(), char_choices, 00478 anchor_pos, "0", NULL); 00479 if (permute_debug) 00480 tprintf("Single char %d %s\n", anchor_pos, 00481 part_choice->unichar_string().string()); 00482 } 00483 if (part_choice && part_choice->length() > 0) 00484 (*best_choice) += (*part_choice); 00485 if (part_choice) delete part_choice; 00486 anchor_pos += step; 00487 } 00488 00489 if (match_score > 0) { 00490 float adjust_factor = pow(0.8, // 1.0/segment_penalty_dict_nonword, 00491 match_score * 2.0 / char_choices.length()); 00492 float adjusted_score = best_choice->rating() * adjust_factor; 00493 if (permute_debug) 00494 tprintf("Adjusting score %f @ %d -> %f\n", 00495 best_choice->rating(), match_score, adjusted_score); 00496 best_choice->set_rating(adjusted_score); 00497 } 00498 if (permute_debug) 00499 tprintf("Found Best CJK word %f: %s\n", 00500 best_choice->rating(), best_choice->unichar_string().string()); 00501 return best_choice; 00502 } 00503 00504 00505 /********************************************************************** 00506 * Returns the dominant chartype for the word. Only the "main" chartype 00507 * of each character is used, and a consistent chartype is defined by 00508 * the majority chartype from non-ambiguous character positions. 00509 * If pos_chartypes is not NULL, then the "main" chartype at each pos 00510 * is also returned. The caller must allocate and deallocate the space. 00511 **********************************************************************/ 00512 char Dict::top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00513 char* pos_chartypes) { 00514 const UNICHARSET &unicharset = getUnicharset(); 00515 const int hist_size = 128; // to contain 'A','a','0','x','p' 00516 int chprop[hist_size]; 00517 int x; 00518 for (x = 0; x < hist_size; x++) chprop[x] = 0; 00519 for (x = 0; x < char_choices.length(); ++x) { 00520 UNICHAR_ID unichar_id = get_top_choice_uid(char_choices.get(x)); 00521 char ctype = unicharset.get_chartype(unichar_id); 00522 if (pos_chartypes) pos_chartypes[x] = ctype; 00523 if (ctype == 0 || ctype == 'p') continue; 00524 if (getUnicharAmbigs().OneToOneDefiniteAmbigs(unichar_id) != NULL) continue; 00525 chprop[ctype]++; 00526 if (x == 0 && ctype == 'A') // first-cap also counts as lower 00527 chprop['a']++; 00528 } 00529 int max_prop = 0; 00530 for (x = 1; x < hist_size; x++) 00531 if (chprop[x] >= chprop[max_prop]) max_prop = x; 00532 return (chprop[max_prop] > 0) ? max_prop : 0; 00533 } 00534 00535 /********************************************************************** 00536 * Promote consistent character type based on neighboring characters. 00537 * For each character position, if the top choice property is inconsistent 00538 * with that of the word or previous character, then its likely 00539 * subsitutions, as defined by DangAmbigs, will be examined and the one 00540 * with a matching property will be selected. 00541 **********************************************************************/ 00542 WERD_CHOICE* Dict::permute_chartype_words( 00543 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00544 PermuterState *permuter_state) { 00545 00546 if (char_choices.length() >= MAX_PERM_LENGTH) 00547 return NULL; 00548 // Store main character property of top choice at every position 00549 char pos_chartypes[MAX_PERM_LENGTH]; 00550 char word_type = top_word_chartype(char_choices, pos_chartypes); 00551 if (word_type == 0 || word_type == 'p') 00552 return NULL; // skip if word type is punctuation or unknown 00553 if (permute_debug) { 00554 tprintf("\n\nPermuteCharType[%c]\n", word_type); 00555 print_char_choices_list("", char_choices, getUnicharset(), true); 00556 } 00557 00558 WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset()); 00559 BLOB_CHOICE_IT blob_choice_it; 00560 const UNICHARSET& unicharset = getUnicharset(); 00561 bool replaced = false; // has any character choice been replaced 00562 int prev_unambig_type = 0; // the last chartype of an unambiguous char 00563 float certainties[MAX_PERM_LENGTH + 1]; 00564 for (int x = 0; x < char_choices.length(); ++x) { 00565 BLOB_CHOICE_LIST* pos_choice = char_choices.get(x); 00566 UNICHAR_ID unichar_id = get_top_choice_uid(pos_choice); 00567 if (unichar_id == 0) { 00568 delete current_word; 00569 return NULL; 00570 } 00571 blob_choice_it.set_to_list(pos_choice); 00572 BLOB_CHOICE *first_choice = blob_choice_it.data(); 00573 ASSERT_HOST(first_choice != NULL); 00574 00575 const UnicharIdVector* ambig_uids = 00576 getUnicharAmbigs().OneToOneDefiniteAmbigs(unichar_id); 00577 bool is_ambiguous = (ambig_uids != NULL); 00578 bool is_punct = unicharset.get_ispunctuation(unichar_id); 00579 bool is_consistent = is_punct || 00580 unicharset.get_chartype(unichar_id) == prev_unambig_type || 00581 unicharset.get_chartype(unichar_id) == word_type; 00582 bool is_fragment = getUnicharset().get_fragment(unichar_id) != NULL; 00583 if (permute_debug) 00584 tprintf("char[%d]:%s is_ambig %c is_punct %c is_consistent %c\n", 00585 x, unicharset.id_to_unichar(unichar_id), 00586 is_ambiguous?'T':'F', is_punct?'T':'F', is_consistent?'T':'F'); 00587 00588 if (is_fragment) { 00589 // Ignore any fragmented characters by skipping them to next choice 00590 // (originally first choice). 00591 first_choice = get_nth_choice(pos_choice, 1); 00592 ASSERT_HOST(first_choice != NULL); 00593 } else if (is_ambiguous && !is_consistent) { 00594 // Check every confusable blob choice where the top choice is inconsistent 00595 // with the character type of the previous unambiguous character. 00596 if (permute_debug) { 00597 tprintf("Checking %s r%g PrevCharType %c\n", 00598 unicharset.id_to_unichar(unichar_id), 00599 first_choice->rating(), prev_unambig_type); 00600 print_ratings_list("\t", pos_choice, getUnicharset()); 00601 } 00602 BLOB_CHOICE* c_it = NULL; 00603 if (c_it == NULL) { 00604 c_it = find_choice_by_type(pos_choice, word_type, unicharset); 00605 } 00606 00607 // Prefer a character choice whose type is the same as the previous 00608 // unambiguous character and the confusion appears in the ambig list. 00609 if (c_it == NULL && prev_unambig_type > 0) { 00610 c_it = find_choice_by_type(pos_choice, prev_unambig_type, unicharset); 00611 if (c_it && 00612 UnicharIdArrayUtils::find_in(*ambig_uids, c_it->unichar_id()) < 0) 00613 c_it = NULL; 00614 } 00615 00616 // Otherwise, perfer a punctuation 00617 if (c_it == NULL) { 00618 c_it = find_choice_by_type(pos_choice, 'p', unicharset); 00619 if (c_it && 00620 UnicharIdArrayUtils::find_in(*ambig_uids, c_it->unichar_id()) < 0) 00621 c_it = NULL; 00622 } 00623 00624 // save any preference other than the top choice 00625 if (c_it != NULL) { 00626 if (permute_debug) { 00627 tprintf("Replacing %s r%g ==> %s r%g\n", 00628 unicharset.id_to_unichar(unichar_id), first_choice->rating(), 00629 unicharset.id_to_unichar(c_it->unichar_id()), c_it->rating()); 00630 tprintf("\n\nPermuteCharType[%c]\n", word_type); 00631 print_char_choices_list("", char_choices, getUnicharset(), false); 00632 } 00633 if (permuter_state) 00634 permuter_state->AddPreference(x, c_it, segment_reward_chartype); 00635 first_choice = c_it; 00636 replaced = true; 00637 } 00638 } else if (!is_ambiguous && !is_punct) { 00639 // keep the last unambiguous character type 00640 prev_unambig_type = pos_chartypes[x]; 00641 } 00642 current_word->append_unichar_id(first_choice->unichar_id(), 1, 00643 first_choice->rating(), 00644 first_choice->certainty()); 00645 certainties[x] = first_choice->certainty(); 00646 } 00647 // All permuter choices should go through adjust_non_word so the choice 00648 // rating would be adjusted on the same scale. 00649 adjust_non_word(current_word, certainties, permute_debug); 00650 if (replaced) { 00651 // Apply a reward multiplier on rating if an chartype permutation is made. 00652 float rating = current_word->rating(); 00653 current_word->set_rating(rating * segment_reward_chartype); 00654 if (permute_debug) 00655 current_word->print("<== permute_chartype_word **"); 00656 } 00657 return current_word; 00658 } 00659 00669 WERD_CHOICE* Dict::permute_script_words( 00670 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00671 PermuterState *permuter_state) { 00672 if (char_choices.length() >= MAX_WERD_LENGTH) 00673 return NULL; 00674 00675 int word_sid = get_top_word_script(char_choices, getUnicharset()); 00676 if (word_sid == getUnicharset().null_sid()) 00677 return NULL; 00678 00679 if (permute_debug) { 00680 tprintf("\n\nPermuteScript %s\n", 00681 getUnicharset().get_script_from_script_id(word_sid)); 00682 print_char_choices_list("", char_choices, getUnicharset(), 00683 permute_debug > 1); 00684 } 00685 00686 WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset()); 00687 BLOB_CHOICE_IT blob_choice_it; 00688 bool replaced = false; 00689 bool prev_is_consistent = false; 00690 float certainties[MAX_PERM_LENGTH + 1]; 00691 for (int x = 0; x < char_choices.length(); ++x) { 00692 blob_choice_it.set_to_list(char_choices.get(x)); 00693 BLOB_CHOICE *first_choice = blob_choice_it.data(); 00694 if (!first_choice) { 00695 delete current_word; 00696 return NULL; 00697 } 00698 UNICHAR_ID unichar_id = first_choice->unichar_id(); 00699 if (unichar_id == 0) { 00700 delete current_word; 00701 return NULL; 00702 } 00703 bool sid_consistent = (getUnicharset().get_script(unichar_id) == word_sid); 00704 bool this_is_punct = getUnicharset().get_chartype(unichar_id) == 'p'; 00705 bool is_fragment = getUnicharset().get_fragment(unichar_id) != NULL; 00706 00707 if (is_fragment) { 00708 // Ignore any fragmented characters by skipping them to next choice 00709 // (originally first choice). 00710 first_choice = get_nth_choice(char_choices.get(x), 1); 00711 ASSERT_HOST(first_choice != NULL); 00712 } else if (!sid_consistent && !this_is_punct && prev_is_consistent) { 00713 // If the previous char is CJK, we prefer a cjk over non-cjk char 00714 if (permute_debug) { 00715 tprintf("Checking %s r%g\n", getUnicharset().id_to_unichar(unichar_id), 00716 first_choice->rating()); 00717 print_ratings_list("\t", char_choices.get(x), getUnicharset()); 00718 } 00719 // prefer a script consistent choice 00720 BLOB_CHOICE* c_it = find_choice_by_script(char_choices.get(x), 00721 word_sid, 0, 0); 00722 // otherwise, prefer a punctuation 00723 if (c_it == NULL) 00724 c_it = find_choice_by_type(char_choices.get(x), 'p', getUnicharset()); 00725 00726 if (c_it != NULL) { 00727 if (permute_debug) 00728 tprintf("Replacing %s r%g ==> %s r%g\n", 00729 getUnicharset().id_to_unichar(unichar_id), 00730 first_choice->rating(), 00731 getUnicharset().id_to_unichar(c_it->unichar_id()), 00732 c_it->rating()); 00733 if (permuter_state) 00734 permuter_state->AddPreference(x, c_it, segment_reward_script); 00735 first_choice = c_it; 00736 replaced = true; 00737 } 00738 } 00739 current_word->append_unichar_id(first_choice->unichar_id(), 1, 00740 first_choice->rating(), 00741 first_choice->certainty()); 00742 certainties[x] = first_choice->certainty(); 00743 prev_is_consistent = sid_consistent; 00744 } 00745 // All permuter choices should go through adjust_non_word so the choice 00746 // rating would be adjusted on the same scale. 00747 adjust_non_word(current_word, certainties, permute_debug); 00748 if (replaced) { 00749 // Apply a reward multiplier on rating if an script permutation is made. 00750 float rating = current_word->rating(); 00751 current_word->set_rating(rating * segment_reward_script); 00752 if (permute_debug) 00753 current_word->print("<== permute_script_word **"); 00754 } 00755 return current_word; 00756 } 00757 00765 bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00766 WERD_CHOICE *best_choice, 00767 WERD_CHOICE *raw_choice) { 00768 if (permute_debug) { 00769 tprintf("\n\n\n##### Permute_Characters #######\n"); 00770 print_char_choices_list("\n==> Input CharChoices", char_choices, 00771 getUnicharset(), segment_debug > 1); 00772 tprintf("\n"); 00773 } 00774 00775 if (char_choices.length() == 1 && 00776 get_top_choice_uid(char_choices.get(0)) == 0) return false; 00777 WERD_CHOICE *this_choice = permute_all(char_choices, best_choice, raw_choice); 00778 00779 if (this_choice && this_choice->rating() < best_choice->rating()) { 00780 *best_choice = *this_choice; 00781 00782 if (permute_debug) { 00783 best_choice->print("\n**** Populate BestChoice"); 00784 cprintf("populate best_choice\n\t%s\n", 00785 best_choice->debug_string().string()); 00786 } 00787 delete this_choice; 00788 return true; 00789 } 00790 delete this_choice; 00791 return false; 00792 } 00793 00799 WERD_CHOICE *Dict::permute_compound_words( 00800 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00801 float rating_limit) { 00802 BLOB_CHOICE *first_choice; 00803 WERD_CHOICE *best_choice = NULL; 00804 WERD_CHOICE current_word(&getUnicharset(), MAX_WERD_LENGTH); 00805 int first_index = 0; 00806 int x; 00807 BLOB_CHOICE_IT blob_choice_it; 00808 00809 if (char_choices.length() > MAX_WERD_LENGTH) { 00810 WERD_CHOICE *bad_word_choice = new WERD_CHOICE(&getUnicharset()); 00811 bad_word_choice->make_bad(); 00812 return bad_word_choice; 00813 } 00814 00815 UNICHAR_ID slash = getUnicharset().unichar_to_id("/"); 00816 UNICHAR_ID dash = getUnicharset().unichar_to_id("-"); 00817 for (x = 0; x < char_choices.length(); ++x) { 00818 blob_choice_it.set_to_list(char_choices.get(x)); 00819 first_choice = blob_choice_it.data(); 00820 if (first_choice->unichar_id() == slash || 00821 first_choice->unichar_id() == dash) { 00822 if (x > first_index) { 00823 if (segment_debug) 00824 cprintf ("Hyphenated word found\n"); 00825 permute_subword(char_choices, rating_limit, first_index, 00826 x - 1, ¤t_word); 00827 if (current_word.rating() > rating_limit) 00828 break; 00829 } 00830 // Append hyphen/slash separator to current_word. 00831 current_word.append_unichar_id_space_allocated( 00832 first_choice->unichar_id(), 1, 00833 first_choice->rating(), first_choice->certainty()); 00834 00835 first_index = x + 1; // update first_index 00836 } 00837 } 00838 00839 if (first_index > 0 && first_index < x && 00840 current_word.rating() <= rating_limit) { 00841 permute_subword(char_choices, rating_limit, first_index, 00842 x - 1, ¤t_word); 00843 best_choice = new WERD_CHOICE(current_word); 00844 best_choice->set_permuter(COMPOUND_PERM); 00845 } 00846 return (best_choice); 00847 } 00848 00849 00859 void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00860 float rating_limit, 00861 int start, 00862 int end, 00863 WERD_CHOICE *current_word) { 00864 int x; 00865 BLOB_CHOICE_LIST_VECTOR subchoices; 00866 WERD_CHOICE *best_choice = NULL; 00867 WERD_CHOICE raw_choice(&getUnicharset()); 00868 raw_choice.make_bad(); 00869 00870 DisableChoiceAccum(); 00871 00872 for (x = start; x <= end; x++) { 00873 if (char_choices.get(x) != NULL) { 00874 subchoices += char_choices.get(x); 00875 } 00876 } 00877 00878 if (!subchoices.empty()) { 00879 WERD_CHOICE initial_choice(&getUnicharset()); 00880 initial_choice.make_bad(); 00881 initial_choice.set_rating(rating_limit); 00882 00883 best_choice = permute_all(subchoices, &initial_choice, &raw_choice); 00884 00885 if (best_choice && best_choice->length() > 0) { 00886 *current_word += *best_choice; 00887 } else { 00888 current_word->set_rating(MAX_FLOAT32); 00889 } 00890 } else { 00891 current_word->set_rating(MAX_FLOAT32); 00892 } 00893 00894 if (best_choice) 00895 delete best_choice; 00896 00897 if (segment_debug && current_word->rating() < MAX_FLOAT32) { 00898 cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n", 00899 current_word->debug_string().string(), 00900 current_word->rating(), current_word->certainty()); 00901 } 00902 EnableChoiceAccum(); 00903 } 00904 00908 WERD_CHOICE *Dict::get_top_choice_word( 00909 const BLOB_CHOICE_LIST_VECTOR &char_choices) { 00910 WERD_CHOICE *top_word = new WERD_CHOICE(&getUnicharset(), MAX_PERM_LENGTH); 00911 float certainties[MAX_PERM_LENGTH]; 00912 top_word->set_permuter(TOP_CHOICE_PERM); 00913 for (int x = 0; x < char_choices.length(); x++) { 00914 BLOB_CHOICE_IT blob_choice_it; 00915 blob_choice_it.set_to_list(char_choices.get(x)); 00916 BLOB_CHOICE *top_choice = blob_choice_it.data(); 00917 top_word->append_unichar_id_space_allocated(top_choice->unichar_id(), 1, 00918 top_choice->rating(), 00919 top_choice->certainty()); 00920 certainties[x] = top_choice->certainty(); 00921 } 00922 LogNewChoice(1.0, certainties, true, top_word); 00923 return top_word; 00924 } 00925 00934 WERD_CHOICE *Dict::permute_top_choice( 00935 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00936 float* rating_limit, 00937 WERD_CHOICE *raw_choice, 00938 BOOL8 *any_alpha) { 00939 BLOB_CHOICE *first_choice; 00940 const char *first_char; //first choice 00941 const char *second_char; //second choice 00942 const char *third_char; //third choice 00943 char prev_char[UNICHAR_LEN + 1]; //prev in word 00944 const char *next_char = ""; //next in word 00945 const char *next_next_char = ""; //after next next in word 00946 00947 WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH); 00948 word.set_permuter(TOP_CHOICE_PERM); 00949 WERD_CHOICE capital_word(&getUnicharset(), MAX_PERM_LENGTH); 00950 capital_word.set_permuter(UPPER_CASE_PERM); 00951 WERD_CHOICE lower_word(&getUnicharset(), MAX_PERM_LENGTH); 00952 lower_word.set_permuter(LOWER_CASE_PERM); 00953 00954 int x; 00955 BOOL8 char_alpha; 00956 float first_rating = 0; 00957 00958 float certainties[MAX_PERM_LENGTH + 1]; 00959 float lower_certainties[MAX_PERM_LENGTH + 1]; 00960 float upper_certainties[MAX_PERM_LENGTH + 1]; 00961 00962 BLOB_CHOICE_IT blob_choice_it; 00963 UNICHAR_ID temp_id; 00964 UNICHAR_ID unichar_id; 00965 UNICHAR_ID space = getUnicharset().unichar_to_id(" "); 00966 register const char* ch; 00967 register inT8 lower_done; 00968 register inT8 upper_done; 00969 00970 prev_char[0] = '\0'; 00971 00972 if (any_alpha != NULL) 00973 *any_alpha = FALSE; 00974 00975 if (char_choices.length() > MAX_PERM_LENGTH) { 00976 return (NULL); 00977 } 00978 00979 for (x = 0; x < char_choices.length(); ++x) { 00980 if (x + 1 < char_choices.length()) { 00981 unichar_id = get_top_choice_uid(char_choices.get(x+1)); 00982 next_char = unichar_id != INVALID_UNICHAR_ID ? 00983 getUnicharset().id_to_unichar(unichar_id) : ""; 00984 } else { 00985 next_char = ""; 00986 } 00987 00988 if (x + 2 < char_choices.length()) { 00989 unichar_id = get_top_choice_uid(char_choices.get(x+2)); 00990 next_next_char = unichar_id != INVALID_UNICHAR_ID ? 00991 getUnicharset().id_to_unichar(unichar_id) : ""; 00992 } else { 00993 next_next_char = ""; 00994 } 00995 00996 blob_choice_it.set_to_list(char_choices.get(x)); 00997 ASSERT_HOST(!blob_choice_it.empty()); 00998 first_choice = NULL; 00999 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); 01000 blob_choice_it.forward()) { // find the best non-fragment char choice 01001 temp_id = blob_choice_it.data()->unichar_id(); 01002 if (!(getUnicharset().get_fragment(temp_id))) { 01003 first_choice = blob_choice_it.data(); 01004 break; 01005 } else if (char_choices.length() > 1) { 01006 word.set_fragment_mark(true); 01007 capital_word.set_fragment_mark(true); 01008 lower_word.set_fragment_mark(true); 01009 } 01010 } 01011 if (first_choice == NULL) { 01012 cprintf("Permuter found only fragments for" 01013 " character at position %d; word=%s\n", 01014 x, word.debug_string().string()); 01015 } 01016 ASSERT_HOST(first_choice != NULL); 01017 01018 unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ? 01019 first_choice->unichar_id() : space; 01020 first_char = getUnicharset().id_to_unichar(unichar_id); 01021 first_rating = first_choice->rating(); 01022 word.append_unichar_id_space_allocated( 01023 unichar_id, 1, first_choice->rating(), first_choice->certainty()); 01024 capital_word.append_unichar_id_space_allocated( 01025 unichar_id, 1, first_choice->rating(), first_choice->certainty()); 01026 lower_word.append_unichar_id_space_allocated( 01027 unichar_id, 1, first_choice->rating(), first_choice->certainty()); 01028 01029 certainties[x] = first_choice->certainty(); 01030 lower_certainties[x] = first_choice->certainty(); 01031 upper_certainties[x] = first_choice->certainty(); 01032 01033 lower_done = FALSE; 01034 upper_done = FALSE; 01035 char_alpha = FALSE; 01036 second_char = ""; 01037 third_char = ""; 01038 for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) { 01039 unichar_id = blob_choice_it.data()->unichar_id(); 01040 if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() && 01041 blob_choice_it.data_relative(1)->rating() == first_rating) { 01042 temp_id = blob_choice_it.data_relative(1)->unichar_id(); 01043 if (getUnicharset().eq(temp_id, "1") || 01044 getUnicharset().eq(temp_id, "I")) { 01045 second_char = getUnicharset().id_to_unichar(temp_id); 01046 blob_choice_it.forward(); 01047 if (!blob_choice_it.at_last() && 01048 blob_choice_it.data_relative(1)->rating() == first_rating) { 01049 temp_id = blob_choice_it.data_relative(1)->unichar_id(); 01050 if (getUnicharset().eq(temp_id, "1") || 01051 getUnicharset().eq(temp_id, "I")) { 01052 third_char = getUnicharset().id_to_unichar(temp_id); 01053 blob_choice_it.forward(); 01054 } 01055 } 01056 ch = choose_il1 (first_char, second_char, third_char, 01057 prev_char, next_char, next_next_char); 01058 unichar_id = (ch != NULL && *ch != '\0') ? 01059 getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID; 01060 if (strcmp(ch, "l") != 0 && 01061 getUnicharset().eq(word.unichar_id(x), "l")) { 01062 word.set_unichar_id(unichar_id, x); 01063 lower_word.set_unichar_id(unichar_id, x); 01064 capital_word.set_unichar_id(unichar_id, x); 01065 } 01066 } 01067 } 01068 if (unichar_id != INVALID_UNICHAR_ID) { 01069 /* Find lower case */ 01070 if (!lower_done && 01071 (getUnicharset().get_islower(unichar_id) || 01072 (getUnicharset().get_isupper(unichar_id) && x == 0))) { 01073 lower_word.set_unichar_id(unichar_id, x); 01074 lower_word.set_rating(lower_word.rating() - 01075 first_choice->rating() + blob_choice_it.data()->rating()); 01076 if (blob_choice_it.data()->certainty() < lower_word.certainty()) { 01077 lower_word.set_certainty(blob_choice_it.data()->certainty()); 01078 } 01079 lower_certainties[x] = blob_choice_it.data()->certainty(); 01080 lower_done = TRUE; 01081 } 01082 /* Find upper case */ 01083 if (!upper_done && getUnicharset().get_isupper(unichar_id)) { 01084 capital_word.set_unichar_id(unichar_id, x); 01085 capital_word.set_rating(capital_word.rating() - 01086 first_choice->rating() + blob_choice_it.data()->rating()); 01087 if (blob_choice_it.data()->certainty() < capital_word.certainty()) { 01088 capital_word.set_certainty(blob_choice_it.data()->certainty()); 01089 } 01090 upper_certainties[x] = blob_choice_it.data()->certainty(); 01091 upper_done = TRUE; 01092 } 01093 if (!char_alpha) { 01094 const CHAR_FRAGMENT *fragment = 01095 getUnicharset().get_fragment(unichar_id); 01096 temp_id = !fragment ? unichar_id : 01097 getUnicharset().unichar_to_id(fragment->get_unichar()); 01098 if (getUnicharset().get_isalpha(temp_id)) { 01099 char_alpha = TRUE; 01100 } 01101 } 01102 if (lower_done && upper_done) 01103 break; 01104 } 01105 } 01106 if (char_alpha && any_alpha != NULL) 01107 *any_alpha = TRUE; 01108 01109 if (word.rating() > bestrate_pruning_factor * *rating_limit) { 01110 if (permute_debug) 01111 tprintf("\n***** Aborting high-cost word: %g > limit %g\n", 01112 word.rating(), bestrate_pruning_factor * *rating_limit); 01113 return (NULL); 01114 } 01115 01116 *prev_char = '\0'; 01117 temp_id = word.unichar_id(word.length()-1); 01118 if (temp_id != INVALID_UNICHAR_ID) { 01119 strcpy(prev_char, getUnicharset().id_to_unichar(temp_id)); 01120 } 01121 } 01122 01123 if (raw_choice != NULL && word.rating() < raw_choice->rating()) { 01124 *raw_choice = word; 01125 LogNewChoice(1.0, certainties, true, raw_choice); 01126 } 01127 float rating = word.rating(); 01128 adjust_non_word(&word, certainties, permute_debug); 01129 01130 float lower_rating = lower_word.rating(); 01131 adjust_non_word(&lower_word, lower_certainties, permute_debug); 01132 01133 float upper_rating = capital_word.rating(); 01134 adjust_non_word(&capital_word, upper_certainties, permute_debug); 01135 01136 WERD_CHOICE *best_choice = &word; 01137 *rating_limit = rating; 01138 if (lower_word.rating() < best_choice->rating()) { 01139 best_choice = &lower_word; 01140 *rating_limit = lower_rating; 01141 } 01142 if (capital_word.rating() < best_choice->rating()) { 01143 best_choice = &capital_word; 01144 *rating_limit = upper_rating; 01145 } 01146 return new WERD_CHOICE(*best_choice); 01147 } 01148 01149 01161 const char* Dict::choose_il1(const char *first_char, 01162 const char *second_char, 01163 const char *third_char, 01164 const char *prev_char, 01165 const char *next_char, 01166 const char *next_next_char) { 01167 inT32 type1; //1/I/l type of first choice 01168 inT32 type2; //1/I/l type of second choice 01169 inT32 type3; //1/I/l type of third choice 01170 01171 int first_char_length = strlen(first_char); 01172 int prev_char_length = strlen(prev_char); 01173 int next_char_length = strlen(next_char); 01174 int next_next_char_length = strlen(next_next_char); 01175 01176 if (*first_char == 'l' && *second_char != '\0') { 01177 if (*second_char == 'I' 01178 && (((prev_char_length != 0 && 01179 getUnicharset().get_isupper (prev_char, prev_char_length)) && 01180 (next_char_length == 0 || 01181 !getUnicharset().get_islower (next_char, next_char_length)) && 01182 (next_char_length == 0 || 01183 !getUnicharset().get_isdigit (next_char, next_char_length))) || 01184 ((next_char_length != 0 && 01185 getUnicharset().get_isupper (next_char, next_char_length)) && 01186 (prev_char_length == 0 || 01187 !getUnicharset().get_islower (prev_char, prev_char_length)) && 01188 (prev_char_length == 0 || 01189 !getUnicharset().get_isdigit (prev_char, prev_char_length))))) 01190 first_char = second_char; //override 01191 else if (*second_char == '1' || *third_char == '1') { 01192 if ((next_char_length != 0 && 01193 getUnicharset().get_isdigit (next_char, next_char_length)) || 01194 (prev_char_length != 0 && 01195 getUnicharset().get_isdigit (prev_char, prev_char_length)) 01196 || (*next_char == 'l' && 01197 (next_next_char_length != 0 && 01198 getUnicharset().get_isdigit (next_next_char, 01199 next_next_char_length)))) { 01200 first_char = "1"; 01201 first_char_length = 1; 01202 } 01203 else if ((prev_char_length == 0 || 01204 !getUnicharset().get_islower (prev_char, prev_char_length)) && 01205 ((next_char_length == 0 || 01206 !getUnicharset().get_islower (next_char, next_char_length)) || 01207 (*next_char == 's' && 01208 *next_next_char == 't'))) { 01209 if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0') 01210 && ((*next_char != '\'' && *next_char != '`') 01211 || *prev_char != '\0')) { 01212 first_char = "1"; 01213 first_char_length = 1; 01214 } 01215 } 01216 } 01217 if (*first_char == 'l' && *next_char != '\0' && 01218 (prev_char_length == 0 || 01219 !getUnicharset().get_isalpha (prev_char, prev_char_length))) { 01220 type1 = 2; 01221 01222 if (*second_char == '1') 01223 type2 = 0; 01224 else if (*second_char == 'I') 01225 type2 = 1; 01226 else if (*second_char == 'l') 01227 type2 = 2; 01228 else 01229 type2 = type1; 01230 01231 if (*third_char == '1') 01232 type3 = 0; 01233 else if (*third_char == 'I') 01234 type3 = 1; 01235 else if (*third_char == 'l') 01236 type3 = 2; 01237 else 01238 type3 = type1; 01239 01240 #if 0 01241 if (bigram_counts[*next_char][type2] > 01242 bigram_counts[*next_char][type1]) { 01243 first_char = second_char; 01244 type1 = type2; 01245 } 01246 if (bigram_counts[*next_char][type3] > 01247 bigram_counts[*next_char][type1]) { 01248 first_char = third_char; 01249 } 01250 #endif 01251 } 01252 } 01253 return first_char; 01254 } 01255 01281 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, 01282 float curr_rating, float curr_certainty, 01283 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 01284 const char *debug, int word_ending, 01285 CHAR_FRAGMENT_INFO *char_frag_info) { 01286 const CHAR_FRAGMENT *this_fragment = 01287 getUnicharset().get_fragment(curr_unichar_id); 01288 const CHAR_FRAGMENT *prev_fragment = 01289 prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL; 01290 01291 // Print debug info for fragments. 01292 if (debug && (prev_fragment || this_fragment)) { 01293 cprintf("%s check fragments: choice=%s word_ending=%d\n", debug, 01294 getUnicharset().debug_str(curr_unichar_id).string(), 01295 word_ending); 01296 if (prev_fragment) { 01297 cprintf("prev_fragment %s\n", prev_fragment->to_string().string()); 01298 } 01299 if (this_fragment) { 01300 cprintf("this_fragment %s\n", this_fragment->to_string().string()); 01301 } 01302 } 01303 01304 char_frag_info->unichar_id = curr_unichar_id; 01305 char_frag_info->fragment = this_fragment; 01306 char_frag_info->rating = curr_rating; 01307 char_frag_info->certainty = curr_certainty; 01308 char_frag_info->num_fragments = 1; 01309 if (prev_fragment && !this_fragment) { 01310 if (debug) tprintf("Skip choice with incomplete fragment\n"); 01311 return false; 01312 } 01313 if (this_fragment) { 01314 // We are dealing with a fragment. 01315 char_frag_info->unichar_id = INVALID_UNICHAR_ID; 01316 if (prev_fragment) { 01317 if (!this_fragment->is_continuation_of(prev_fragment)) { 01318 if (debug) tprintf("Non-matching fragment piece\n"); 01319 return false; 01320 } 01321 if (this_fragment->is_ending()) { 01322 char_frag_info->unichar_id = 01323 getUnicharset().unichar_to_id(this_fragment->get_unichar()); 01324 char_frag_info->fragment = NULL; 01325 if (debug) { 01326 tprintf("Built character %s from fragments\n", 01327 getUnicharset().debug_str( 01328 char_frag_info->unichar_id).string()); 01329 } 01330 } else { 01331 if (debug) tprintf("Record fragment continuation\n"); 01332 char_frag_info->fragment = this_fragment; 01333 } 01334 // Update certainty and rating. 01335 char_frag_info->rating = 01336 prev_char_frag_info->rating + curr_rating; 01337 char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1; 01338 char_frag_info->certainty = 01339 MIN(curr_certainty, prev_char_frag_info->certainty); 01340 } else { 01341 if (this_fragment->is_beginning()) { 01342 if (debug) cprintf("Record fragment beginning\n"); 01343 } else { 01344 if (debug) { 01345 tprintf("Non-starting fragment piece with no prev_fragment\n"); 01346 } 01347 return false; 01348 } 01349 } 01350 } 01351 if (word_ending && char_frag_info->fragment) { 01352 if (debug) tprintf("Word can not end with a fragment\n"); 01353 return false; 01354 } 01355 return true; 01356 } 01365 WERD_CHOICE *Dict::top_fragments_permute_and_select( 01366 const BLOB_CHOICE_LIST_VECTOR &char_choices, 01367 float rating_limit) { 01368 if (char_choices.length() <= 1 || 01369 char_choices.length() > MAX_PERM_LENGTH) { 01370 return NULL; 01371 } 01372 // See it would be possible to benefit from permuting fragments. 01373 int x; 01374 float min_rating = 0.0; 01375 BLOB_CHOICE_IT blob_choice_it; 01376 for (x = 0; x < char_choices.length(); ++x) { 01377 blob_choice_it.set_to_list(char_choices.get(x)); 01378 if (blob_choice_it.data()) { 01379 min_rating += blob_choice_it.data()->rating(); 01380 } 01381 if (min_rating >= rating_limit) { 01382 return NULL; 01383 } 01384 } 01385 if (fragments_debug > 1) { 01386 tprintf("A choice with fragment beats top choice\n"); 01387 tprintf("Running fragment permuter...\n"); 01388 } 01389 01390 // Construct a modified choices list that contains (for each position): 01391 // the best choice, all fragments and at least one choice for 01392 // a non-fragmented character. 01393 BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length()); 01394 for (x = 0; x < char_choices.length(); ++x) { 01395 bool need_nonfrag_char = true; 01396 BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST(); 01397 BLOB_CHOICE_IT frag_choices_it; 01398 frag_choices_it.set_to_list(frag_choices); 01399 blob_choice_it.set_to_list(char_choices.get(x)); 01400 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); 01401 blob_choice_it.forward()) { 01402 if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) { 01403 frag_choices_it.add_after_then_move( 01404 new BLOB_CHOICE(*(blob_choice_it.data()))); 01405 } else if (need_nonfrag_char) { 01406 frag_choices_it.add_after_then_move( 01407 new BLOB_CHOICE(*(blob_choice_it.data()))); 01408 need_nonfrag_char = false; 01409 } 01410 } 01411 frag_char_choices += frag_choices; 01412 } 01413 01414 WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset()); 01415 best_choice->make_bad(); 01416 WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH); 01417 word.set_permuter(TOP_CHOICE_PERM); 01418 float certainties[MAX_PERM_LENGTH]; 01419 this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn; 01420 int attempts_left = max_permuter_attempts; 01421 permute_choices((fragments_debug > 1) ? "fragments_debug" : NULL, 01422 frag_char_choices, 0, NULL, &word, certainties, 01423 &rating_limit, best_choice, &attempts_left, NULL); 01424 01425 frag_char_choices.delete_data_pointers(); 01426 return best_choice; 01427 } 01428 01435 void Dict::permute_choices( 01436 const char *debug, 01437 const BLOB_CHOICE_LIST_VECTOR &char_choices, 01438 int char_choice_index, 01439 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 01440 WERD_CHOICE *word, 01441 float certainties[], 01442 float *limit, 01443 WERD_CHOICE *best_choice, 01444 int *attempts_left, 01445 void *more_args) { 01446 if (debug) { 01447 tprintf("%s permute_choices: char_choice_index=%d" 01448 " limit=%g rating=%g, certainty=%g word=%s\n", 01449 debug, char_choice_index, *limit, word->rating(), 01450 word->certainty(), word->debug_string().string()); 01451 } 01452 if (char_choice_index < char_choices.length()) { 01453 BLOB_CHOICE_IT blob_choice_it; 01454 blob_choice_it.set_to_list(char_choices.get(char_choice_index)); 01455 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); 01456 blob_choice_it.forward()) { 01457 (*attempts_left)--; 01458 append_choices(debug, char_choices, *(blob_choice_it.data()), 01459 char_choice_index, prev_char_frag_info, word, 01460 certainties, limit, best_choice, attempts_left, more_args); 01461 if (*attempts_left <= 0) { 01462 if (debug) tprintf("permute_choices(): attempts_left is 0\n"); 01463 break; 01464 } 01465 } 01466 } 01467 } 01468 01477 void Dict::append_choices( 01478 const char *debug, 01479 const BLOB_CHOICE_LIST_VECTOR &char_choices, 01480 const BLOB_CHOICE &blob_choice, 01481 int char_choice_index, 01482 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 01483 WERD_CHOICE *word, 01484 float certainties[], 01485 float *limit, 01486 WERD_CHOICE *best_choice, 01487 int *attempts_left, 01488 void *more_args) { 01489 int word_ending = 01490 (char_choice_index == char_choices.length() - 1) ? true : false; 01491 01492 // Deal with fragments. 01493 CHAR_FRAGMENT_INFO char_frag_info; 01494 if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), 01495 blob_choice.certainty(), prev_char_frag_info, debug, 01496 word_ending, &char_frag_info)) { 01497 return; // blob_choice must be an invalid fragment 01498 } 01499 // Search the next letter if this character is a fragment. 01500 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { 01501 permute_choices(debug, char_choices, char_choice_index + 1, 01502 &char_frag_info, word, certainties, limit, 01503 best_choice, attempts_left, more_args); 01504 return; 01505 } 01506 01507 // Add the next unichar. 01508 float old_rating = word->rating(); 01509 float old_certainty = word->certainty(); 01510 uinT8 old_permuter = word->permuter(); 01511 certainties[word->length()] = char_frag_info.certainty; 01512 word->append_unichar_id_space_allocated( 01513 char_frag_info.unichar_id, char_frag_info.num_fragments, 01514 char_frag_info.rating, char_frag_info.certainty); 01515 01516 // Explore the next unichar. 01517 (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, 01518 &char_frag_info, word_ending, word, certainties, 01519 limit, best_choice, attempts_left, more_args); 01520 01521 // Remove the unichar we added to explore other choices in it's place. 01522 word->remove_last_unichar_id(); 01523 word->set_rating(old_rating); 01524 word->set_certainty(old_certainty); 01525 word->set_permuter(old_permuter); 01526 } 01527 01536 void Dict::go_deeper_top_fragments_fxn( 01537 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 01538 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 01539 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, 01540 WERD_CHOICE *best_choice, int *attempts_left, void *more_args) { 01541 if (word->rating() < *limit) { 01542 if (word_ending) { 01543 if (fragments_debug > 1) { 01544 tprintf("fragments_debug new choice = %s\n", 01545 word->debug_string().string()); 01546 } 01547 *limit = word->rating(); 01548 adjust_non_word(word, certainties, permute_debug); 01549 update_best_choice(*word, best_choice); 01550 } else { // search the next letter 01551 permute_choices(debug, char_choices, char_choice_index + 1, 01552 prev_char_frag_info, word, certainties, limit, 01553 best_choice, attempts_left, more_args); 01554 } 01555 } else { 01556 if (fragments_debug > 1) { 01557 tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n", 01558 word->debug_string().string(), word->rating(), *limit); 01559 } 01560 } 01561 } 01562 01563 } // namespace tesseract