Tesseract
3.02
|
00001 /****************************************************************************** 00002 ** Filename: stopper.c 00003 ** Purpose: Stopping criteria for word classifier. 00004 ** Author: Dan Johnson 00005 ** History: Mon Apr 29 14:56:49 1991, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 00019 #include "stopper.h" 00020 #include "matchdefs.h" 00021 #include "callcpp.h" 00022 #include "permute.h" 00023 #include "danerror.h" 00024 #include "const.h" 00025 #include "efio.h" 00026 #include "scanutils.h" 00027 #include "unichar.h" 00028 #include "params.h" 00029 #include "dict.h" 00030 #include "image.h" 00031 #include "ccutil.h" 00032 #include "ratngs.h" 00033 #include "ambigs.h" 00034 00035 #include <stdio.h> 00036 #include <string.h> 00037 #include <ctype.h> 00038 #include <math.h> 00039 #ifdef __UNIX__ 00040 #include <assert.h> 00041 #endif 00042 00043 #ifdef _MSC_VER 00044 #pragma warning(disable:4244) // Conversion warnings 00045 #pragma warning(disable:4800) // int/bool warnings 00046 #endif 00047 00048 /* these are kludges - add appropriate .h file later */ 00049 /* from adaptmatch.cpp */ 00050 #define MAX_WERD_SIZE 100 00051 00052 typedef struct 00053 { 00054 VIABLE_CHOICE Choice; 00055 float ChunkCertainty[MAX_NUM_CHUNKS]; 00056 UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS]; 00057 } EXPANDED_CHOICE; 00058 00059 void DeleteViableChoiceStruct(void *vcs) { 00060 delete (static_cast<VIABLE_CHOICE_STRUCT *>(vcs)); 00061 } 00062 00063 #define BestCertainty(Choices) \ 00064 (((VIABLE_CHOICE) first_node (Choices))->Certainty) 00065 00066 #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating) 00067 00068 #define BestFactor(Choices) \ 00069 (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor) 00070 00074 // Returns -1 if the rating for Choice1 is less than the rating for Choice2, 00075 // otherwise returns 1. 00076 static int CmpChoiceRatings(void *arg1, // VIABLE_CHOICE Choice1 00077 void *arg2) { // VIABLE_CHOICE Choice2 00078 float R1, R2; 00079 VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1; 00080 VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2; 00081 R1 = Choice1->Rating; 00082 R2 = Choice2->Rating; 00083 return (R1 < R2) ? -1 : 1; 00084 } 00085 00086 // Expands Choice and places the results in ExpandedChoice. The primary 00087 // function of expansion is to create an two arrays, one which holds the 00088 // corresponding certainty for each chunk in Choice, and one which holds 00089 // the class for each chunk. 00090 static void ExpandChoice(VIABLE_CHOICE Choice, 00091 EXPANDED_CHOICE *ExpandedChoice) { 00092 int i, j, Chunk; 00093 ExpandedChoice->Choice = Choice; 00094 for (i = 0, Chunk = 0; i < Choice->Length; i++) 00095 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) { 00096 ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty; 00097 ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class; 00098 } 00099 } 00100 00101 VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT(int length) 00102 : Length(length) { 00103 Blob = new CHAR_CHOICE[length]; 00104 } 00105 00106 VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT() : Length(0) { 00107 Blob = NULL; 00108 } 00109 00110 VIABLE_CHOICE_STRUCT::~VIABLE_CHOICE_STRUCT() { 00111 delete []Blob; 00112 } 00113 00114 void VIABLE_CHOICE_STRUCT::Init( 00115 const WERD_CHOICE &word_choice, 00116 const PIECES_STATE &pieces_state, 00117 const float certainties[], 00118 FLOAT32 adjust_factor) { 00119 this->Rating = word_choice.rating(); 00120 this->Certainty = word_choice.certainty(); 00121 this->AdjustFactor = adjust_factor; 00122 this->ComposedFromCharFragments = false; 00123 ASSERT_HOST(this->Length == word_choice.length()); 00124 00125 for (int i = 0, bw_idx = 0; i < word_choice.length(); i++, bw_idx++) { 00126 int blob_width = pieces_state[bw_idx]; 00127 CHAR_CHOICE *blob_choice = &this->Blob[i]; 00128 blob_choice->Class = word_choice.unichar_id(i); 00129 blob_choice->NumChunks = blob_width; 00130 blob_choice->Certainty = certainties[i]; 00131 for (int f = 1; f < word_choice.fragment_length(i); ++f) { 00132 blob_width = pieces_state[++bw_idx]; 00133 assert(blob_width > 0); 00134 blob_choice->NumChunks += blob_width; 00135 this->ComposedFromCharFragments = true; 00136 } 00137 } 00138 } 00139 00140 00141 namespace tesseract { 00142 00143 // If the certainty of any chunk in Choice (item1) is not ambiguous with the 00144 // corresponding chunk in the best choice (item2), frees Choice and 00145 // returns true. 00146 int Dict::FreeBadChoice( 00147 void *item1, // VIABLE_CHOICE Choice, 00148 void *item2) { // EXPANDED_CHOICE *BestChoice 00149 int i, j, Chunk; 00150 FLOAT32 Threshold; 00151 VIABLE_CHOICE Choice = reinterpret_cast<VIABLE_CHOICE>(item1); 00152 EXPANDED_CHOICE *BestChoice = reinterpret_cast<EXPANDED_CHOICE *>(item2); 00153 Threshold = StopperAmbigThreshold(BestChoice->Choice->AdjustFactor, 00154 Choice->AdjustFactor); 00155 for (i = 0, Chunk = 0; i < Choice->Length; i++) { 00156 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) { 00157 if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] && 00158 Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] < 00159 Threshold) { 00160 if (stopper_debug_level >= 2) 00161 PrintViableChoice(stderr, "\nDiscarding bad choice: ", Choice); 00162 delete Choice; 00163 return true; 00164 } 00165 } 00166 } 00167 return false; 00168 } 00169 00170 bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, 00171 WERD_CHOICE *BestChoice, 00172 DANGERR *fixpt, 00173 ACCEPTABLE_CHOICE_CALLER caller, 00174 bool *modified_blobs) { 00175 float CertaintyThreshold = stopper_nondict_certainty_base; 00176 int WordSize; 00177 if (modified_blobs != NULL) *modified_blobs = false; 00178 00179 if (stopper_no_acceptable_choices) return false; 00180 00181 if (fixpt != NULL) fixpt->clear(); 00182 if (BestChoice->length() == 0) 00183 return false; 00184 if (caller == CHOPPER_CALLER && BestChoice->fragment_mark()) { 00185 if (stopper_debug_level >= 1) { 00186 cprintf("AcceptableChoice(): a choice with fragments beats BestChoice"); 00187 } 00188 return false; 00189 } 00190 00191 bool no_dang_ambigs = (GetMaxFixedLengthDawgIndex() >= 0 || 00192 NoDangerousAmbig(BestChoice, fixpt, true, 00193 Choices, modified_blobs)); 00194 bool is_valid_word = valid_word_permuter(BestChoice->permuter(), false); 00195 bool is_case_ok = case_ok(*BestChoice, getUnicharset()); 00196 00197 if (stopper_debug_level >= 1) 00198 tprintf("\nStopper: %s (word=%c, case=%c)\n", 00199 BestChoice->debug_string().string(), 00200 (is_valid_word ? 'y' : 'n'), 00201 (is_case_ok ? 'y' : 'n')); 00202 00203 // Do not accept invalid words in PASS1. 00204 if (reject_offset_ <= 0.0f && !is_valid_word) return false; 00205 if (is_valid_word && is_case_ok) { 00206 WordSize = LengthOfShortestAlphaRun(*BestChoice); 00207 WordSize -= stopper_smallword_size; 00208 if (WordSize < 0) 00209 WordSize = 0; 00210 CertaintyThreshold += WordSize * stopper_certainty_per_char; 00211 } 00212 00213 if (stopper_debug_level >= 1) 00214 tprintf("Stopper: Certainty = %4.1f, Threshold = %4.1f\n", 00215 BestChoice->certainty(), CertaintyThreshold); 00216 00217 if (no_dang_ambigs && 00218 BestChoice->certainty() > CertaintyThreshold && 00219 UniformCertainties(*Choices, *BestChoice)) { 00220 return true; 00221 } else { 00222 if (stopper_debug_level >= 2) { 00223 tprintf("AcceptableChoice() returned false" 00224 " (no_dang_ambig:%d cert:%g thresh:%g uniform:%d)\n", 00225 no_dang_ambigs, BestChoice->certainty(), 00226 CertaintyThreshold, 00227 UniformCertainties(*Choices, *BestChoice)); 00228 } 00229 return false; 00230 } 00231 } 00232 00233 bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) { 00234 float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; 00235 int WordSize; 00236 00237 if (stopper_debug_level >= 1) { 00238 tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n", 00239 BestChoice.debug_string().string(), 00240 (valid_word(BestChoice) ? 'y' : 'n'), 00241 (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'), 00242 ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y')); 00243 } 00244 00245 if (BestChoice.length() == 0 || CurrentWordAmbig()) 00246 return false; 00247 if (BestChoice.fragment_mark()) { 00248 if (stopper_debug_level >= 1) { 00249 cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n"); 00250 } 00251 return false; 00252 } 00253 if (valid_word(BestChoice) && case_ok(BestChoice, getUnicharset())) { 00254 WordSize = LengthOfShortestAlphaRun(BestChoice); 00255 WordSize -= stopper_smallword_size; 00256 if (WordSize < 0) 00257 WordSize = 0; 00258 CertaintyThreshold += WordSize * stopper_certainty_per_char; 00259 } 00260 00261 if (stopper_debug_level >= 1) 00262 cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", 00263 BestChoice.certainty(), CertaintyThreshold); 00264 00265 if (BestChoice.certainty() > CertaintyThreshold && 00266 !stopper_no_acceptable_choices) { 00267 if (stopper_debug_level >= 1) 00268 cprintf("ACCEPTED\n"); 00269 return true; 00270 } 00271 else { 00272 if (stopper_debug_level >= 1) 00273 cprintf("REJECTED\n"); 00274 return false; 00275 } 00276 } 00277 00278 bool Dict::AlternativeChoicesWorseThan(FLOAT32 Threshold) { 00279 LIST Alternatives; 00280 VIABLE_CHOICE Choice; 00281 Alternatives = list_rest (best_choices_); 00282 iterate(Alternatives) { 00283 Choice = (VIABLE_CHOICE) first_node (Alternatives); 00284 if (Choice->AdjustFactor <= Threshold) 00285 return false; 00286 } 00287 return true; 00288 } 00289 00290 bool Dict::CurrentBestChoiceIs(const WERD_CHOICE &WordChoice) { 00291 return (best_choices_ != NIL_LIST && 00292 StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(best_choices_))); 00293 } 00294 00295 FLOAT32 Dict::CurrentBestChoiceAdjustFactor() { 00296 VIABLE_CHOICE BestChoice; 00297 if (best_choices_ == NIL_LIST) 00298 return (MAX_FLOAT32); 00299 BestChoice = (VIABLE_CHOICE) first_node (best_choices_); 00300 return (BestChoice->AdjustFactor); 00301 } 00302 00303 00304 bool Dict::CurrentWordAmbig() { 00305 return (list_rest (best_choices_) != NIL_LIST); 00306 } 00307 00308 00309 void Dict::DebugWordChoices() { 00310 LIST Choices; 00311 int i; 00312 char LabelString[80]; 00313 VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(best_choices_); 00314 bool force_debug = 00315 fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments; 00316 00317 if (stopper_debug_level >= 1 || force_debug || 00318 (((STRING)word_to_debug).length() > 0 && best_choices_ && 00319 StringSameAs(word_to_debug.string(), word_to_debug_lengths.string(), 00320 (VIABLE_CHOICE)first_node(best_choices_)))) { 00321 if (best_raw_choice_) 00322 PrintViableChoice(stderr, "\nBest Raw Choice: ", best_raw_choice_); 00323 00324 i = 1; 00325 Choices = best_choices_; 00326 if (Choices) 00327 cprintf("\nBest Cooked Choices:\n"); 00328 iterate(Choices) { 00329 sprintf(LabelString, "Cooked Choice #%d: ", i); 00330 PrintViableChoice(stderr, LabelString, 00331 (VIABLE_CHOICE)first_node(Choices)); 00332 i++; 00333 } 00334 } 00335 } 00336 00337 void Dict::PrintAmbigAlternatives(FILE *file, const char *label, 00338 int label_num_unichars) { 00339 iterate(raw_choices_) { 00340 VIABLE_CHOICE Choice = (VIABLE_CHOICE)first_node(raw_choices_); 00341 if (Choice->Length > 0 && 00342 (label_num_unichars > 1 || Choice->Length > 1)) { 00343 for (int i = 0; i < Choice->Length; i++) { 00344 fprintf(file, "%s", 00345 getUnicharset().id_to_unichar(Choice->Blob[i].Class)); 00346 } 00347 fflush(file); 00348 fprintf(file, "\t%s\t%.4f\t%.4f\n", label, 00349 Choice->Rating, Choice->Certainty); 00350 } 00351 } 00352 } 00353 00354 void Dict::FilterWordChoices() { 00355 EXPANDED_CHOICE BestChoice; 00356 00357 if (best_choices_ == NIL_LIST || second_node (best_choices_) == NIL_LIST) 00358 return; 00359 00360 // Compute certainties and class for each chunk in best choice. 00361 VIABLE_CHOICE_STRUCT *best_choice = 00362 (VIABLE_CHOICE_STRUCT *)first_node(best_choices_); 00363 ExpandChoice(best_choice, &BestChoice); 00364 if (stopper_debug_level >= 2) 00365 PrintViableChoice(stderr, "\nFiltering against best choice: ", best_choice); 00366 TessResultCallback2<int, void*, void*>* is_bad = 00367 NewPermanentTessCallback(this, &Dict::FreeBadChoice); 00368 set_rest(best_choices_, delete_d(list_rest(best_choices_), 00369 &BestChoice, is_bad)); 00370 delete is_bad; 00371 } 00372 00373 void Dict::FindClassifierErrors(FLOAT32 MinRating, 00374 FLOAT32 MaxRating, 00375 FLOAT32 RatingMargin, 00376 FLOAT32 Thresholds[]) { 00377 EXPANDED_CHOICE BestRaw; 00378 VIABLE_CHOICE Choice; 00379 int i, j, Chunk; 00380 FLOAT32 AvgRating; 00381 int NumErrorChunks; 00382 00383 assert (best_choices_ != NIL_LIST); 00384 assert (best_raw_choice_ != NULL); 00385 00386 ExpandChoice(best_raw_choice_, &BestRaw); 00387 Choice = (VIABLE_CHOICE) first_node (best_choices_); 00388 00389 for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) { 00390 AvgRating = 0.0; 00391 NumErrorChunks = 0; 00392 00393 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) { 00394 if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) { 00395 AvgRating += BestRaw.ChunkCertainty[Chunk]; 00396 NumErrorChunks++; 00397 } 00398 } 00399 00400 if (NumErrorChunks > 0) { 00401 AvgRating /= NumErrorChunks; 00402 *Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin); 00403 } 00404 else 00405 *Thresholds = MaxRating; 00406 00407 if (*Thresholds > MaxRating) 00408 *Thresholds = MaxRating; 00409 if (*Thresholds < MinRating) 00410 *Thresholds = MinRating; 00411 } 00412 } 00413 00414 void Dict::InitChoiceAccum() { 00415 BLOB_WIDTH *BlobWidth, *End; 00416 00417 if (best_raw_choice_) 00418 delete best_raw_choice_; 00419 best_raw_choice_ = NULL; 00420 00421 if (best_choices_) 00422 destroy_nodes(best_choices_, DeleteViableChoiceStruct); 00423 best_choices_ = NIL_LIST; 00424 00425 if (raw_choices_) 00426 destroy_nodes(raw_choices_, DeleteViableChoiceStruct); 00427 raw_choices_ = NIL_LIST; 00428 00429 EnableChoiceAccum(); 00430 00431 for (BlobWidth = current_segmentation_, 00432 End = current_segmentation_ + MAX_NUM_CHUNKS; 00433 BlobWidth < End; *BlobWidth++ = 1); 00434 00435 } 00436 00437 void Dict::ClearBestChoiceAccum() { 00438 if (best_choices_) destroy_nodes(best_choices_, DeleteViableChoiceStruct); 00439 best_choices_ = NIL_LIST; 00440 } 00441 00442 void Dict::LogNewSegmentation(PIECES_STATE BlobWidth) { 00443 BLOB_WIDTH *Segmentation; 00444 for (Segmentation = current_segmentation_; *BlobWidth != 0; 00445 BlobWidth++, Segmentation++) 00446 *Segmentation = *BlobWidth; 00447 *Segmentation = 0; 00448 } 00449 00450 void Dict::LogNewSplit(int Blob) { 00451 LIST Choices; 00452 if (best_raw_choice_) AddNewChunk(best_raw_choice_, Blob); 00453 Choices = best_choices_; 00454 iterate(Choices) { 00455 AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob); 00456 } 00457 Choices = raw_choices_; 00458 iterate(Choices) { 00459 AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob); 00460 } 00461 } 00462 00463 void Dict::LogNewChoice(FLOAT32 AdjustFactor, 00464 const float Certainties[], 00465 bool raw_choice, 00466 WERD_CHOICE *WordChoice) { 00467 LIST ChoicesList; 00468 LIST Choices; 00469 FLOAT32 Threshold; 00470 00471 if (!keep_word_choices_) 00472 return; 00473 00474 if (raw_choice) { 00475 if (!best_raw_choice_) { 00476 best_raw_choice_ = 00477 NewViableChoice(*WordChoice, AdjustFactor, Certainties); 00478 } else if (WordChoice->rating() < best_raw_choice_->Rating) { 00479 if (ChoiceSameAs(*WordChoice, best_raw_choice_)) { 00480 FillViableChoice(*WordChoice, AdjustFactor, Certainties, 00481 best_raw_choice_); 00482 } else { 00483 delete best_raw_choice_; 00484 best_raw_choice_ = 00485 NewViableChoice(*WordChoice, AdjustFactor, Certainties); 00486 } 00487 } 00488 if (!save_raw_choices) return; 00489 ChoicesList = raw_choices_; 00490 } else { 00491 ChoicesList = best_choices_; 00492 } 00493 00494 // Throw out obviously bad choices to save some work. 00495 if (ChoicesList != NIL_LIST) { 00496 Threshold = StopperAmbigThreshold(BestFactor(ChoicesList), AdjustFactor); 00497 if (Threshold > -stopper_ambiguity_threshold_offset) 00498 Threshold = -stopper_ambiguity_threshold_offset; 00499 if (WordChoice->certainty() - BestCertainty (ChoicesList) < Threshold) { 00500 // Set the rating of the word to be terrible, so that it does not 00501 // get chosen as the best choice. 00502 if (stopper_debug_level >= 2) { 00503 STRING bad_string; 00504 WordChoice->string_and_lengths(&bad_string, NULL); 00505 tprintf("Discarding choice \"%s\" with an overly low certainty" 00506 " %.4f vs best choice certainty %.4f (Threshold: %.4f)\n", 00507 bad_string.string(), WordChoice->certainty(), 00508 BestCertainty(ChoicesList), 00509 Threshold + BestCertainty(ChoicesList)); 00510 } 00511 WordChoice->set_rating(WERD_CHOICE::kBadRating); 00512 return; 00513 } 00514 } 00515 00516 // See if a choice with the same text string has already been found. 00517 VIABLE_CHOICE NewChoice = NULL; 00518 Choices = ChoicesList; 00519 00520 iterate(Choices) { 00521 if (ChoiceSameAs (*WordChoice, (VIABLE_CHOICE) first_node (Choices))) { 00522 if (WordChoice->rating() < BestRating (Choices)) { 00523 NewChoice = (VIABLE_CHOICE) first_node (Choices); 00524 } else { 00525 return; 00526 } 00527 } 00528 } 00529 00530 if (NewChoice) { 00531 FillViableChoice(*WordChoice, AdjustFactor, Certainties, NewChoice); 00532 ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node); 00533 } else { 00534 NewChoice = NewViableChoice(*WordChoice, AdjustFactor, Certainties); 00535 } 00536 00537 ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings); 00538 if (stopper_debug_level >= 2) 00539 raw_choice ? PrintViableChoice (stderr, "New Raw Choice: ", NewChoice) : 00540 PrintViableChoice (stderr, "New Word Choice: ", NewChoice); 00541 if (count (ChoicesList) > tessedit_truncate_wordchoice_log) { 00542 Choices = 00543 (LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log); 00544 destroy_nodes(list_rest (Choices), DeleteViableChoiceStruct); 00545 set_rest(Choices, NIL_LIST); 00546 } 00547 00548 // Update raw_choices_/best_choices_ pointer. 00549 if (raw_choice) { 00550 raw_choices_ = ChoicesList; 00551 } else { 00552 best_choices_ = ChoicesList; 00553 } 00554 } 00555 00556 bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, 00557 DANGERR *fixpt, 00558 bool fix_replaceable, 00559 BLOB_CHOICE_LIST_VECTOR *blob_choices, 00560 bool *modified_blobs) { 00561 if (stopper_debug_level > 2) { 00562 tprintf("\nRunning NoDangerousAmbig() for %s\n", 00563 best_choice->debug_string().string()); 00564 } 00565 00566 // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities 00567 // for each unichar id in BestChoice. 00568 BLOB_CHOICE_LIST_VECTOR ambig_blob_choices; 00569 int i; 00570 bool modified_best_choice = false; 00571 bool ambigs_found = false; 00572 // For each position in best_choice: 00573 // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i] 00574 // -- initialize wrong_ngram with a single unichar_id at best_choice[i] 00575 // -- look for ambiguities corresponding to wrong_ngram in the list while 00576 // adding the following unichar_ids from best_choice to wrong_ngram 00577 // 00578 // Repeat the above procedure twice: first time look through 00579 // ambigs to be replaced and replace all the ambiguities found; 00580 // second time look through dangerous ambiguities and construct 00581 // ambig_blob_choices with fake a blob choice for each ambiguity 00582 // and pass them to dawg_permute_and_select() to search for 00583 // ambiguous words in the dictionaries. 00584 // 00585 // Note that during the execution of the for loop (on the first pass) 00586 // if replacements are made the length of best_choice might change. 00587 for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) { 00588 bool replace = (fix_replaceable && pass == 0); 00589 const UnicharAmbigsVector &table = replace ? 00590 getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs(); 00591 if (!replace) { 00592 // Initialize ambig_blob_choices with lists containing a single 00593 // unichar id for the correspoding position in best_choice. 00594 // best_choice consisting from only the original letters will 00595 // have a rating of 0.0. 00596 for (i = 0; i < best_choice->length(); ++i) { 00597 BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST(); 00598 BLOB_CHOICE_IT lst_it(lst); 00599 // TODO(rays/antonova) Should these BLOB_CHOICEs use real xheights 00600 // or are these fake ones good enough? 00601 lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i), 00602 0.0, 0.0, -1, -1, -1, 0, 1, false)); 00603 ambig_blob_choices.push_back(lst); 00604 } 00605 } 00606 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; 00607 int wrong_ngram_index; 00608 int next_index; 00609 int blob_index = 0; 00610 for (i = 0; i < best_choice->length(); ++i) { 00611 if (i > 0) blob_index += best_choice->fragment_length(i-1); 00612 UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i); 00613 if (stopper_debug_level > 2) { 00614 tprintf("Looking for %s ngrams starting with %s:\n", 00615 replace ? "replaceable" : "ambiguous", 00616 getUnicharset().debug_str(curr_unichar_id).string()); 00617 } 00618 wrong_ngram_index = 0; 00619 wrong_ngram[wrong_ngram_index] = curr_unichar_id; 00620 if (curr_unichar_id == INVALID_UNICHAR_ID || 00621 curr_unichar_id >= table.size() || 00622 table[curr_unichar_id] == NULL) { 00623 continue; // there is no ambig spec for this unichar id 00624 } 00625 AmbigSpec_IT spec_it(table[curr_unichar_id]); 00626 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) { 00627 const AmbigSpec *ambig_spec = spec_it.data(); 00628 wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID; 00629 int compare = UnicharIdArrayUtils::compare(wrong_ngram, 00630 ambig_spec->wrong_ngram); 00631 if (stopper_debug_level > 2) { 00632 tprintf("candidate ngram: "); 00633 UnicharIdArrayUtils::print(wrong_ngram, getUnicharset()); 00634 tprintf("current ngram from spec: "); 00635 UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset()); 00636 tprintf("comparison result: %d\n", compare); 00637 } 00638 if (compare == 0) { 00639 // Record the place where we found an ambiguity. 00640 if (fixpt != NULL) { 00641 fixpt->push_back(DANGERR_INFO( 00642 blob_index, blob_index+wrong_ngram_index, replace, 00643 getUnicharset().get_isngram(ambig_spec->correct_ngram_id))); 00644 if (stopper_debug_level > 1) { 00645 tprintf("fixpt+=(%d %d %d %d)\n", blob_index, 00646 blob_index+wrong_ngram_index, false, 00647 getUnicharset().get_isngram( 00648 ambig_spec->correct_ngram_id)); 00649 } 00650 } 00651 00652 if (replace) { 00653 if (stopper_debug_level > 2) { 00654 tprintf("replace ambiguity with: "); 00655 UnicharIdArrayUtils::print( 00656 ambig_spec->correct_fragments, getUnicharset()); 00657 } 00658 ReplaceAmbig(i, ambig_spec->wrong_ngram_size, 00659 ambig_spec->correct_ngram_id, 00660 best_choice, blob_choices, modified_blobs); 00661 modified_best_choice = true; 00662 } else if (i > 0 || ambig_spec->type != CASE_AMBIG) { 00663 // We found dang ambig - update ambig_blob_choices. 00664 if (stopper_debug_level > 2) { 00665 tprintf("found ambiguity: "); 00666 UnicharIdArrayUtils::print( 00667 ambig_spec->correct_fragments, getUnicharset()); 00668 } 00669 ambigs_found = true; 00670 for (int tmp_index = 0; tmp_index <= wrong_ngram_index; 00671 ++tmp_index) { 00672 // Add a blob choice for the corresponding fragment of the 00673 // ambiguity. These fake blob choices are initialized with 00674 // negative ratings (which are not possible for real blob 00675 // choices), so that dawg_permute_and_select() considers any 00676 // word not consisting of only the original letters a better 00677 // choice and stops searching for alternatives once such a 00678 // choice is found. 00679 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]); 00680 bc_it.add_to_end(new BLOB_CHOICE( 00681 ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, 00682 -1, -1, -1, 0, 1, false)); 00683 } 00684 } 00685 spec_it.forward(); 00686 } else if (compare == -1) { 00687 if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size && 00688 ((next_index = wrong_ngram_index+1+i) < best_choice->length())) { 00689 // Add the next unichar id to wrong_ngram and keep looking for 00690 // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST. 00691 wrong_ngram[++wrong_ngram_index] = 00692 best_choice->unichar_id(next_index); 00693 } else { 00694 break; // no more matching ambigs in this AMBIG_SPEC_LIST 00695 } 00696 } else { 00697 spec_it.forward(); 00698 } 00699 } // end searching AmbigSpec_LIST 00700 } // end searching best_choice 00701 } // end searching replace and dangerous ambigs 00702 00703 // If any ambiguities were found permute the constructed ambig_blob_choices 00704 // to see if an alternative dictionary word can be found. 00705 if (ambigs_found) { 00706 if (stopper_debug_level > 2) { 00707 tprintf("\nResulting ambig_blob_choices:\n"); 00708 for (i = 0; i < ambig_blob_choices.length(); ++i) { 00709 print_ratings_list("", ambig_blob_choices.get(i), getUnicharset()); 00710 tprintf("\n"); 00711 } 00712 } 00713 WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0); 00714 ambigs_found = (alt_word->rating() < 0.0); 00715 if (ambigs_found) { 00716 if (stopper_debug_level >= 1) { 00717 tprintf ("Stopper: Possible ambiguous word = %s\n", 00718 alt_word->debug_string().string()); 00719 } 00720 if (fixpt != NULL) { 00721 // Note: Currently character choices combined from fragments can only 00722 // be generated by NoDangrousAmbigs(). This code should be updated if 00723 // the capability to produce classifications combined from character 00724 // fragments is added to other functions. 00725 int orig_i = 0; 00726 for (i = 0; i < alt_word->length(); ++i) { 00727 bool replacement_is_ngram = 00728 getUnicharset().get_isngram(alt_word->unichar_id(i)); 00729 int end_i = orig_i + alt_word->fragment_length(i) - 1; 00730 if (alt_word->fragment_length(i) > 1 || 00731 (orig_i == end_i && replacement_is_ngram)) { 00732 fixpt->push_back(DANGERR_INFO(orig_i, end_i, true, 00733 replacement_is_ngram)); 00734 if (stopper_debug_level > 1) { 00735 tprintf("fixpt->dangerous+=(%d %d %d %d)\n", orig_i, end_i, 00736 true, replacement_is_ngram); 00737 } 00738 } 00739 orig_i += alt_word->fragment_length(i); 00740 } 00741 } 00742 } 00743 delete alt_word; 00744 } 00745 if (output_ambig_words_file_ != NULL) { 00746 fprintf(output_ambig_words_file_, "\n"); 00747 } 00748 00749 ambig_blob_choices.delete_data_pointers(); 00750 return !ambigs_found; 00751 } 00752 00753 void Dict::EndDangerousAmbigs() {} 00754 00755 void Dict::SettupStopperPass1() { 00756 reject_offset_ = 0.0; 00757 } 00758 00759 void Dict::SettupStopperPass2() { 00760 reject_offset_ = stopper_phase2_certainty_rejection_offset; 00761 } 00762 00763 void Dict::AddNewChunk(VIABLE_CHOICE Choice, int Blob) { 00764 int i, LastChunk; 00765 for (i = 0, LastChunk = 0; i < Choice->Length; i++) { 00766 LastChunk += Choice->Blob[i].NumChunks; 00767 if (Blob < LastChunk) { 00768 (Choice->Blob[i].NumChunks)++; 00769 return; 00770 } 00771 } 00772 cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n", 00773 Choice->Length, LastChunk, Blob); 00774 assert(false); // this should never get executed 00775 } 00776 00777 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, 00778 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, 00779 BLOB_CHOICE_LIST_VECTOR *blob_choices, 00780 bool *modified_blobs) { 00781 int num_blobs_to_replace = 0; 00782 int begin_blob_index = 0; 00783 int i; 00784 for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { 00785 if (i >= wrong_ngram_begin_index) { 00786 num_blobs_to_replace += werd_choice->fragment_length(i); 00787 } else { 00788 begin_blob_index += werd_choice->fragment_length(i); 00789 } 00790 } 00791 BLOB_CHOICE_IT bit; 00792 int temp_blob_index = begin_blob_index; 00793 const char *temp_uch = NULL; 00794 const char *correct_ngram_str = 00795 getUnicharset().id_to_unichar(correct_ngram_id); 00796 for (int replaced_count = 0; replaced_count < wrong_ngram_size; 00797 ++replaced_count) { 00798 if (blob_choices != NULL) { 00799 UNICHAR_ID uch_id = werd_choice->unichar_id(wrong_ngram_begin_index); 00800 int fraglen = werd_choice->fragment_length(wrong_ngram_begin_index); 00801 if (fraglen > 1) temp_uch = getUnicharset().id_to_unichar(uch_id); 00802 for (i = 0; i < fraglen; ++i) { 00803 if (fraglen > 1) { 00804 STRING frag_str = 00805 CHAR_FRAGMENT::to_string(temp_uch, i, fraglen, false); 00806 getUnicharset().unichar_insert(frag_str.string()); 00807 uch_id = getUnicharset().unichar_to_id(frag_str.string()); 00808 } 00809 bit.set_to_list(blob_choices->get(temp_blob_index)); 00810 STRING correct_frag_uch = 00811 CHAR_FRAGMENT::to_string(correct_ngram_str, 00812 temp_blob_index - begin_blob_index, 00813 num_blobs_to_replace, false); 00814 getUnicharset().unichar_insert(correct_frag_uch.string()); 00815 UNICHAR_ID correct_frag_uch_id = 00816 getUnicharset().unichar_to_id(correct_frag_uch.string()); 00817 // Find the WERD_CHOICE corresponding to the original unichar in 00818 // the list of blob choices, add the derived character fragment 00819 // before it with the same rating and certainty. 00820 for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) { 00821 if (bit.data()->unichar_id() == correct_frag_uch_id) { 00822 break; // the unichar we want to insert is already there 00823 } 00824 if (bit.data()->unichar_id() == uch_id) { 00825 bit.add_before_then_move(new BLOB_CHOICE(*(bit.data()))); 00826 bit.data()->set_unichar_id(correct_frag_uch_id); 00827 if (modified_blobs != NULL) *modified_blobs = true; 00828 break; 00829 } 00830 } 00831 temp_blob_index++; 00832 } 00833 } 00834 // Remove current unichar from werd_choice. On the last iteration 00835 // set the correct replacement unichar instead of removing a unichar. 00836 if (replaced_count + 1 == wrong_ngram_size) { 00837 werd_choice->set_unichar_id(correct_ngram_id, 00838 num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index); 00839 } else { 00840 werd_choice->remove_unichar_id(wrong_ngram_begin_index); 00841 } 00842 } 00843 if (stopper_debug_level >= 1 && modified_blobs != NULL && 00844 *modified_blobs && blob_choices != NULL) { 00845 werd_choice->print("ReplaceAmbig() "); 00846 tprintf("Modified blob_choices: "); 00847 for (int i = 0; i < blob_choices->size(); ++i) { 00848 print_ratings_list("\n", blob_choices->get(i), getUnicharset()); 00849 } 00850 } 00851 } 00852 00853 int Dict::ChoiceSameAs(const WERD_CHOICE &WordChoice, 00854 VIABLE_CHOICE ViableChoice) { 00855 return (StringSameAs(WordChoice, ViableChoice)); 00856 } 00857 00858 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) { 00859 int shortest = MAX_INT32; 00860 int curr_len = 0; 00861 for (int w = 0; w < WordChoice.length(); ++w) { 00862 if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) { 00863 curr_len++; 00864 } else if (curr_len > 0) { 00865 if (curr_len < shortest) shortest = curr_len; 00866 curr_len = 0; 00867 } 00868 } 00869 if (curr_len > 0 && curr_len < shortest) { 00870 shortest = curr_len; 00871 } else if (shortest == MAX_INT32) { 00872 shortest = 0; 00873 } 00874 return shortest; 00875 } 00876 00877 VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice, 00878 FLOAT32 AdjustFactor, 00879 const float Certainties[]) { 00880 int Length = WordChoice.length(); 00881 assert (Length <= MAX_NUM_CHUNKS && Length > 0); 00882 VIABLE_CHOICE NewChoice = new VIABLE_CHOICE_STRUCT(Length); 00883 FillViableChoice(WordChoice, AdjustFactor, Certainties, NewChoice); 00884 return NewChoice; 00885 } 00886 00887 void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) { 00888 int i, j; 00889 fprintf (File, "%s", Label); 00890 fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d) ", 00891 Choice->Rating, Choice->Certainty, 00892 Choice->AdjustFactor, Choice->ComposedFromCharFragments); 00893 00894 for (i = 0; i < Choice->Length; i++) 00895 fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class)); 00896 fprintf(File, "\n"); 00897 00898 for (i = 0; i < Choice->Length; i++) { 00899 fprintf(File, " %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class)); 00900 for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++) 00901 fprintf(File, " "); 00902 } 00903 fprintf(File, "\n"); 00904 00905 for (i = 0; i < Choice->Length; i++) { 00906 for (j = 0; j < Choice->Blob[i].NumChunks; j++) 00907 fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0)); 00908 } 00909 fprintf(File, "\n"); 00910 00911 for (i = 0; i < Choice->Length; i++) { 00912 for (j = 0; j < Choice->Blob[i].NumChunks; j++) 00913 fprintf(File, "%3d ", Choice->Blob[i].NumChunks); 00914 } 00915 fprintf(File, "\n"); 00916 } 00917 00918 void Dict::FillViableChoice(const WERD_CHOICE &WordChoice, 00919 FLOAT32 AdjustFactor, const float Certainties[], 00920 VIABLE_CHOICE ViableChoice) { 00921 ViableChoice->Init(WordChoice, current_segmentation_, Certainties, 00922 AdjustFactor); 00923 00924 } 00925 00926 bool Dict::StringSameAs(const WERD_CHOICE &WordChoice, 00927 VIABLE_CHOICE ViableChoice) { 00928 if (WordChoice.length() != ViableChoice->Length) { 00929 return false; 00930 } 00931 int i; 00932 CHAR_CHOICE *CharChoice; 00933 for (i = 0, CharChoice = &(ViableChoice->Blob[0]); 00934 i < ViableChoice->Length; CharChoice++, i++) { 00935 if (CharChoice->Class != WordChoice.unichar_id(i)) { 00936 return false; 00937 } 00938 } 00939 return true; 00940 } 00941 00942 bool Dict::StringSameAs(const char *String, 00943 const char *String_lengths, 00944 VIABLE_CHOICE ViableChoice) { 00945 CHAR_CHOICE *Char; 00946 int i; 00947 int current_unichar_length; 00948 00949 for (Char = &(ViableChoice->Blob[0]), i = 0; 00950 i < ViableChoice->Length; 00951 String += *(String_lengths++), Char++, i++) { 00952 current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class)); 00953 if (current_unichar_length != *String_lengths || 00954 strncmp(String, getUnicharset().id_to_unichar(Char->Class), 00955 current_unichar_length) != 0) 00956 return false; 00957 } 00958 return (*String == 0) ? true : false; 00959 } 00960 00961 int Dict::UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices, 00962 const WERD_CHOICE &BestChoice) { 00963 float Certainty; 00964 float WorstCertainty = MAX_FLOAT32; 00965 float CertaintyThreshold; 00966 FLOAT64 TotalCertainty; 00967 FLOAT64 TotalCertaintySquared; 00968 FLOAT64 Variance; 00969 FLOAT32 Mean, StdDev; 00970 int WordLength; 00971 00972 WordLength = Choices.length(); 00973 if (WordLength < 3) 00974 return true; 00975 00976 TotalCertainty = TotalCertaintySquared = 0.0; 00977 BLOB_CHOICE_IT BlobChoiceIt; 00978 for (int i = 0; i < Choices.length(); ++i) { 00979 BlobChoiceIt.set_to_list(Choices.get(i)); 00980 Certainty = BlobChoiceIt.data()->certainty(); 00981 TotalCertainty += Certainty; 00982 TotalCertaintySquared += Certainty * Certainty; 00983 if (Certainty < WorstCertainty) 00984 WorstCertainty = Certainty; 00985 } 00986 00987 // Subtract off worst certainty from statistics. 00988 WordLength--; 00989 TotalCertainty -= WorstCertainty; 00990 TotalCertaintySquared -= WorstCertainty * WorstCertainty; 00991 00992 Mean = TotalCertainty / WordLength; 00993 Variance = ((WordLength * TotalCertaintySquared - 00994 TotalCertainty * TotalCertainty) / 00995 (WordLength * (WordLength - 1))); 00996 if (Variance < 0.0) 00997 Variance = 0.0; 00998 StdDev = sqrt (Variance); 00999 01000 CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev; 01001 if (CertaintyThreshold > stopper_nondict_certainty_base) 01002 CertaintyThreshold = stopper_nondict_certainty_base; 01003 01004 if (BestChoice.certainty() < CertaintyThreshold) { 01005 if (stopper_debug_level >= 1) 01006 cprintf("Stopper: Non-uniform certainty = %4.1f" 01007 " (m=%4.1f, s=%4.1f, t=%4.1f)\n", 01008 BestChoice.certainty(), Mean, StdDev, CertaintyThreshold); 01009 return false; 01010 } else { 01011 return true; 01012 } 01013 } 01014 01015 } // namespace tesseract