Tesseract  3.02
tesseract-ocr/dict/stopper.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:    stopper.c
00003  **     Purpose:     Stopping criteria for word classifier.
00004  **     Author:      Dan Johnson
00005  **     History:     Mon Apr 29 14:56:49 1991, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 
00019 #include "stopper.h"
00020 #include "matchdefs.h"
00021 #include "callcpp.h"
00022 #include "permute.h"
00023 #include "danerror.h"
00024 #include "const.h"
00025 #include "efio.h"
00026 #include "scanutils.h"
00027 #include "unichar.h"
00028 #include "params.h"
00029 #include "dict.h"
00030 #include "image.h"
00031 #include "ccutil.h"
00032 #include "ratngs.h"
00033 #include "ambigs.h"
00034 
00035 #include <stdio.h>
00036 #include <string.h>
00037 #include <ctype.h>
00038 #include <math.h>
00039 #ifdef __UNIX__
00040 #include <assert.h>
00041 #endif
00042 
00043 #ifdef _MSC_VER
00044 #pragma warning(disable:4244)  // Conversion warnings
00045 #pragma warning(disable:4800)  // int/bool warnings
00046 #endif
00047 
00048 /* these are kludges - add appropriate .h file later */
00049 /* from adaptmatch.cpp */
00050 #define MAX_WERD_SIZE   100
00051 
00052 typedef struct
00053 {
00054   VIABLE_CHOICE Choice;
00055   float ChunkCertainty[MAX_NUM_CHUNKS];
00056   UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
00057 } EXPANDED_CHOICE;
00058 
00059 void DeleteViableChoiceStruct(void *vcs) {
00060   delete (static_cast<VIABLE_CHOICE_STRUCT *>(vcs));
00061 }
00062 
00063 #define BestCertainty(Choices) \
00064   (((VIABLE_CHOICE) first_node (Choices))->Certainty)
00065 
00066 #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
00067 
00068 #define BestFactor(Choices) \
00069   (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
00070 
00074 // Returns -1 if the rating for Choice1 is less than the rating for Choice2,
00075 // otherwise returns 1.
00076 static int CmpChoiceRatings(void *arg1,    // VIABLE_CHOICE Choice1
00077                             void *arg2) {  // VIABLE_CHOICE Choice2
00078   float R1, R2;
00079   VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
00080   VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
00081   R1 = Choice1->Rating;
00082   R2 = Choice2->Rating;
00083   return (R1 < R2) ? -1 : 1;
00084 }
00085 
00086 // Expands Choice and places the results in ExpandedChoice. The primary
00087 // function of expansion is to create an two arrays, one which holds the
00088 // corresponding certainty for each chunk in Choice, and one which holds
00089 // the class for each chunk.
00090 static void ExpandChoice(VIABLE_CHOICE Choice,
00091                          EXPANDED_CHOICE *ExpandedChoice) {
00092   int i, j, Chunk;
00093   ExpandedChoice->Choice = Choice;
00094   for (i = 0, Chunk = 0; i < Choice->Length; i++)
00095   for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
00096     ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
00097     ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
00098   }
00099 }
00100 
00101 VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT(int length)
00102     : Length(length) {
00103   Blob = new CHAR_CHOICE[length];
00104 }
00105 
00106 VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT() : Length(0) {
00107   Blob = NULL;
00108 }
00109 
00110 VIABLE_CHOICE_STRUCT::~VIABLE_CHOICE_STRUCT() {
00111   delete []Blob;
00112 }
00113 
00114 void VIABLE_CHOICE_STRUCT::Init(
00115     const WERD_CHOICE &word_choice,
00116     const PIECES_STATE &pieces_state,
00117     const float certainties[],
00118     FLOAT32 adjust_factor) {
00119   this->Rating = word_choice.rating();
00120   this->Certainty = word_choice.certainty();
00121   this->AdjustFactor = adjust_factor;
00122   this->ComposedFromCharFragments = false;
00123   ASSERT_HOST(this->Length == word_choice.length());
00124 
00125   for (int i = 0, bw_idx = 0; i < word_choice.length(); i++, bw_idx++) {
00126     int blob_width = pieces_state[bw_idx];
00127     CHAR_CHOICE *blob_choice = &this->Blob[i];
00128     blob_choice->Class = word_choice.unichar_id(i);
00129     blob_choice->NumChunks = blob_width;
00130     blob_choice->Certainty = certainties[i];
00131     for (int f = 1; f < word_choice.fragment_length(i); ++f) {
00132       blob_width = pieces_state[++bw_idx];
00133       assert(blob_width > 0);
00134       blob_choice->NumChunks += blob_width;
00135       this->ComposedFromCharFragments = true;
00136     }
00137   }
00138 }
00139 
00140 
00141 namespace tesseract {
00142 
00143 // If the certainty of any chunk in Choice (item1) is not ambiguous with the
00144 // corresponding chunk in the best choice (item2), frees Choice and
00145 // returns true.
00146 int Dict::FreeBadChoice(
00147     void *item1,    // VIABLE_CHOICE Choice,
00148     void *item2) {  // EXPANDED_CHOICE *BestChoice
00149   int i, j, Chunk;
00150   FLOAT32 Threshold;
00151   VIABLE_CHOICE Choice = reinterpret_cast<VIABLE_CHOICE>(item1);
00152   EXPANDED_CHOICE *BestChoice = reinterpret_cast<EXPANDED_CHOICE *>(item2);
00153   Threshold = StopperAmbigThreshold(BestChoice->Choice->AdjustFactor,
00154                                     Choice->AdjustFactor);
00155   for (i = 0, Chunk = 0; i < Choice->Length; i++) {
00156     for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
00157       if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
00158           Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
00159           Threshold) {
00160         if (stopper_debug_level >= 2)
00161           PrintViableChoice(stderr, "\nDiscarding bad choice:  ", Choice);
00162         delete Choice;
00163         return true;
00164       }
00165     }
00166   }
00167   return false;
00168 }
00169 
00170 bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
00171                             WERD_CHOICE *BestChoice,
00172                             DANGERR *fixpt,
00173                             ACCEPTABLE_CHOICE_CALLER caller,
00174                             bool *modified_blobs) {
00175   float CertaintyThreshold = stopper_nondict_certainty_base;
00176   int WordSize;
00177   if (modified_blobs != NULL) *modified_blobs = false;
00178 
00179   if (stopper_no_acceptable_choices) return false;
00180 
00181   if (fixpt != NULL) fixpt->clear();
00182   if (BestChoice->length() == 0)
00183     return false;
00184   if (caller == CHOPPER_CALLER && BestChoice->fragment_mark()) {
00185     if (stopper_debug_level >= 1) {
00186       cprintf("AcceptableChoice(): a choice with fragments beats BestChoice");
00187     }
00188     return false;
00189   }
00190 
00191   bool no_dang_ambigs = (GetMaxFixedLengthDawgIndex() >= 0 ||
00192                          NoDangerousAmbig(BestChoice, fixpt, true,
00193                                           Choices, modified_blobs));
00194   bool is_valid_word = valid_word_permuter(BestChoice->permuter(), false);
00195   bool is_case_ok = case_ok(*BestChoice, getUnicharset());
00196 
00197   if (stopper_debug_level >= 1)
00198     tprintf("\nStopper:  %s (word=%c, case=%c)\n",
00199             BestChoice->debug_string().string(),
00200             (is_valid_word ? 'y' : 'n'),
00201             (is_case_ok ? 'y' : 'n'));
00202 
00203   // Do not accept invalid words in PASS1.
00204   if (reject_offset_ <= 0.0f && !is_valid_word) return false;
00205   if (is_valid_word && is_case_ok) {
00206     WordSize = LengthOfShortestAlphaRun(*BestChoice);
00207     WordSize -= stopper_smallword_size;
00208     if (WordSize < 0)
00209       WordSize = 0;
00210     CertaintyThreshold += WordSize * stopper_certainty_per_char;
00211   }
00212 
00213   if (stopper_debug_level >= 1)
00214     tprintf("Stopper:  Certainty = %4.1f, Threshold = %4.1f\n",
00215             BestChoice->certainty(), CertaintyThreshold);
00216 
00217   if (no_dang_ambigs &&
00218       BestChoice->certainty() > CertaintyThreshold &&
00219       UniformCertainties(*Choices, *BestChoice)) {
00220     return true;
00221   } else {
00222     if (stopper_debug_level >= 2) {
00223       tprintf("AcceptableChoice() returned false"
00224               " (no_dang_ambig:%d cert:%g thresh:%g uniform:%d)\n",
00225               no_dang_ambigs, BestChoice->certainty(),
00226               CertaintyThreshold,
00227               UniformCertainties(*Choices, *BestChoice));
00228     }
00229     return false;
00230   }
00231 }
00232 
00233 bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) {
00234   float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
00235   int WordSize;
00236 
00237   if (stopper_debug_level >= 1) {
00238     tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
00239             BestChoice.debug_string().string(),
00240             (valid_word(BestChoice) ? 'y' : 'n'),
00241             (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
00242             ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y'));
00243   }
00244 
00245   if (BestChoice.length() == 0 || CurrentWordAmbig())
00246     return false;
00247   if (BestChoice.fragment_mark()) {
00248     if (stopper_debug_level >= 1) {
00249       cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
00250     }
00251     return false;
00252   }
00253   if (valid_word(BestChoice) && case_ok(BestChoice, getUnicharset())) {
00254     WordSize = LengthOfShortestAlphaRun(BestChoice);
00255     WordSize -= stopper_smallword_size;
00256     if (WordSize < 0)
00257       WordSize = 0;
00258     CertaintyThreshold += WordSize * stopper_certainty_per_char;
00259   }
00260 
00261   if (stopper_debug_level >= 1)
00262     cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
00263       BestChoice.certainty(), CertaintyThreshold);
00264 
00265   if (BestChoice.certainty() > CertaintyThreshold &&
00266       !stopper_no_acceptable_choices) {
00267     if (stopper_debug_level >= 1)
00268       cprintf("ACCEPTED\n");
00269     return true;
00270   }
00271   else {
00272     if (stopper_debug_level >= 1)
00273       cprintf("REJECTED\n");
00274     return false;
00275   }
00276 }
00277 
00278 bool Dict::AlternativeChoicesWorseThan(FLOAT32 Threshold) {
00279   LIST Alternatives;
00280   VIABLE_CHOICE Choice;
00281   Alternatives = list_rest (best_choices_);
00282   iterate(Alternatives) {
00283     Choice = (VIABLE_CHOICE) first_node (Alternatives);
00284     if (Choice->AdjustFactor <= Threshold)
00285       return false;
00286   }
00287   return true;
00288 }
00289 
00290 bool Dict::CurrentBestChoiceIs(const WERD_CHOICE &WordChoice) {
00291   return (best_choices_ != NIL_LIST &&
00292           StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(best_choices_)));
00293 }
00294 
00295 FLOAT32 Dict::CurrentBestChoiceAdjustFactor() {
00296   VIABLE_CHOICE BestChoice;
00297   if (best_choices_ == NIL_LIST)
00298     return (MAX_FLOAT32);
00299   BestChoice = (VIABLE_CHOICE) first_node (best_choices_);
00300   return (BestChoice->AdjustFactor);
00301 }
00302 
00303 
00304 bool Dict::CurrentWordAmbig() {
00305   return (list_rest (best_choices_) != NIL_LIST);
00306 }
00307 
00308 
00309 void Dict::DebugWordChoices() {
00310   LIST Choices;
00311   int i;
00312   char LabelString[80];
00313   VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(best_choices_);
00314   bool force_debug =
00315     fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments;
00316 
00317   if (stopper_debug_level >= 1 || force_debug ||
00318   (((STRING)word_to_debug).length() > 0 && best_choices_ &&
00319        StringSameAs(word_to_debug.string(), word_to_debug_lengths.string(),
00320                     (VIABLE_CHOICE)first_node(best_choices_)))) {
00321     if (best_raw_choice_)
00322       PrintViableChoice(stderr, "\nBest Raw Choice:   ", best_raw_choice_);
00323 
00324     i = 1;
00325     Choices = best_choices_;
00326     if (Choices)
00327       cprintf("\nBest Cooked Choices:\n");
00328     iterate(Choices) {
00329       sprintf(LabelString, "Cooked Choice #%d:  ", i);
00330       PrintViableChoice(stderr, LabelString,
00331                         (VIABLE_CHOICE)first_node(Choices));
00332       i++;
00333     }
00334   }
00335 }
00336 
00337 void Dict::PrintAmbigAlternatives(FILE *file, const char *label,
00338                                   int label_num_unichars) {
00339   iterate(raw_choices_) {
00340     VIABLE_CHOICE Choice = (VIABLE_CHOICE)first_node(raw_choices_);
00341     if (Choice->Length > 0 &&
00342         (label_num_unichars > 1 || Choice->Length > 1)) {
00343       for (int i = 0; i < Choice->Length; i++) {
00344         fprintf(file, "%s",
00345                 getUnicharset().id_to_unichar(Choice->Blob[i].Class));
00346       }
00347       fflush(file);
00348       fprintf(file, "\t%s\t%.4f\t%.4f\n", label,
00349               Choice->Rating, Choice->Certainty);
00350     }
00351   }
00352 }
00353 
00354 void Dict::FilterWordChoices() {
00355   EXPANDED_CHOICE BestChoice;
00356 
00357   if (best_choices_ == NIL_LIST || second_node (best_choices_) == NIL_LIST)
00358     return;
00359 
00360   // Compute certainties and class for each chunk in best choice.
00361   VIABLE_CHOICE_STRUCT *best_choice =
00362       (VIABLE_CHOICE_STRUCT *)first_node(best_choices_);
00363   ExpandChoice(best_choice, &BestChoice);
00364   if (stopper_debug_level >= 2)
00365     PrintViableChoice(stderr, "\nFiltering against best choice: ", best_choice);
00366   TessResultCallback2<int, void*, void*>* is_bad =
00367       NewPermanentTessCallback(this, &Dict::FreeBadChoice);
00368   set_rest(best_choices_, delete_d(list_rest(best_choices_),
00369                                    &BestChoice, is_bad));
00370   delete is_bad;
00371 }
00372 
00373 void Dict::FindClassifierErrors(FLOAT32 MinRating,
00374                                 FLOAT32 MaxRating,
00375                                 FLOAT32 RatingMargin,
00376                                 FLOAT32 Thresholds[]) {
00377   EXPANDED_CHOICE BestRaw;
00378   VIABLE_CHOICE Choice;
00379   int i, j, Chunk;
00380   FLOAT32 AvgRating;
00381   int NumErrorChunks;
00382 
00383   assert (best_choices_ != NIL_LIST);
00384   assert (best_raw_choice_ != NULL);
00385 
00386   ExpandChoice(best_raw_choice_, &BestRaw);
00387   Choice = (VIABLE_CHOICE) first_node (best_choices_);
00388 
00389   for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
00390     AvgRating = 0.0;
00391     NumErrorChunks = 0;
00392 
00393     for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
00394       if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
00395         AvgRating += BestRaw.ChunkCertainty[Chunk];
00396         NumErrorChunks++;
00397       }
00398     }
00399 
00400     if (NumErrorChunks > 0) {
00401       AvgRating /= NumErrorChunks;
00402       *Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin);
00403     }
00404     else
00405       *Thresholds = MaxRating;
00406 
00407     if (*Thresholds > MaxRating)
00408       *Thresholds = MaxRating;
00409     if (*Thresholds < MinRating)
00410       *Thresholds = MinRating;
00411   }
00412 }
00413 
00414 void Dict::InitChoiceAccum() {
00415   BLOB_WIDTH *BlobWidth, *End;
00416 
00417   if (best_raw_choice_)
00418     delete best_raw_choice_;
00419   best_raw_choice_ = NULL;
00420 
00421   if (best_choices_)
00422     destroy_nodes(best_choices_, DeleteViableChoiceStruct);
00423   best_choices_ = NIL_LIST;
00424 
00425   if (raw_choices_)
00426     destroy_nodes(raw_choices_, DeleteViableChoiceStruct);
00427   raw_choices_ = NIL_LIST;
00428 
00429   EnableChoiceAccum();
00430 
00431   for (BlobWidth = current_segmentation_,
00432     End = current_segmentation_ + MAX_NUM_CHUNKS;
00433     BlobWidth < End; *BlobWidth++ = 1);
00434 
00435 }
00436 
00437 void Dict::ClearBestChoiceAccum() {
00438   if (best_choices_) destroy_nodes(best_choices_, DeleteViableChoiceStruct);
00439   best_choices_ = NIL_LIST;
00440 }
00441 
00442 void Dict::LogNewSegmentation(PIECES_STATE BlobWidth) {
00443   BLOB_WIDTH *Segmentation;
00444   for (Segmentation = current_segmentation_; *BlobWidth != 0;
00445     BlobWidth++, Segmentation++)
00446   *Segmentation = *BlobWidth;
00447   *Segmentation = 0;
00448 }
00449 
00450 void Dict::LogNewSplit(int Blob) {
00451   LIST Choices;
00452   if (best_raw_choice_) AddNewChunk(best_raw_choice_, Blob);
00453   Choices = best_choices_;
00454   iterate(Choices) {
00455     AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
00456   }
00457   Choices = raw_choices_;
00458   iterate(Choices) {
00459     AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
00460   }
00461 }
00462 
00463 void Dict::LogNewChoice(FLOAT32 AdjustFactor,
00464                         const float Certainties[],
00465                         bool raw_choice,
00466                         WERD_CHOICE *WordChoice) {
00467   LIST ChoicesList;
00468   LIST Choices;
00469   FLOAT32 Threshold;
00470 
00471   if (!keep_word_choices_)
00472     return;
00473 
00474   if (raw_choice) {
00475     if (!best_raw_choice_) {
00476       best_raw_choice_ =
00477           NewViableChoice(*WordChoice, AdjustFactor, Certainties);
00478     } else if (WordChoice->rating() < best_raw_choice_->Rating) {
00479       if (ChoiceSameAs(*WordChoice, best_raw_choice_)) {
00480         FillViableChoice(*WordChoice, AdjustFactor, Certainties,
00481                          best_raw_choice_);
00482       } else {
00483         delete best_raw_choice_;
00484         best_raw_choice_ =
00485           NewViableChoice(*WordChoice, AdjustFactor, Certainties);
00486       }
00487     }
00488     if (!save_raw_choices) return;
00489     ChoicesList = raw_choices_;
00490   } else {
00491     ChoicesList = best_choices_;
00492   }
00493 
00494   // Throw out obviously bad choices to save some work.
00495   if (ChoicesList != NIL_LIST) {
00496     Threshold = StopperAmbigThreshold(BestFactor(ChoicesList), AdjustFactor);
00497     if (Threshold > -stopper_ambiguity_threshold_offset)
00498       Threshold = -stopper_ambiguity_threshold_offset;
00499     if (WordChoice->certainty() - BestCertainty (ChoicesList) < Threshold) {
00500       // Set the rating of the word to be terrible, so that it does not
00501       // get chosen as the best choice.
00502       if (stopper_debug_level >= 2) {
00503         STRING bad_string;
00504         WordChoice->string_and_lengths(&bad_string, NULL);
00505         tprintf("Discarding choice \"%s\" with an overly low certainty"
00506                 " %.4f vs best choice certainty %.4f (Threshold: %.4f)\n",
00507                 bad_string.string(), WordChoice->certainty(),
00508                 BestCertainty(ChoicesList),
00509                 Threshold + BestCertainty(ChoicesList));
00510       }
00511       WordChoice->set_rating(WERD_CHOICE::kBadRating);
00512       return;
00513     }
00514   }
00515 
00516   // See if a choice with the same text string has already been found.
00517   VIABLE_CHOICE NewChoice = NULL;
00518   Choices = ChoicesList;
00519 
00520   iterate(Choices) {
00521     if (ChoiceSameAs (*WordChoice, (VIABLE_CHOICE) first_node (Choices))) {
00522       if (WordChoice->rating() < BestRating (Choices)) {
00523         NewChoice = (VIABLE_CHOICE) first_node (Choices);
00524       } else {
00525         return;
00526       }
00527     }
00528   }
00529 
00530   if (NewChoice) {
00531     FillViableChoice(*WordChoice, AdjustFactor, Certainties, NewChoice);
00532     ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
00533   } else {
00534     NewChoice = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
00535   }
00536 
00537   ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
00538   if (stopper_debug_level >= 2)
00539     raw_choice ? PrintViableChoice (stderr, "New Raw Choice:  ", NewChoice) :
00540       PrintViableChoice (stderr, "New Word Choice:  ", NewChoice);
00541   if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
00542     Choices =
00543       (LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log);
00544     destroy_nodes(list_rest (Choices), DeleteViableChoiceStruct);
00545     set_rest(Choices, NIL_LIST);
00546   }
00547 
00548   // Update raw_choices_/best_choices_ pointer.
00549   if (raw_choice) {
00550     raw_choices_ = ChoicesList;
00551   } else {
00552     best_choices_ = ChoicesList;
00553   }
00554 }
00555 
00556 bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
00557                             DANGERR *fixpt,
00558                             bool fix_replaceable,
00559                             BLOB_CHOICE_LIST_VECTOR *blob_choices,
00560                             bool *modified_blobs) {
00561   if (stopper_debug_level > 2) {
00562     tprintf("\nRunning NoDangerousAmbig() for %s\n",
00563             best_choice->debug_string().string());
00564   }
00565 
00566   // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
00567   // for each unichar id in BestChoice.
00568   BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
00569   int i;
00570   bool modified_best_choice = false;
00571   bool ambigs_found = false;
00572   // For each position in best_choice:
00573   // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
00574   // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
00575   // -- look for ambiguities corresponding to wrong_ngram in the list while
00576   //    adding the following unichar_ids from best_choice to wrong_ngram
00577   //
00578   // Repeat the above procedure twice: first time look through
00579   // ambigs to be replaced and replace all the ambiguities found;
00580   // second time look through dangerous ambiguities and construct
00581   // ambig_blob_choices with fake a blob choice for each ambiguity
00582   // and pass them to dawg_permute_and_select() to search for
00583   // ambiguous words in the dictionaries.
00584   //
00585   // Note that during the execution of the for loop (on the first pass)
00586   // if replacements are made the length of best_choice might change.
00587   for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
00588     bool replace = (fix_replaceable && pass == 0);
00589     const UnicharAmbigsVector &table = replace ?
00590       getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
00591     if (!replace) {
00592       // Initialize ambig_blob_choices with lists containing a single
00593       // unichar id for the correspoding position in best_choice.
00594       // best_choice consisting from only the original letters will
00595       // have a rating of 0.0.
00596       for (i = 0; i < best_choice->length(); ++i) {
00597         BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
00598         BLOB_CHOICE_IT lst_it(lst);
00599         // TODO(rays/antonova) Should these BLOB_CHOICEs use real xheights
00600         // or are these fake ones good enough?
00601         lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
00602                                           0.0, 0.0, -1, -1, -1, 0, 1, false));
00603         ambig_blob_choices.push_back(lst);
00604       }
00605     }
00606     UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
00607     int wrong_ngram_index;
00608     int next_index;
00609     int blob_index = 0;
00610     for (i = 0; i < best_choice->length(); ++i) {
00611       if (i > 0) blob_index += best_choice->fragment_length(i-1);
00612       UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
00613       if (stopper_debug_level > 2) {
00614         tprintf("Looking for %s ngrams starting with %s:\n",
00615                 replace ? "replaceable" : "ambiguous",
00616                 getUnicharset().debug_str(curr_unichar_id).string());
00617       }
00618       wrong_ngram_index = 0;
00619       wrong_ngram[wrong_ngram_index] = curr_unichar_id;
00620       if (curr_unichar_id == INVALID_UNICHAR_ID ||
00621           curr_unichar_id >= table.size() ||
00622           table[curr_unichar_id] == NULL) {
00623         continue;  // there is no ambig spec for this unichar id
00624       }
00625       AmbigSpec_IT spec_it(table[curr_unichar_id]);
00626       for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
00627         const AmbigSpec *ambig_spec = spec_it.data();
00628         wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
00629         int compare = UnicharIdArrayUtils::compare(wrong_ngram,
00630                                                    ambig_spec->wrong_ngram);
00631         if (stopper_debug_level > 2) {
00632           tprintf("candidate ngram: ");
00633           UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
00634           tprintf("current ngram from spec: ");
00635           UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
00636           tprintf("comparison result: %d\n", compare);
00637         }
00638         if (compare == 0) {
00639           // Record the place where we found an ambiguity.
00640           if (fixpt != NULL) {
00641             fixpt->push_back(DANGERR_INFO(
00642                 blob_index, blob_index+wrong_ngram_index, replace,
00643                 getUnicharset().get_isngram(ambig_spec->correct_ngram_id)));
00644             if (stopper_debug_level > 1) {
00645               tprintf("fixpt+=(%d %d %d %d)\n", blob_index,
00646                       blob_index+wrong_ngram_index, false,
00647                       getUnicharset().get_isngram(
00648                           ambig_spec->correct_ngram_id));
00649             }
00650           }
00651 
00652           if (replace) {
00653             if (stopper_debug_level > 2) {
00654               tprintf("replace ambiguity with: ");
00655               UnicharIdArrayUtils::print(
00656                   ambig_spec->correct_fragments, getUnicharset());
00657             }
00658             ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
00659                          ambig_spec->correct_ngram_id,
00660                          best_choice, blob_choices, modified_blobs);
00661             modified_best_choice = true;
00662           } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
00663             // We found dang ambig - update ambig_blob_choices.
00664             if (stopper_debug_level > 2) {
00665               tprintf("found ambiguity: ");
00666               UnicharIdArrayUtils::print(
00667                   ambig_spec->correct_fragments, getUnicharset());
00668             }
00669             ambigs_found = true;
00670             for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
00671                  ++tmp_index) {
00672               // Add a blob choice for the corresponding fragment of the
00673               // ambiguity. These fake blob choices are initialized with
00674               // negative ratings (which are not possible for real blob
00675               // choices), so that dawg_permute_and_select() considers any
00676               // word not consisting of only the original letters a better
00677               // choice and stops searching for alternatives once such a
00678               // choice is found.
00679               BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
00680               bc_it.add_to_end(new BLOB_CHOICE(
00681                   ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
00682                   -1, -1, -1, 0, 1, false));
00683             }
00684           }
00685           spec_it.forward();
00686         } else if (compare == -1) {
00687           if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
00688               ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
00689             // Add the next unichar id to wrong_ngram and keep looking for
00690             // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
00691             wrong_ngram[++wrong_ngram_index] =
00692               best_choice->unichar_id(next_index);
00693           } else {
00694             break;  // no more matching ambigs in this AMBIG_SPEC_LIST
00695           }
00696         } else {
00697           spec_it.forward();
00698         }
00699       }  // end searching AmbigSpec_LIST
00700     }  // end searching best_choice
00701   }  // end searching replace and dangerous ambigs
00702 
00703   // If any ambiguities were found permute the constructed ambig_blob_choices
00704   // to see if an alternative dictionary word can be found.
00705   if (ambigs_found) {
00706     if (stopper_debug_level > 2) {
00707       tprintf("\nResulting ambig_blob_choices:\n");
00708       for (i = 0; i < ambig_blob_choices.length(); ++i) {
00709         print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
00710         tprintf("\n");
00711       }
00712     }
00713     WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
00714     ambigs_found = (alt_word->rating() < 0.0);
00715     if (ambigs_found) {
00716       if (stopper_debug_level >= 1) {
00717         tprintf ("Stopper: Possible ambiguous word = %s\n",
00718                  alt_word->debug_string().string());
00719       }
00720       if (fixpt != NULL) {
00721         // Note: Currently character choices combined from fragments can only
00722         // be generated by NoDangrousAmbigs(). This code should be updated if
00723         // the capability to produce classifications combined from character
00724         // fragments is added to other functions.
00725         int orig_i = 0;
00726         for (i = 0; i < alt_word->length(); ++i) {
00727           bool replacement_is_ngram =
00728               getUnicharset().get_isngram(alt_word->unichar_id(i));
00729           int end_i = orig_i + alt_word->fragment_length(i) - 1;
00730           if (alt_word->fragment_length(i) > 1 ||
00731               (orig_i == end_i && replacement_is_ngram)) {
00732             fixpt->push_back(DANGERR_INFO(orig_i, end_i, true,
00733                                           replacement_is_ngram));
00734              if (stopper_debug_level > 1) {
00735                tprintf("fixpt->dangerous+=(%d %d %d %d)\n", orig_i, end_i,
00736                        true, replacement_is_ngram);
00737              }
00738           }
00739           orig_i += alt_word->fragment_length(i);
00740         }
00741       }
00742     }
00743     delete alt_word;
00744   }
00745   if (output_ambig_words_file_ != NULL) {
00746     fprintf(output_ambig_words_file_, "\n");
00747   }
00748 
00749   ambig_blob_choices.delete_data_pointers();
00750   return !ambigs_found;
00751 }
00752 
00753 void Dict::EndDangerousAmbigs() {}
00754 
00755 void Dict::SettupStopperPass1() {
00756   reject_offset_ = 0.0;
00757 }
00758 
00759 void Dict::SettupStopperPass2() {
00760   reject_offset_ = stopper_phase2_certainty_rejection_offset;
00761 }
00762 
00763 void Dict::AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
00764   int i, LastChunk;
00765   for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
00766     LastChunk += Choice->Blob[i].NumChunks;
00767     if (Blob < LastChunk) {
00768       (Choice->Blob[i].NumChunks)++;
00769       return;
00770     }
00771   }
00772   cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
00773            Choice->Length, LastChunk, Blob);
00774   assert(false);  // this should never get executed
00775 }
00776 
00777 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
00778                         UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
00779                         BLOB_CHOICE_LIST_VECTOR *blob_choices,
00780                         bool *modified_blobs) {
00781   int num_blobs_to_replace = 0;
00782   int begin_blob_index = 0;
00783   int i;
00784   for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
00785     if (i >= wrong_ngram_begin_index) {
00786       num_blobs_to_replace +=  werd_choice->fragment_length(i);
00787     } else {
00788       begin_blob_index += werd_choice->fragment_length(i);
00789     }
00790   }
00791   BLOB_CHOICE_IT bit;
00792   int temp_blob_index = begin_blob_index;
00793   const char *temp_uch = NULL;
00794   const char *correct_ngram_str =
00795     getUnicharset().id_to_unichar(correct_ngram_id);
00796   for (int replaced_count = 0; replaced_count < wrong_ngram_size;
00797        ++replaced_count) {
00798     if (blob_choices != NULL) {
00799       UNICHAR_ID uch_id = werd_choice->unichar_id(wrong_ngram_begin_index);
00800       int fraglen = werd_choice->fragment_length(wrong_ngram_begin_index);
00801       if (fraglen > 1) temp_uch = getUnicharset().id_to_unichar(uch_id);
00802       for (i = 0; i < fraglen; ++i) {
00803         if (fraglen > 1) {
00804           STRING frag_str =
00805             CHAR_FRAGMENT::to_string(temp_uch, i, fraglen, false);
00806           getUnicharset().unichar_insert(frag_str.string());
00807           uch_id = getUnicharset().unichar_to_id(frag_str.string());
00808         }
00809         bit.set_to_list(blob_choices->get(temp_blob_index));
00810         STRING correct_frag_uch =
00811           CHAR_FRAGMENT::to_string(correct_ngram_str,
00812                                    temp_blob_index - begin_blob_index,
00813                                    num_blobs_to_replace, false);
00814         getUnicharset().unichar_insert(correct_frag_uch.string());
00815         UNICHAR_ID correct_frag_uch_id =
00816           getUnicharset().unichar_to_id(correct_frag_uch.string());
00817         // Find the WERD_CHOICE corresponding to the original unichar in
00818         // the list of blob choices, add the derived character fragment
00819         // before it with the same rating and certainty.
00820         for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
00821           if (bit.data()->unichar_id() == correct_frag_uch_id) {
00822             break;  // the unichar we want to insert is already there
00823           }
00824           if (bit.data()->unichar_id() == uch_id) {
00825             bit.add_before_then_move(new BLOB_CHOICE(*(bit.data())));
00826             bit.data()->set_unichar_id(correct_frag_uch_id);
00827             if (modified_blobs != NULL) *modified_blobs = true;
00828             break;
00829           }
00830         }
00831         temp_blob_index++;
00832       }
00833     }
00834     // Remove current unichar from werd_choice. On the last iteration
00835     // set the correct replacement unichar instead of removing a unichar.
00836     if (replaced_count + 1 == wrong_ngram_size) {
00837       werd_choice->set_unichar_id(correct_ngram_id,
00838           num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index);
00839     } else {
00840       werd_choice->remove_unichar_id(wrong_ngram_begin_index);
00841     }
00842   }
00843   if (stopper_debug_level >= 1 && modified_blobs != NULL &&
00844       *modified_blobs && blob_choices != NULL) {
00845       werd_choice->print("ReplaceAmbig() ");
00846       tprintf("Modified blob_choices: ");
00847       for (int i = 0; i < blob_choices->size(); ++i) {
00848         print_ratings_list("\n", blob_choices->get(i), getUnicharset());
00849     }
00850   }
00851 }
00852 
00853 int Dict::ChoiceSameAs(const WERD_CHOICE &WordChoice,
00854                        VIABLE_CHOICE ViableChoice) {
00855   return (StringSameAs(WordChoice, ViableChoice));
00856 }
00857 
00858 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
00859   int shortest = MAX_INT32;
00860   int curr_len = 0;
00861   for (int w = 0; w < WordChoice.length(); ++w) {
00862     if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
00863       curr_len++;
00864     } else if (curr_len > 0) {
00865       if (curr_len < shortest) shortest = curr_len;
00866       curr_len = 0;
00867     }
00868   }
00869   if (curr_len > 0 && curr_len < shortest) {
00870     shortest = curr_len;
00871   } else if (shortest == MAX_INT32) {
00872     shortest = 0;
00873   }
00874   return shortest;
00875 }
00876 
00877 VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
00878                                     FLOAT32 AdjustFactor,
00879                                     const float Certainties[]) {
00880   int Length = WordChoice.length();
00881   assert (Length <= MAX_NUM_CHUNKS && Length > 0);
00882   VIABLE_CHOICE NewChoice = new VIABLE_CHOICE_STRUCT(Length);
00883   FillViableChoice(WordChoice, AdjustFactor, Certainties, NewChoice);
00884   return NewChoice;
00885 }
00886 
00887 void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
00888   int i, j;
00889   fprintf (File, "%s", Label);
00890   fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d)  ",
00891     Choice->Rating, Choice->Certainty,
00892     Choice->AdjustFactor, Choice->ComposedFromCharFragments);
00893 
00894   for (i = 0; i < Choice->Length; i++)
00895     fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
00896   fprintf(File, "\n");
00897 
00898   for (i = 0; i < Choice->Length; i++) {
00899     fprintf(File, "  %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
00900     for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
00901       fprintf(File, "    ");
00902   }
00903   fprintf(File, "\n");
00904 
00905   for (i = 0; i < Choice->Length; i++) {
00906     for (j = 0; j < Choice->Blob[i].NumChunks; j++)
00907       fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0));
00908   }
00909   fprintf(File, "\n");
00910 
00911   for (i = 0; i < Choice->Length; i++) {
00912     for (j = 0; j < Choice->Blob[i].NumChunks; j++)
00913       fprintf(File, "%3d ", Choice->Blob[i].NumChunks);
00914   }
00915   fprintf(File, "\n");
00916 }
00917 
00918 void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
00919                             FLOAT32 AdjustFactor, const float Certainties[],
00920                             VIABLE_CHOICE ViableChoice) {
00921   ViableChoice->Init(WordChoice, current_segmentation_, Certainties,
00922                      AdjustFactor);
00923 
00924 }
00925 
00926 bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
00927                         VIABLE_CHOICE ViableChoice) {
00928   if (WordChoice.length() != ViableChoice->Length) {
00929     return false;
00930   }
00931   int i;
00932   CHAR_CHOICE *CharChoice;
00933   for (i = 0, CharChoice = &(ViableChoice->Blob[0]);
00934        i < ViableChoice->Length; CharChoice++, i++) {
00935     if (CharChoice->Class != WordChoice.unichar_id(i)) {
00936       return false;
00937     }
00938   }
00939   return true;
00940 }
00941 
00942 bool Dict::StringSameAs(const char *String,
00943                         const char *String_lengths,
00944                         VIABLE_CHOICE ViableChoice) {
00945   CHAR_CHOICE *Char;
00946   int i;
00947   int current_unichar_length;
00948 
00949   for (Char = &(ViableChoice->Blob[0]), i = 0;
00950     i < ViableChoice->Length;
00951        String += *(String_lengths++), Char++, i++) {
00952     current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class));
00953   if (current_unichar_length != *String_lengths ||
00954       strncmp(String, getUnicharset().id_to_unichar(Char->Class),
00955               current_unichar_length) != 0)
00956     return false;
00957   }
00958   return (*String == 0) ? true : false;
00959 }
00960 
00961 int Dict::UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
00962                              const WERD_CHOICE &BestChoice) {
00963   float Certainty;
00964   float WorstCertainty = MAX_FLOAT32;
00965   float CertaintyThreshold;
00966   FLOAT64 TotalCertainty;
00967   FLOAT64 TotalCertaintySquared;
00968   FLOAT64 Variance;
00969   FLOAT32 Mean, StdDev;
00970   int WordLength;
00971 
00972   WordLength = Choices.length();
00973   if (WordLength < 3)
00974     return true;
00975 
00976   TotalCertainty = TotalCertaintySquared = 0.0;
00977   BLOB_CHOICE_IT BlobChoiceIt;
00978   for (int i = 0; i < Choices.length(); ++i) {
00979     BlobChoiceIt.set_to_list(Choices.get(i));
00980     Certainty = BlobChoiceIt.data()->certainty();
00981     TotalCertainty += Certainty;
00982     TotalCertaintySquared += Certainty * Certainty;
00983     if (Certainty < WorstCertainty)
00984       WorstCertainty = Certainty;
00985   }
00986 
00987   // Subtract off worst certainty from statistics.
00988   WordLength--;
00989   TotalCertainty -= WorstCertainty;
00990   TotalCertaintySquared -= WorstCertainty * WorstCertainty;
00991 
00992   Mean = TotalCertainty / WordLength;
00993   Variance = ((WordLength * TotalCertaintySquared -
00994     TotalCertainty * TotalCertainty) /
00995     (WordLength * (WordLength - 1)));
00996   if (Variance < 0.0)
00997     Variance = 0.0;
00998   StdDev = sqrt (Variance);
00999 
01000   CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
01001   if (CertaintyThreshold > stopper_nondict_certainty_base)
01002     CertaintyThreshold = stopper_nondict_certainty_base;
01003 
01004   if (BestChoice.certainty() < CertaintyThreshold) {
01005     if (stopper_debug_level >= 1)
01006       cprintf("Stopper: Non-uniform certainty = %4.1f"
01007               " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
01008               BestChoice.certainty(), Mean, StdDev, CertaintyThreshold);
01009     return false;
01010   } else {
01011     return true;
01012   }
01013 }
01014 
01015 } // namespace tesseract