Tesseract  3.02
tesseract-ocr/classify/adaptmatch.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002  ** Filename:    adaptmatch.c
00003  ** Purpose:     High level adaptive matcher.
00004  ** Author:      Dan Johnson
00005  ** History:     Mon Mar 11 10:00:10 1991, DSJ, Created.
00006  **
00007  ** (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 
00019 /*-----------------------------------------------------------------------------
00020           Include Files and Type Defines
00021 -----------------------------------------------------------------------------*/
00022 #include <ctype.h>
00023 #include "ambigs.h"
00024 #include "blobclass.h"
00025 #include "blobs.h"
00026 #include "helpers.h"
00027 #include "normfeat.h"
00028 #include "mfoutline.h"
00029 #include "picofeat.h"
00030 #include "float2int.h"
00031 #include "outfeat.h"
00032 #include "emalloc.h"
00033 #include "intfx.h"
00034 #include "speckle.h"
00035 #include "efio.h"
00036 #include "normmatch.h"
00037 #include "permute.h"
00038 #include "ndminx.h"
00039 #include "intproto.h"
00040 #include "const.h"
00041 #include "globals.h"
00042 #include "werd.h"
00043 #include "callcpp.h"
00044 #include "pageres.h"
00045 #include "params.h"
00046 #include "classify.h"
00047 #include "shapetable.h"
00048 #include "tessclassifier.h"
00049 #include "trainingsample.h"
00050 #include "unicharset.h"
00051 #include "dict.h"
00052 #include "featdefs.h"
00053 #include "genericvector.h"
00054 
00055 #include <stdio.h>
00056 #include <string.h>
00057 #include <stdlib.h>
00058 #include <math.h>
00059 #ifdef __UNIX__
00060 #include <assert.h>
00061 #endif
00062 
00063 // Include automatically generated configuration file if running autoconf.
00064 #ifdef HAVE_CONFIG_H
00065 #include "config_auto.h"
00066 #endif
00067 
00068 #define ADAPT_TEMPLATE_SUFFIX ".a"
00069 
00070 #define MAX_MATCHES         10
00071 #define UNLIKELY_NUM_FEAT 200
00072 #define NO_DEBUG      0
00073 #define MAX_ADAPTABLE_WERD_SIZE 40
00074 
00075 #define ADAPTABLE_WERD_ADJUSTMENT    (0.05)
00076 
00077 #define Y_DIM_OFFSET    (Y_SHIFT - BASELINE_Y_SHIFT)
00078 
00079 #define WORST_POSSIBLE_RATING (1.0)
00080 
00081 struct ScoredClass {
00082   CLASS_ID unichar_id;
00083   int shape_id;
00084   FLOAT32 rating;
00085   bool adapted;
00086   inT16 config;
00087   inT16 fontinfo_id;
00088   inT16 fontinfo_id2;
00089 };
00090 
00091 struct ADAPT_RESULTS {
00092   inT32 BlobLength;
00093   int NumMatches;
00094   bool HasNonfragment;
00095   ScoredClass match[MAX_NUM_CLASSES];
00096   ScoredClass best_match;
00097   CLASS_PRUNER_RESULTS CPResults;
00098 
00101   inline void Initialize() {
00102      BlobLength = MAX_INT32;
00103      NumMatches = 0;
00104      HasNonfragment = false;
00105      best_match.unichar_id = NO_CLASS;
00106      best_match.shape_id = -1;
00107      best_match.rating = WORST_POSSIBLE_RATING;
00108      best_match.adapted = false;
00109      best_match.config = 0;
00110      best_match.fontinfo_id = kBlankFontinfoId;
00111      best_match.fontinfo_id2 = kBlankFontinfoId;
00112   }
00113 };
00114 
00115 struct PROTO_KEY {
00116   ADAPT_TEMPLATES Templates;
00117   CLASS_ID ClassId;
00118   int ConfigId;
00119 };
00120 
00121 /*-----------------------------------------------------------------------------
00122           Private Macros
00123 -----------------------------------------------------------------------------*/
00124 #define MarginalMatch(Rating)       \
00125 ((Rating) > matcher_great_threshold)
00126 
00127 #define InitIntFX() (FeaturesHaveBeenExtracted = FALSE)
00128 
00129 /*-----------------------------------------------------------------------------
00130           Private Function Prototypes
00131 -----------------------------------------------------------------------------*/
00132 int CompareByRating(const void *arg1, const void *arg2);
00133 
00134 ScoredClass *FindScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id);
00135 
00136 ScoredClass ScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id);
00137 
00138 void InitMatcherRatings(register FLOAT32 *Rating);
00139 
00140 int MakeTempProtoPerm(void *item1, void *item2);
00141 
00142 void SetAdaptiveThreshold(FLOAT32 Threshold);
00143 
00144 
00145 /*-----------------------------------------------------------------------------
00146               Public Code
00147 -----------------------------------------------------------------------------*/
00148 /*---------------------------------------------------------------------------*/
00149 namespace tesseract {
00178 void Classify::AdaptiveClassifier(TBLOB *Blob,
00179                                   const DENORM& denorm,
00180                                   BLOB_CHOICE_LIST *Choices,
00181                                   CLASS_PRUNER_RESULTS CPResults) {
00182   assert(Choices != NULL);
00183   ADAPT_RESULTS *Results = new ADAPT_RESULTS();
00184 
00185   if (AdaptedTemplates == NULL)
00186     AdaptedTemplates = NewAdaptedTemplates (true);
00187 
00188   Results->Initialize();
00189 
00190   DoAdaptiveMatch(Blob, denorm, Results);
00191   if (CPResults != NULL)
00192     memcpy(CPResults, Results->CPResults,
00193            sizeof(CPResults[0]) * Results->NumMatches);
00194 
00195   RemoveBadMatches(Results);
00196   qsort((void *)Results->match, Results->NumMatches,
00197         sizeof(ScoredClass), CompareByRating);
00198   RemoveExtraPuncs(Results);
00199   ConvertMatchesToChoices(denorm, Blob->bounding_box(), Results, Choices);
00200 
00201   if (matcher_debug_level >= 1) {
00202     cprintf ("AD Matches =  ");
00203     PrintAdaptiveMatchResults(stdout, Results);
00204   }
00205 
00206   if (LargeSpeckle(Blob))
00207     AddLargeSpeckleTo(Choices);
00208 
00209 #ifndef GRAPHICS_DISABLED
00210   if (classify_enable_adaptive_debugger)
00211     DebugAdaptiveClassifier(Blob, denorm, Results);
00212 #endif
00213 
00214   NumClassesOutput += Choices->length();
00215   if (Choices->length() == 0) {
00216     if (!classify_bln_numeric_mode)
00217       tprintf ("Empty classification!\n");  // Should never normally happen.
00218     Choices = new BLOB_CHOICE_LIST();
00219     BLOB_CHOICE_IT temp_it;
00220     temp_it.set_to_list(Choices);
00221     temp_it.add_to_end(
00222         new BLOB_CHOICE(0, 50.0f, -20.0f, -1, -1, NULL, 0, 0, false));
00223   }
00224 
00225   delete Results;
00226 }                                /* AdaptiveClassifier */
00227 
00228 // If *win is NULL, sets it to a new ScrollView() object with title msg.
00229 // Clears the window and draws baselines.
00230 void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,
00231                                   int y_offset, const TBOX &wbox) {
00232   #ifndef GRAPHICS_DISABLED
00233   const int kSampleSpaceWidth = 500;
00234   if (*win == NULL) {
00235     *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
00236                           kSampleSpaceWidth * 2, 200, true);
00237   }
00238   (*win)->Clear();
00239   (*win)->Pen(64, 64, 64);
00240   (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
00241                kSampleSpaceWidth, kBlnBaselineOffset);
00242   (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
00243                kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
00244   (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
00245                           wbox.right(), wbox.bottom());
00246   #endif  // GRAPHICS_DISABLED
00247 }
00248 
00249 // Learns the given word using its chopped_word, seam_array, denorm,
00250 // box_word, best_state, and correct_text to learn both correctly and
00251 // incorrectly segmented blobs. If filename is not NULL, then LearnBlob
00252 // is called and the data will be written to a file for static training.
00253 // Otherwise AdaptToBlob is called for adaption within a document.
00254 // If rejmap is not NULL, then only chars with a rejmap entry of '1' will
00255 // be learned, otherwise all chars with good correct_text are learned.
00256 void Classify::LearnWord(const char* filename, const char *rejmap,
00257                          WERD_RES *word) {
00258   int word_len = word->correct_text.size();
00259   if (word_len == 0) return;
00260 
00261   float* thresholds = NULL;
00262   if (filename == NULL) {
00263     // Adaption mode.
00264     if (!EnableLearning || word->best_choice == NULL ||
00265         // If word->best_choice is not recorded at the top of accumulator's
00266         // best choices (which could happen for choices that are
00267         // altered with ReplaceAmbig()) we skip the adaption.
00268         !getDict().CurrentBestChoiceIs(*(word->best_choice)))
00269       return;  // Can't or won't adapt.
00270 
00271     NumWordsAdaptedTo++;
00272     if (classify_learning_debug_level >= 1)
00273       tprintf("\n\nAdapting to word = %s\n",
00274               word->best_choice->debug_string().string());
00275     thresholds = new float[word_len];
00276     GetAdaptThresholds(word->rebuild_word, word->denorm, *word->best_choice,
00277                        *word->raw_choice, thresholds);
00278   }
00279   int start_blob = 0;
00280   char prev_map_char = '0';
00281 
00282   #ifndef GRAPHICS_DISABLED
00283   if (classify_debug_character_fragments) {
00284     if (learn_fragmented_word_debug_win_ != NULL) {
00285       window_wait(learn_fragmented_word_debug_win_);
00286     }
00287     RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
00288                        word->chopped_word->bounding_box());
00289     RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
00290                        word->chopped_word->bounding_box());
00291     word->chopped_word->plot(learn_fragmented_word_debug_win_);
00292     ScrollView::Update();
00293   }
00294   #endif  // GRAPHICS_DISABLED
00295 
00296   for (int ch = 0; ch < word_len; ++ch) {
00297     if (classify_debug_character_fragments) {
00298       tprintf("\nLearning %s\n",  word->correct_text[ch].string());
00299     }
00300     char rej_map_char = rejmap != NULL ? *rejmap++ : '1';
00301 
00302     if (word->correct_text[ch].length() > 0 && rej_map_char == '1') {
00303       float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;
00304 
00305       LearnPieces(filename, start_blob, word->best_state[ch],
00306                   threshold, CST_WHOLE, word->correct_text[ch].string(), word);
00307 
00308       if (word->best_state[ch] > 1 && !disable_character_fragments) {
00309         // Check that the character breaks into meaningful fragments
00310         // that each match a whole character with at least
00311         // classify_character_fragments_garbage_certainty_threshold
00312         bool garbage = false;
00313         TBLOB* frag_blob = word->chopped_word->blobs;
00314         for (int i = 0; i < start_blob; ++i) frag_blob = frag_blob->next;
00315         int frag;
00316         for (frag = 0; frag < word->best_state[ch]; ++frag) {
00317           if (classify_character_fragments_garbage_certainty_threshold < 0) {
00318             garbage |= LooksLikeGarbage(word->denorm, frag_blob);
00319           }
00320           frag_blob = frag_blob->next;
00321         }
00322         // Learn the fragments.
00323         if (!garbage) {
00324           bool pieces_all_natural = word->PiecesAllNatural(start_blob,
00325               word->best_state[ch]);
00326           if (pieces_all_natural || !prioritize_division) {
00327             for (frag = 0; frag < word->best_state[ch]; ++frag) {
00328               GenericVector<STRING> tokens;
00329               word->correct_text[ch].split(' ', &tokens);
00330 
00331               tokens[0] = CHAR_FRAGMENT::to_string(
00332                   tokens[0].string(), frag, word->best_state[ch],
00333                   pieces_all_natural);
00334 
00335               STRING full_string;
00336               for (int i = 0; i < tokens.size(); i++) {
00337                 full_string += tokens[i];
00338                 if (i != tokens.size() - 1)
00339                   full_string += ' ';
00340               }
00341               LearnPieces(filename, start_blob + frag, 1,
00342                           threshold, CST_FRAGMENT, full_string.string(), word);
00343             }
00344           }
00345         }
00346       }
00347 
00348       // TODO(rays): re-enable this part of the code when we switch to the
00349       // new classifier that needs to see examples of garbage.
00350       /*
00351       char next_map_char = ch + 1 < word_len
00352                            ? (rejmap != NULL ? *rejmap : '1')
00353                            : '0';
00354       if (word->best_state[ch] > 1) {
00355         // If the next blob is good, make junk with the rightmost fragment.
00356         if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
00357             next_map_char == '1') {
00358           LearnPieces(filename, start_blob + word->best_state[ch] - 1,
00359                       word->best_state[ch + 1] + 1,
00360                       threshold, CST_IMPROPER, INVALID_UNICHAR, word);
00361         }
00362         // If the previous blob is good, make junk with the leftmost fragment.
00363         if (ch > 0 && word->correct_text[ch - 1].length() > 0 &&
00364             prev_map_char == '1') {
00365           LearnPieces(filename, start_blob - word->best_state[ch - 1],
00366                       word->best_state[ch - 1] + 1,
00367                       threshold, CST_IMPROPER, INVALID_UNICHAR, word);
00368         }
00369       }
00370       // If the next blob is good, make a join with it.
00371       if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
00372           next_map_char == '1') {
00373         STRING joined_text = word->correct_text[ch];
00374         joined_text += word->correct_text[ch + 1];
00375         LearnPieces(filename, start_blob,
00376                     word->best_state[ch] + word->best_state[ch + 1],
00377                     threshold, CST_NGRAM, joined_text.string(), word);
00378       }
00379       */
00380     }
00381     start_blob += word->best_state[ch];
00382     prev_map_char = rej_map_char;
00383   }
00384   delete [] thresholds;
00385 }  // LearnWord.
00386 
00387 // Builds a blob of length fragments, from the word, starting at start,
00388 // and then learns it, as having the given correct_text.
00389 // If filename is not NULL, then LearnBlob
00390 // is called and the data will be written to a file for static training.
00391 // Otherwise AdaptToBlob is called for adaption within a document.
00392 // threshold is a magic number required by AdaptToChar and generated by
00393 // GetAdaptThresholds.
00394 // Although it can be partly inferred from the string, segmentation is
00395 // provided to explicitly clarify the character segmentation.
00396 void Classify::LearnPieces(const char* filename, int start, int length,
00397                            float threshold, CharSegmentationType segmentation,
00398                            const char* correct_text, WERD_RES *word) {
00399   // TODO(daria) Remove/modify this if/when we want
00400   // to train and/or adapt to n-grams.
00401   if (segmentation != CST_WHOLE &&
00402       (segmentation != CST_FRAGMENT || disable_character_fragments))
00403     return;
00404 
00405   if (length > 1) {
00406     join_pieces(word->chopped_word->blobs, word->seam_array,
00407                 start, start + length - 1);
00408   }
00409   TBLOB* blob = word->chopped_word->blobs;
00410   for (int i = 0; i < start; ++i)
00411     blob = blob->next;
00412   // Rotate the blob if needed for classification.
00413   const DENORM* denorm = &word->denorm;
00414   TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded(&denorm);
00415   if (rotated_blob == NULL)
00416     rotated_blob = blob;
00417 
00418   #ifndef GRAPHICS_DISABLED
00419   // Draw debug windows showing the blob that is being learned if needed.
00420   if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
00421     RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
00422                        word->chopped_word->bounding_box());
00423     rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
00424     learn_debug_win_->Update();
00425     window_wait(learn_debug_win_);
00426   }
00427   if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
00428     ASSERT_HOST(learn_fragments_debug_win_ != NULL);  // set up in LearnWord
00429     blob->plot(learn_fragments_debug_win_,
00430                ScrollView::BLUE, ScrollView::BROWN);
00431     learn_fragments_debug_win_->Update();
00432   }
00433   #endif  // GRAPHICS_DISABLED
00434 
00435   if (filename != NULL) {
00436     classify_norm_method.set_value(character);  // force char norm spc 30/11/93
00437     tess_bn_matching.set_value(false);    // turn it off
00438     tess_cn_matching.set_value(false);
00439     LearnBlob(feature_defs_, filename, rotated_blob, *denorm,
00440               correct_text);
00441   } else if (unicharset.contains_unichar(correct_text)) {
00442     UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
00443     int font_id = word->fontinfo != NULL
00444                 ? fontinfo_table_.get_id(*word->fontinfo)
00445                 : 0;
00446     if (classify_learning_debug_level >= 1)
00447       tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
00448               unicharset.id_to_unichar(class_id), threshold, font_id);
00449     // If filename is not NULL we are doing recognition
00450     // (as opposed to training), so we must have already set word fonts.
00451     AdaptToChar(rotated_blob, *denorm, class_id, font_id, threshold);
00452   } else if (classify_debug_level >= 1) {
00453     tprintf("Can't adapt to %s not in unicharset\n", correct_text);
00454   }
00455   if (rotated_blob != blob) {
00456     delete rotated_blob;
00457     delete denorm;
00458   }
00459 
00460   break_pieces(blob, word->seam_array, start, start + length - 1);
00461 }  // LearnPieces.
00462 
00463 /*---------------------------------------------------------------------------*/
00478 void Classify::EndAdaptiveClassifier() {
00479   STRING Filename;
00480   FILE *File;
00481 
00482   #ifndef SECURE_NAMES
00483   if (AdaptedTemplates != NULL &&
00484       classify_enable_adaptive_matcher && classify_save_adapted_templates) {
00485     Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
00486     File = fopen (Filename.string(), "wb");
00487     if (File == NULL)
00488       cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
00489     else {
00490       cprintf ("\nSaving adapted templates to %s ...", Filename.string());
00491       fflush(stdout);
00492       WriteAdaptedTemplates(File, AdaptedTemplates);
00493       cprintf ("\n");
00494       fclose(File);
00495     }
00496   }
00497   #endif
00498 
00499   if (AdaptedTemplates != NULL) {
00500     free_adapted_templates(AdaptedTemplates);
00501     AdaptedTemplates = NULL;
00502   }
00503 
00504   if (PreTrainedTemplates != NULL) {
00505     free_int_templates(PreTrainedTemplates);
00506     PreTrainedTemplates = NULL;
00507   }
00508   getDict().EndDangerousAmbigs();
00509   FreeNormProtos();
00510   if (AllProtosOn != NULL) {
00511     FreeBitVector(AllProtosOn);
00512     FreeBitVector(PrunedProtos);
00513     FreeBitVector(AllConfigsOn);
00514     FreeBitVector(AllProtosOff);
00515     FreeBitVector(AllConfigsOff);
00516     FreeBitVector(TempProtoMask);
00517     AllProtosOn = NULL;
00518     PrunedProtos = NULL;
00519     AllConfigsOn = NULL;
00520     AllProtosOff = NULL;
00521     AllConfigsOff = NULL;
00522     TempProtoMask = NULL;
00523   }
00524   delete shape_table_;
00525   shape_table_ = NULL;
00526 }                                /* EndAdaptiveClassifier */
00527 
00528 
00529 /*---------------------------------------------------------------------------*/
00547 void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
00548   if (!classify_enable_adaptive_matcher)
00549     return;
00550   if (AllProtosOn != NULL)
00551     EndAdaptiveClassifier();  // Don't leak with multiple inits.
00552 
00553   // If there is no language_data_path_prefix, the classifier will be
00554   // adaptive only.
00555   if (language_data_path_prefix.length() > 0 &&
00556       load_pre_trained_templates) {
00557     ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_INTTEMP));
00558     PreTrainedTemplates =
00559       ReadIntTemplates(tessdata_manager.GetDataFilePtr());
00560     if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded inttemp\n");
00561 
00562     if (tessdata_manager.SeekToStart(TESSDATA_SHAPE_TABLE)) {
00563       shape_table_ = new ShapeTable(unicharset);
00564       if (!shape_table_->DeSerialize(tessdata_manager.swap(),
00565                                      tessdata_manager.GetDataFilePtr())) {
00566         tprintf("Error loading shape table!\n");
00567         delete shape_table_;
00568         shape_table_ = NULL;
00569       } else if (tessdata_manager.DebugLevel() > 0) {
00570         tprintf("Successfully loaded shape table!\n");
00571       }
00572     }
00573 
00574     ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_PFFMTABLE));
00575     ReadNewCutoffs(tessdata_manager.GetDataFilePtr(),
00576                    tessdata_manager.swap(),
00577                    tessdata_manager.GetEndOffset(TESSDATA_PFFMTABLE),
00578                    CharNormCutoffs);
00579     if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded pffmtable\n");
00580 
00581     ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_NORMPROTO));
00582     NormProtos =
00583       ReadNormProtos(tessdata_manager.GetDataFilePtr(),
00584                      tessdata_manager.GetEndOffset(TESSDATA_NORMPROTO));
00585     if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded normproto\n");
00586   }
00587 
00588   im_.Init(&classify_debug_level, classify_integer_matcher_multiplier);
00589   InitIntegerFX();
00590 
00591   AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
00592   PrunedProtos = NewBitVector(MAX_NUM_PROTOS);
00593   AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
00594   AllProtosOff = NewBitVector(MAX_NUM_PROTOS);
00595   AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
00596   TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
00597   set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
00598   set_all_bits(PrunedProtos, WordsInVectorOfSize(MAX_NUM_PROTOS));
00599   set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
00600   zero_all_bits(AllProtosOff, WordsInVectorOfSize(MAX_NUM_PROTOS));
00601   zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
00602 
00603   for (int i = 0; i < MAX_NUM_CLASSES; i++) {
00604      BaselineCutoffs[i] = 0;
00605   }
00606 
00607   if (classify_use_pre_adapted_templates) {
00608     FILE *File;
00609     STRING Filename;
00610 
00611     Filename = imagefile;
00612     Filename += ADAPT_TEMPLATE_SUFFIX;
00613     File = fopen(Filename.string(), "rb");
00614     if (File == NULL) {
00615       AdaptedTemplates = NewAdaptedTemplates(true);
00616     } else {
00617       #ifndef SECURE_NAMES
00618       cprintf("\nReading pre-adapted templates from %s ...\n",
00619               Filename.string());
00620       fflush(stdout);
00621       #endif
00622       AdaptedTemplates = ReadAdaptedTemplates(File);
00623       cprintf("\n");
00624       fclose(File);
00625       PrintAdaptedTemplates(stdout, AdaptedTemplates);
00626 
00627       for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
00628         BaselineCutoffs[i] = CharNormCutoffs[i];
00629       }
00630     }
00631   } else {
00632     if (AdaptedTemplates != NULL)
00633       free_adapted_templates(AdaptedTemplates);
00634     AdaptedTemplates = NewAdaptedTemplates(true);
00635   }
00636 }                                /* InitAdaptiveClassifier */
00637 
00638 void Classify::ResetAdaptiveClassifierInternal() {
00639   if (classify_learning_debug_level > 0) {
00640     tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
00641             NumAdaptationsFailed);
00642   }
00643   free_adapted_templates(AdaptedTemplates);
00644   AdaptedTemplates = NULL;
00645   NumAdaptationsFailed = 0;
00646 }
00647 
00648 
00649 /*---------------------------------------------------------------------------*/
00661 void Classify::PrintAdaptiveStatistics(FILE *File) {
00662   #ifndef SECURE_NAMES
00663 
00664   fprintf (File, "\nADAPTIVE MATCHER STATISTICS:\n");
00665   fprintf (File, "\tNum blobs classified = %d\n", AdaptiveMatcherCalls);
00666   fprintf (File, "\tNum classes output   = %d (Avg = %4.2f)\n",
00667     NumClassesOutput,
00668     ((AdaptiveMatcherCalls == 0) ? (0.0) :
00669   ((float) NumClassesOutput / AdaptiveMatcherCalls)));
00670   fprintf (File, "\t\tBaseline Classifier: %4d calls (%4.2f classes/call)\n",
00671     BaselineClassifierCalls,
00672     ((BaselineClassifierCalls == 0) ? (0.0) :
00673   ((float) NumBaselineClassesTried / BaselineClassifierCalls)));
00674   fprintf (File, "\t\tCharNorm Classifier: %4d calls (%4.2f classes/call)\n",
00675     CharNormClassifierCalls,
00676     ((CharNormClassifierCalls == 0) ? (0.0) :
00677   ((float) NumCharNormClassesTried / CharNormClassifierCalls)));
00678   fprintf (File, "\t\tAmbig    Classifier: %4d calls (%4.2f classes/call)\n",
00679     AmbigClassifierCalls,
00680     ((AmbigClassifierCalls == 0) ? (0.0) :
00681   ((float) NumAmbigClassesTried / AmbigClassifierCalls)));
00682 
00683   fprintf (File, "\nADAPTIVE LEARNER STATISTICS:\n");
00684   fprintf (File, "\tNumber of words adapted to: %d\n", NumWordsAdaptedTo);
00685   fprintf (File, "\tNumber of chars adapted to: %d\n", NumCharsAdaptedTo);
00686 
00687   PrintAdaptedTemplates(File, AdaptedTemplates);
00688   #endif
00689 }                                /* PrintAdaptiveStatistics */
00690 
00691 
00692 /*---------------------------------------------------------------------------*/
00712 void Classify::SettupPass1() {
00713   EnableLearning = classify_enable_learning;
00714 
00715   getDict().SettupStopperPass1();
00716 
00717 }                                /* SettupPass1 */
00718 
00719 
00720 /*---------------------------------------------------------------------------*/
00732 void Classify::SettupPass2() {
00733   EnableLearning = FALSE;
00734   getDict().SettupStopperPass2();
00735 
00736 }                                /* SettupPass2 */
00737 
00738 
00739 /*---------------------------------------------------------------------------*/
00760 void Classify::InitAdaptedClass(TBLOB *Blob,
00761                                 const DENORM& denorm,
00762                                 CLASS_ID ClassId,
00763                                 int FontinfoId,
00764                                 ADAPT_CLASS Class,
00765                                 ADAPT_TEMPLATES Templates) {
00766   FEATURE_SET Features;
00767   int Fid, Pid;
00768   FEATURE Feature;
00769   int NumFeatures;
00770   TEMP_PROTO TempProto;
00771   PROTO Proto;
00772   INT_CLASS IClass;
00773   TEMP_CONFIG Config;
00774 
00775   classify_norm_method.set_value(baseline);
00776   Features = ExtractOutlineFeatures(Blob);
00777   NumFeatures = Features->NumFeatures;
00778   if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
00779     FreeFeatureSet(Features);
00780     return;
00781   }
00782 
00783   Config = NewTempConfig(NumFeatures - 1, FontinfoId);
00784   TempConfigFor(Class, 0) = Config;
00785 
00786   /* this is a kludge to construct cutoffs for adapted templates */
00787   if (Templates == AdaptedTemplates)
00788     BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
00789 
00790   IClass = ClassForClassId (Templates->Templates, ClassId);
00791 
00792   for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
00793     Pid = AddIntProto (IClass);
00794     assert (Pid != NO_PROTO);
00795 
00796     Feature = Features->Features[Fid];
00797     TempProto = NewTempProto ();
00798     Proto = &(TempProto->Proto);
00799 
00800     /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
00801        ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
00802        instead of the -0.25 to 0.75 used in baseline normalization */
00803     Proto->Angle = Feature->Params[OutlineFeatDir];
00804     Proto->X = Feature->Params[OutlineFeatX];
00805     Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
00806     Proto->Length = Feature->Params[OutlineFeatLength];
00807     FillABC(Proto);
00808 
00809     TempProto->ProtoId = Pid;
00810     SET_BIT (Config->Protos, Pid);
00811 
00812     ConvertProto(Proto, Pid, IClass);
00813     AddProtoToProtoPruner(Proto, Pid, IClass,
00814                           classify_learning_debug_level >= 2);
00815 
00816     Class->TempProtos = push (Class->TempProtos, TempProto);
00817   }
00818   FreeFeatureSet(Features);
00819 
00820   AddIntConfig(IClass);
00821   ConvertConfig (AllProtosOn, 0, IClass);
00822 
00823   if (classify_learning_debug_level >= 1) {
00824     cprintf ("Added new class '%s' with class id %d and %d protos.\n",
00825              unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
00826     if (classify_learning_debug_level > 1)
00827       DisplayAdaptedChar(Blob, denorm, IClass);
00828   }
00829 
00830   if (IsEmptyAdaptedClass(Class))
00831     (Templates->NumNonEmptyClasses)++;
00832 }                                /* InitAdaptedClass */
00833 
00834 
00835 /*---------------------------------------------------------------------------*/
00856 int Classify::GetAdaptiveFeatures(TBLOB *Blob,
00857                                   INT_FEATURE_ARRAY IntFeatures,
00858                                   FEATURE_SET *FloatFeatures) {
00859   FEATURE_SET Features;
00860   int NumFeatures;
00861 
00862   classify_norm_method.set_value(baseline);
00863   Features = ExtractPicoFeatures(Blob);
00864 
00865   NumFeatures = Features->NumFeatures;
00866   if (NumFeatures > UNLIKELY_NUM_FEAT) {
00867     FreeFeatureSet(Features);
00868     return 0;
00869   }
00870 
00871   ComputeIntFeatures(Features, IntFeatures);
00872   *FloatFeatures = Features;
00873 
00874   return NumFeatures;
00875 }                                /* GetAdaptiveFeatures */
00876 
00877 
00878 /*-----------------------------------------------------------------------------
00879               Private Code
00880 -----------------------------------------------------------------------------*/
00881 /*---------------------------------------------------------------------------*/
00896 int Classify::AdaptableWord(TWERD *Word,
00897                             const WERD_CHOICE &BestChoiceWord,
00898                             const WERD_CHOICE &RawChoiceWord) {
00899   int BestChoiceLength = BestChoiceWord.length();
00900   float adaptable_score =
00901     getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
00902   return   // rules that apply in general - simplest to compute first
00903       BestChoiceLength > 0 &&
00904       BestChoiceLength == Word->NumBlobs() &&
00905       BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
00906       getDict().CurrentBestChoiceAdjustFactor() <= adaptable_score &&
00907       getDict().AlternativeChoicesWorseThan(adaptable_score) &&
00908       getDict().CurrentBestChoiceIs(BestChoiceWord);
00909 }
00910 
00911 /*---------------------------------------------------------------------------*/
00928 void Classify::AdaptToChar(TBLOB *Blob,
00929                            const DENORM& denorm,
00930                            CLASS_ID ClassId,
00931                            int FontinfoId,
00932                            FLOAT32 Threshold) {
00933   int NumFeatures;
00934   INT_FEATURE_ARRAY IntFeatures;
00935   INT_RESULT_STRUCT IntResult;
00936   INT_CLASS IClass;
00937   ADAPT_CLASS Class;
00938   TEMP_CONFIG TempConfig;
00939   FEATURE_SET FloatFeatures;
00940   int NewTempConfigId;
00941 
00942   ResetFeaturesHaveBeenExtracted();
00943   NumCharsAdaptedTo++;
00944   if (!LegalClassId (ClassId))
00945     return;
00946 
00947   Class = AdaptedTemplates->Class[ClassId];
00948   assert(Class != NULL);
00949   if (IsEmptyAdaptedClass(Class)) {
00950     InitAdaptedClass(Blob, denorm, ClassId, FontinfoId, Class,
00951                      AdaptedTemplates);
00952   }
00953   else {
00954     IClass = ClassForClassId (AdaptedTemplates->Templates, ClassId);
00955 
00956     NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
00957     if (NumFeatures <= 0)
00958       return;
00959 
00960     im_.SetBaseLineMatch();
00961     // Only match configs with the matching font.
00962     BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
00963     for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
00964       if (GetFontinfoId(Class, cfg) == FontinfoId) {
00965         SET_BIT(MatchingFontConfigs, cfg);
00966       } else {
00967         reset_bit(MatchingFontConfigs, cfg);
00968       }
00969     }
00970     im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
00971               NumFeatures, IntFeatures,
00972               &IntResult, classify_adapt_feature_threshold,
00973               NO_DEBUG, matcher_debug_separate_windows);
00974     FreeBitVector(MatchingFontConfigs);
00975 
00976     SetAdaptiveThreshold(Threshold);
00977 
00978     if (IntResult.Rating <= Threshold) {
00979       if (ConfigIsPermanent (Class, IntResult.Config)) {
00980         if (classify_learning_debug_level >= 1)
00981           cprintf ("Found good match to perm config %d = %4.1f%%.\n",
00982             IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
00983         FreeFeatureSet(FloatFeatures);
00984         return;
00985       }
00986 
00987       TempConfig = TempConfigFor (Class, IntResult.Config);
00988       IncreaseConfidence(TempConfig);
00989       if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
00990         Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
00991       }
00992       if (classify_learning_debug_level >= 1)
00993         cprintf ("Increasing reliability of temp config %d to %d.\n",
00994           IntResult.Config, TempConfig->NumTimesSeen);
00995 
00996       if (TempConfigReliable(ClassId, TempConfig)) {
00997         MakePermanent(AdaptedTemplates, ClassId, IntResult.Config, denorm,
00998                       Blob);
00999         UpdateAmbigsGroup(ClassId, denorm, Blob);
01000       }
01001     }
01002     else {
01003       if (classify_learning_debug_level >= 1) {
01004         cprintf ("Found poor match to temp config %d = %4.1f%%.\n",
01005           IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
01006         if (classify_learning_debug_level > 2)
01007           DisplayAdaptedChar(Blob, denorm, IClass);
01008       }
01009       NewTempConfigId = MakeNewTemporaryConfig(AdaptedTemplates,
01010                                                ClassId,
01011                                                FontinfoId,
01012                                                NumFeatures,
01013                                                IntFeatures,
01014                                                FloatFeatures);
01015       if (NewTempConfigId >= 0 &&
01016           TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
01017         MakePermanent(AdaptedTemplates, ClassId, NewTempConfigId, denorm, Blob);
01018         UpdateAmbigsGroup(ClassId, denorm, Blob);
01019       }
01020 
01021 #ifndef GRAPHICS_DISABLED
01022       if (classify_learning_debug_level > 1) {
01023         DisplayAdaptedChar(Blob, denorm, IClass);
01024       }
01025 #endif
01026     }
01027     FreeFeatureSet(FloatFeatures);
01028   }
01029 }                                /* AdaptToChar */
01030 
01031 void Classify::DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
01032                                   INT_CLASS_STRUCT* int_class) {
01033 #ifndef GRAPHICS_DISABLED
01034   int bloblength = 0;
01035   INT_FEATURE_ARRAY features;
01036   uinT8* norm_array = new uinT8[unicharset.size()];
01037   int num_features = GetBaselineFeatures(blob, denorm, PreTrainedTemplates,
01038                                          features,
01039                                          norm_array, &bloblength);
01040   delete [] norm_array;
01041   INT_RESULT_STRUCT IntResult;
01042 
01043   im_.Match(int_class, AllProtosOn, AllConfigsOn,
01044             num_features, features,
01045             &IntResult, classify_adapt_feature_threshold,
01046             NO_DEBUG, matcher_debug_separate_windows);
01047   cprintf ("Best match to temp config %d = %4.1f%%.\n",
01048     IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
01049   if (classify_learning_debug_level >= 2) {
01050     uinT32 ConfigMask;
01051     ConfigMask = 1 << IntResult.Config;
01052     ShowMatchDisplay();
01053     im_.Match(int_class, AllProtosOn, (BIT_VECTOR)&ConfigMask,
01054               num_features, features,
01055               &IntResult, classify_adapt_feature_threshold,
01056               6 | 0x19, matcher_debug_separate_windows);
01057     UpdateMatchDisplay();
01058   }
01059 #endif
01060 }
01061 
01062 
01063 /*---------------------------------------------------------------------------*/
01077 void Classify::AdaptToPunc(TBLOB *Blob,
01078                            const DENORM& denorm,
01079                            CLASS_ID ClassId,
01080                            int FontinfoId,
01081                            FLOAT32 Threshold) {
01082   ADAPT_RESULTS *Results = new ADAPT_RESULTS();
01083   int i;
01084 
01085   Results->Initialize();
01086   CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
01087   RemoveBadMatches(Results);
01088 
01089   if (Results->NumMatches != 1) {
01090     if (classify_learning_debug_level >= 1) {
01091       cprintf ("Rejecting punc = %s (Alternatives = ",
01092                unicharset.id_to_unichar(ClassId));
01093 
01094       for (i = 0; i < Results->NumMatches; i++)
01095         tprintf("%s", unicharset.id_to_unichar(Results->match[i].unichar_id));
01096       tprintf(")\n");
01097     }
01098   } else {
01099     #ifndef SECURE_NAMES
01100     if (classify_learning_debug_level >= 1)
01101       cprintf ("Adapting to punc = %s, thr= %g\n",
01102                unicharset.id_to_unichar(ClassId), Threshold);
01103     #endif
01104     AdaptToChar(Blob, denorm, ClassId, FontinfoId, Threshold);
01105   }
01106   delete Results;
01107 }                                /* AdaptToPunc */
01108 
01109 
01110 /*---------------------------------------------------------------------------*/
01137 void Classify::AddNewResult(ADAPT_RESULTS *results,
01138                             CLASS_ID class_id,
01139                             int shape_id,
01140                             FLOAT32 rating,
01141                             bool adapted,
01142                             int config,
01143                             int fontinfo_id,
01144                             int fontinfo_id2) {
01145   ScoredClass *old_match = FindScoredUnichar(results, class_id);
01146   ScoredClass match =
01147       { class_id,
01148         shape_id,
01149         rating,
01150         adapted,
01151         static_cast<inT16>(config),
01152         static_cast<inT16>(fontinfo_id),
01153         static_cast<inT16>(fontinfo_id2) };
01154 
01155   if (rating > results->best_match.rating + matcher_bad_match_pad ||
01156       (old_match && rating >= old_match->rating))
01157     return;
01158 
01159   if (!unicharset.get_fragment(class_id))
01160     results->HasNonfragment = true;
01161 
01162   if (old_match)
01163     old_match->rating = rating;
01164   else
01165     results->match[results->NumMatches++] = match;
01166 
01167   if (rating < results->best_match.rating &&
01168       // Ensure that fragments do not affect best rating, class and config.
01169       // This is needed so that at least one non-fragmented character is
01170       // always present in the results.
01171       // TODO(daria): verify that this helps accuracy and does not
01172       // hurt performance.
01173       !unicharset.get_fragment(class_id)) {
01174     results->best_match = match;
01175   }
01176 }                                /* AddNewResult */
01177 
01178 
01179 /*---------------------------------------------------------------------------*/
01200 void Classify::AmbigClassifier(TBLOB *Blob,
01201                                const DENORM& denorm,
01202                                INT_TEMPLATES Templates,
01203                                ADAPT_CLASS *Classes,
01204                                UNICHAR_ID *Ambiguities,
01205                                ADAPT_RESULTS *Results) {
01206   int NumFeatures;
01207   INT_FEATURE_ARRAY IntFeatures;
01208   uinT8* CharNormArray = new uinT8[unicharset.size()];
01209   INT_RESULT_STRUCT IntResult;
01210   CLASS_ID ClassId;
01211 
01212   AmbigClassifierCalls++;
01213 
01214   NumFeatures = GetCharNormFeatures(Blob, denorm, Templates, IntFeatures,
01215                                     NULL, CharNormArray,
01216                                     &(Results->BlobLength), NULL);
01217   if (NumFeatures <= 0) {
01218     delete [] CharNormArray;
01219     return;
01220   }
01221 
01222   bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
01223   if (debug)
01224     tprintf("AM Matches =  ");
01225 
01226   int top = Blob->bounding_box().top();
01227   int bottom = Blob->bounding_box().bottom();
01228   while (*Ambiguities >= 0) {
01229     ClassId = *Ambiguities;
01230 
01231     im_.SetCharNormMatch(classify_integer_matcher_multiplier);
01232     im_.Match(ClassForClassId(Templates, ClassId),
01233               AllProtosOn, AllConfigsOn,
01234               NumFeatures, IntFeatures,
01235               &IntResult,
01236               classify_adapt_feature_threshold, NO_DEBUG,
01237               matcher_debug_separate_windows);
01238 
01239     ExpandShapesAndApplyCorrections(NULL, debug, ClassId, bottom, top, 0,
01240                                     Results->BlobLength, CharNormArray,
01241                                     IntResult, Results);
01242     Ambiguities++;
01243 
01244     NumAmbigClassesTried++;
01245   }
01246   delete [] CharNormArray;
01247 }                                /* AmbigClassifier */
01248 
01249 /*---------------------------------------------------------------------------*/
01252 void Classify::MasterMatcher(INT_TEMPLATES templates,
01253                              inT16 num_features,
01254                              const INT_FEATURE_STRUCT* features,
01255                              const uinT8* norm_factors,
01256                              ADAPT_CLASS* classes,
01257                              int debug,
01258                              int num_classes,
01259                              const TBOX& blob_box,
01260                              CLASS_PRUNER_RESULTS results,
01261                              ADAPT_RESULTS* final_results) {
01262   int top = blob_box.top();
01263   int bottom = blob_box.bottom();
01264   for (int c = 0; c < num_classes; c++) {
01265     CLASS_ID class_id = results[c].Class;
01266     INT_RESULT_STRUCT& int_result = results[c].IMResult;
01267     BIT_VECTOR protos = classes != NULL ? classes[class_id]->PermProtos
01268                                         : AllProtosOn;
01269     BIT_VECTOR configs = classes != NULL ? classes[class_id]->PermConfigs
01270                                          : AllConfigsOn;
01271 
01272     im_.Match(ClassForClassId(templates, class_id),
01273               protos, configs,
01274               num_features, features,
01275               &int_result, classify_adapt_feature_threshold, debug,
01276               matcher_debug_separate_windows);
01277     bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
01278     ExpandShapesAndApplyCorrections(classes, debug, class_id, bottom, top,
01279                                     results[c].Rating,
01280                                     final_results->BlobLength, norm_factors,
01281                                     int_result, final_results);
01282   }
01283 }
01284 
01285 // Converts configs to fonts, and if the result is not adapted, and a
01286 // shape_table_ is present, the shape is expanded to include all
01287 // unichar_ids represented, before applying a set of corrections to the
01288 // distance rating in int_result, (see ComputeCorrectedRating.)
01289 // The results are added to the final_results output.
01290 void Classify::ExpandShapesAndApplyCorrections(
01291     ADAPT_CLASS* classes, bool debug, int class_id, int bottom, int top,
01292     float cp_rating, int blob_length, const uinT8* cn_factors,
01293     INT_RESULT_STRUCT& int_result, ADAPT_RESULTS* final_results) {
01294   // Compute the fontinfo_ids.
01295   int fontinfo_id = kBlankFontinfoId;
01296   int fontinfo_id2 = kBlankFontinfoId;
01297   if (classes != NULL) {
01298     // Adapted result.
01299     fontinfo_id = GetFontinfoId(classes[class_id], int_result.Config);
01300     if (int_result.Config2 >= 0)
01301       fontinfo_id2 = GetFontinfoId(classes[class_id], int_result.Config2);
01302   } else {
01303     // Pre-trained result.
01304     fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, int_result.Config);
01305     if (int_result.Config2 >= 0) {
01306       fontinfo_id2 = ClassAndConfigIDToFontOrShapeID(class_id,
01307                                                      int_result.Config2);
01308     }
01309     if (shape_table_ != NULL) {
01310       // Actually fontinfo_id is an index into the shape_table_ and it
01311       // contains a list of unchar_id/font_id pairs.
01312       int shape_id = fontinfo_id;
01313       const Shape& shape = shape_table_->GetShape(fontinfo_id);
01314       double min_rating = 0.0;
01315       for (int c = 0; c < shape.size(); ++c) {
01316         int unichar_id = shape[c].unichar_id;
01317         fontinfo_id = shape[c].font_ids[0];
01318         if (shape[c].font_ids.size() > 1)
01319           fontinfo_id2 = shape[c].font_ids[1];
01320         else if (fontinfo_id2 != kBlankFontinfoId)
01321           fontinfo_id2 = shape_table_->GetShape(fontinfo_id2)[0].font_ids[0];
01322         double rating = ComputeCorrectedRating(debug, unichar_id, cp_rating,
01323                                                int_result.Rating,
01324                                                int_result.FeatureMisses,
01325                                                bottom, top, blob_length,
01326                                                cn_factors);
01327         if (c == 0 || rating < min_rating)
01328           min_rating = rating;
01329         if (unicharset.get_enabled(unichar_id)) {
01330           AddNewResult(final_results, unichar_id, shape_id, rating,
01331                        classes != NULL, int_result.Config,
01332                        fontinfo_id, fontinfo_id2);
01333         }
01334       }
01335       int_result.Rating = min_rating;
01336       return;
01337     }
01338   }
01339   double rating = ComputeCorrectedRating(debug, class_id, cp_rating,
01340                                          int_result.Rating,
01341                                          int_result.FeatureMisses,
01342                                          bottom, top, blob_length,
01343                                          cn_factors);
01344   if (unicharset.get_enabled(class_id)) {
01345     AddNewResult(final_results, class_id, -1, rating,
01346                  classes != NULL, int_result.Config,
01347                  fontinfo_id, fontinfo_id2);
01348   }
01349   int_result.Rating = rating;
01350 }
01351 
01352 // Applies a set of corrections to the distance im_rating,
01353 // including the cn_correction, miss penalty and additional penalty
01354 // for non-alnums being vertical misfits. Returns the corrected distance.
01355 double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
01356                                         double cp_rating, double im_rating,
01357                                         int feature_misses,
01358                                         int bottom, int top,
01359                                         int blob_length,
01360                                         const uinT8* cn_factors) {
01361   // Compute class feature corrections.
01362   double cn_corrected = im_.ApplyCNCorrection(im_rating, blob_length,
01363                                               cn_factors[unichar_id]);
01364   double miss_penalty = tessedit_class_miss_scale * feature_misses;
01365   double vertical_penalty = 0.0;
01366   // Penalize non-alnums for being vertical misfits.
01367   if (!unicharset.get_isalpha(unichar_id) &&
01368       !unicharset.get_isdigit(unichar_id) &&
01369       cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
01370     int min_bottom, max_bottom, min_top, max_top;
01371     unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
01372                               &min_top, &max_top);
01373     if (debug) {
01374       tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
01375               top, min_top, max_top, bottom, min_bottom, max_bottom);
01376     }
01377     if (top < min_top || top > max_top ||
01378         bottom < min_bottom || bottom > max_bottom) {
01379       vertical_penalty = classify_misfit_junk_penalty;
01380     }
01381   }
01382   double result =cn_corrected + miss_penalty + vertical_penalty;
01383   if (result > WORST_POSSIBLE_RATING)
01384     result = WORST_POSSIBLE_RATING;
01385   if (debug) {
01386     tprintf("%s: %2.1f(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
01387             unicharset.id_to_unichar(unichar_id),
01388             result * 100.0,
01389             cp_rating * 100.0,
01390             im_rating * 100.0,
01391             (cn_corrected - im_rating) * 100.0,
01392             cn_factors[unichar_id],
01393             miss_penalty * 100.0,
01394             vertical_penalty * 100.0);
01395   }
01396   return result;
01397 }
01398 
01399 /*---------------------------------------------------------------------------*/
01418 UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
01419                                          const DENORM& denorm,
01420                                          ADAPT_TEMPLATES Templates,
01421                                          ADAPT_RESULTS *Results) {
01422   int NumFeatures;
01423   int NumClasses;
01424   INT_FEATURE_ARRAY IntFeatures;
01425   uinT8* CharNormArray = new uinT8[unicharset.size()];
01426   CLASS_ID ClassId;
01427 
01428   BaselineClassifierCalls++;
01429 
01430   NumFeatures = GetBaselineFeatures(
01431       Blob, denorm, Templates->Templates, IntFeatures, CharNormArray,
01432       &(Results->BlobLength));
01433   if (NumFeatures <= 0) {
01434     delete [] CharNormArray;
01435     return NULL;
01436   }
01437 
01438   NumClasses = PruneClasses(Templates->Templates, NumFeatures, IntFeatures,
01439                             CharNormArray, BaselineCutoffs, Results->CPResults);
01440 
01441   NumBaselineClassesTried += NumClasses;
01442 
01443   if (matcher_debug_level >= 2 || classify_debug_level > 1)
01444     cprintf ("BL Matches =  ");
01445 
01446   im_.SetBaseLineMatch();
01447   MasterMatcher(Templates->Templates, NumFeatures, IntFeatures, CharNormArray,
01448                 Templates->Class, matcher_debug_flags, NumClasses,
01449                 Blob->bounding_box(), Results->CPResults, Results);
01450 
01451   delete [] CharNormArray;
01452   ClassId = Results->best_match.unichar_id;
01453   if (ClassId == NO_CLASS)
01454     return (NULL);
01455   /* this is a bug - maybe should return "" */
01456 
01457   return Templates->Class[ClassId]->
01458       Config[Results->best_match.config].Perm->Ambigs;
01459 }                                /* BaselineClassifier */
01460 
01461 
01462 /*---------------------------------------------------------------------------*/
01482 int Classify::CharNormClassifier(TBLOB *Blob,
01483                                  const DENORM& denorm,
01484                                  INT_TEMPLATES Templates,
01485                                  ADAPT_RESULTS *Results) {
01486   int NumFeatures;
01487   int NumClasses;
01488   INT_FEATURE_ARRAY IntFeatures;
01489 
01490   CharNormClassifierCalls++;
01491 
01492   uinT8* CharNormArray = new uinT8[unicharset.size()];
01493   int num_pruner_classes = MAX(unicharset.size(),
01494                                PreTrainedTemplates->NumClasses);
01495   uinT8* PrunerNormArray = new uinT8[num_pruner_classes];
01496   NumFeatures = GetCharNormFeatures(Blob, denorm, Templates, IntFeatures,
01497                                     PrunerNormArray, CharNormArray,
01498                                     &(Results->BlobLength), NULL);
01499   if (NumFeatures <= 0) {
01500     delete [] CharNormArray;
01501     delete [] PrunerNormArray;
01502     return 0;
01503   }
01504 
01505   NumClasses = PruneClasses(Templates, NumFeatures, IntFeatures,
01506                             PrunerNormArray,
01507                             shape_table_ != NULL ? &shapetable_cutoffs_[0]
01508                                                  : CharNormCutoffs,
01509                             Results->CPResults);
01510 
01511   if (tessedit_single_match && NumClasses > 1)
01512     NumClasses = 1;
01513   NumCharNormClassesTried += NumClasses;
01514 
01515   im_.SetCharNormMatch(classify_integer_matcher_multiplier);
01516   MasterMatcher(Templates, NumFeatures, IntFeatures, CharNormArray,
01517                 NULL, matcher_debug_flags, NumClasses,
01518                 Blob->bounding_box(), Results->CPResults, Results);
01519   delete [] CharNormArray;
01520   delete [] PrunerNormArray;
01521   return NumFeatures;
01522 }                                /* CharNormClassifier */
01523 
01524 // As CharNormClassifier, but operates on a TrainingSample and outputs to
01525 // a GenericVector of ShapeRating without conversion to classes.
01526 int Classify::CharNormTrainingSample(bool pruner_only,
01527                                      const TrainingSample& sample,
01528                                      GenericVector<ShapeRating>* results) {
01529   results->clear();
01530   ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS();
01531   adapt_results->Initialize();
01532   // Compute the bounding box of the features.
01533   int num_features = sample.num_features();
01534   TBOX blob_box;
01535   for (int f = 0; f < num_features; ++f) {
01536     const INT_FEATURE_STRUCT feature = sample.features()[f];
01537     TBOX fbox(feature.X, feature.Y, feature.X, feature.Y);
01538     blob_box += fbox;
01539   }
01540   // Compute the char_norm_array from the saved cn_feature.
01541   FEATURE norm_feature = NewFeature(&CharNormDesc);
01542   norm_feature->Params[CharNormY] = sample.cn_feature(CharNormY);
01543   norm_feature->Params[CharNormLength] = sample.cn_feature(CharNormLength);
01544   norm_feature->Params[CharNormRx] = sample.cn_feature(CharNormRx);
01545   norm_feature->Params[CharNormRy] = sample.cn_feature(CharNormRy);
01546   uinT8* char_norm_array = new uinT8[unicharset.size()];
01547   int num_pruner_classes = MAX(unicharset.size(),
01548                                PreTrainedTemplates->NumClasses);
01549   uinT8* pruner_norm_array = new uinT8[num_pruner_classes];
01550   adapt_results->BlobLength =
01551       static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
01552   ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
01553                         pruner_norm_array);
01554 
01555   int num_classes = PruneClasses(PreTrainedTemplates, num_features,
01556                                  sample.features(),
01557                                  pruner_norm_array,
01558                                  shape_table_ != NULL ? &shapetable_cutoffs_[0]
01559                                                       : CharNormCutoffs,
01560                                  adapt_results->CPResults);
01561   delete [] pruner_norm_array;
01562   if (pruner_only) {
01563     // Convert pruner results to output format.
01564     for (int i = 0; i < num_classes; ++i) {
01565       int class_id = adapt_results->CPResults[i].Class;
01566       int shape_id = class_id;
01567       if (shape_table_ != NULL) {
01568         // All shapes in a class have the same combination of unichars, so
01569         // it doesn't really matter which config we give it, as we aren't
01570         // trying to get the font here.
01571         shape_id = ClassAndConfigIDToFontOrShapeID(class_id, 0);
01572       }
01573       results->push_back(
01574           ShapeRating(shape_id, 1.0f - adapt_results->CPResults[i].Rating));
01575     }
01576   } else {
01577     im_.SetCharNormMatch(classify_integer_matcher_multiplier);
01578     MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
01579                   char_norm_array,
01580                   NULL, matcher_debug_flags, num_classes,
01581                   blob_box, adapt_results->CPResults, adapt_results);
01582     // Convert master matcher results to output format.
01583     for (int i = 0; i < adapt_results->NumMatches; i++) {
01584       ScoredClass next = adapt_results->match[i];
01585       results->push_back(ShapeRating(next.shape_id, 1.0f - next.rating));
01586     }
01587     results->sort(&ShapeRating::SortDescendingRating);
01588   }
01589   delete [] char_norm_array;
01590   delete adapt_results;
01591   return num_features;
01592 }                                /* CharNormTrainingSample */
01593 
01594 
01595 /*---------------------------------------------------------------------------*/
01610 void Classify::ClassifyAsNoise(ADAPT_RESULTS *Results) {
01611   register FLOAT32 Rating;
01612 
01613   Rating = Results->BlobLength / matcher_avg_noise_size;
01614   Rating *= Rating;
01615   Rating /= 1.0 + Rating;
01616 
01617   AddNewResult(Results, NO_CLASS, -1, Rating, false, -1,
01618                kBlankFontinfoId, kBlankFontinfoId);
01619 }                                /* ClassifyAsNoise */
01620 }  // namespace tesseract
01621 
01622 
01623 /*---------------------------------------------------------------------------*/
01624 // Return a pointer to the scored unichar in results, or NULL if not present.
01625 ScoredClass *FindScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id) {
01626   for (int i = 0; i < results->NumMatches; i++) {
01627     if (results->match[i].unichar_id == id)
01628       return &results->match[i];
01629   }
01630   return NULL;
01631 }
01632 
01633 // Retrieve the current rating for a unichar id if we have rated it, defaulting
01634 // to WORST_POSSIBLE_RATING.
01635 ScoredClass ScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id) {
01636   ScoredClass poor_result =
01637       {id, -1, WORST_POSSIBLE_RATING, false, -1,
01638           kBlankFontinfoId, kBlankFontinfoId};
01639   ScoredClass *entry = FindScoredUnichar(results, id);
01640   return (entry == NULL) ? poor_result : *entry;
01641 }
01642 
01643 // Compare character classes by rating as for qsort(3).
01644 // For repeatability, use character class id as a tie-breaker.
01645 int CompareByRating(const void *arg1,    // ScoredClass *class1
01646                     const void *arg2) {  // ScoredClass *class2
01647   const ScoredClass *class1 = (const ScoredClass *)arg1;
01648   const ScoredClass *class2 = (const ScoredClass *)arg2;
01649 
01650   if (class1->rating < class2->rating)
01651     return -1;
01652   else if (class1->rating > class2->rating)
01653     return 1;
01654 
01655   if (class1->unichar_id < class2->unichar_id)
01656     return -1;
01657   else if (class1->unichar_id > class2->unichar_id)
01658     return 1;
01659   return 0;
01660 }
01661 
01662 /*---------------------------------------------------------------------------*/
01663 namespace tesseract {
01670 void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
01671                                        ADAPT_RESULTS *Results,
01672                                        BLOB_CHOICE_LIST *Choices) {
01673   assert(Choices != NULL);
01674   FLOAT32 Rating;
01675   FLOAT32 Certainty;
01676   BLOB_CHOICE_IT temp_it;
01677   bool contains_nonfrag = false;
01678   temp_it.set_to_list(Choices);
01679   int choices_length = 0;
01680   // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
01681   // number of returned results, but with a shape_table_ we want to have room
01682   // for at least the biggest shape (which might contain hundreds of Indic
01683   // grapheme fragments) and more, so use double the size of the biggest shape
01684   // if that is more than the default.
01685   int max_matches = MAX_MATCHES;
01686   if (shape_table_ != NULL) {
01687     max_matches = shape_table_->MaxNumUnichars() * 2;
01688     if (max_matches < MAX_MATCHES)
01689       max_matches = MAX_MATCHES;
01690   }
01691 
01692   for (int i = 0; i < Results->NumMatches; i++) {
01693     ScoredClass next = Results->match[i];
01694     int fontinfo_id = next.fontinfo_id;
01695     int fontinfo_id2 = next.fontinfo_id2;
01696     bool adapted = next.adapted;
01697     bool current_is_frag = (unicharset.get_fragment(next.unichar_id) != NULL);
01698     if (temp_it.length()+1 == max_matches &&
01699         !contains_nonfrag && current_is_frag) {
01700       continue;  // look for a non-fragmented character to fill the
01701                  // last spot in Choices if only fragments are present
01702     }
01703     // BlobLength can never be legally 0, this means recognition failed.
01704     // But we must return a classification result because some invoking
01705     // functions (chopper/permuter) do not anticipate a null blob choice.
01706     // So we need to assign a poor, but not infinitely bad score.
01707     if (Results->BlobLength == 0) {
01708       Certainty = -20;
01709       Rating = 100;    // should be -certainty * real_blob_length
01710     } else {
01711       Rating = Certainty = next.rating;
01712       Rating *= rating_scale * Results->BlobLength;
01713       Certainty *= -(getDict().certainty_scale);
01714     }
01715     inT16 min_xheight, max_xheight;
01716     denorm.XHeightRange(next.unichar_id, unicharset, box,
01717                         &min_xheight, &max_xheight);
01718     temp_it.add_to_end(new BLOB_CHOICE(next.unichar_id, Rating, Certainty,
01719                                         fontinfo_id, fontinfo_id2,
01720                                         unicharset.get_script(next.unichar_id),
01721                                         min_xheight, max_xheight, adapted));
01722     contains_nonfrag |= !current_is_frag;  // update contains_nonfrag
01723     choices_length++;
01724     if (choices_length >= max_matches) break;
01725   }
01726   Results->NumMatches = choices_length;
01727 }  // ConvertMatchesToChoices
01728 
01729 
01730 /*---------------------------------------------------------------------------*/
01731 #ifndef GRAPHICS_DISABLED
01732 
01743 void Classify::DebugAdaptiveClassifier(TBLOB *Blob,
01744                                        const DENORM& denorm,
01745                                        ADAPT_RESULTS *Results) {
01746   for (int i = 0; i < Results->NumMatches; i++) {
01747     if (Results->match[i].rating < Results->best_match.rating)
01748       Results->best_match = Results->match[i];
01749   }
01750   const char *Prompt =
01751     "Left-click in IntegerMatch Window to continue or right click to debug...";
01752   CLASS_ID unichar_id = Results->best_match.unichar_id;
01753   int shape_id = Results->best_match.shape_id;
01754   bool adaptive_on = true;
01755   bool pretrained_on = true;
01756 
01757   const char* debug_mode;
01758   do {
01759     if (!pretrained_on)
01760       debug_mode = "Adaptive Templates Only";
01761     else if (!adaptive_on)
01762       debug_mode = "PreTrained Templates Only";
01763     else
01764       debug_mode = "All Templates";
01765     ShowMatchDisplay();
01766     tprintf("Debugging class %d = %s in mode %s ...",
01767             unichar_id, unicharset.id_to_unichar(unichar_id), debug_mode);
01768     if (shape_id >= 0 && shape_table_ != NULL) {
01769       tprintf(" from shape %s\n", shape_table_->DebugStr(shape_id).string());
01770     }
01771     ShowBestMatchFor(Blob, denorm, unichar_id, shape_id, adaptive_on,
01772                      pretrained_on, Results);
01773     UpdateMatchDisplay();
01774   } while ((unichar_id = GetClassToDebug(Prompt, &adaptive_on,
01775                                          &pretrained_on, &shape_id)) != 0);
01776 }                                /* DebugAdaptiveClassifier */
01777 #endif
01778 
01779 /*---------------------------------------------------------------------------*/
01803 void Classify::DoAdaptiveMatch(TBLOB *Blob,
01804                                const DENORM& denorm,
01805                                ADAPT_RESULTS *Results) {
01806   UNICHAR_ID *Ambiguities;
01807 
01808   AdaptiveMatcherCalls++;
01809   InitIntFX();
01810 
01811   if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min ||
01812       tess_cn_matching) {
01813     CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
01814   } else {
01815     Ambiguities = BaselineClassifier(Blob, denorm, AdaptedTemplates, Results);
01816     if ((Results->NumMatches > 0 &&
01817          MarginalMatch (Results->best_match.rating) &&
01818          !tess_bn_matching) ||
01819         Results->NumMatches == 0) {
01820       CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
01821     } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
01822       AmbigClassifier(Blob, denorm,
01823                       PreTrainedTemplates,
01824                       AdaptedTemplates->Class,
01825                       Ambiguities,
01826                       Results);
01827     }
01828   }
01829 
01830   // Force the blob to be classified as noise
01831   // if the results contain only fragments.
01832   // TODO(daria): verify that this is better than
01833   // just adding a NULL classification.
01834   if (!Results->HasNonfragment || Results->NumMatches == 0)
01835     ClassifyAsNoise(Results);
01836 }   /* DoAdaptiveMatch */
01837 
01838 /*---------------------------------------------------------------------------*/
01864 void Classify::GetAdaptThresholds(TWERD * Word,
01865                                   const DENORM& denorm,
01866                                   const WERD_CHOICE& BestChoice,
01867                                   const WERD_CHOICE& BestRawChoice,
01868                                   FLOAT32 Thresholds[]) {
01869   getDict().FindClassifierErrors(matcher_perfect_threshold,
01870                                  matcher_good_threshold,
01871                                  matcher_rating_margin,
01872                                  Thresholds);
01873 }                              /* GetAdaptThresholds */
01874 
01875 /*---------------------------------------------------------------------------*/
01893 UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob,
01894                                      const DENORM& denorm,
01895                                      CLASS_ID CorrectClass) {
01896   ADAPT_RESULTS *Results = new ADAPT_RESULTS();
01897   UNICHAR_ID *Ambiguities;
01898   int i;
01899 
01900   Results->Initialize();
01901 
01902   CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
01903   RemoveBadMatches(Results);
01904   qsort((void *)Results->match, Results->NumMatches,
01905         sizeof(ScoredClass), CompareByRating);
01906 
01907   /* copy the class id's into an string of ambiguities - don't copy if
01908      the correct class is the only class id matched */
01909   Ambiguities = (UNICHAR_ID *) Emalloc (sizeof (UNICHAR_ID) *
01910                                         (Results->NumMatches + 1));
01911   if (Results->NumMatches > 1 ||
01912       (Results->NumMatches == 1 &&
01913           Results->match[0].unichar_id != CorrectClass)) {
01914     for (i = 0; i < Results->NumMatches; i++)
01915       Ambiguities[i] = Results->match[i].unichar_id;
01916     Ambiguities[i] = -1;
01917   } else {
01918     Ambiguities[0] = -1;
01919   }
01920 
01921   delete Results;
01922   return Ambiguities;
01923 }                              /* GetAmbiguities */
01924 
01925 /*---------------------------------------------------------------------------*/
01952 int Classify::GetBaselineFeatures(TBLOB *Blob,
01953                                   const DENORM& denorm,
01954                                   INT_TEMPLATES Templates,
01955                                   INT_FEATURE_ARRAY IntFeatures,
01956                                   uinT8* CharNormArray,
01957                                   inT32 *BlobLength) {
01958   register INT_FEATURE Src, Dest, End;
01959 
01960   if (!FeaturesHaveBeenExtracted) {
01961     FeaturesOK = ExtractIntFeat(Blob, denorm, BaselineFeatures,
01962                                 CharNormFeatures, &FXInfo, NULL);
01963     FeaturesHaveBeenExtracted = TRUE;
01964   }
01965 
01966   if (!FeaturesOK) {
01967     *BlobLength = FXInfo.NumBL;
01968     return 0;
01969   }
01970 
01971   for (Src = BaselineFeatures, End = Src + FXInfo.NumBL, Dest = IntFeatures;
01972        Src < End;
01973        *Dest++ = *Src++);
01974 
01975   ClearCharNormArray(CharNormArray);
01976   *BlobLength = FXInfo.NumBL;
01977   return FXInfo.NumBL;
01978 }                              /* GetBaselineFeatures */
01979 
01980 void Classify::ResetFeaturesHaveBeenExtracted() {
01981   FeaturesHaveBeenExtracted = FALSE;
01982 }
01983 
01984 // Returns true if the given blob looks too dissimilar to any character
01985 // present in the classifier templates.
01986 bool Classify::LooksLikeGarbage(const DENORM& denorm, TBLOB *blob) {
01987   BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();
01988   AdaptiveClassifier(blob, denorm, ratings, NULL);
01989   BLOB_CHOICE_IT ratings_it(ratings);
01990   const UNICHARSET &unicharset = getDict().getUnicharset();
01991   if (classify_debug_character_fragments) {
01992     print_ratings_list("======================\nLooksLikeGarbage() got ",
01993                        ratings, unicharset);
01994   }
01995   for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
01996        ratings_it.forward()) {
01997     if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != NULL) {
01998       continue;
01999     }
02000     delete ratings;
02001     return (ratings_it.data()->certainty() <
02002             classify_character_fragments_garbage_certainty_threshold);
02003   }
02004   delete ratings;
02005   return true;  // no whole characters in ratings
02006 }
02007 
02008 /*---------------------------------------------------------------------------*/
02040 int Classify::GetCharNormFeatures(TBLOB *Blob,
02041                                   const DENORM& denorm,
02042                                   INT_TEMPLATES Templates,
02043                                   INT_FEATURE_ARRAY IntFeatures,
02044                                   uinT8* PrunerNormArray,
02045                                   uinT8* CharNormArray,
02046                                   inT32 *BlobLength,
02047                                   inT32 *FeatureOutlineArray) {
02048   register INT_FEATURE Src, Dest, End;
02049   FEATURE NormFeature;
02050   FLOAT32 Baseline, Scale;
02051   inT32 FeatureOutlineIndex[MAX_NUM_INT_FEATURES];
02052 
02053   if (!FeaturesHaveBeenExtracted) {
02054     FeaturesOK = ExtractIntFeat(Blob, denorm, BaselineFeatures,
02055                                 CharNormFeatures, &FXInfo,
02056                                 FeatureOutlineIndex);
02057     FeaturesHaveBeenExtracted = TRUE;
02058   }
02059 
02060   if (!FeaturesOK) {
02061     *BlobLength = FXInfo.NumBL;
02062     return (0);
02063   }
02064 
02065   for (Src = CharNormFeatures, End = Src + FXInfo.NumCN, Dest = IntFeatures;
02066        Src < End;
02067        *Dest++ = *Src++);
02068   for (int i = 0;  FeatureOutlineArray && i < FXInfo.NumCN; ++i) {
02069     FeatureOutlineArray[i] = FeatureOutlineIndex[i];
02070   }
02071 
02072   NormFeature = NewFeature(&CharNormDesc);
02073   Baseline = BASELINE_OFFSET;
02074   Scale = MF_SCALE_FACTOR;
02075   NormFeature->Params[CharNormY] = (FXInfo.Ymean - Baseline) * Scale;
02076   NormFeature->Params[CharNormLength] =
02077     FXInfo.Length * Scale / LENGTH_COMPRESSION;
02078   NormFeature->Params[CharNormRx] = FXInfo.Rx * Scale;
02079   NormFeature->Params[CharNormRy] = FXInfo.Ry * Scale;
02080   ComputeCharNormArrays(NormFeature, Templates, CharNormArray, PrunerNormArray);
02081   *BlobLength = FXInfo.NumBL;
02082   return (FXInfo.NumCN);
02083 }                              /* GetCharNormFeatures */
02084 
02085 // Computes the char_norm_array for the unicharset and, if not NULL, the
02086 // pruner_array as appropriate according to the existence of the shape_table.
02087 void Classify::ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
02088                                      INT_TEMPLATES_STRUCT* templates,
02089                                      uinT8* char_norm_array,
02090                                      uinT8* pruner_array) {
02091   ComputeIntCharNormArray(*norm_feature, char_norm_array);
02092   if (pruner_array != NULL) {
02093     if (shape_table_ == NULL) {
02094       ComputeIntCharNormArray(*norm_feature, pruner_array);
02095     } else {
02096       memset(pruner_array, MAX_UINT8,
02097              templates->NumClasses * sizeof(pruner_array[0]));
02098       // Each entry in the pruner norm array is the MIN of all the entries of
02099       // the corresponding unichars in the CharNormArray.
02100       for (int id = 0; id < templates->NumClasses; ++id) {
02101         int font_set_id = templates->Class[id]->font_set_id;
02102         const FontSet &fs = fontset_table_.get(font_set_id);
02103         for (int config = 0; config < fs.size; ++config) {
02104           const Shape& shape = shape_table_->GetShape(fs.configs[config]);
02105           for (int c = 0; c < shape.size(); ++c) {
02106             if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
02107               pruner_array[id] = char_norm_array[shape[c].unichar_id];
02108           }
02109         }
02110       }
02111     }
02112   }
02113   FreeFeature(norm_feature);
02114 }
02115 
02116 /*---------------------------------------------------------------------------*/
02131 int Classify::MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
02132                            CLASS_ID ClassId,
02133                            int FontinfoId,
02134                            int NumFeatures,
02135                            INT_FEATURE_ARRAY Features,
02136                            FEATURE_SET FloatFeatures) {
02137   INT_CLASS IClass;
02138   ADAPT_CLASS Class;
02139   PROTO_ID OldProtos[MAX_NUM_PROTOS];
02140   FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
02141   int NumOldProtos;
02142   int NumBadFeatures;
02143   int MaxProtoId, OldMaxProtoId;
02144   int BlobLength = 0;
02145   int MaskSize;
02146   int ConfigId;
02147   TEMP_CONFIG Config;
02148   int i;
02149   int debug_level = NO_DEBUG;
02150 
02151   if (classify_learning_debug_level >= 3)
02152     debug_level =
02153         PRINT_MATCH_SUMMARY | PRINT_FEATURE_MATCHES | PRINT_PROTO_MATCHES;
02154 
02155   IClass = ClassForClassId(Templates->Templates, ClassId);
02156   Class = Templates->Class[ClassId];
02157 
02158   if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
02159     ++NumAdaptationsFailed;
02160     if (classify_learning_debug_level >= 1)
02161       cprintf("Cannot make new temporary config: maximum number exceeded.\n");
02162     return -1;
02163   }
02164 
02165   OldMaxProtoId = IClass->NumProtos - 1;
02166 
02167   NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
02168                                     BlobLength, NumFeatures, Features,
02169                                     OldProtos, classify_adapt_proto_threshold,
02170                                     debug_level);
02171 
02172   MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
02173   zero_all_bits(TempProtoMask, MaskSize);
02174   for (i = 0; i < NumOldProtos; i++)
02175     SET_BIT(TempProtoMask, OldProtos[i]);
02176 
02177   NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
02178                                        BlobLength, NumFeatures, Features,
02179                                        BadFeatures,
02180                                        classify_adapt_feature_threshold,
02181                                        debug_level);
02182 
02183   MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
02184                                  IClass, Class, TempProtoMask);
02185   if (MaxProtoId == NO_PROTO) {
02186     ++NumAdaptationsFailed;
02187     if (classify_learning_debug_level >= 1)
02188       cprintf("Cannot make new temp protos: maximum number exceeded.\n");
02189     return -1;
02190   }
02191 
02192   ConfigId = AddIntConfig(IClass);
02193   ConvertConfig(TempProtoMask, ConfigId, IClass);
02194   Config = NewTempConfig(MaxProtoId, FontinfoId);
02195   TempConfigFor(Class, ConfigId) = Config;
02196   copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
02197 
02198   if (classify_learning_debug_level >= 1)
02199     cprintf("Making new temp config %d fontinfo id %d"
02200             " using %d old and %d new protos.\n",
02201             ConfigId, Config->FontinfoId,
02202             NumOldProtos, MaxProtoId - OldMaxProtoId);
02203 
02204   return ConfigId;
02205 }                              /* MakeNewTemporaryConfig */
02206 
02207 /*---------------------------------------------------------------------------*/
02228 PROTO_ID Classify::MakeNewTempProtos(FEATURE_SET Features,
02229                                      int NumBadFeat,
02230                                      FEATURE_ID BadFeat[],
02231                                      INT_CLASS IClass,
02232                                      ADAPT_CLASS Class,
02233                                      BIT_VECTOR TempProtoMask) {
02234   FEATURE_ID *ProtoStart;
02235   FEATURE_ID *ProtoEnd;
02236   FEATURE_ID *LastBad;
02237   TEMP_PROTO TempProto;
02238   PROTO Proto;
02239   FEATURE F1, F2;
02240   FLOAT32 X1, X2, Y1, Y2;
02241   FLOAT32 A1, A2, AngleDelta;
02242   FLOAT32 SegmentLength;
02243   PROTO_ID Pid;
02244 
02245   for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
02246        ProtoStart < LastBad; ProtoStart = ProtoEnd) {
02247     F1 = Features->Features[*ProtoStart];
02248     X1 = F1->Params[PicoFeatX];
02249     Y1 = F1->Params[PicoFeatY];
02250     A1 = F1->Params[PicoFeatDir];
02251 
02252     for (ProtoEnd = ProtoStart + 1,
02253          SegmentLength = GetPicoFeatureLength();
02254          ProtoEnd < LastBad;
02255          ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
02256       F2 = Features->Features[*ProtoEnd];
02257       X2 = F2->Params[PicoFeatX];
02258       Y2 = F2->Params[PicoFeatY];
02259       A2 = F2->Params[PicoFeatDir];
02260 
02261       AngleDelta = fabs(A1 - A2);
02262       if (AngleDelta > 0.5)
02263         AngleDelta = 1.0 - AngleDelta;
02264 
02265       if (AngleDelta > matcher_clustering_max_angle_delta ||
02266           fabs(X1 - X2) > SegmentLength ||
02267           fabs(Y1 - Y2) > SegmentLength)
02268         break;
02269     }
02270 
02271     F2 = Features->Features[*(ProtoEnd - 1)];
02272     X2 = F2->Params[PicoFeatX];
02273     Y2 = F2->Params[PicoFeatY];
02274     A2 = F2->Params[PicoFeatDir];
02275 
02276     Pid = AddIntProto(IClass);
02277     if (Pid == NO_PROTO)
02278       return (NO_PROTO);
02279 
02280     TempProto = NewTempProto();
02281     Proto = &(TempProto->Proto);
02282 
02283     /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
02284        ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
02285        instead of the -0.25 to 0.75 used in baseline normalization */
02286     Proto->Length = SegmentLength;
02287     Proto->Angle = A1;
02288     Proto->X = (X1 + X2) / 2.0;
02289     Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
02290     FillABC(Proto);
02291 
02292     TempProto->ProtoId = Pid;
02293     SET_BIT(TempProtoMask, Pid);
02294 
02295     ConvertProto(Proto, Pid, IClass);
02296     AddProtoToProtoPruner(Proto, Pid, IClass,
02297                           classify_learning_debug_level >= 2);
02298 
02299     Class->TempProtos = push(Class->TempProtos, TempProto);
02300   }
02301   return IClass->NumProtos - 1;
02302 }                              /* MakeNewTempProtos */
02303 
02304 /*---------------------------------------------------------------------------*/
02318 void Classify::MakePermanent(ADAPT_TEMPLATES Templates,
02319                              CLASS_ID ClassId,
02320                              int ConfigId,
02321                              const DENORM& denorm,
02322                              TBLOB *Blob) {
02323   UNICHAR_ID *Ambigs;
02324   TEMP_CONFIG Config;
02325   ADAPT_CLASS Class;
02326   PROTO_KEY ProtoKey;
02327 
02328   Class = Templates->Class[ClassId];
02329   Config = TempConfigFor(Class, ConfigId);
02330 
02331   MakeConfigPermanent(Class, ConfigId);
02332   if (Class->NumPermConfigs == 0)
02333     Templates->NumPermClasses++;
02334   Class->NumPermConfigs++;
02335 
02336   // Initialize permanent config.
02337   Ambigs = GetAmbiguities(Blob, denorm, ClassId);
02338   PERM_CONFIG Perm = (PERM_CONFIG) alloc_struct(sizeof(PERM_CONFIG_STRUCT),
02339                                                 "PERM_CONFIG_STRUCT");
02340   Perm->Ambigs = Ambigs;
02341   Perm->FontinfoId = Config->FontinfoId;
02342 
02343   // Free memory associated with temporary config (since ADAPTED_CONFIG
02344   // is a union we need to clean up before we record permanent config).
02345   ProtoKey.Templates = Templates;
02346   ProtoKey.ClassId = ClassId;
02347   ProtoKey.ConfigId = ConfigId;
02348   Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
02349   FreeTempConfig(Config);
02350 
02351   // Record permanent config.
02352   PermConfigFor(Class, ConfigId) = Perm;
02353 
02354   if (classify_learning_debug_level >= 1) {
02355     tprintf("Making config %d for %s (ClassId %d) permanent:"
02356             " fontinfo id %d, ambiguities '",
02357             ConfigId, getDict().getUnicharset().debug_str(ClassId).string(),
02358             ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
02359     for (UNICHAR_ID *AmbigsPointer = Ambigs;
02360         *AmbigsPointer >= 0; ++AmbigsPointer)
02361       tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
02362     tprintf("'.\n");
02363   }
02364 }                              /* MakePermanent */
02365 }  // namespace tesseract
02366 
02367 /*---------------------------------------------------------------------------*/
02382 int MakeTempProtoPerm(void *item1, void *item2) {
02383   ADAPT_CLASS Class;
02384   TEMP_CONFIG Config;
02385   TEMP_PROTO TempProto;
02386   PROTO_KEY *ProtoKey;
02387 
02388   TempProto = (TEMP_PROTO) item1;
02389   ProtoKey = (PROTO_KEY *) item2;
02390 
02391   Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
02392   Config = TempConfigFor(Class, ProtoKey->ConfigId);
02393 
02394   if (TempProto->ProtoId > Config->MaxProtoId ||
02395       !test_bit (Config->Protos, TempProto->ProtoId))
02396     return FALSE;
02397 
02398   MakeProtoPermanent(Class, TempProto->ProtoId);
02399   AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId,
02400                          ProtoKey->Templates->Templates);
02401   FreeTempProto(TempProto);
02402 
02403   return TRUE;
02404 }                              /* MakeTempProtoPerm */
02405 
02406 /*---------------------------------------------------------------------------*/
02407 namespace tesseract {
02419 void Classify::PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results) {
02420   for (int i = 0; i < Results->NumMatches; ++i) {
02421     tprintf("%s(%d), shape %d, %.2f  ",
02422             unicharset.debug_str(Results->match[i].unichar_id).string(),
02423             Results->match[i].unichar_id, Results->match[i].shape_id,
02424             Results->match[i].rating * 100.0);
02425   }
02426   tprintf("\n");
02427 }                              /* PrintAdaptiveMatchResults */
02428 
02429 /*---------------------------------------------------------------------------*/
02445 void Classify::RemoveBadMatches(ADAPT_RESULTS *Results) {
02446   int Next, NextGood;
02447   FLOAT32 BadMatchThreshold;
02448   static const char* romans = "i v x I V X";
02449   BadMatchThreshold = Results->best_match.rating + matcher_bad_match_pad;
02450 
02451   if (classify_bln_numeric_mode) {
02452     UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
02453         unicharset.unichar_to_id("1") : -1;
02454     UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
02455         unicharset.unichar_to_id("0") : -1;
02456     ScoredClass scored_one = ScoredUnichar(Results, unichar_id_one);
02457     ScoredClass scored_zero = ScoredUnichar(Results, unichar_id_zero);
02458 
02459     for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
02460       if (Results->match[Next].rating <= BadMatchThreshold) {
02461         ScoredClass match = Results->match[Next];
02462         if (!unicharset.get_isalpha(match.unichar_id) ||
02463             strstr(romans,
02464                    unicharset.id_to_unichar(match.unichar_id)) != NULL) {
02465           Results->match[NextGood++] = Results->match[Next];
02466         } else if (unicharset.eq(match.unichar_id, "l") &&
02467                    scored_one.rating >= BadMatchThreshold) {
02468           Results->match[NextGood] = scored_one;
02469           Results->match[NextGood].rating = match.rating;
02470           NextGood++;
02471         } else if (unicharset.eq(match.unichar_id, "O") &&
02472                    scored_zero.rating >= BadMatchThreshold) {
02473           Results->match[NextGood] = scored_zero;
02474           Results->match[NextGood].rating = match.rating;
02475           NextGood++;
02476         }
02477       }
02478     }
02479   } else {
02480     for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
02481       if (Results->match[Next].rating <= BadMatchThreshold)
02482         Results->match[NextGood++] = Results->match[Next];
02483     }
02484   }
02485   Results->NumMatches = NextGood;
02486 }                              /* RemoveBadMatches */
02487 
02488 /*----------------------------------------------------------------------------*/
02504 void Classify::RemoveExtraPuncs(ADAPT_RESULTS *Results) {
02505   int Next, NextGood;
02506   int punc_count;              /*no of garbage characters */
02507   int digit_count;
02508   /*garbage characters */
02509   static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
02510   static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
02511 
02512   punc_count = 0;
02513   digit_count = 0;
02514   for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
02515     ScoredClass match = Results->match[Next];
02516     if (strstr(punc_chars,
02517                unicharset.id_to_unichar(match.unichar_id)) != NULL) {
02518       if (punc_count < 2)
02519         Results->match[NextGood++] = match;
02520       punc_count++;
02521     } else {
02522       if (strstr(digit_chars,
02523                  unicharset.id_to_unichar(match.unichar_id)) != NULL) {
02524         if (digit_count < 1)
02525           Results->match[NextGood++] = match;
02526         digit_count++;
02527       } else {
02528         Results->match[NextGood++] = match;
02529       }
02530     }
02531   }
02532   Results->NumMatches = NextGood;
02533 }                              /* RemoveExtraPuncs */
02534 
02535 /*---------------------------------------------------------------------------*/
02549 void Classify::SetAdaptiveThreshold(FLOAT32 Threshold) {
02550   Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
02551   classify_adapt_proto_threshold.set_value(
02552       ClipToRange<int>(255 * Threshold, 0, 255));
02553   classify_adapt_feature_threshold.set_value(
02554       ClipToRange<int>(255 * Threshold, 0, 255));
02555 }                              /* SetAdaptiveThreshold */
02556 
02557 /*---------------------------------------------------------------------------*/
02580 void Classify::ShowBestMatchFor(TBLOB *Blob,
02581                                 const DENORM& denorm,
02582                                 CLASS_ID ClassId,
02583                                 int shape_id,
02584                                 BOOL8 AdaptiveOn,
02585                                 BOOL8 PreTrainedOn,
02586                                 ADAPT_RESULTS *Results) {
02587   int NumCNFeatures = 0, NumBLFeatures = 0;
02588   INT_FEATURE_ARRAY CNFeatures, BLFeatures;
02589   INT_RESULT_STRUCT CNResult, BLResult;
02590   inT32 BlobLength;
02591   uinT32 ConfigMask;
02592   static int next_config = -1;
02593 
02594   if (PreTrainedOn) next_config = -1;
02595 
02596   CNResult.Rating = BLResult.Rating = 2.0;
02597 
02598   if (!LegalClassId (ClassId)) {
02599     cprintf ("%d is not a legal class id!!\n", ClassId);
02600     return;
02601   }
02602 
02603   uinT8 *CNAdjust = new uinT8[MAX_NUM_CLASSES];
02604   uinT8 *BLAdjust = new uinT8[MAX_NUM_CLASSES];
02605 
02606   if (shape_table_ == NULL)
02607     shape_id = ClassId;
02608   else
02609     shape_id = ShapeIDToClassID(shape_id);
02610   if (PreTrainedOn && shape_id >= 0) {
02611     if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
02612       tprintf("No built-in templates for class/shape %d\n", shape_id);
02613     } else {
02614       NumCNFeatures = GetCharNormFeatures(Blob, denorm, PreTrainedTemplates,
02615                                           CNFeatures, NULL, CNAdjust,
02616                                           &BlobLength, NULL);
02617       if (NumCNFeatures <= 0) {
02618         tprintf("Illegal blob (char norm features)!\n");
02619       } else {
02620         im_.SetCharNormMatch(classify_integer_matcher_multiplier);
02621         im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
02622                   AllProtosOn, AllConfigsOn,
02623                   NumCNFeatures, CNFeatures,
02624                   &CNResult,
02625                   classify_adapt_feature_threshold, NO_DEBUG,
02626                   matcher_debug_separate_windows);
02627         ExpandShapesAndApplyCorrections(NULL, false, shape_id,
02628                                         Blob->bounding_box().bottom(),
02629                                         Blob->bounding_box().top(),
02630                                         0, BlobLength, CNAdjust,
02631                                         CNResult, Results);
02632       }
02633     }
02634   }
02635 
02636   if (AdaptiveOn) {
02637     if (ClassId < 0 || ClassId >= AdaptedTemplates->Templates->NumClasses) {
02638       tprintf("Invalid adapted class id: %d\n", ClassId);
02639     } else if (UnusedClassIdIn(AdaptedTemplates->Templates, ClassId) ||
02640                AdaptedTemplates->Class[ClassId] == NULL ||
02641                IsEmptyAdaptedClass(AdaptedTemplates->Class[ClassId])) {
02642       tprintf("No AD templates for class %d = %s\n",
02643               ClassId, unicharset.id_to_unichar(ClassId));
02644     } else {
02645       NumBLFeatures = GetBaselineFeatures(Blob,
02646                                           denorm,
02647                                           AdaptedTemplates->Templates,
02648                                           BLFeatures, BLAdjust,
02649                                           &BlobLength);
02650       if (NumBLFeatures <= 0)
02651         tprintf("Illegal blob (baseline features)!\n");
02652       else {
02653         im_.SetBaseLineMatch();
02654         im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
02655                   AllProtosOn, AllConfigsOn,
02656                   NumBLFeatures, BLFeatures,
02657                   &BLResult,
02658                   classify_adapt_feature_threshold, NO_DEBUG,
02659                   matcher_debug_separate_windows);
02660         ExpandShapesAndApplyCorrections(
02661             AdaptedTemplates->Class, false,
02662             ClassId, Blob->bounding_box().bottom(),
02663             Blob->bounding_box().top(), 0, BlobLength, CNAdjust,
02664             BLResult, Results);
02665       }
02666     }
02667   }
02668 
02669   tprintf("\n");
02670   if (BLResult.Rating < CNResult.Rating) {
02671     if (next_config < 0) {
02672       ConfigMask = 1 << BLResult.Config;
02673       next_config = 0;
02674     } else {
02675       ConfigMask = 1 << next_config;
02676       ++next_config;
02677     }
02678     classify_norm_method.set_value(baseline);
02679 
02680     im_.SetBaseLineMatch();
02681     tprintf("Adaptive Class ID: %d\n", ClassId);
02682     im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
02683               AllProtosOn, (BIT_VECTOR) &ConfigMask,
02684               NumBLFeatures, BLFeatures,
02685               &BLResult,
02686               classify_adapt_feature_threshold,
02687               matcher_debug_flags,
02688               matcher_debug_separate_windows);
02689     ExpandShapesAndApplyCorrections(
02690         AdaptedTemplates->Class, true,
02691         ClassId, Blob->bounding_box().bottom(),
02692         Blob->bounding_box().top(), 0, BlobLength, CNAdjust,
02693         BLResult, Results);
02694   } else if (shape_id >= 0) {
02695     ConfigMask = 1 << CNResult.Config;
02696     classify_norm_method.set_value(character);
02697 
02698     tprintf("Static Shape ID: %d\n", shape_id);
02699     im_.SetCharNormMatch(classify_integer_matcher_multiplier);
02700     im_.Match(ClassForClassId (PreTrainedTemplates, shape_id),
02701               AllProtosOn, (BIT_VECTOR) & ConfigMask,
02702               NumCNFeatures, CNFeatures,
02703               &CNResult,
02704               classify_adapt_feature_threshold,
02705               matcher_debug_flags,
02706               matcher_debug_separate_windows);
02707     ExpandShapesAndApplyCorrections(NULL, true, shape_id,
02708                                     Blob->bounding_box().bottom(),
02709                                     Blob->bounding_box().top(),
02710                                     0, BlobLength, CNAdjust,
02711                                     CNResult, Results);
02712   }
02713 
02714   // Clean up.
02715   delete[] CNAdjust;
02716   delete[] BLAdjust;
02717 }                              /* ShowBestMatchFor */
02718 
02719 // Returns a string for the classifier class_id: either the corresponding
02720 // unicharset debug_str or the shape_table_ debug str.
02721 STRING Classify::ClassIDToDebugStr(const INT_TEMPLATES_STRUCT* templates,
02722                                    int class_id, int config_id) const {
02723   STRING class_string;
02724   if (templates == PreTrainedTemplates && shape_table_ != NULL) {
02725     int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
02726     class_string = shape_table_->DebugStr(shape_id);
02727   } else {
02728     class_string = unicharset.debug_str(class_id);
02729   }
02730   return class_string;
02731 }
02732 
02733 // Converts a classifier class_id index to a shape_table_ index
02734 int Classify::ClassAndConfigIDToFontOrShapeID(int class_id,
02735                                               int int_result_config) const {
02736   int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
02737   // Older inttemps have no font_ids.
02738   if (font_set_id < 0)
02739     return kBlankFontinfoId;
02740   const FontSet &fs = fontset_table_.get(font_set_id);
02741   ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
02742   return fs.configs[int_result_config];
02743 }
02744 
02745 // Converts a shape_table_ index to a classifier class_id index (not a
02746 // unichar-id!). Uses a search, so not fast.
02747 int Classify::ShapeIDToClassID(int shape_id) const {
02748   for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
02749     int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
02750     ASSERT_HOST(font_set_id >= 0);
02751     const FontSet &fs = fontset_table_.get(font_set_id);
02752     for (int config = 0; config < fs.size; ++config) {
02753       if (fs.configs[config] == shape_id)
02754         return id;
02755     }
02756   }
02757   tprintf("Shape %d not found\n", shape_id);
02758   return -1;
02759 }
02760 
02761 // Returns true if the given TEMP_CONFIG is good enough to make it
02762 // a permanent config.
02763 bool Classify::TempConfigReliable(CLASS_ID class_id,
02764                                   const TEMP_CONFIG &config) {
02765   if (classify_learning_debug_level >= 1) {
02766     tprintf("NumTimesSeen for config of %s is %d\n",
02767             getDict().getUnicharset().debug_str(class_id).string(),
02768             config->NumTimesSeen);
02769   }
02770   if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
02771     return true;
02772   } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
02773     return false;
02774   } else if (use_ambigs_for_adaption) {
02775     // Go through the ambigs vector and see whether we have already seen
02776     // enough times all the characters represented by the ambigs vector.
02777     const UnicharIdVector *ambigs =
02778       getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
02779     int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
02780     for (int ambig = 0; ambig < ambigs_size; ++ambig) {
02781       ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
02782       assert(ambig_class != NULL);
02783       if (ambig_class->NumPermConfigs == 0 &&
02784           ambig_class->MaxNumTimesSeen <
02785           matcher_min_examples_for_prototyping) {
02786         if (classify_learning_debug_level >= 1) {
02787           tprintf("Ambig %s has not been seen enough times,"
02788                   " not making config for %s permanent\n",
02789                   getDict().getUnicharset().debug_str(
02790                       (*ambigs)[ambig]).string(),
02791                   getDict().getUnicharset().debug_str(class_id).string());
02792         }
02793         return false;
02794       }
02795     }
02796   }
02797   return true;
02798 }
02799 
02800 void Classify::UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm,
02801                                  TBLOB *Blob) {
02802   const UnicharIdVector *ambigs =
02803     getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);
02804   int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
02805   if (classify_learning_debug_level >= 1) {
02806     tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
02807             getDict().getUnicharset().debug_str(class_id).string(), class_id);
02808   }
02809   for (int ambig = 0; ambig < ambigs_size; ++ambig) {
02810     CLASS_ID ambig_class_id = (*ambigs)[ambig];
02811     const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
02812     for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
02813       if (ConfigIsPermanent(ambigs_class, cfg)) continue;
02814       const TEMP_CONFIG config =
02815         TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
02816       if (config != NULL && TempConfigReliable(ambig_class_id, config)) {
02817         if (classify_learning_debug_level >= 1) {
02818           tprintf("Making config %d of %s permanent\n", cfg,
02819                   getDict().getUnicharset().debug_str(
02820                       ambig_class_id).string());
02821         }
02822         MakePermanent(AdaptedTemplates, ambig_class_id, cfg, denorm, Blob);
02823       }
02824     }
02825   }
02826 }
02827 
02828 }  // namespace tesseract