Tesseract  3.02
tesseract-ocr/classify/classify.h
Go to the documentation of this file.
00001 
00002 // File:        classify.h
00003 // Description: classify class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
00020 #define TESSERACT_CLASSIFY_CLASSIFY_H__
00021 
00022 #include "adaptive.h"
00023 #include "ccstruct.h"
00024 #include "classify.h"
00025 #include "dict.h"
00026 #include "featdefs.h"
00027 #include "fontinfo.h"
00028 #include "intfx.h"
00029 #include "intmatcher.h"
00030 #include "normalis.h"
00031 #include "ratngs.h"
00032 #include "ocrfeatures.h"
00033 #include "unicity_table.h"
00034 
00035 class ScrollView;
00036 class WERD_CHOICE;
00037 class WERD_RES;
00038 struct ADAPT_RESULTS;
00039 struct NORM_PROTOS;
00040 
00041 static const int kUnknownFontinfoId = -1;
00042 static const int kBlankFontinfoId = -2;
00043 
00044 namespace tesseract {
00045 
00046 struct ShapeRating;
00047 class ShapeTable;
00048 
00049 // How segmented is a blob. In this enum, character refers to a classifiable
00050 // unit, but that is too long and character is usually easier to understand.
00051 enum CharSegmentationType {
00052   CST_FRAGMENT,  // A partial character.
00053   CST_WHOLE,     // A correctly segmented character.
00054   CST_IMPROPER,  // More than one but less than 2 characters.
00055   CST_NGRAM      // Multiple characters.
00056 };
00057 
00058 class Classify : public CCStruct {
00059  public:
00060   Classify();
00061   virtual ~Classify();
00062   Dict& getDict() {
00063     return dict_;
00064   }
00065 
00066   const ShapeTable* shape_table() const {
00067     return shape_table_;
00068   }
00069 
00070   /* adaptive.cpp ************************************************************/
00071   ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
00072   int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId);
00073   // Runs the class pruner from int_templates on the given features, returning
00074   // the number of classes output in results.
00075   //    int_templates          Class pruner tables
00076   //    num_features           Number of features in blob
00077   //    features               Array of features
00078   //    normalization_factors  (input) Array of int_templates->NumClasses fudge
00079   //                           factors from blob normalization process.
00080   //                           (Indexed by CLASS_INDEX)
00081   //    expected_num_features  (input) Array of int_templates->NumClasses
00082   //                           expected number of features for each class.
00083   //                           (Indexed by CLASS_INDEX)
00084   //    results                (output) Sorted Array of pruned classes.
00085   //                           Array must be sized to take the maximum possible
00086   //                           number of outputs : int_templates->NumClasses.
00087   int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates,
00088                    int num_features,
00089                    const INT_FEATURE_STRUCT* features,
00090                    const uinT8* normalization_factors,
00091                    const uinT16* expected_num_features,
00092                    CP_RESULT_STRUCT* results);
00093   void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
00094                       CLASS_CUTOFF_ARRAY Cutoffs);
00095   void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
00096   void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
00097   ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
00098   /* normmatch.cpp ************************************************************/
00099   FLOAT32 ComputeNormMatch(CLASS_ID ClassId,
00100                            const FEATURE_STRUCT& feature, BOOL8 DebugMatch);
00101   void FreeNormProtos();
00102   NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
00103   /* protos.cpp ***************************************************************/
00104   void ReadClassFile();
00105   void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
00106   INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos,
00107                                    const UNICHARSET& target_unicharset);
00108   /* adaptmatch.cpp ***********************************************************/
00109 
00110   // Learn the given word using its chopped_word, seam_array, denorm,
00111   // box_word, best_state, and correct_text to learn both correctly and
00112   // incorrectly segmented blobs. If filename is not NULL, then LearnBlob
00113   // is called and the data will be written to a file for static training.
00114   // Otherwise AdaptToBlob is called for adaption within a document.
00115   // If rejmap is not NULL, then only chars with a rejmap entry of '1' will
00116   // be learned, otherwise all chars with good correct_text are learned.
00117   void LearnWord(const char* filename, const char *rejmap, WERD_RES *word);
00118 
00119   // Builds a blob of length fragments, from the word, starting at start,
00120   // and then learn it, as having the given correct_text.
00121   // If filename is not NULL, then LearnBlob
00122   // is called and the data will be written to a file for static training.
00123   // Otherwise AdaptToBlob is called for adaption within a document.
00124   // threshold is a magic number required by AdaptToChar and generated by
00125   // GetAdaptThresholds.
00126   // Although it can be partly inferred from the string, segmentation is
00127   // provided to explicitly clarify the character segmentation.
00128   void LearnPieces(const char* filename, int start, int length,
00129                    float threshold, CharSegmentationType segmentation,
00130                    const char* correct_text, WERD_RES *word);
00131   void InitAdaptiveClassifier(bool load_pre_trained_templates);
00132   void InitAdaptedClass(TBLOB *Blob,
00133                         const DENORM& denorm,
00134                         CLASS_ID ClassId,
00135                         int FontinfoId,
00136                         ADAPT_CLASS Class,
00137                         ADAPT_TEMPLATES Templates);
00138   void AdaptToPunc(TBLOB *Blob,
00139                    const DENORM& denorm,
00140                    CLASS_ID ClassId,
00141                    int FontinfoId,
00142                    FLOAT32 Threshold);
00143   void AmbigClassifier(TBLOB *Blob,
00144                        const DENORM& denorm,
00145                        INT_TEMPLATES Templates,
00146                        ADAPT_CLASS *Classes,
00147                        UNICHAR_ID *Ambiguities,
00148                        ADAPT_RESULTS *Results);
00149   void MasterMatcher(INT_TEMPLATES templates,
00150                      inT16 num_features,
00151                      const INT_FEATURE_STRUCT* features,
00152                      const uinT8* norm_factors,
00153                      ADAPT_CLASS* classes,
00154                      int debug,
00155                      int num_classes,
00156                      const TBOX& blob_box,
00157                      CLASS_PRUNER_RESULTS results,
00158                      ADAPT_RESULTS* final_results);
00159   // Converts configs to fonts, and if the result is not adapted, and a
00160   // shape_table_ is present, the shape is expanded to include all
00161   // unichar_ids represented, before applying a set of corrections to the
00162   // distance rating in int_result, (see ComputeCorrectedRating.)
00163   // The results are added to the final_results output.
00164   void ExpandShapesAndApplyCorrections(ADAPT_CLASS* classes,
00165                                        bool debug,
00166                                        int class_id,
00167                                        int bottom, int top,
00168                                        float cp_rating,
00169                                        int blob_length,
00170                                        const uinT8* cn_factors,
00171                                        INT_RESULT_STRUCT& int_result,
00172                                        ADAPT_RESULTS* final_results);
00173   // Applies a set of corrections to the distance im_rating,
00174   // including the cn_correction, miss penalty and additional penalty
00175   // for non-alnums being vertical misfits. Returns the corrected distance.
00176   double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
00177                                 double im_rating, int feature_misses,
00178                                 int bottom, int top,
00179                                 int blob_length, const uinT8* cn_factors);
00180   void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
00181                                ADAPT_RESULTS *Results,
00182                                BLOB_CHOICE_LIST *Choices);
00183   void AddNewResult(ADAPT_RESULTS *results,
00184                     CLASS_ID class_id,
00185                     int shape_id,
00186                     FLOAT32 rating,
00187                     bool adapted,
00188                     int config,
00189                     int fontinfo_id,
00190                     int fontinfo_id2);
00191   int GetAdaptiveFeatures(TBLOB *Blob,
00192                           INT_FEATURE_ARRAY IntFeatures,
00193                           FEATURE_SET *FloatFeatures);
00194 
00195 #ifndef GRAPHICS_DISABLED
00196   void DebugAdaptiveClassifier(TBLOB *Blob,
00197                                const DENORM& denorm,
00198                                ADAPT_RESULTS *Results);
00199 #endif
00200   void GetAdaptThresholds (TWERD * Word,
00201                            const DENORM& denorm,
00202                            const WERD_CHOICE& BestChoice,
00203                            const WERD_CHOICE& BestRawChoice,
00204                            FLOAT32 Thresholds[]);
00205 
00206   PROTO_ID MakeNewTempProtos(FEATURE_SET Features,
00207                              int NumBadFeat,
00208                              FEATURE_ID BadFeat[],
00209                              INT_CLASS IClass,
00210                              ADAPT_CLASS Class,
00211                              BIT_VECTOR TempProtoMask);
00212   int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
00213                              CLASS_ID ClassId,
00214                              int FontinfoId,
00215                              int NumFeatures,
00216                              INT_FEATURE_ARRAY Features,
00217                              FEATURE_SET FloatFeatures);
00218   void MakePermanent(ADAPT_TEMPLATES Templates,
00219                      CLASS_ID ClassId,
00220                      int ConfigId,
00221                      const DENORM& denorm,
00222                      TBLOB *Blob);
00223   void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
00224   void RemoveExtraPuncs(ADAPT_RESULTS *Results);
00225   void RemoveBadMatches(ADAPT_RESULTS *Results);
00226   void SetAdaptiveThreshold(FLOAT32 Threshold);
00227   void ShowBestMatchFor(TBLOB *Blob,
00228                         const DENORM& denorm,
00229                         CLASS_ID ClassId,
00230                         int shape_id,
00231                         BOOL8 AdaptiveOn,
00232                         BOOL8 PreTrainedOn,
00233                         ADAPT_RESULTS *Results);
00234   // Returns a string for the classifier class_id: either the corresponding
00235   // unicharset debug_str or the shape_table_ debug str.
00236   STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT* templates,
00237                            int class_id, int config_id) const;
00238   // Converts a classifier class_id index with a config ID to:
00239   // shape_table_ present: a shape_table_ index OR
00240   // No shape_table_: a font ID.
00241   // Without shape training, each class_id, config pair represents a single
00242   // unichar id/font combination, so this function looks up the corresponding
00243   // font id.
00244   // With shape training, each class_id, config pair represents a single
00245   // shape table index, so the fontset_table stores the shape table index,
00246   // and the shape_table_ must be consulted to obtain the actual unichar_id/
00247   // font combinations that the shape represents.
00248   int ClassAndConfigIDToFontOrShapeID(int class_id,
00249                                       int int_result_config) const;
00250   // Converts a shape_table_ index to a classifier class_id index (not a
00251   // unichar-id!). Uses a search, so not fast.
00252   int ShapeIDToClassID(int shape_id) const;
00253   UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
00254                                  const DENORM& denorm,
00255                                  ADAPT_TEMPLATES Templates,
00256                                  ADAPT_RESULTS *Results);
00257   int CharNormClassifier(TBLOB *Blob,
00258                          const DENORM& denorm,
00259                          INT_TEMPLATES Templates,
00260                          ADAPT_RESULTS *Results);
00261 
00262   // As CharNormClassifier, but operates on a TrainingSample and outputs to
00263   // a GenericVector of ShapeRating without conversion to classes.
00264   int CharNormTrainingSample(bool pruner_only, const TrainingSample& sample,
00265                              GenericVector<ShapeRating>* results);
00266   UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
00267                              const DENORM& denorm,
00268                              CLASS_ID CorrectClass);
00269   void DoAdaptiveMatch(TBLOB *Blob,
00270                        const DENORM& denorm,
00271                        ADAPT_RESULTS *Results);
00272   void AdaptToChar(TBLOB *Blob,
00273                    const DENORM& denorm,
00274                    CLASS_ID ClassId,
00275                    int FontinfoId,
00276                    FLOAT32 Threshold);
00277   void DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm,
00278                           INT_CLASS_STRUCT* int_class);
00279   int AdaptableWord(TWERD *Word,
00280                   const WERD_CHOICE &BestChoiceWord,
00281                   const WERD_CHOICE &RawChoiceWord);
00282   void EndAdaptiveClassifier();
00283   void PrintAdaptiveStatistics(FILE *File);
00284   void SettupPass1();
00285   void SettupPass2();
00286   void AdaptiveClassifier(TBLOB *Blob,
00287                           const DENORM& denorm,
00288                           BLOB_CHOICE_LIST *Choices,
00289                           CLASS_PRUNER_RESULTS cp_results);
00290   void ClassifyAsNoise(ADAPT_RESULTS *Results);
00291   void ResetAdaptiveClassifierInternal();
00292 
00293   int GetBaselineFeatures(TBLOB *Blob,
00294                           const DENORM& denorm,
00295                           INT_TEMPLATES Templates,
00296                           INT_FEATURE_ARRAY IntFeatures,
00297                           uinT8* CharNormArray,
00298                           inT32 *BlobLength);
00299   int GetCharNormFeatures(TBLOB *Blob,
00300                           const DENORM& denorm,
00301                           INT_TEMPLATES Templates,
00302                           INT_FEATURE_ARRAY IntFeatures,
00303                           uinT8* PrunerNormArray,
00304                           uinT8* CharNormArray,
00305                           inT32 *BlobLength,
00306                           inT32 *FeatureOutlineIndex);
00307   // Computes the char_norm_array for the unicharset and, if not NULL, the
00308   // pruner_array as appropriate according to the existence of the shape_table.
00309   // The norm_feature is deleted as it is almost certainly no longer needed.
00310   void ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
00311                              INT_TEMPLATES_STRUCT* templates,
00312                              uinT8* char_norm_array,
00313                              uinT8* pruner_array);
00314 
00315   bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
00316   void UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm, TBLOB *Blob);
00317 
00318   void ResetFeaturesHaveBeenExtracted();
00319   bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
00320   bool LooksLikeGarbage(const DENORM& denorm, TBLOB *blob);
00321   void RefreshDebugWindow(ScrollView **win, const char *msg,
00322                           int y_offset, const TBOX &wbox);
00323   /* float2int.cpp ************************************************************/
00324   void ClearCharNormArray(uinT8* char_norm_array);
00325   void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
00326                                uinT8* char_norm_array);
00327   void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
00328   /* intproto.cpp *************************************************************/
00329   INT_TEMPLATES ReadIntTemplates(FILE *File);
00330   void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
00331                          const UNICHARSET& target_unicharset);
00332   CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
00333                            bool* pretrained_on, int* shape_id);
00334   void ShowMatchDisplay();
00335   /* font detection ***********************************************************/
00336   UnicityTable<FontInfo>& get_fontinfo_table() {
00337     return fontinfo_table_;
00338   }
00339   UnicityTable<FontSet>& get_fontset_table() {
00340     return fontset_table_;
00341   }
00342   /* mfoutline.cpp ***********************************************************/
00343   void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale);
00344   /* outfeat.cpp ***********************************************************/
00345   FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob);
00346   /* picofeat.cpp ***********************************************************/
00347   FEATURE_SET ExtractPicoFeatures(TBLOB *Blob);
00348 
00349 
00350   // Member variables.
00351 
00352   // Parameters.
00353   BOOL_VAR_H(prioritize_division, FALSE,
00354              "Prioritize blob division over chopping");
00355   INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
00356   BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
00357   INT_VAR_H(classify_debug_level, 0, "Classify debug level");
00358 
00359   /* mfoutline.cpp ***********************************************************/
00360   /* control knobs used to control normalization of outlines */
00361   INT_VAR_H(classify_norm_method, character, "Normalization Method   ...");
00362   double_VAR_H(classify_char_norm_range, 0.2,
00363              "Character Normalization Range ...");
00364   double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...");
00365   double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
00366   double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
00367   double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
00368 
00369   /* adaptmatch.cpp ***********************************************************/
00370   BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
00371   BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching");
00372   BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
00373   BOOL_VAR_H(classify_use_pre_adapted_templates, 0,
00374              "Use pre-adapted classifier templates");
00375   BOOL_VAR_H(classify_save_adapted_templates, 0,
00376              "Save adapted templates to a file");
00377   BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
00378   INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
00379   INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
00380   INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
00381   double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)");
00382   double_VAR_H(matcher_great_threshold, 0.0, "Great Match (0-1)");
00383   double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
00384   double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
00385   double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)");
00386   double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
00387   INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes");
00388   INT_VAR_H(matcher_min_examples_for_prototyping, 3,
00389             "Reliable Config Threshold");
00390   INT_VAR_H(matcher_sufficient_examples_for_prototyping, 5,
00391             "Enable adaption even if the ambiguities have not been seen");
00392   double_VAR_H(matcher_clustering_max_angle_delta, 0.015,
00393                "Maximum angle delta for prototype clustering");
00394   double_VAR_H(classify_misfit_junk_penalty, 0.0,
00395                "Penalty to apply when a non-alnum is vertically out of "
00396                "its expected textline position");
00397   double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
00398   double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
00399   double_VAR_H(tessedit_class_miss_scale, 0.00390625,
00400                "Scale factor for features not used");
00401   INT_VAR_H(classify_adapt_proto_threshold, 230,
00402             "Threshold for good protos during adaptive 0-255");
00403   INT_VAR_H(classify_adapt_feature_threshold, 230,
00404             "Threshold for good features during adaptive 0-255");
00405   BOOL_VAR_H(disable_character_fragments, TRUE,
00406              "Do not include character fragments in the"
00407              " results of the classifier");
00408   double_VAR_H(classify_character_fragments_garbage_certainty_threshold, -3.0,
00409                "Exclude fragments that do not match any whole character"
00410                " with at least this certainty");
00411   BOOL_VAR_H(classify_debug_character_fragments, FALSE,
00412              "Bring up graphical debugging windows for fragments training");
00413   BOOL_VAR_H(matcher_debug_separate_windows, FALSE,
00414              "Use two different windows for debugging the matching: "
00415              "One for the protos and one for the features.");
00416   STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning");
00417 
00418   /* intmatcher.cpp **********************************************************/
00419   INT_VAR_H(classify_class_pruner_threshold, 229,
00420             "Class Pruner Threshold 0-255");
00421   INT_VAR_H(classify_class_pruner_multiplier, 30,
00422             "Class Pruner Multiplier 0-255:       ");
00423   INT_VAR_H(classify_cp_cutoff_strength, 7,
00424             "Class Pruner CutoffStrength:         ");
00425   INT_VAR_H(classify_integer_matcher_multiplier, 14,
00426             "Integer Matcher Multiplier  0-255:   ");
00427 
00428   // Use class variables to hold onto built-in templates and adapted templates.
00429   INT_TEMPLATES PreTrainedTemplates;
00430   ADAPT_TEMPLATES AdaptedTemplates;
00431 
00432   // Create dummy proto and config masks for use with the built-in templates.
00433   BIT_VECTOR AllProtosOn;
00434   BIT_VECTOR PrunedProtos;
00435   BIT_VECTOR AllConfigsOn;
00436   BIT_VECTOR AllProtosOff;
00437   BIT_VECTOR AllConfigsOff;
00438   BIT_VECTOR TempProtoMask;
00439   bool EnableLearning;
00440   /* normmatch.cpp */
00441   NORM_PROTOS *NormProtos;
00442   /* font detection ***********************************************************/
00443   UnicityTable<FontInfo> fontinfo_table_;
00444   // Without shape training, each class_id, config pair represents a single
00445   // unichar id/font combination, so each fontset_table_ entry holds font ids
00446   // for each config in the class.
00447   // With shape training, each class_id, config pair represents a single
00448   // shape_table_ index, so the fontset_table_ stores the shape_table_ index,
00449   // and the shape_table_ must be consulted to obtain the actual unichar_id/
00450   // font combinations that the shape represents.
00451   UnicityTable<FontSet> fontset_table_;
00452 
00453   INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
00454   BOOL_VAR_H(classify_bln_numeric_mode, 0,
00455              "Assume the input is numbers [0-9].");
00456 
00457  protected:
00458   IntegerMatcher im_;
00459   FEATURE_DEFS_STRUCT feature_defs_;
00460   // If a shape_table_ is present, it is used to remap classifier output in
00461   // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
00462   // mean an index to the shape_table_ and the choices returned are *all* the
00463   // shape_table_ entries at that index.
00464   ShapeTable* shape_table_;
00465 
00466  private:
00467 
00468   Dict dict_;
00469 
00470   /* variables used to hold performance statistics */
00471   int AdaptiveMatcherCalls;
00472   int BaselineClassifierCalls;
00473   int CharNormClassifierCalls;
00474   int AmbigClassifierCalls;
00475   int NumWordsAdaptedTo;
00476   int NumCharsAdaptedTo;
00477   int NumBaselineClassesTried;
00478   int NumCharNormClassesTried;
00479   int NumAmbigClassesTried;
00480   int NumClassesOutput;
00481   int NumAdaptationsFailed;
00482 
00483   /* variables used to hold onto extracted features.  This is used
00484   to map from the old scheme in which baseline features and char norm
00485   features are extracted separately, to the new scheme in which they
00486   are extracted at the same time. */
00487   bool FeaturesHaveBeenExtracted;
00488   bool FeaturesOK;
00489   INT_FEATURE_ARRAY BaselineFeatures;
00490   INT_FEATURE_ARRAY CharNormFeatures;
00491   INT_FX_RESULT_STRUCT FXInfo;
00492 
00493   // Expected number of features in the class pruner, used to penalize
00494   // unknowns that have too few features (like a c being classified as e) so
00495   // it doesn't recognize everything as '@' or '#'.
00496   // CharNormCutoffs is for the static classifier (with no shapetable).
00497   // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
00498   // value in the adaptive classifier. Both are indexed by unichar_id.
00499   // shapetable_cutoffs_ provides a similar value for each shape in the
00500   // shape_table_
00501   uinT16* CharNormCutoffs;
00502   uinT16* BaselineCutoffs;
00503   GenericVector<uinT16> shapetable_cutoffs_;
00504   ScrollView* learn_debug_win_;
00505   ScrollView* learn_fragmented_word_debug_win_;
00506   ScrollView* learn_fragments_debug_win_;
00507 };
00508 }  // namespace tesseract
00509 
00510 #endif  // TESSERACT_CLASSIFY_CLASSIFY_H__