Tesseract
3.02
|
00001 00002 // File: classify.h 00003 // Description: classify class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__ 00020 #define TESSERACT_CLASSIFY_CLASSIFY_H__ 00021 00022 #include "adaptive.h" 00023 #include "ccstruct.h" 00024 #include "classify.h" 00025 #include "dict.h" 00026 #include "featdefs.h" 00027 #include "fontinfo.h" 00028 #include "intfx.h" 00029 #include "intmatcher.h" 00030 #include "normalis.h" 00031 #include "ratngs.h" 00032 #include "ocrfeatures.h" 00033 #include "unicity_table.h" 00034 00035 class ScrollView; 00036 class WERD_CHOICE; 00037 class WERD_RES; 00038 struct ADAPT_RESULTS; 00039 struct NORM_PROTOS; 00040 00041 static const int kUnknownFontinfoId = -1; 00042 static const int kBlankFontinfoId = -2; 00043 00044 namespace tesseract { 00045 00046 struct ShapeRating; 00047 class ShapeTable; 00048 00049 // How segmented is a blob. In this enum, character refers to a classifiable 00050 // unit, but that is too long and character is usually easier to understand. 00051 enum CharSegmentationType { 00052 CST_FRAGMENT, // A partial character. 00053 CST_WHOLE, // A correctly segmented character. 00054 CST_IMPROPER, // More than one but less than 2 characters. 00055 CST_NGRAM // Multiple characters. 00056 }; 00057 00058 class Classify : public CCStruct { 00059 public: 00060 Classify(); 00061 virtual ~Classify(); 00062 Dict& getDict() { 00063 return dict_; 00064 } 00065 00066 const ShapeTable* shape_table() const { 00067 return shape_table_; 00068 } 00069 00070 /* adaptive.cpp ************************************************************/ 00071 ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset); 00072 int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId); 00073 // Runs the class pruner from int_templates on the given features, returning 00074 // the number of classes output in results. 00075 // int_templates Class pruner tables 00076 // num_features Number of features in blob 00077 // features Array of features 00078 // normalization_factors (input) Array of int_templates->NumClasses fudge 00079 // factors from blob normalization process. 00080 // (Indexed by CLASS_INDEX) 00081 // expected_num_features (input) Array of int_templates->NumClasses 00082 // expected number of features for each class. 00083 // (Indexed by CLASS_INDEX) 00084 // results (output) Sorted Array of pruned classes. 00085 // Array must be sized to take the maximum possible 00086 // number of outputs : int_templates->NumClasses. 00087 int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates, 00088 int num_features, 00089 const INT_FEATURE_STRUCT* features, 00090 const uinT8* normalization_factors, 00091 const uinT16* expected_num_features, 00092 CP_RESULT_STRUCT* results); 00093 void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset, 00094 CLASS_CUTOFF_ARRAY Cutoffs); 00095 void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates); 00096 void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates); 00097 ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File); 00098 /* normmatch.cpp ************************************************************/ 00099 FLOAT32 ComputeNormMatch(CLASS_ID ClassId, 00100 const FEATURE_STRUCT& feature, BOOL8 DebugMatch); 00101 void FreeNormProtos(); 00102 NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset); 00103 /* protos.cpp ***************************************************************/ 00104 void ReadClassFile(); 00105 void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class); 00106 INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, 00107 const UNICHARSET& target_unicharset); 00108 /* adaptmatch.cpp ***********************************************************/ 00109 00110 // Learn the given word using its chopped_word, seam_array, denorm, 00111 // box_word, best_state, and correct_text to learn both correctly and 00112 // incorrectly segmented blobs. If filename is not NULL, then LearnBlob 00113 // is called and the data will be written to a file for static training. 00114 // Otherwise AdaptToBlob is called for adaption within a document. 00115 // If rejmap is not NULL, then only chars with a rejmap entry of '1' will 00116 // be learned, otherwise all chars with good correct_text are learned. 00117 void LearnWord(const char* filename, const char *rejmap, WERD_RES *word); 00118 00119 // Builds a blob of length fragments, from the word, starting at start, 00120 // and then learn it, as having the given correct_text. 00121 // If filename is not NULL, then LearnBlob 00122 // is called and the data will be written to a file for static training. 00123 // Otherwise AdaptToBlob is called for adaption within a document. 00124 // threshold is a magic number required by AdaptToChar and generated by 00125 // GetAdaptThresholds. 00126 // Although it can be partly inferred from the string, segmentation is 00127 // provided to explicitly clarify the character segmentation. 00128 void LearnPieces(const char* filename, int start, int length, 00129 float threshold, CharSegmentationType segmentation, 00130 const char* correct_text, WERD_RES *word); 00131 void InitAdaptiveClassifier(bool load_pre_trained_templates); 00132 void InitAdaptedClass(TBLOB *Blob, 00133 const DENORM& denorm, 00134 CLASS_ID ClassId, 00135 int FontinfoId, 00136 ADAPT_CLASS Class, 00137 ADAPT_TEMPLATES Templates); 00138 void AdaptToPunc(TBLOB *Blob, 00139 const DENORM& denorm, 00140 CLASS_ID ClassId, 00141 int FontinfoId, 00142 FLOAT32 Threshold); 00143 void AmbigClassifier(TBLOB *Blob, 00144 const DENORM& denorm, 00145 INT_TEMPLATES Templates, 00146 ADAPT_CLASS *Classes, 00147 UNICHAR_ID *Ambiguities, 00148 ADAPT_RESULTS *Results); 00149 void MasterMatcher(INT_TEMPLATES templates, 00150 inT16 num_features, 00151 const INT_FEATURE_STRUCT* features, 00152 const uinT8* norm_factors, 00153 ADAPT_CLASS* classes, 00154 int debug, 00155 int num_classes, 00156 const TBOX& blob_box, 00157 CLASS_PRUNER_RESULTS results, 00158 ADAPT_RESULTS* final_results); 00159 // Converts configs to fonts, and if the result is not adapted, and a 00160 // shape_table_ is present, the shape is expanded to include all 00161 // unichar_ids represented, before applying a set of corrections to the 00162 // distance rating in int_result, (see ComputeCorrectedRating.) 00163 // The results are added to the final_results output. 00164 void ExpandShapesAndApplyCorrections(ADAPT_CLASS* classes, 00165 bool debug, 00166 int class_id, 00167 int bottom, int top, 00168 float cp_rating, 00169 int blob_length, 00170 const uinT8* cn_factors, 00171 INT_RESULT_STRUCT& int_result, 00172 ADAPT_RESULTS* final_results); 00173 // Applies a set of corrections to the distance im_rating, 00174 // including the cn_correction, miss penalty and additional penalty 00175 // for non-alnums being vertical misfits. Returns the corrected distance. 00176 double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, 00177 double im_rating, int feature_misses, 00178 int bottom, int top, 00179 int blob_length, const uinT8* cn_factors); 00180 void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box, 00181 ADAPT_RESULTS *Results, 00182 BLOB_CHOICE_LIST *Choices); 00183 void AddNewResult(ADAPT_RESULTS *results, 00184 CLASS_ID class_id, 00185 int shape_id, 00186 FLOAT32 rating, 00187 bool adapted, 00188 int config, 00189 int fontinfo_id, 00190 int fontinfo_id2); 00191 int GetAdaptiveFeatures(TBLOB *Blob, 00192 INT_FEATURE_ARRAY IntFeatures, 00193 FEATURE_SET *FloatFeatures); 00194 00195 #ifndef GRAPHICS_DISABLED 00196 void DebugAdaptiveClassifier(TBLOB *Blob, 00197 const DENORM& denorm, 00198 ADAPT_RESULTS *Results); 00199 #endif 00200 void GetAdaptThresholds (TWERD * Word, 00201 const DENORM& denorm, 00202 const WERD_CHOICE& BestChoice, 00203 const WERD_CHOICE& BestRawChoice, 00204 FLOAT32 Thresholds[]); 00205 00206 PROTO_ID MakeNewTempProtos(FEATURE_SET Features, 00207 int NumBadFeat, 00208 FEATURE_ID BadFeat[], 00209 INT_CLASS IClass, 00210 ADAPT_CLASS Class, 00211 BIT_VECTOR TempProtoMask); 00212 int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, 00213 CLASS_ID ClassId, 00214 int FontinfoId, 00215 int NumFeatures, 00216 INT_FEATURE_ARRAY Features, 00217 FEATURE_SET FloatFeatures); 00218 void MakePermanent(ADAPT_TEMPLATES Templates, 00219 CLASS_ID ClassId, 00220 int ConfigId, 00221 const DENORM& denorm, 00222 TBLOB *Blob); 00223 void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results); 00224 void RemoveExtraPuncs(ADAPT_RESULTS *Results); 00225 void RemoveBadMatches(ADAPT_RESULTS *Results); 00226 void SetAdaptiveThreshold(FLOAT32 Threshold); 00227 void ShowBestMatchFor(TBLOB *Blob, 00228 const DENORM& denorm, 00229 CLASS_ID ClassId, 00230 int shape_id, 00231 BOOL8 AdaptiveOn, 00232 BOOL8 PreTrainedOn, 00233 ADAPT_RESULTS *Results); 00234 // Returns a string for the classifier class_id: either the corresponding 00235 // unicharset debug_str or the shape_table_ debug str. 00236 STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT* templates, 00237 int class_id, int config_id) const; 00238 // Converts a classifier class_id index with a config ID to: 00239 // shape_table_ present: a shape_table_ index OR 00240 // No shape_table_: a font ID. 00241 // Without shape training, each class_id, config pair represents a single 00242 // unichar id/font combination, so this function looks up the corresponding 00243 // font id. 00244 // With shape training, each class_id, config pair represents a single 00245 // shape table index, so the fontset_table stores the shape table index, 00246 // and the shape_table_ must be consulted to obtain the actual unichar_id/ 00247 // font combinations that the shape represents. 00248 int ClassAndConfigIDToFontOrShapeID(int class_id, 00249 int int_result_config) const; 00250 // Converts a shape_table_ index to a classifier class_id index (not a 00251 // unichar-id!). Uses a search, so not fast. 00252 int ShapeIDToClassID(int shape_id) const; 00253 UNICHAR_ID *BaselineClassifier(TBLOB *Blob, 00254 const DENORM& denorm, 00255 ADAPT_TEMPLATES Templates, 00256 ADAPT_RESULTS *Results); 00257 int CharNormClassifier(TBLOB *Blob, 00258 const DENORM& denorm, 00259 INT_TEMPLATES Templates, 00260 ADAPT_RESULTS *Results); 00261 00262 // As CharNormClassifier, but operates on a TrainingSample and outputs to 00263 // a GenericVector of ShapeRating without conversion to classes. 00264 int CharNormTrainingSample(bool pruner_only, const TrainingSample& sample, 00265 GenericVector<ShapeRating>* results); 00266 UNICHAR_ID *GetAmbiguities(TBLOB *Blob, 00267 const DENORM& denorm, 00268 CLASS_ID CorrectClass); 00269 void DoAdaptiveMatch(TBLOB *Blob, 00270 const DENORM& denorm, 00271 ADAPT_RESULTS *Results); 00272 void AdaptToChar(TBLOB *Blob, 00273 const DENORM& denorm, 00274 CLASS_ID ClassId, 00275 int FontinfoId, 00276 FLOAT32 Threshold); 00277 void DisplayAdaptedChar(TBLOB* blob, const DENORM& denorm, 00278 INT_CLASS_STRUCT* int_class); 00279 int AdaptableWord(TWERD *Word, 00280 const WERD_CHOICE &BestChoiceWord, 00281 const WERD_CHOICE &RawChoiceWord); 00282 void EndAdaptiveClassifier(); 00283 void PrintAdaptiveStatistics(FILE *File); 00284 void SettupPass1(); 00285 void SettupPass2(); 00286 void AdaptiveClassifier(TBLOB *Blob, 00287 const DENORM& denorm, 00288 BLOB_CHOICE_LIST *Choices, 00289 CLASS_PRUNER_RESULTS cp_results); 00290 void ClassifyAsNoise(ADAPT_RESULTS *Results); 00291 void ResetAdaptiveClassifierInternal(); 00292 00293 int GetBaselineFeatures(TBLOB *Blob, 00294 const DENORM& denorm, 00295 INT_TEMPLATES Templates, 00296 INT_FEATURE_ARRAY IntFeatures, 00297 uinT8* CharNormArray, 00298 inT32 *BlobLength); 00299 int GetCharNormFeatures(TBLOB *Blob, 00300 const DENORM& denorm, 00301 INT_TEMPLATES Templates, 00302 INT_FEATURE_ARRAY IntFeatures, 00303 uinT8* PrunerNormArray, 00304 uinT8* CharNormArray, 00305 inT32 *BlobLength, 00306 inT32 *FeatureOutlineIndex); 00307 // Computes the char_norm_array for the unicharset and, if not NULL, the 00308 // pruner_array as appropriate according to the existence of the shape_table. 00309 // The norm_feature is deleted as it is almost certainly no longer needed. 00310 void ComputeCharNormArrays(FEATURE_STRUCT* norm_feature, 00311 INT_TEMPLATES_STRUCT* templates, 00312 uinT8* char_norm_array, 00313 uinT8* pruner_array); 00314 00315 bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config); 00316 void UpdateAmbigsGroup(CLASS_ID class_id, const DENORM& denorm, TBLOB *Blob); 00317 00318 void ResetFeaturesHaveBeenExtracted(); 00319 bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; } 00320 bool LooksLikeGarbage(const DENORM& denorm, TBLOB *blob); 00321 void RefreshDebugWindow(ScrollView **win, const char *msg, 00322 int y_offset, const TBOX &wbox); 00323 /* float2int.cpp ************************************************************/ 00324 void ClearCharNormArray(uinT8* char_norm_array); 00325 void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature, 00326 uinT8* char_norm_array); 00327 void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures); 00328 /* intproto.cpp *************************************************************/ 00329 INT_TEMPLATES ReadIntTemplates(FILE *File); 00330 void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, 00331 const UNICHARSET& target_unicharset); 00332 CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on, 00333 bool* pretrained_on, int* shape_id); 00334 void ShowMatchDisplay(); 00335 /* font detection ***********************************************************/ 00336 UnicityTable<FontInfo>& get_fontinfo_table() { 00337 return fontinfo_table_; 00338 } 00339 UnicityTable<FontSet>& get_fontset_table() { 00340 return fontset_table_; 00341 } 00342 /* mfoutline.cpp ***********************************************************/ 00343 void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale); 00344 /* outfeat.cpp ***********************************************************/ 00345 FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob); 00346 /* picofeat.cpp ***********************************************************/ 00347 FEATURE_SET ExtractPicoFeatures(TBLOB *Blob); 00348 00349 00350 // Member variables. 00351 00352 // Parameters. 00353 BOOL_VAR_H(prioritize_division, FALSE, 00354 "Prioritize blob division over chopping"); 00355 INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP"); 00356 BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier"); 00357 INT_VAR_H(classify_debug_level, 0, "Classify debug level"); 00358 00359 /* mfoutline.cpp ***********************************************************/ 00360 /* control knobs used to control normalization of outlines */ 00361 INT_VAR_H(classify_norm_method, character, "Normalization Method ..."); 00362 double_VAR_H(classify_char_norm_range, 0.2, 00363 "Character Normalization Range ..."); 00364 double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ..."); 00365 double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ..."); 00366 double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ..."); 00367 double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ..."); 00368 00369 /* adaptmatch.cpp ***********************************************************/ 00370 BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching"); 00371 BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching"); 00372 BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier"); 00373 BOOL_VAR_H(classify_use_pre_adapted_templates, 0, 00374 "Use pre-adapted classifier templates"); 00375 BOOL_VAR_H(classify_save_adapted_templates, 0, 00376 "Save adapted templates to a file"); 00377 BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger"); 00378 INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level"); 00379 INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags"); 00380 INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: "); 00381 double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)"); 00382 double_VAR_H(matcher_great_threshold, 0.0, "Great Match (0-1)"); 00383 double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)"); 00384 double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)"); 00385 double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)"); 00386 double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: "); 00387 INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes"); 00388 INT_VAR_H(matcher_min_examples_for_prototyping, 3, 00389 "Reliable Config Threshold"); 00390 INT_VAR_H(matcher_sufficient_examples_for_prototyping, 5, 00391 "Enable adaption even if the ambiguities have not been seen"); 00392 double_VAR_H(matcher_clustering_max_angle_delta, 0.015, 00393 "Maximum angle delta for prototype clustering"); 00394 double_VAR_H(classify_misfit_junk_penalty, 0.0, 00395 "Penalty to apply when a non-alnum is vertically out of " 00396 "its expected textline position"); 00397 double_VAR_H(rating_scale, 1.5, "Rating scaling factor"); 00398 double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor"); 00399 double_VAR_H(tessedit_class_miss_scale, 0.00390625, 00400 "Scale factor for features not used"); 00401 INT_VAR_H(classify_adapt_proto_threshold, 230, 00402 "Threshold for good protos during adaptive 0-255"); 00403 INT_VAR_H(classify_adapt_feature_threshold, 230, 00404 "Threshold for good features during adaptive 0-255"); 00405 BOOL_VAR_H(disable_character_fragments, TRUE, 00406 "Do not include character fragments in the" 00407 " results of the classifier"); 00408 double_VAR_H(classify_character_fragments_garbage_certainty_threshold, -3.0, 00409 "Exclude fragments that do not match any whole character" 00410 " with at least this certainty"); 00411 BOOL_VAR_H(classify_debug_character_fragments, FALSE, 00412 "Bring up graphical debugging windows for fragments training"); 00413 BOOL_VAR_H(matcher_debug_separate_windows, FALSE, 00414 "Use two different windows for debugging the matching: " 00415 "One for the protos and one for the features."); 00416 STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning"); 00417 00418 /* intmatcher.cpp **********************************************************/ 00419 INT_VAR_H(classify_class_pruner_threshold, 229, 00420 "Class Pruner Threshold 0-255"); 00421 INT_VAR_H(classify_class_pruner_multiplier, 30, 00422 "Class Pruner Multiplier 0-255: "); 00423 INT_VAR_H(classify_cp_cutoff_strength, 7, 00424 "Class Pruner CutoffStrength: "); 00425 INT_VAR_H(classify_integer_matcher_multiplier, 14, 00426 "Integer Matcher Multiplier 0-255: "); 00427 00428 // Use class variables to hold onto built-in templates and adapted templates. 00429 INT_TEMPLATES PreTrainedTemplates; 00430 ADAPT_TEMPLATES AdaptedTemplates; 00431 00432 // Create dummy proto and config masks for use with the built-in templates. 00433 BIT_VECTOR AllProtosOn; 00434 BIT_VECTOR PrunedProtos; 00435 BIT_VECTOR AllConfigsOn; 00436 BIT_VECTOR AllProtosOff; 00437 BIT_VECTOR AllConfigsOff; 00438 BIT_VECTOR TempProtoMask; 00439 bool EnableLearning; 00440 /* normmatch.cpp */ 00441 NORM_PROTOS *NormProtos; 00442 /* font detection ***********************************************************/ 00443 UnicityTable<FontInfo> fontinfo_table_; 00444 // Without shape training, each class_id, config pair represents a single 00445 // unichar id/font combination, so each fontset_table_ entry holds font ids 00446 // for each config in the class. 00447 // With shape training, each class_id, config pair represents a single 00448 // shape_table_ index, so the fontset_table_ stores the shape_table_ index, 00449 // and the shape_table_ must be consulted to obtain the actual unichar_id/ 00450 // font combinations that the shape represents. 00451 UnicityTable<FontSet> fontset_table_; 00452 00453 INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word"); 00454 BOOL_VAR_H(classify_bln_numeric_mode, 0, 00455 "Assume the input is numbers [0-9]."); 00456 00457 protected: 00458 IntegerMatcher im_; 00459 FEATURE_DEFS_STRUCT feature_defs_; 00460 // If a shape_table_ is present, it is used to remap classifier output in 00461 // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually 00462 // mean an index to the shape_table_ and the choices returned are *all* the 00463 // shape_table_ entries at that index. 00464 ShapeTable* shape_table_; 00465 00466 private: 00467 00468 Dict dict_; 00469 00470 /* variables used to hold performance statistics */ 00471 int AdaptiveMatcherCalls; 00472 int BaselineClassifierCalls; 00473 int CharNormClassifierCalls; 00474 int AmbigClassifierCalls; 00475 int NumWordsAdaptedTo; 00476 int NumCharsAdaptedTo; 00477 int NumBaselineClassesTried; 00478 int NumCharNormClassesTried; 00479 int NumAmbigClassesTried; 00480 int NumClassesOutput; 00481 int NumAdaptationsFailed; 00482 00483 /* variables used to hold onto extracted features. This is used 00484 to map from the old scheme in which baseline features and char norm 00485 features are extracted separately, to the new scheme in which they 00486 are extracted at the same time. */ 00487 bool FeaturesHaveBeenExtracted; 00488 bool FeaturesOK; 00489 INT_FEATURE_ARRAY BaselineFeatures; 00490 INT_FEATURE_ARRAY CharNormFeatures; 00491 INT_FX_RESULT_STRUCT FXInfo; 00492 00493 // Expected number of features in the class pruner, used to penalize 00494 // unknowns that have too few features (like a c being classified as e) so 00495 // it doesn't recognize everything as '@' or '#'. 00496 // CharNormCutoffs is for the static classifier (with no shapetable). 00497 // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real 00498 // value in the adaptive classifier. Both are indexed by unichar_id. 00499 // shapetable_cutoffs_ provides a similar value for each shape in the 00500 // shape_table_ 00501 uinT16* CharNormCutoffs; 00502 uinT16* BaselineCutoffs; 00503 GenericVector<uinT16> shapetable_cutoffs_; 00504 ScrollView* learn_debug_win_; 00505 ScrollView* learn_fragmented_word_debug_win_; 00506 ScrollView* learn_fragments_debug_win_; 00507 }; 00508 } // namespace tesseract 00509 00510 #endif // TESSERACT_CLASSIFY_CLASSIFY_H__