Tesseract
3.02
|
00001 00002 // File: classify.cpp 00003 // Description: classify class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #include "classify.h" 00020 #include "fontinfo.h" 00021 #include "intproto.h" 00022 #include "mfoutline.h" 00023 #include "scrollview.h" 00024 #include "shapetable.h" 00025 #include "unicity_table.h" 00026 #include <string.h> 00027 00028 namespace tesseract { 00029 Classify::Classify() 00030 : BOOL_MEMBER(prioritize_division, FALSE, 00031 "Prioritize blob division over chopping", this->params()), 00032 INT_MEMBER(tessedit_single_match, FALSE, 00033 "Top choice only from CP", this->params()), 00034 BOOL_MEMBER(classify_enable_learning, true, 00035 "Enable adaptive classifier", this->params()), 00036 INT_MEMBER(classify_debug_level, 0, "Classify debug level", 00037 this->params()), 00038 INT_MEMBER(classify_norm_method, character, "Normalization Method ...", 00039 this->params()), 00040 double_MEMBER(classify_char_norm_range, 0.2, 00041 "Character Normalization Range ...", this->params()), 00042 double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...", 00043 this->params()), /* PREV DEFAULT 0.1 */ 00044 double_MEMBER(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...", 00045 this->params()), /* PREV DEFAULT 0.3 */ 00046 double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...", 00047 this->params()), /* PREV DEFAULT 0.1 */ 00048 double_MEMBER(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...", 00049 this->params()), /* PREV DEFAULT 0.3 */ 00050 BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching", 00051 this->params()), 00052 BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching", 00053 this->params()), 00054 BOOL_MEMBER(classify_enable_adaptive_matcher, 1, 00055 "Enable adaptive classifier", 00056 this->params()), 00057 BOOL_MEMBER(classify_use_pre_adapted_templates, 0, 00058 "Use pre-adapted classifier templates", this->params()), 00059 BOOL_MEMBER(classify_save_adapted_templates, 0, 00060 "Save adapted templates to a file", this->params()), 00061 BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger", 00062 this->params()), 00063 INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()), 00064 INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()), 00065 INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ", 00066 this->params()), 00067 double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)", 00068 this->params()), 00069 double_MEMBER(matcher_great_threshold, 0.0, "Great Match (0-1)", 00070 this->params()), 00071 double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)", 00072 this->params()), 00073 double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)", 00074 this->params()), 00075 double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)", 00076 this->params()), 00077 double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length", 00078 this->params()), 00079 INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes", 00080 this->params()), 00081 INT_MEMBER(matcher_min_examples_for_prototyping, 3, 00082 "Reliable Config Threshold", this->params()), 00083 INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5, 00084 "Enable adaption even if the ambiguities have not been seen", 00085 this->params()), 00086 double_MEMBER(matcher_clustering_max_angle_delta, 0.015, 00087 "Maximum angle delta for prototype clustering", 00088 this->params()), 00089 double_MEMBER(classify_misfit_junk_penalty, 0.0, 00090 "Penalty to apply when a non-alnum is vertically out of " 00091 "its expected textline position", 00092 this->params()), 00093 double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()), 00094 double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", 00095 this->params()), 00096 double_MEMBER(tessedit_class_miss_scale, 0.00390625, 00097 "Scale factor for features not used", this->params()), 00098 INT_MEMBER(classify_adapt_proto_threshold, 230, 00099 "Threshold for good protos during adaptive 0-255", 00100 this->params()), 00101 INT_MEMBER(classify_adapt_feature_threshold, 230, 00102 "Threshold for good features during adaptive 0-255", 00103 this->params()), 00104 BOOL_MEMBER(disable_character_fragments, TRUE, 00105 "Do not include character fragments in the" 00106 " results of the classifier", this->params()), 00107 double_MEMBER(classify_character_fragments_garbage_certainty_threshold, 00108 -3.0, "Exclude fragments that do not look like whole" 00109 " characters from training and adaption", this->params()), 00110 BOOL_MEMBER(classify_debug_character_fragments, FALSE, 00111 "Bring up graphical debugging windows for fragments training", 00112 this->params()), 00113 BOOL_MEMBER(matcher_debug_separate_windows, FALSE, 00114 "Use two different windows for debugging the matching: " 00115 "One for the protos and one for the features.", this->params()), 00116 STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning", 00117 this->params()), 00118 INT_MEMBER(classify_class_pruner_threshold, 229, 00119 "Class Pruner Threshold 0-255", this->params()), 00120 INT_MEMBER(classify_class_pruner_multiplier, 30, 00121 "Class Pruner Multiplier 0-255: ", this->params()), 00122 INT_MEMBER(classify_cp_cutoff_strength, 7, 00123 "Class Pruner CutoffStrength: ", this->params()), 00124 INT_MEMBER(classify_integer_matcher_multiplier, 14, 00125 "Integer Matcher Multiplier 0-255: ", this->params()), 00126 EnableLearning(true), 00127 INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word", 00128 this->params()), 00129 BOOL_MEMBER(classify_bln_numeric_mode, 0, 00130 "Assume the input is numbers [0-9].", this->params()), 00131 shape_table_(NULL), 00132 dict_(&image_) { 00133 fontinfo_table_.set_compare_callback( 00134 NewPermanentTessCallback(CompareFontInfo)); 00135 fontinfo_table_.set_clear_callback( 00136 NewPermanentTessCallback(FontInfoDeleteCallback)); 00137 fontset_table_.set_compare_callback( 00138 NewPermanentTessCallback(CompareFontSet)); 00139 fontset_table_.set_clear_callback( 00140 NewPermanentTessCallback(FontSetDeleteCallback)); 00141 AdaptedTemplates = NULL; 00142 PreTrainedTemplates = NULL; 00143 AllProtosOn = NULL; 00144 PrunedProtos = NULL; 00145 AllConfigsOn = NULL; 00146 AllProtosOff = NULL; 00147 AllConfigsOff = NULL; 00148 TempProtoMask = NULL; 00149 NormProtos = NULL; 00150 00151 AdaptiveMatcherCalls = 0; 00152 BaselineClassifierCalls = 0; 00153 CharNormClassifierCalls = 0; 00154 AmbigClassifierCalls = 0; 00155 NumWordsAdaptedTo = 0; 00156 NumCharsAdaptedTo = 0; 00157 NumBaselineClassesTried = 0; 00158 NumCharNormClassesTried = 0; 00159 NumAmbigClassesTried = 0; 00160 NumClassesOutput = 0; 00161 NumAdaptationsFailed = 0; 00162 00163 FeaturesHaveBeenExtracted = false; 00164 FeaturesOK = true; 00165 learn_debug_win_ = NULL; 00166 learn_fragmented_word_debug_win_ = NULL; 00167 learn_fragments_debug_win_ = NULL; 00168 00169 CharNormCutoffs = new uinT16[MAX_NUM_CLASSES]; 00170 BaselineCutoffs = new uinT16[MAX_NUM_CLASSES]; 00171 } 00172 00173 Classify::~Classify() { 00174 EndAdaptiveClassifier(); 00175 delete learn_debug_win_; 00176 delete learn_fragmented_word_debug_win_; 00177 delete learn_fragments_debug_win_; 00178 delete[] CharNormCutoffs; 00179 delete[] BaselineCutoffs; 00180 } 00181 00182 } // namespace tesseract