Tesseract  3.02
tesseract-ocr/classify/classify.cpp
Go to the documentation of this file.
00001 
00002 // File:        classify.cpp
00003 // Description: classify class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #include "classify.h"
00020 #include "fontinfo.h"
00021 #include "intproto.h"
00022 #include "mfoutline.h"
00023 #include "scrollview.h"
00024 #include "shapetable.h"
00025 #include "unicity_table.h"
00026 #include <string.h>
00027 
00028 namespace tesseract {
00029 Classify::Classify()
00030   : BOOL_MEMBER(prioritize_division, FALSE,
00031                 "Prioritize blob division over chopping", this->params()),
00032     INT_MEMBER(tessedit_single_match, FALSE,
00033                "Top choice only from CP", this->params()),
00034     BOOL_MEMBER(classify_enable_learning, true,
00035                 "Enable adaptive classifier", this->params()),
00036     INT_MEMBER(classify_debug_level, 0, "Classify debug level",
00037                this->params()),
00038     INT_MEMBER(classify_norm_method, character, "Normalization Method   ...",
00039                this->params()),
00040     double_MEMBER(classify_char_norm_range, 0.2,
00041                   "Character Normalization Range ...", this->params()),
00042     double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...",
00043                   this->params()),  /* PREV DEFAULT 0.1 */
00044     double_MEMBER(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...",
00045                   this->params()),  /* PREV DEFAULT 0.3 */
00046     double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...",
00047                   this->params()),  /* PREV DEFAULT 0.1 */
00048     double_MEMBER(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...",
00049                   this->params()),  /* PREV DEFAULT 0.3 */
00050     BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
00051                 this->params()),
00052     BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
00053                 this->params()),
00054     BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
00055                 "Enable adaptive classifier",
00056                 this->params()),
00057     BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
00058                 "Use pre-adapted classifier templates", this->params()),
00059     BOOL_MEMBER(classify_save_adapted_templates, 0,
00060                "Save adapted templates to a file", this->params()),
00061     BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
00062                 this->params()),
00063     INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
00064     INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
00065     INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
00066                this->params()),
00067     double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
00068                   this->params()),
00069     double_MEMBER(matcher_great_threshold, 0.0, "Great Match (0-1)",
00070                   this->params()),
00071     double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
00072                   this->params()),
00073     double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
00074                   this->params()),
00075     double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
00076                   this->params()),
00077     double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
00078                   this->params()),
00079     INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
00080                this->params()),
00081     INT_MEMBER(matcher_min_examples_for_prototyping, 3,
00082                "Reliable Config Threshold", this->params()),
00083     INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
00084                "Enable adaption even if the ambiguities have not been seen",
00085                this->params()),
00086     double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
00087                   "Maximum angle delta for prototype clustering",
00088                   this->params()),
00089     double_MEMBER(classify_misfit_junk_penalty, 0.0,
00090                   "Penalty to apply when a non-alnum is vertically out of "
00091                   "its expected textline position",
00092                   this->params()),
00093     double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
00094     double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
00095                   this->params()),
00096     double_MEMBER(tessedit_class_miss_scale, 0.00390625,
00097                   "Scale factor for features not used", this->params()),
00098     INT_MEMBER(classify_adapt_proto_threshold, 230,
00099                "Threshold for good protos during adaptive 0-255",
00100                this->params()),
00101     INT_MEMBER(classify_adapt_feature_threshold, 230,
00102                "Threshold for good features during adaptive 0-255",
00103                this->params()),
00104     BOOL_MEMBER(disable_character_fragments, TRUE,
00105                 "Do not include character fragments in the"
00106                 " results of the classifier", this->params()),
00107     double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
00108                   -3.0, "Exclude fragments that do not look like whole"
00109                   " characters from training and adaption", this->params()),
00110     BOOL_MEMBER(classify_debug_character_fragments, FALSE,
00111                 "Bring up graphical debugging windows for fragments training",
00112                 this->params()),
00113     BOOL_MEMBER(matcher_debug_separate_windows, FALSE,
00114                 "Use two different windows for debugging the matching: "
00115                 "One for the protos and one for the features.", this->params()),
00116     STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
00117                   this->params()),
00118     INT_MEMBER(classify_class_pruner_threshold, 229,
00119                "Class Pruner Threshold 0-255", this->params()),
00120     INT_MEMBER(classify_class_pruner_multiplier, 30,
00121                "Class Pruner Multiplier 0-255:       ", this->params()),
00122     INT_MEMBER(classify_cp_cutoff_strength, 7,
00123                "Class Pruner CutoffStrength:         ", this->params()),
00124     INT_MEMBER(classify_integer_matcher_multiplier, 14,
00125                "Integer Matcher Multiplier  0-255:   ", this->params()),
00126     EnableLearning(true),
00127     INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
00128                this->params()),
00129     BOOL_MEMBER(classify_bln_numeric_mode, 0,
00130                 "Assume the input is numbers [0-9].", this->params()),
00131     shape_table_(NULL),
00132     dict_(&image_) {
00133   fontinfo_table_.set_compare_callback(
00134       NewPermanentTessCallback(CompareFontInfo));
00135   fontinfo_table_.set_clear_callback(
00136       NewPermanentTessCallback(FontInfoDeleteCallback));
00137   fontset_table_.set_compare_callback(
00138       NewPermanentTessCallback(CompareFontSet));
00139   fontset_table_.set_clear_callback(
00140       NewPermanentTessCallback(FontSetDeleteCallback));
00141   AdaptedTemplates = NULL;
00142   PreTrainedTemplates = NULL;
00143   AllProtosOn = NULL;
00144   PrunedProtos = NULL;
00145   AllConfigsOn = NULL;
00146   AllProtosOff = NULL;
00147   AllConfigsOff = NULL;
00148   TempProtoMask = NULL;
00149   NormProtos = NULL;
00150 
00151   AdaptiveMatcherCalls = 0;
00152   BaselineClassifierCalls = 0;
00153   CharNormClassifierCalls = 0;
00154   AmbigClassifierCalls = 0;
00155   NumWordsAdaptedTo = 0;
00156   NumCharsAdaptedTo = 0;
00157   NumBaselineClassesTried = 0;
00158   NumCharNormClassesTried = 0;
00159   NumAmbigClassesTried = 0;
00160   NumClassesOutput = 0;
00161   NumAdaptationsFailed = 0;
00162 
00163   FeaturesHaveBeenExtracted = false;
00164   FeaturesOK = true;
00165   learn_debug_win_ = NULL;
00166   learn_fragmented_word_debug_win_ = NULL;
00167   learn_fragments_debug_win_ = NULL;
00168 
00169   CharNormCutoffs = new uinT16[MAX_NUM_CLASSES];
00170   BaselineCutoffs = new uinT16[MAX_NUM_CLASSES];
00171 }
00172 
00173 Classify::~Classify() {
00174   EndAdaptiveClassifier();
00175   delete learn_debug_win_;
00176   delete learn_fragmented_word_debug_win_;
00177   delete learn_fragments_debug_win_;
00178   delete[] CharNormCutoffs;
00179   delete[] BaselineCutoffs;
00180 }
00181 
00182 }  // namespace tesseract