Tesseract
3.02
|
00001 // Copyright 2011 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 // 00015 00016 #ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ 00017 #define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ 00018 00019 #include "genericvector.h" 00020 #include "matrix.h" 00021 00022 struct Pix; 00023 template <typename T> class UnicityTable; 00024 00025 namespace tesseract { 00026 00027 struct FontInfo; 00028 class SampleIterator; 00029 class ShapeClassifier; 00030 class ShapeRating; 00031 class ShapeTable; 00032 class TrainingSample; 00033 00034 // Enumeration of the different types of error count. 00035 // Error counts work as follows: 00036 // 00037 // Ground truth is a valid unichar-id / font-id pair: 00038 // Number of classifier answers? 00039 // 0 >0 00040 // CT_REJECT BOTH unichar-id and font-id match top shape? 00041 // __________ yes! no 00042 // CT_SHAPE_TOP_CORRECT CT_SHAPE_TOP_ERR 00043 // | Font attributes match? 00044 // | yes! no 00045 // | | CT_FONT_ATTR_ERROR 00046 // | Top unichar-id matches? 00047 // | yes! no 00048 // Top shape-id has multiple unichars? CT_UNICHAR_TOP1_ERR 00049 // yes! no 2nd shape unichar id matches? 00050 // CT_OK_MULTI_UNICHAR ________ yes! no 00051 // ___________________ _____ CT_UNICHAR_TOP2_ERR 00052 // Any unichar-id matches? 00053 // yes! no 00054 // ______ CT_UNICHAR_TOPN_ERR 00055 // _________________ 00056 // Note that multiple counts may be activated for a single sample! 00057 // 00058 // Ground truth is for a fragment/n-gram that is NOT in the unicharset. 00059 // This is called junk and is expected to be rejected: 00060 // Number of classifier answers? 00061 // 0 >0 00062 // CT_REJECTED_JUNK CT_ACCEPTED_JUNK 00063 // 00064 // Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores 00065 // the mean rank of the correct result, counting from 0, and with an error 00066 // receiving the number of answers as the correct rank. 00067 // 00068 // Keep in sync with the ReportString function. 00069 enum CountTypes { 00070 CT_SHAPE_TOP_CORRECT, // Top shape id is actually correct. 00071 CT_SHAPE_TOP_ERR, // Top shape id is not correct. 00072 CT_FONT_ATTR_ERR, // Font attributes incorrect, ignoring unichar. 00073 CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id. 00074 CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id. 00075 CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id. 00076 CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others. 00077 CT_REJECT, // Classifier hates this. 00078 CT_NUM_RESULTS, // Number of answers produced. 00079 CT_RANK, // Rank of correct answer. 00080 CT_REJECTED_JUNK, // Junk that was correctly rejected. 00081 CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise. 00082 00083 CT_SIZE // Number of types for array sizing. 00084 }; 00085 00086 // Class to encapsulate all the functionality and sub-structures required 00087 // to count errors for an isolated character classifier (ShapeClassifier). 00088 class ErrorCounter { 00089 public: 00090 // Computes and returns the unweighted boosting_mode error rate of the given 00091 // classifier. Can be used for testing, or inside an iterative training 00092 // system, including one that uses boosting. 00093 // report_levels: 00094 // 0 = no output. 00095 // 1 = bottom-line error rate. 00096 // 2 = bottom-line error rate + time. 00097 // 3 = font-level error rate + time. 00098 // 4 = list of all errors + short classifier debug output on 16 errors. 00099 // 5 = list of all errors + short classifier debug output on 25 errors. 00100 // * The boosting_mode determines which error type is used for computing the 00101 // scaled_error output, and setting the is_error flag in the samples. 00102 // * The fontinfo_table is used to get string font names for the debug 00103 // output, and also to count font attributes errors. 00104 // * The page_images vector may contain a Pix* (which may be NULL) for each 00105 // page index assigned to the samples. 00106 // * The it provides encapsulated iteration over some sample set. 00107 // * The outputs unichar_error, scaled_error and totals_report are all 00108 // optional. 00109 // * If not NULL, unichar error gets the top1 unichar error rate. 00110 // * Scaled_error gets the error chosen by boosting_mode weighted by the 00111 // weights on the samples. 00112 // * Fonts_report gets a string summarizing the error rates for each font in 00113 // both human-readable form and as a tab-separated list of error counts. 00114 // The human-readable form is all before the first tab. 00115 // * The return value is the un-weighted version of the scaled_error. 00116 static double ComputeErrorRate(ShapeClassifier* classifier, 00117 int report_level, CountTypes boosting_mode, 00118 const UnicityTable<FontInfo>& fontinfo_table, 00119 const GenericVector<Pix*>& page_images, 00120 SampleIterator* it, 00121 double* unichar_error, 00122 double* scaled_error, 00123 STRING* fonts_report); 00124 00125 private: 00126 // Simple struct to hold an array of counts. 00127 struct Counts { 00128 Counts(); 00129 // Adds other into this for computing totals. 00130 void operator+=(const Counts& other); 00131 00132 int n[CT_SIZE]; 00133 }; 00134 00135 // Constructor is private. Only anticipated use of ErrorCounter is via 00136 // the static ComputeErrorRate. 00137 ErrorCounter(int charsetsize, int shapesize, int fontsize); 00138 ~ErrorCounter(); 00139 00140 // Accumulates the errors from the classifier results on a single sample. 00141 // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred. 00142 // boosting_mode selects the type of error to be used for boosting and the 00143 // is_error_ member of sample is set according to whether the required type 00144 // of error occurred. The font_table provides access to font properties 00145 // for error counting and shape_table is used to understand the relationship 00146 // between unichar_ids and shape_ids in the results 00147 bool AccumulateErrors(bool debug, CountTypes boosting_mode, 00148 const UnicityTable<FontInfo>& font_table, 00149 const ShapeTable& shape_table, 00150 const GenericVector<ShapeRating>& results, 00151 TrainingSample* sample); 00152 00153 // Accumulates counts for junk. Counts only whether the junk was correctly 00154 // rejected or not. 00155 void AccumulateJunk(const ShapeTable& shape_table, 00156 const GenericVector<ShapeRating>& results, 00157 TrainingSample* sample); 00158 00159 // Creates a report of the error rate. The report_level controls the detail 00160 // that is reported to stderr via tprintf: 00161 // 0 -> no output. 00162 // >=1 -> bottom-line error rate. 00163 // >=3 -> font-level error rate. 00164 // boosting_mode determines the return value. It selects which (un-weighted) 00165 // error rate to return. 00166 // The fontinfo_table from MasterTrainer provides the names of fonts. 00167 // The it determines the current subset of the training samples. 00168 // If not NULL, the top-choice unichar error rate is saved in unichar_error. 00169 // If not NULL, the report string is saved in fonts_report. 00170 // (Ignoring report_level). 00171 double ReportErrors(int report_level, CountTypes boosting_mode, 00172 const UnicityTable<FontInfo>& fontinfo_table, 00173 const SampleIterator& it, 00174 double* unichar_error, 00175 STRING* fonts_report); 00176 00177 // Sets the report string to a combined human and machine-readable report 00178 // string of the error rates. 00179 // Returns false if there is no data, leaving report unchanged. 00180 static bool ReportString(const Counts& counts, STRING* report); 00181 00182 // Computes the error rates and returns in rates which is an array of size 00183 // CT_SIZE. Returns false if there is no data, leaving rates unchanged. 00184 static bool ComputeRates(const Counts& counts, double rates[CT_SIZE]); 00185 00186 00187 // Total scaled error used by boosting algorithms. 00188 double scaled_error_; 00189 // Vector indexed by font_id from the samples of error accumulators. 00190 GenericVector<Counts> font_counts_; 00191 // Counts of the results that map each unichar_id (from samples) to an 00192 // incorrect shape_id. 00193 GENERIC_2D_ARRAY<int> unichar_counts_; 00194 }; 00195 00196 } // namespace tesseract. 00197 00198 #endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */