Tesseract  3.02
tesseract-ocr/classify/errorcounter.h
Go to the documentation of this file.
00001 // Copyright 2011 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00003 //
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 //
00015 
00016 #ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
00017 #define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
00018 
00019 #include "genericvector.h"
00020 #include "matrix.h"
00021 
00022 struct Pix;
00023 template <typename T> class UnicityTable;
00024 
00025 namespace tesseract {
00026 
00027 struct FontInfo;
00028 class SampleIterator;
00029 class ShapeClassifier;
00030 class ShapeRating;
00031 class ShapeTable;
00032 class TrainingSample;
00033 
00034 // Enumeration of the different types of error count.
00035 // Error counts work as follows:
00036 //
00037 // Ground truth is a valid unichar-id / font-id pair:
00038 //        Number of classifier answers?
00039 //          0                       >0
00040 //     CT_REJECT     BOTH unichar-id and font-id match top shape?
00041 //     __________             yes!              no
00042 //                   CT_SHAPE_TOP_CORRECT  CT_SHAPE_TOP_ERR
00043 //                           |            Font attributes match?
00044 //                           |               yes!        no
00045 //                           |                 |     CT_FONT_ATTR_ERROR
00046 //                           |         Top unichar-id matches?
00047 //                           |         yes!          no
00048 //       Top shape-id has multiple unichars?    CT_UNICHAR_TOP1_ERR
00049 //               yes!            no           2nd shape unichar id matches?
00050 //        CT_OK_MULTI_UNICHAR   ________        yes!              no
00051 //        ___________________                  _____  CT_UNICHAR_TOP2_ERR
00052 //                                                    Any unichar-id matches?
00053 //                                                    yes!        no
00054 //                                                   ______ CT_UNICHAR_TOPN_ERR
00055 //                                                           _________________
00056 // Note that multiple counts may be activated for a single sample!
00057 //
00058 // Ground truth is for a fragment/n-gram that is NOT in the unicharset.
00059 // This is called junk and is expected to be rejected:
00060 //        Number of classifier answers?
00061 //          0                       >0
00062 //     CT_REJECTED_JUNK     CT_ACCEPTED_JUNK
00063 //
00064 // Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores
00065 // the mean rank of the correct result, counting from 0, and with an error
00066 // receiving the number of answers as the correct rank.
00067 //
00068 // Keep in sync with the ReportString function.
00069 enum CountTypes {
00070   CT_SHAPE_TOP_CORRECT,  // Top shape id is actually correct.
00071   CT_SHAPE_TOP_ERR,      // Top shape id is not correct.
00072   CT_FONT_ATTR_ERR,      // Font attributes incorrect, ignoring unichar.
00073   CT_UNICHAR_TOP1_ERR,   // Top shape does not contain correct unichar id.
00074   CT_UNICHAR_TOP2_ERR,   // Top 2 shapes don't contain correct unichar id.
00075   CT_UNICHAR_TOPN_ERR,   // No output shape contains correct unichar id.
00076   CT_OK_MULTI_UNICHAR,   // Top shape id has correct unichar id, and others.
00077   CT_REJECT,             // Classifier hates this.
00078   CT_NUM_RESULTS,        // Number of answers produced.
00079   CT_RANK,               // Rank of correct answer.
00080   CT_REJECTED_JUNK,      // Junk that was correctly rejected.
00081   CT_ACCEPTED_JUNK,      // Junk that was incorrectly classified otherwise.
00082 
00083   CT_SIZE                // Number of types for array sizing.
00084 };
00085 
00086 // Class to encapsulate all the functionality and sub-structures required
00087 // to count errors for an isolated character classifier (ShapeClassifier).
00088 class ErrorCounter {
00089  public:
00090   // Computes and returns the unweighted boosting_mode error rate of the given
00091   // classifier. Can be used for testing, or inside an iterative training
00092   // system, including one that uses boosting.
00093   // report_levels:
00094   // 0 = no output.
00095   // 1 = bottom-line error rate.
00096   // 2 = bottom-line error rate + time.
00097   // 3 = font-level error rate + time.
00098   // 4 = list of all errors + short classifier debug output on 16 errors.
00099   // 5 = list of all errors + short classifier debug output on 25 errors.
00100   // * The boosting_mode determines which error type is used for computing the
00101   //   scaled_error output, and setting the is_error flag in the samples.
00102   // * The fontinfo_table is used to get string font names for the debug
00103   //   output, and also to count font attributes errors.
00104   // * The page_images vector may contain a Pix* (which may be NULL) for each
00105   //   page index assigned to the samples.
00106   // * The it provides encapsulated iteration over some sample set.
00107   // * The outputs unichar_error, scaled_error and totals_report are all
00108   //   optional.
00109   // * If not NULL, unichar error gets the top1 unichar error rate.
00110   // * Scaled_error gets the error chosen by boosting_mode weighted by the
00111   //   weights on the samples.
00112   // * Fonts_report gets a string summarizing the error rates for each font in
00113   //   both human-readable form and as a tab-separated list of error counts.
00114   //   The human-readable form is all before the first tab.
00115   // * The return value is the un-weighted version of the scaled_error.
00116   static double ComputeErrorRate(ShapeClassifier* classifier,
00117                                  int report_level, CountTypes boosting_mode,
00118                                  const UnicityTable<FontInfo>& fontinfo_table,
00119                                  const GenericVector<Pix*>& page_images,
00120                                  SampleIterator* it,
00121                                  double* unichar_error,
00122                                  double* scaled_error,
00123                                  STRING* fonts_report);
00124 
00125  private:
00126   // Simple struct to hold an array of counts.
00127   struct Counts {
00128     Counts();
00129     // Adds other into this for computing totals.
00130     void operator+=(const Counts& other);
00131 
00132     int n[CT_SIZE];
00133   };
00134 
00135   // Constructor is private. Only anticipated use of ErrorCounter is via
00136   // the static ComputeErrorRate.
00137   ErrorCounter(int charsetsize, int shapesize, int fontsize);
00138   ~ErrorCounter();
00139 
00140   // Accumulates the errors from the classifier results on a single sample.
00141   // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
00142   // boosting_mode selects the type of error to be used for boosting and the
00143   // is_error_ member of sample is set according to whether the required type
00144   // of error occurred. The font_table provides access to font properties
00145   // for error counting and shape_table is used to understand the relationship
00146   // between unichar_ids and shape_ids in the results
00147   bool AccumulateErrors(bool debug, CountTypes boosting_mode,
00148                         const UnicityTable<FontInfo>& font_table,
00149                         const ShapeTable& shape_table,
00150                         const GenericVector<ShapeRating>& results,
00151                         TrainingSample* sample);
00152 
00153   // Accumulates counts for junk. Counts only whether the junk was correctly
00154   // rejected or not.
00155   void AccumulateJunk(const ShapeTable& shape_table,
00156                       const GenericVector<ShapeRating>& results,
00157                       TrainingSample* sample);
00158 
00159   // Creates a report of the error rate. The report_level controls the detail
00160   // that is reported to stderr via tprintf:
00161   // 0   -> no output.
00162   // >=1 -> bottom-line error rate.
00163   // >=3 -> font-level error rate.
00164   // boosting_mode determines the return value. It selects which (un-weighted)
00165   // error rate to return.
00166   // The fontinfo_table from MasterTrainer provides the names of fonts.
00167   // The it determines the current subset of the training samples.
00168   // If not NULL, the top-choice unichar error rate is saved in unichar_error.
00169   // If not NULL, the report string is saved in fonts_report.
00170   // (Ignoring report_level).
00171   double ReportErrors(int report_level, CountTypes boosting_mode,
00172                       const UnicityTable<FontInfo>& fontinfo_table,
00173                       const SampleIterator& it,
00174                       double* unichar_error,
00175                       STRING* fonts_report);
00176 
00177   // Sets the report string to a combined human and machine-readable report
00178   // string of the error rates.
00179   // Returns false if there is no data, leaving report unchanged.
00180   static bool ReportString(const Counts& counts, STRING* report);
00181 
00182   // Computes the error rates and returns in rates which is an array of size
00183   // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
00184   static bool ComputeRates(const Counts& counts, double rates[CT_SIZE]);
00185 
00186 
00187   // Total scaled error used by boosting algorithms.
00188   double scaled_error_;
00189   // Vector indexed by font_id from the samples of error accumulators.
00190   GenericVector<Counts> font_counts_;
00191   // Counts of the results that map each unichar_id (from samples) to an
00192   // incorrect shape_id.
00193   GENERIC_2D_ARRAY<int> unichar_counts_;
00194 };
00195 
00196 }  // namespace tesseract.
00197 
00198 #endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */