Tesseract
3.02
|
00001 // Copyright 2011 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 // 00015 00016 00017 #ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_ 00018 #define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_ 00019 00020 namespace tesseract { 00021 00022 class IndexMapBiDi; 00023 class IntFeatureMap; 00024 class ShapeTable; 00025 class TrainingSample; 00026 class TrainingSampleSet; 00027 class UnicharAndFonts; 00028 00029 // Iterator class to encapsulate the complex iteration involved in getting 00030 // all samples of all shapes needed for a classification problem. 00031 // 00032 // =====INPUTS TO Init FUNCTION===== 00033 // The charset_map defines a subset of the sample_set classes (with a NULL 00034 // shape_table, or the shape_table classes if not NULL.) 00035 // 00036 // The shape_table (if not NULL) defines the mapping from shapes to 00037 // font_id/class_id pairs. Each shape is a list of unichar_id and font lists. 00038 // 00039 // The sample_set holds the samples and provides indexed access to samples 00040 // of font_id/class_id pairs. 00041 // 00042 // If randomize is true, the samples are perturbed slightly, but the 00043 // perturbation is guaranteed to be the same for multiple identical 00044 // iterations. 00045 // 00046 // =====DIFFERENT COMBINATIONS OF INPUTS===== 00047 // NULL shape_table: 00048 // Without a shape_table, everything works in UNICHAR_IDs. 00049 // 00050 // NULL shape_table, NULL charset_map: 00051 // Iterations simply run over the samples in the order the samples occur in the 00052 // input files. 00053 // GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID. 00054 // 00055 // NULL shape_table, non-NULL charset_map: 00056 // When shape_table is NULL, the charset_map indexes unichar_ids directly, 00057 // and an iteration returns all samples of all chars in the charset_map, which 00058 // is a subset of the full unicharset. 00059 // The iteration will be in groups of the same unichar_id, in the order 00060 // defined by the charset_map. 00061 // GetCompactClassID returns the charset_map index of a sample, and 00062 // GetSparseClassID returns the sample UNICHAR_ID. 00063 // 00064 // Non-NULL shape_table: 00065 // With a shape_table, samples are grouped according to the shape_table, so 00066 // multiple UNICHAR_IDs and fonts may be grouped together, and everything 00067 // works in shape_ids. 00068 // 00069 // Non-NULL shape_table, NULL charset_map. 00070 // Iterations simply run over the samples in the order of shape_id. 00071 // GetCompactClassID and GetSparseClassID both return the shape_id. 00072 // (If you want the unichar_id or font_id, the sample still has them.) 00073 // 00074 // Non-NULL shape_table, non-NULL charset_map. 00075 // When shape_table is not NULL, the charset_map indexes and subsets shapes in 00076 // the shape_table, and iterations will be in shape_table order, not 00077 // charset_map order. 00078 // GetCompactClassID returns the charset_map index of a shape, and 00079 // GetSparseClassID returns the shape_id. 00080 // 00081 // =====What is SampleIterator good for?===== 00082 // Inside a classifier training module, the SampleIterator has abstracted away 00083 // all the different modes above. 00084 // Use the following iteration to train your classifier: 00085 // for (it.Begin(); !it.AtEnd(); it.Next()) { 00086 // const TrainingSample& sample = it.GetSample(); 00087 // int class_id = it.GetCompactClassID(); 00088 // Your classifier may or may not be dealing with a shape_table, and may be 00089 // dealing with some subset of the character/shape set. It doesn't need to 00090 // know and shouldn't care. It is just learning shapes with compact class ids 00091 // in the range [0, it.CompactCharsetSize()). 00092 class SampleIterator { 00093 public: 00094 SampleIterator(); 00095 ~SampleIterator(); 00096 00097 void Clear(); 00098 00099 // See class comment for arguments. 00100 void Init(const IndexMapBiDi* charset_map, 00101 const ShapeTable* shape_table, 00102 bool randomize, 00103 TrainingSampleSet* sample_set); 00104 00105 // Iterator functions designed for use with a simple for loop: 00106 // for (it.Begin(); !it.AtEnd(); it.Next()) { 00107 // const TrainingSample& sample = it.GetSample(); 00108 // int class_id = it.GetCompactClassID(); 00109 // ... 00110 // } 00111 void Begin(); 00112 bool AtEnd() const; 00113 const TrainingSample& GetSample() const; 00114 TrainingSample* MutableSample() const; 00115 // Returns the total index (from the original set of samples) of the current 00116 // sample. 00117 int GlobalSampleIndex() const; 00118 // Returns the index of the current sample in compact charset space, so 00119 // in a 2-class problem between x and y, the returned indices will all be 00120 // 0 or 1, and have nothing to do with the unichar_ids. 00121 // If the charset_map_ is NULL, then this is equal to GetSparseClassID(). 00122 int GetCompactClassID() const; 00123 // Returns the index of the current sample in sparse charset space, so 00124 // in a 2-class problem between x and y, the returned indices will all be 00125 // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids 00126 // with a shape_table_. 00127 int GetSparseClassID() const; 00128 // Moves on to the next indexable sample. If the end is reached, leaves 00129 // the state such that AtEnd() is true. 00130 void Next(); 00131 00132 // Returns the size of the compact charset space. 00133 int CompactCharsetSize() const; 00134 // Returns the size of the sparse charset space. 00135 int SparseCharsetSize() const; 00136 00137 const IndexMapBiDi& charset_map() const { 00138 return *charset_map_; 00139 } 00140 const ShapeTable* shape_table() const { 00141 return shape_table_; 00142 } 00143 // Sample set operations. 00144 const TrainingSampleSet* sample_set() const { 00145 return sample_set_; 00146 } 00147 00148 // A set of functions that do something to all the samples accessed by the 00149 // iterator, as it is currently setup. 00150 00151 // Apply the supplied feature_space/feature_map transform to all samples 00152 // accessed by this iterator. 00153 void MapSampleFeatures(const IntFeatureMap& feature_map); 00154 00155 // Adjust the weights of all the samples to be uniform in the given charset. 00156 // Returns the number of samples in the iterator. 00157 int UniformSamples(); 00158 00159 // Normalize the weights of all the samples defined by the iterator so they 00160 // sum to 1. Returns the minimum assigned sample weight. 00161 double NormalizeSamples(); 00162 00163 private: 00164 // Helper returns the current UnicharAndFont shape_entry. 00165 const UnicharAndFonts* GetShapeEntry() const; 00166 00167 // Map to subset the actual charset space. 00168 const IndexMapBiDi* charset_map_; 00169 // Shape table to recombine character classes into shapes 00170 const ShapeTable* shape_table_; 00171 // The samples to iterate over. 00172 TrainingSampleSet* sample_set_; 00173 // Flag to control randomizing the sample features. 00174 bool randomize_; 00175 // Shape table owned by this used to iterate character classes. 00176 ShapeTable* owned_shape_table_; 00177 00178 // Top-level iteration. Shape index in sparse charset_map space. 00179 int shape_index_; 00180 int num_shapes_; 00181 // Index to the character class within a shape. 00182 int shape_char_index_; 00183 int num_shape_chars_; 00184 // Index to the font within a shape/class pair. 00185 int shape_font_index_; 00186 int num_shape_fonts_; 00187 // The lowest level iteration. sample_index_/num_samples_ counts samples 00188 // in the current shape/class/font combination. 00189 int sample_index_; 00190 int num_samples_; 00191 }; 00192 00193 } // namespace tesseract. 00194 00195 #endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_