Tesseract
3.02
|
00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 // 00015 00016 #ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__ 00017 #define TESSERACT_TRAINING_TRAININGSAMPLESET_H__ 00018 00019 #include "bitvector.h" 00020 #include "genericvector.h" 00021 #include "indexmapbidi.h" 00022 #include "matrix.h" 00023 #include "shapetable.h" 00024 #include "trainingsample.h" 00025 00026 class UNICHARSET; 00027 template <typename T> class UnicityTable; 00028 00029 namespace tesseract { 00030 00031 struct FontInfo; 00032 class IntFeatureMap; 00033 class IntFeatureSpace; 00034 class TrainingSample; 00035 class UnicharAndFonts; 00036 00037 // Collection of TrainingSample used for training or testing a classifier. 00038 // Provides several useful methods to operate on the collection as a whole, 00039 // including outlier detection and deletion, providing access by font and 00040 // class, finding the canonical sample, finding the "cloud" features (OR of 00041 // all features in all samples), replication of samples, caching of distance 00042 // metrics. 00043 class TrainingSampleSet { 00044 public: 00045 explicit TrainingSampleSet(const UnicityTable<FontInfo>& fontinfo_table); 00046 ~TrainingSampleSet(); 00047 00048 // Writes to the given file. Returns false in case of error. 00049 bool Serialize(FILE* fp) const; 00050 // Reads from the given file. Returns false in case of error. 00051 // If swap is true, assumes a big/little-endian swap is needed. 00052 bool DeSerialize(bool swap, FILE* fp); 00053 00054 // Accessors 00055 int num_samples() const { 00056 return samples_.size(); 00057 } 00058 int num_raw_samples() const { 00059 return num_raw_samples_; 00060 } 00061 int NumFonts() const { 00062 return font_id_map_.SparseSize(); 00063 } 00064 const UNICHARSET& unicharset() const { 00065 return unicharset_; 00066 } 00067 int charsetsize() const { 00068 return unicharset_size_; 00069 } 00070 00071 // Loads an initial unicharset, or sets one up if the file cannot be read. 00072 void LoadUnicharset(const char* filename); 00073 00074 // Adds a character sample to this sample set. 00075 // If the unichar is not already in the local unicharset, it is added. 00076 // Returns the unichar_id of the added sample, from the local unicharset. 00077 int AddSample(const char* unichar, TrainingSample* sample); 00078 // Adds a character sample to this sample set with the given unichar_id, 00079 // which must correspond to the local unicharset (in this). 00080 void AddSample(int unichar_id, TrainingSample* sample); 00081 00082 // Returns the number of samples for the given font,class pair. 00083 // If randomize is true, returns the number of samples accessible 00084 // with randomizing on. (Increases the number of samples if small.) 00085 // OrganizeByFontAndClass must have been already called. 00086 int NumClassSamples(int font_id, int class_id, bool randomize) const; 00087 00088 // Gets a sample by its index. 00089 const TrainingSample* GetSample(int index) const; 00090 00091 // Gets a sample by its font, class, index. 00092 // OrganizeByFontAndClass must have been already called. 00093 const TrainingSample* GetSample(int font_id, int class_id, int index) const; 00094 00095 // Get a sample by its font, class, index. Does not randomize. 00096 // OrganizeByFontAndClass must have been already called. 00097 TrainingSample* MutableSample(int font_id, int class_id, int index); 00098 00099 // Returns a string debug representation of the given sample: 00100 // font, unichar_str, bounding box, page. 00101 STRING SampleToString(const TrainingSample& sample) const; 00102 00103 // Gets the combined set of features used by all the samples of the given 00104 // font/class combination. 00105 const BitVector& GetCloudFeatures(int font_id, int class_id) const; 00106 // Gets the indexed features of the canonical sample of the given 00107 // font/class combination. 00108 const GenericVector<int>& GetCanonicalFeatures(int font_id, 00109 int class_id) const; 00110 00111 // Returns the distance between the given UniCharAndFonts pair. 00112 // If matched_fonts, only matching fonts, are considered, unless that yields 00113 // the empty set. 00114 // OrganizeByFontAndClass must have been already called. 00115 float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2, 00116 bool matched_fonts, const IntFeatureMap& feature_map); 00117 00118 // Returns the distance between the given pair of font/class pairs. 00119 // Finds in cache or computes and caches. 00120 // OrganizeByFontAndClass must have been already called. 00121 float ClusterDistance(int font_id1, int class_id1, 00122 int font_id2, int class_id2, 00123 const IntFeatureMap& feature_map); 00124 00125 // Computes the distance between the given pair of font/class pairs. 00126 float ComputeClusterDistance(int font_id1, int class_id1, 00127 int font_id2, int class_id2, 00128 const IntFeatureMap& feature_map) const; 00129 00130 // Returns the number of canonical features of font/class 2 for which 00131 // neither the feature nor any of its near neighbors occurs in the cloud 00132 // of font/class 1. Each such feature is a reliable separation between 00133 // the classes, ASSUMING that the canonical sample is sufficiently 00134 // representative that every sample has a feature near that particular 00135 // feature. To check that this is so on the fly would be prohibitively 00136 // expensive, but it might be possible to pre-qualify the canonical features 00137 // to include only those for which this assumption is true. 00138 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called 00139 // first, or the results will be nonsense. 00140 int ReliablySeparable(int font_id1, int class_id1, 00141 int font_id2, int class_id2, 00142 const IntFeatureMap& feature_map, 00143 bool thorough) const; 00144 00145 00146 // Returns the total index of the requested sample. 00147 // OrganizeByFontAndClass must have been already called. 00148 int GlobalSampleIndex(int font_id, int class_id, int index) const; 00149 00150 // Gets the canonical sample for the given font, class pair. 00151 // ComputeCanonicalSamples must have been called first. 00152 const TrainingSample* GetCanonicalSample(int font_id, int class_id) const; 00153 // Gets the max distance for the given canonical sample. 00154 // ComputeCanonicalSamples must have been called first. 00155 float GetCanonicalDist(int font_id, int class_id) const; 00156 00157 // Returns a mutable pointer to the sample with the given index. 00158 TrainingSample* mutable_sample(int index) { 00159 return samples_[index]; 00160 } 00161 // Gets ownership of the sample with the given index, removing it from this. 00162 TrainingSample* extract_sample(int index) { 00163 TrainingSample* sample = samples_[index]; 00164 samples_[index] = NULL; 00165 return sample; 00166 } 00167 00168 // Generates indexed features for all samples with the supplied feature_space. 00169 void IndexFeatures(const IntFeatureSpace& feature_space); 00170 00171 // Delete outlier samples with few features that are shared with others. 00172 // IndexFeatures must have been called already. 00173 void DeleteOutliers(const IntFeatureSpace& feature_space, bool debug); 00174 00175 // Marks the given sample for deletion. 00176 // Deletion is actually completed by DeleteDeadSamples. 00177 void KillSample(TrainingSample* sample); 00178 00179 // Deletes all samples with a negative sample index marked by KillSample. 00180 // Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass 00181 // must be called after as the samples have been renumbered. 00182 void DeleteDeadSamples(); 00183 00184 // Callback function returns true if the given sample is to be deleted, due 00185 // to having a negative classid. 00186 bool DeleteableSample(const TrainingSample* sample); 00187 00188 // Construct an array to access the samples by font,class pair. 00189 void OrganizeByFontAndClass(); 00190 00191 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact 00192 // index for the font_class_array_. 00193 void SetupFontIdMap(); 00194 00195 // Finds the sample for each font, class pair that has least maximum 00196 // distance to all the other samples of the same font, class. 00197 // OrganizeByFontAndClass must have been already called. 00198 void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug); 00199 00200 // Replicates the samples to a minimum frequency defined by 00201 // 2 * kSampleRandomSize, or for larger counts duplicates all samples. 00202 // After replication, the replicated samples are perturbed slightly, but 00203 // in a predictable and repeatable way. 00204 // Use after OrganizeByFontAndClass(). 00205 void ReplicateAndRandomizeSamples(); 00206 00207 // Caches the indexed features of the canonical samples. 00208 // ComputeCanonicalSamples must have been already called. 00209 void ComputeCanonicalFeatures(); 00210 // Computes the combined set of features used by all the samples of each 00211 // font/class combination. Use after ReplicateAndRandomizeSamples. 00212 void ComputeCloudFeatures(int feature_space_size); 00213 00214 // Adds all fonts of the given class to the shape. 00215 void AddAllFontsForClass(int class_id, Shape* shape) const; 00216 00217 // Display the samples with the given indexed feature that also match 00218 // the given shape. 00219 void DisplaySamplesWithFeature(int f_index, const Shape& shape, 00220 const IntFeatureSpace& feature_space, 00221 ScrollView::Color color, 00222 ScrollView* window) const; 00223 00224 private: 00225 // Struct to store a triplet of unichar, font, distance in the distance cache. 00226 struct FontClassDistance { 00227 int unichar_id; 00228 int font_id; // Real font id. 00229 float distance; 00230 }; 00231 // Simple struct to store information related to each font/class combination. 00232 struct FontClassInfo { 00233 FontClassInfo(); 00234 00235 // Writes to the given file. Returns false in case of error. 00236 bool Serialize(FILE* fp) const; 00237 // Reads from the given file. Returns false in case of error. 00238 // If swap is true, assumes a big/little-endian swap is needed. 00239 bool DeSerialize(bool swap, FILE* fp); 00240 00241 // Number of raw samples. 00242 inT32 num_raw_samples; 00243 // Index of the canonical sample. 00244 inT32 canonical_sample; 00245 // Max distance of the canonical sample from any other. 00246 float canonical_dist; 00247 // Sample indices for the samples, including replicated. 00248 GenericVector<inT32> samples; 00249 00250 // Non-serialized cache data. 00251 // Indexed features of the canonical sample. 00252 GenericVector<int> canonical_features; 00253 // The mapped features of all the samples. 00254 BitVector cloud_features; 00255 00256 // Caches for ClusterDistance. 00257 // Caches for other fonts but matching this unichar. -1 indicates not set. 00258 // Indexed by compact font index from font_id_map_. 00259 GenericVector<float> font_distance_cache; 00260 // Caches for other unichars but matching this font. -1 indicates not set. 00261 GenericVector<float> unichar_distance_cache; 00262 // Cache for the rest (non matching font and unichar.) 00263 // A cache of distances computed by ReliablySeparable. 00264 GenericVector<FontClassDistance> distance_cache; 00265 }; 00266 00267 PointerVector<TrainingSample> samples_; 00268 // Number of samples before replication/randomization. 00269 int num_raw_samples_; 00270 // Character set we are training for. 00271 UNICHARSET unicharset_; 00272 // Character set size to which the 2-d arrays below refer. 00273 int unicharset_size_; 00274 // Map to allow the font_class_array_ below to be compact. 00275 // The sparse space is the real font_id, used in samples_ . 00276 // The compact space is an index to font_class_array_ 00277 IndexMapBiDi font_id_map_; 00278 // A 2-d array of FontClassInfo holding information related to each 00279 // (font_id, class_id) pair. 00280 GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_; 00281 00282 // Reference to the fontinfo_table_ in MasterTrainer. Provides names 00283 // for font_ids in the samples. Not serialized! 00284 const UnicityTable<FontInfo>& fontinfo_table_; 00285 }; 00286 00287 } // namespace tesseract. 00288 00289 00290 #endif // TRAININGSAMPLESETSET_H_