Tesseract
3.02
|
00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 // 00015 00016 #include "trainingsampleset.h" 00017 #include "allheaders.h" 00018 #include "boxread.h" 00019 #include "fontinfo.h" 00020 #include "indexmapbidi.h" 00021 #include "intfeaturedist.h" 00022 #include "intfeaturemap.h" 00023 #include "intfeaturespace.h" 00024 #include "shapetable.h" 00025 #include "trainingsample.h" 00026 #include "unicity_table.h" 00027 00028 namespace tesseract { 00029 00030 const int kTestChar = -1; // 37; 00031 // Max number of distances to compute the squared way 00032 const int kSquareLimit = 25; 00033 // Prime numbers for subsampling distances. 00034 const int kPrime1 = 17; 00035 const int kPrime2 = 13; 00036 // Min samples from which to start discarding outliers. 00037 const int kMinOutlierSamples = 5; 00038 00039 TrainingSampleSet::FontClassInfo::FontClassInfo() 00040 : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) { 00041 } 00042 00043 // Writes to the given file. Returns false in case of error. 00044 bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp) const { 00045 if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) 00046 return false; 00047 if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) 00048 return false; 00049 if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false; 00050 if (!samples.Serialize(fp)) return false; 00051 return true; 00052 } 00053 // Reads from the given file. Returns false in case of error. 00054 // If swap is true, assumes a big/little-endian swap is needed. 00055 bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) { 00056 if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1) 00057 return false; 00058 if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1) 00059 return false; 00060 if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false; 00061 if (!samples.DeSerialize(swap, fp)) return false; 00062 if (swap) { 00063 ReverseN(&num_raw_samples, sizeof(num_raw_samples)); 00064 ReverseN(&canonical_sample, sizeof(canonical_sample)); 00065 ReverseN(&canonical_dist, sizeof(canonical_dist)); 00066 } 00067 return true; 00068 } 00069 00070 TrainingSampleSet::TrainingSampleSet(const UnicityTable<FontInfo>& font_table) 00071 : num_raw_samples_(0), unicharset_size_(0), 00072 font_class_array_(NULL), fontinfo_table_(font_table) { 00073 } 00074 00075 TrainingSampleSet::~TrainingSampleSet() { 00076 delete font_class_array_; 00077 } 00078 00079 // Writes to the given file. Returns false in case of error. 00080 bool TrainingSampleSet::Serialize(FILE* fp) const { 00081 if (!samples_.Serialize(fp)) return false; 00082 if (!unicharset_.save_to_file(fp)) return false; 00083 if (!font_id_map_.Serialize(fp)) return false; 00084 inT8 not_null = font_class_array_ != NULL; 00085 if (fwrite(¬_null, sizeof(not_null), 1, fp) != 1) return false; 00086 if (not_null) { 00087 if (!font_class_array_->SerializeClasses(fp)) return false; 00088 } 00089 return true; 00090 } 00091 00092 // Reads from the given file. Returns false in case of error. 00093 // If swap is true, assumes a big/little-endian swap is needed. 00094 bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) { 00095 if (!samples_.DeSerialize(swap, fp)) return false; 00096 num_raw_samples_ = samples_.size(); 00097 if (!unicharset_.load_from_file(fp)) return false; 00098 if (!font_id_map_.DeSerialize(swap, fp)) return false; 00099 if (font_class_array_ != NULL) { 00100 delete font_class_array_; 00101 font_class_array_ = NULL; 00102 } 00103 inT8 not_null; 00104 if (fread(¬_null, sizeof(not_null), 1, fp) != 1) return false; 00105 if (not_null) { 00106 FontClassInfo empty; 00107 font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty); 00108 if (!font_class_array_->DeSerializeClasses(swap, fp)) return false; 00109 } 00110 unicharset_size_ = unicharset_.size(); 00111 return true; 00112 } 00113 00114 // Load an initial unicharset, or set one up if the file cannot be read. 00115 void TrainingSampleSet::LoadUnicharset(const char* filename) { 00116 if (!unicharset_.load_from_file(filename)) { 00117 tprintf("Failed to load unicharset from file %s\n" 00118 "Building unicharset for boosting from scratch...\n", 00119 filename); 00120 unicharset_.clear(); 00121 // Space character needed to represent NIL_LIST classification. 00122 unicharset_.unichar_insert(" "); 00123 } 00124 unicharset_size_ = unicharset_.size(); 00125 } 00126 00127 // Adds a character sample to this sample set. 00128 // If the unichar is not already in the local unicharset, it is added. 00129 // Returns the unichar_id of the added sample, from the local unicharset. 00130 int TrainingSampleSet::AddSample(const char* unichar, TrainingSample* sample) { 00131 if (!unicharset_.contains_unichar(unichar)) { 00132 unicharset_.unichar_insert(unichar); 00133 if (unicharset_.size() > MAX_NUM_CLASSES) { 00134 tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is " 00135 "greater than MAX_NUM_CLASSES\n"); 00136 return -1; 00137 } 00138 } 00139 UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar); 00140 AddSample(char_id, sample); 00141 return char_id; 00142 } 00143 00144 // Adds a character sample to this sample set with the given unichar_id, 00145 // which must correspond to the local unicharset (in this). 00146 void TrainingSampleSet::AddSample(int unichar_id, TrainingSample* sample) { 00147 sample->set_class_id(unichar_id); 00148 samples_.push_back(sample); 00149 num_raw_samples_ = samples_.size(); 00150 unicharset_size_ = unicharset_.size(); 00151 } 00152 00153 // Returns the number of samples for the given font,class pair. 00154 // If randomize is true, returns the number of samples accessible 00155 // with randomizing on. (Increases the number of samples if small.) 00156 // OrganizeByFontAndClass must have been already called. 00157 int TrainingSampleSet::NumClassSamples(int font_id, int class_id, 00158 bool randomize) const { 00159 ASSERT_HOST(font_class_array_ != NULL); 00160 if (font_id < 0 || class_id < 0 || 00161 font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) { 00162 // There are no samples because the font or class doesn't exist. 00163 return 0; 00164 } 00165 int font_index = font_id_map_.SparseToCompact(font_id); 00166 if (font_index < 0) 00167 return 0; // The font has no samples. 00168 if (randomize) 00169 return (*font_class_array_)(font_index, class_id).samples.size(); 00170 else 00171 return (*font_class_array_)(font_index, class_id).num_raw_samples; 00172 } 00173 00174 // Gets a sample by its index. 00175 const TrainingSample* TrainingSampleSet::GetSample(int index) const { 00176 return samples_[index]; 00177 } 00178 00179 // Gets a sample by its font, class, index. 00180 // OrganizeByFontAndClass must have been already called. 00181 const TrainingSample* TrainingSampleSet::GetSample(int font_id, int class_id, 00182 int index) const { 00183 ASSERT_HOST(font_class_array_ != NULL); 00184 int font_index = font_id_map_.SparseToCompact(font_id); 00185 if (font_index < 0) return NULL; 00186 int sample_index = (*font_class_array_)(font_index, class_id).samples[index]; 00187 return samples_[sample_index]; 00188 } 00189 00190 // Get a sample by its font, class, index. Does not randomize. 00191 // OrganizeByFontAndClass must have been already called. 00192 TrainingSample* TrainingSampleSet::MutableSample(int font_id, int class_id, 00193 int index) { 00194 ASSERT_HOST(font_class_array_ != NULL); 00195 int font_index = font_id_map_.SparseToCompact(font_id); 00196 if (font_index < 0) return NULL; 00197 int sample_index = (*font_class_array_)(font_index, class_id).samples[index]; 00198 return samples_[sample_index]; 00199 } 00200 00201 // Returns a string debug representation of the given sample: 00202 // font, unichar_str, bounding box, page. 00203 STRING TrainingSampleSet::SampleToString(const TrainingSample& sample) const { 00204 STRING boxfile_str; 00205 MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()), 00206 sample.bounding_box(), sample.page_num(), &boxfile_str); 00207 return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str; 00208 } 00209 00210 // Gets the combined set of features used by all the samples of the given 00211 // font/class combination. 00212 const BitVector& TrainingSampleSet::GetCloudFeatures( 00213 int font_id, int class_id) const { 00214 int font_index = font_id_map_.SparseToCompact(font_id); 00215 ASSERT_HOST(font_index >= 0); 00216 return (*font_class_array_)(font_index, class_id).cloud_features; 00217 } 00218 // Gets the indexed features of the canonical sample of the given 00219 // font/class combination. 00220 const GenericVector<int>& TrainingSampleSet::GetCanonicalFeatures( 00221 int font_id, int class_id) const { 00222 int font_index = font_id_map_.SparseToCompact(font_id); 00223 ASSERT_HOST(font_index >= 0); 00224 return (*font_class_array_)(font_index, class_id).canonical_features; 00225 } 00226 00227 // Returns the distance between the given UniCharAndFonts pair. 00228 // If matched_fonts, only matching fonts, are considered, unless that yields 00229 // the empty set. 00230 // OrganizeByFontAndClass must have been already called. 00231 float TrainingSampleSet::UnicharDistance(const UnicharAndFonts& uf1, 00232 const UnicharAndFonts& uf2, 00233 bool matched_fonts, 00234 const IntFeatureMap& feature_map) { 00235 int num_fonts1 = uf1.font_ids.size(); 00236 int c1 = uf1.unichar_id; 00237 int num_fonts2 = uf2.font_ids.size(); 00238 int c2 = uf2.unichar_id; 00239 double dist_sum = 0.0; 00240 int dist_count = 0; 00241 bool debug = false; 00242 if (matched_fonts) { 00243 // Compute distances only where fonts match. 00244 for (int i = 0; i < num_fonts1; ++i) { 00245 int f1 = uf1.font_ids[i]; 00246 for (int j = 0; j < num_fonts2; ++j) { 00247 int f2 = uf2.font_ids[j]; 00248 if (f1 == f2) { 00249 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map); 00250 ++dist_count; 00251 } 00252 } 00253 } 00254 } else if (num_fonts1 * num_fonts2 <= kSquareLimit) { 00255 // Small enough sets to compute all the distances. 00256 for (int i = 0; i < num_fonts1; ++i) { 00257 int f1 = uf1.font_ids[i]; 00258 for (int j = 0; j < num_fonts2; ++j) { 00259 int f2 = uf2.font_ids[j]; 00260 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map); 00261 if (debug) { 00262 tprintf("Cluster dist %d %d %d %d = %g\n", 00263 f1, c1, f2, c2, 00264 ClusterDistance(f1, c1, f2, c2, feature_map)); 00265 } 00266 ++dist_count; 00267 } 00268 } 00269 } else { 00270 // Subsample distances, using the largest set once, and stepping through 00271 // the smaller set so as to ensure that all the pairs are different. 00272 int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2; 00273 int index = 0; 00274 int num_samples = MAX(num_fonts1, num_fonts2); 00275 for (int i = 0; i < num_samples; ++i, index += increment) { 00276 int f1 = uf1.font_ids[i % num_fonts1]; 00277 int f2 = uf2.font_ids[index % num_fonts2]; 00278 if (debug) { 00279 tprintf("Cluster dist %d %d %d %d = %g\n", 00280 f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map)); 00281 } 00282 dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map); 00283 ++dist_count; 00284 } 00285 } 00286 if (dist_count == 0) { 00287 if (matched_fonts) 00288 return UnicharDistance(uf1, uf2, false, feature_map); 00289 return 0.0f; 00290 } 00291 return dist_sum / dist_count; 00292 } 00293 00294 // Returns the distance between the given pair of font/class pairs. 00295 // Finds in cache or computes and caches. 00296 // OrganizeByFontAndClass must have been already called. 00297 float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1, 00298 int font_id2, int class_id2, 00299 const IntFeatureMap& feature_map) { 00300 ASSERT_HOST(font_class_array_ != NULL); 00301 int font_index1 = font_id_map_.SparseToCompact(font_id1); 00302 int font_index2 = font_id_map_.SparseToCompact(font_id2); 00303 if (font_index1 < 0 || font_index2 < 0) 00304 return 0.0f; 00305 FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1); 00306 if (font_id1 == font_id2) { 00307 // Special case cache for speed. 00308 if (fc_info.unichar_distance_cache.size() == 0) 00309 fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f); 00310 if (fc_info.unichar_distance_cache[class_id2] < 0) { 00311 // Distance has to be calculated. 00312 float result = ComputeClusterDistance(font_id1, class_id1, 00313 font_id2, class_id2, 00314 feature_map); 00315 fc_info.unichar_distance_cache[class_id2] = result; 00316 // Copy to the symmetric cache entry. 00317 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2); 00318 if (fc_info2.unichar_distance_cache.size() == 0) 00319 fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f); 00320 fc_info2.unichar_distance_cache[class_id1] = result; 00321 } 00322 return fc_info.unichar_distance_cache[class_id2]; 00323 } else if (class_id1 == class_id2) { 00324 // Another special-case cache for equal class-id. 00325 if (fc_info.font_distance_cache.size() == 0) 00326 fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(), 00327 -1.0f); 00328 if (fc_info.font_distance_cache[font_index2] < 0) { 00329 // Distance has to be calculated. 00330 float result = ComputeClusterDistance(font_id1, class_id1, 00331 font_id2, class_id2, 00332 feature_map); 00333 fc_info.font_distance_cache[font_index2] = result; 00334 // Copy to the symmetric cache entry. 00335 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2); 00336 if (fc_info2.font_distance_cache.size() == 0) 00337 fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(), 00338 -1.0f); 00339 fc_info2.font_distance_cache[font_index1] = result; 00340 } 00341 return fc_info.font_distance_cache[font_index2]; 00342 } 00343 // Both font and class are different. Linear search for class_id2/font_id2 00344 // in what is a hopefully short list of distances. 00345 int cache_index = 0; 00346 while (cache_index < fc_info.distance_cache.size() && 00347 (fc_info.distance_cache[cache_index].unichar_id != class_id2 || 00348 fc_info.distance_cache[cache_index].font_id != font_id2)) 00349 ++cache_index; 00350 if (cache_index == fc_info.distance_cache.size()) { 00351 // Distance has to be calculated. 00352 float result = ComputeClusterDistance(font_id1, class_id1, 00353 font_id2, class_id2, 00354 feature_map); 00355 FontClassDistance fc_dist = { class_id2, font_id2, result }; 00356 fc_info.distance_cache.push_back(fc_dist); 00357 // Copy to the symmetric cache entry. We know it isn't there already, as 00358 // we always copy to the symmetric entry. 00359 FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2); 00360 fc_dist.unichar_id = class_id1; 00361 fc_dist.font_id = font_id1; 00362 fc_info2.distance_cache.push_back(fc_dist); 00363 } 00364 return fc_info.distance_cache[cache_index].distance; 00365 } 00366 00367 // Computes the distance between the given pair of font/class pairs. 00368 float TrainingSampleSet::ComputeClusterDistance( 00369 int font_id1, int class_id1, int font_id2, int class_id2, 00370 const IntFeatureMap& feature_map) const { 00371 int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2, 00372 feature_map, false); 00373 dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1, 00374 feature_map, false); 00375 int denominator = GetCanonicalFeatures(font_id1, class_id1).size(); 00376 denominator += GetCanonicalFeatures(font_id2, class_id2).size(); 00377 return static_cast<float>(dist) / denominator; 00378 } 00379 00380 // Helper to add a feature and its near neighbors to the good_features. 00381 // levels indicates how many times to compute the offset features of what is 00382 // already there. This is done by iteration rather than recursion. 00383 static void AddNearFeatures(const IntFeatureMap& feature_map, int f, int levels, 00384 GenericVector<int>* good_features) { 00385 int prev_num_features = 0; 00386 good_features->push_back(f); 00387 int num_features = 1; 00388 for (int level = 0; level < levels; ++level) { 00389 for (int i = prev_num_features; i < num_features; ++i) { 00390 int feature = (*good_features)[i]; 00391 for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) { 00392 if (dir == 0) continue; 00393 int f1 = feature_map.OffsetFeature(feature, dir); 00394 if (f1 >= 0) { 00395 good_features->push_back(f1); 00396 } 00397 } 00398 } 00399 prev_num_features = num_features; 00400 num_features = good_features->size(); 00401 } 00402 } 00403 00404 // Returns the number of canonical features of font/class 2 for which 00405 // neither the feature nor any of its near neighbors occurs in the cloud 00406 // of font/class 1. Each such feature is a reliable separation between 00407 // the classes, ASSUMING that the canonical sample is sufficiently 00408 // representative that every sample has a feature near that particular 00409 // feature. To check that this is so on the fly would be prohibitively 00410 // expensive, but it might be possible to pre-qualify the canonical features 00411 // to include only those for which this assumption is true. 00412 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called 00413 // first, or the results will be nonsense. 00414 int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1, 00415 int font_id2, int class_id2, 00416 const IntFeatureMap& feature_map, 00417 bool thorough) const { 00418 int result = 0; 00419 const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2); 00420 if (sample2 == NULL) 00421 return 0; // There are no canonical features. 00422 const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2, 00423 class_id2); 00424 const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1); 00425 if (cloud1.size() == 0) 00426 return canonical2.size(); // There are no cloud features. 00427 00428 // Find a canonical2 feature that is not in cloud1. 00429 for (int f = 0; f < canonical2.size(); ++f) { 00430 int feature = canonical2[f]; 00431 if (cloud1[feature]) 00432 continue; 00433 // Gather the near neighbours of f. 00434 GenericVector<int> good_features; 00435 AddNearFeatures(feature_map, feature, 1, &good_features); 00436 // Check that none of the good_features are in the cloud. 00437 int i; 00438 for (i = 0; i < good_features.size(); ++i) { 00439 int good_f = good_features[i]; 00440 if (cloud1[good_f]) { 00441 break; 00442 } 00443 } 00444 if (i < good_features.size()) 00445 continue; // Found one in the cloud. 00446 ++result; 00447 } 00448 return result; 00449 } 00450 00451 // Returns the total index of the requested sample. 00452 // OrganizeByFontAndClass must have been already called. 00453 int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id, 00454 int index) const { 00455 ASSERT_HOST(font_class_array_ != NULL); 00456 int font_index = font_id_map_.SparseToCompact(font_id); 00457 if (font_index < 0) return -1; 00458 return (*font_class_array_)(font_index, class_id).samples[index]; 00459 } 00460 00461 // Gets the canonical sample for the given font, class pair. 00462 // ComputeCanonicalSamples must have been called first. 00463 const TrainingSample* TrainingSampleSet::GetCanonicalSample( 00464 int font_id, int class_id) const { 00465 ASSERT_HOST(font_class_array_ != NULL); 00466 int font_index = font_id_map_.SparseToCompact(font_id); 00467 if (font_index < 0) return NULL; 00468 int sample_index = (*font_class_array_)(font_index, 00469 class_id).canonical_sample; 00470 return sample_index >= 0 ? samples_[sample_index] : NULL; 00471 } 00472 00473 // Gets the max distance for the given canonical sample. 00474 // ComputeCanonicalSamples must have been called first. 00475 float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const { 00476 ASSERT_HOST(font_class_array_ != NULL); 00477 int font_index = font_id_map_.SparseToCompact(font_id); 00478 if (font_index < 0) return 0.0f; 00479 if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0) 00480 return (*font_class_array_)(font_index, class_id).canonical_dist; 00481 else 00482 return 0.0f; 00483 } 00484 00485 // Generates indexed features for all samples with the supplied feature_space. 00486 void TrainingSampleSet::IndexFeatures(const IntFeatureSpace& feature_space) { 00487 for (int s = 0; s < samples_.size(); ++s) 00488 samples_[s]->IndexFeatures(feature_space); 00489 } 00490 00491 // Delete outlier samples with few features that are shared with others. 00492 // IndexFeatures must have been called already. 00493 void TrainingSampleSet::DeleteOutliers(const IntFeatureSpace& feature_space, 00494 bool debug) { 00495 if (font_class_array_ == NULL) 00496 OrganizeByFontAndClass(); 00497 Pixa* pixa = NULL; 00498 if (debug) 00499 pixa = pixaCreate(0); 00500 GenericVector<int> feature_counts; 00501 int fs_size = feature_space.Size(); 00502 int font_size = font_id_map_.CompactSize(); 00503 for (int font_index = 0; font_index < font_size; ++font_index) { 00504 for (int c = 0; c < unicharset_size_; ++c) { 00505 // Create a histogram of the features used by all samples of this 00506 // font/class combination. 00507 feature_counts.init_to_size(fs_size, 0); 00508 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c); 00509 int sample_count = fcinfo.samples.size(); 00510 if (sample_count < kMinOutlierSamples) 00511 continue; 00512 for (int i = 0; i < sample_count; ++i) { 00513 int s = fcinfo.samples[i]; 00514 const GenericVector<int>& features = samples_[s]->indexed_features(); 00515 for (int f = 0; f < features.size(); ++f) { 00516 ++feature_counts[features[f]]; 00517 } 00518 } 00519 for (int i = 0; i < sample_count; ++i) { 00520 int s = fcinfo.samples[i]; 00521 const TrainingSample& sample = *samples_[s]; 00522 const GenericVector<int>& features = sample.indexed_features(); 00523 // A feature that has a histogram count of 1 is only used by this 00524 // sample, making it 'bad'. All others are 'good'. 00525 int good_features = 0; 00526 int bad_features = 0; 00527 for (int f = 0; f < features.size(); ++f) { 00528 if (feature_counts[features[f]] > 1) 00529 ++good_features; 00530 else 00531 ++bad_features; 00532 } 00533 // If more than 1/3 features are bad, then this is an outlier. 00534 if (bad_features * 2 > good_features) { 00535 tprintf("Deleting outlier sample of %s, %d good, %d bad\n", 00536 SampleToString(sample).string(), 00537 good_features, bad_features); 00538 if (debug) { 00539 pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT); 00540 // Add the previous sample as well, so it is easier to see in 00541 // the output what is wrong with this sample. 00542 int t; 00543 if (i == 0) 00544 t = fcinfo.samples[1]; 00545 else 00546 t = fcinfo.samples[i - 1]; 00547 const TrainingSample &csample = *samples_[t]; 00548 pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT); 00549 } 00550 // Mark the sample for deletion. 00551 KillSample(samples_[s]); 00552 } 00553 } 00554 } 00555 } 00556 // Truly delete all bad samples and renumber everything. 00557 DeleteDeadSamples(); 00558 if (pixa != NULL) { 00559 Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10); 00560 pixaDestroy(&pixa); 00561 pixWrite("outliers.png", pix, IFF_PNG); 00562 pixDestroy(&pix); 00563 } 00564 } 00565 00566 // Marks the given sample index for deletion. 00567 // Deletion is actually completed by DeleteDeadSamples. 00568 void TrainingSampleSet::KillSample(TrainingSample* sample) { 00569 sample->set_sample_index(-1); 00570 } 00571 00572 // Deletes all samples with zero features marked by KillSample. 00573 void TrainingSampleSet::DeleteDeadSamples() { 00574 samples_.compact( 00575 NewPermanentTessCallback(this, &TrainingSampleSet::DeleteableSample)); 00576 num_raw_samples_ = samples_.size(); 00577 // Samples must be re-organized now we have deleted a few. 00578 } 00579 00580 // Callback function returns true if the given sample is to be deleted, due 00581 // to having a negative classid. 00582 bool TrainingSampleSet::DeleteableSample(const TrainingSample* sample) { 00583 return sample == NULL || sample->class_id() < 0; 00584 } 00585 00586 static Pix* DebugSample(const UNICHARSET& unicharset, 00587 TrainingSample* sample) { 00588 tprintf("\nOriginal features:\n"); 00589 for (int i = 0; i < sample->num_features(); ++i) { 00590 sample->features()[i].print(); 00591 } 00592 if (sample->features_are_mapped()) { 00593 tprintf("\nMapped features:\n"); 00594 for (int i = 0; i < sample->mapped_features().size(); ++i) { 00595 tprintf("%d ", sample->mapped_features()[i]); 00596 } 00597 tprintf("\n"); 00598 } 00599 return sample->RenderToPix(&unicharset); 00600 } 00601 00602 // Construct an array to access the samples by font,class pair. 00603 void TrainingSampleSet::OrganizeByFontAndClass() { 00604 // Font indexes are sparse, so we used a map to compact them, so we can 00605 // have an efficient 2-d array of fonts and character classes. 00606 SetupFontIdMap(); 00607 int compact_font_size = font_id_map_.CompactSize(); 00608 // Get a 2-d array of generic vectors. 00609 if (font_class_array_ != NULL) 00610 delete font_class_array_; 00611 FontClassInfo empty; 00612 font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>( 00613 compact_font_size, unicharset_size_, empty); 00614 for (int s = 0; s < samples_.size(); ++s) { 00615 int font_id = samples_[s]->font_id(); 00616 int class_id = samples_[s]->class_id(); 00617 if (font_id < 0 || font_id >= font_id_map_.SparseSize()) { 00618 tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n", 00619 font_id, font_id_map_.SparseSize(), class_id, unicharset_size_, 00620 s); 00621 } 00622 ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize()); 00623 ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_); 00624 int font_index = font_id_map_.SparseToCompact(font_id); 00625 (*font_class_array_)(font_index, class_id).samples.push_back(s); 00626 } 00627 // Set the num_raw_samples member of the FontClassInfo, to set the boundary 00628 // between the raw samples and the replicated ones. 00629 for (int f = 0; f < compact_font_size; ++f) { 00630 for (int c = 0; c < unicharset_size_; ++c) 00631 (*font_class_array_)(f, c).num_raw_samples = 00632 (*font_class_array_)(f, c).samples.size(); 00633 } 00634 // This is the global number of samples and also marks the boundary between 00635 // real and replicated samples. 00636 num_raw_samples_ = samples_.size(); 00637 } 00638 00639 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact 00640 // index for the font_class_array_. 00641 void TrainingSampleSet::SetupFontIdMap() { 00642 // Number of samples for each font_id. 00643 GenericVector<int> font_counts; 00644 for (int s = 0; s < samples_.size(); ++s) { 00645 int font_id = samples_[s]->font_id(); 00646 while (font_id >= font_counts.size()) 00647 font_counts.push_back(0); 00648 ++font_counts[font_id]; 00649 } 00650 font_id_map_.Init(font_counts.size(), false); 00651 for (int f = 0; f < font_counts.size(); ++f) { 00652 font_id_map_.SetMap(f, font_counts[f] > 0); 00653 } 00654 font_id_map_.Setup(); 00655 } 00656 00657 00658 // Finds the sample for each font, class pair that has least maximum 00659 // distance to all the other samples of the same font, class. 00660 // OrganizeByFontAndClass must have been already called. 00661 void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap& map, 00662 bool debug) { 00663 ASSERT_HOST(font_class_array_ != NULL); 00664 IntFeatureDist f_table; 00665 if (debug) tprintf("feature table size %d\n", map.sparse_size()); 00666 f_table.Init(&map); 00667 int worst_s1 = 0; 00668 int worst_s2 = 0; 00669 double global_worst_dist = 0.0; 00670 // Compute distances independently for each font and char index. 00671 int font_size = font_id_map_.CompactSize(); 00672 for (int font_index = 0; font_index < font_size; ++font_index) { 00673 int font_id = font_id_map_.CompactToSparse(font_index); 00674 for (int c = 0; c < unicharset_size_; ++c) { 00675 int samples_found = 0; 00676 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c); 00677 if (fcinfo.samples.size() == 0 || 00678 (kTestChar >= 0 && c != kTestChar)) { 00679 fcinfo.canonical_sample = -1; 00680 fcinfo.canonical_dist = 0.0f; 00681 if (debug) tprintf("Skipping class %d\n", c); 00682 continue; 00683 } 00684 // The canonical sample will be the one with the min_max_dist, which 00685 // is the sample with the lowest maximum distance to all other samples. 00686 double min_max_dist = 2.0; 00687 // We keep track of the farthest apart pair (max_s1, max_s2) which 00688 // are max_max_dist apart, so we can see how bad the variability is. 00689 double max_max_dist = 0.0; 00690 int max_s1 = 0; 00691 int max_s2 = 0; 00692 fcinfo.canonical_sample = fcinfo.samples[0]; 00693 fcinfo.canonical_dist = 0.0f; 00694 for (int i = 0; i < fcinfo.samples.size(); ++i) { 00695 int s1 = fcinfo.samples[i]; 00696 const GenericVector<int>& features1 = samples_[s1]->indexed_features(); 00697 f_table.Set(features1, features1.size(), true); 00698 double max_dist = 0.0; 00699 // Run the full squared-order search for similar samples. It is still 00700 // reasonably fast because f_table.FeatureDistance is fast, but we 00701 // may have to reconsider if we start playing with too many samples 00702 // of a single char/font. 00703 for (int j = 0; j < fcinfo.samples.size(); ++j) { 00704 int s2 = fcinfo.samples[j]; 00705 if (samples_[s2]->class_id() != c || 00706 samples_[s2]->font_id() != font_id || 00707 s2 == s1) 00708 continue; 00709 GenericVector<int> features2 = samples_[s2]->indexed_features(); 00710 double dist = f_table.FeatureDistance(features2); 00711 int height = samples_[s2]->geo_feature(GeoTop) - 00712 samples_[s2]->geo_feature(GeoBottom); 00713 if (dist == 1.0 && height > 64) { 00714 // TODO(rays) rethink this when the polygonal approximation goes. 00715 // Currently it is possible for dots and other small characters 00716 // to be completely different, even within the same class. 00717 f_table.DebugFeatureDistance(features2); 00718 } 00719 if (dist > max_dist) { 00720 max_dist = dist; 00721 if (dist > max_max_dist) { 00722 max_s1 = s1; 00723 max_s2 = s2; 00724 } 00725 } 00726 } 00727 // Using Set(..., false) is far faster than re initializing, due to 00728 // the sparseness of the feature space. 00729 f_table.Set(features1, features1.size(), false); 00730 samples_[s1]->set_max_dist(max_dist); 00731 ++samples_found; 00732 if (max_dist < min_max_dist) { 00733 fcinfo.canonical_sample = s1; 00734 fcinfo.canonical_dist = max_dist; 00735 } 00736 UpdateRange(max_dist, &min_max_dist, &max_max_dist); 00737 } 00738 if (max_max_dist > global_worst_dist) { 00739 // Keep a record of the worst pair over all characters/fonts too. 00740 global_worst_dist = max_max_dist; 00741 worst_s1 = max_s1; 00742 worst_s2 = max_s2; 00743 } 00744 if (debug) { 00745 tprintf("Found %d samples of class %d=%s, font %d, " 00746 "dist range [%g, %g], worst pair= %s, %s\n", 00747 samples_found, c, unicharset_.debug_str(c).string(), 00748 font_index, min_max_dist, max_max_dist, 00749 SampleToString(*samples_[max_s1]).string(), 00750 SampleToString(*samples_[max_s2]).string()); 00751 } 00752 } 00753 } 00754 if (debug) { 00755 tprintf("Global worst dist = %g, between sample %d and %d\n", 00756 global_worst_dist, worst_s1, worst_s2); 00757 Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]); 00758 Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]); 00759 pixOr(pix1, pix1, pix2); 00760 pixWrite("worstpair.png", pix1, IFF_PNG); 00761 pixDestroy(&pix1); 00762 pixDestroy(&pix2); 00763 } 00764 } 00765 00766 // Replicates the samples to a minimum frequency defined by 00767 // 2 * kSampleRandomSize, or for larger counts duplicates all samples. 00768 // After replication, the replicated samples are perturbed slightly, but 00769 // in a predictable and repeatable way. 00770 // Use after OrganizeByFontAndClass(). 00771 void TrainingSampleSet::ReplicateAndRandomizeSamples() { 00772 ASSERT_HOST(font_class_array_ != NULL); 00773 int font_size = font_id_map_.CompactSize(); 00774 for (int font_index = 0; font_index < font_size; ++font_index) { 00775 for (int c = 0; c < unicharset_size_; ++c) { 00776 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c); 00777 int sample_count = fcinfo.samples.size(); 00778 int min_samples = 2 * MAX(kSampleRandomSize, sample_count); 00779 if (sample_count > 0 && sample_count < min_samples) { 00780 int base_count = sample_count; 00781 for (int base_index = 0; sample_count < min_samples; ++sample_count) { 00782 int src_index = fcinfo.samples[base_index++]; 00783 if (base_index >= base_count) base_index = 0; 00784 TrainingSample* sample = samples_[src_index]->RandomizedCopy( 00785 sample_count % kSampleRandomSize); 00786 int sample_index = samples_.size(); 00787 sample->set_sample_index(sample_index); 00788 samples_.push_back(sample); 00789 fcinfo.samples.push_back(sample_index); 00790 } 00791 } 00792 } 00793 } 00794 } 00795 00796 // Caches the indexed features of the canonical samples. 00797 // ComputeCanonicalSamples must have been already called. 00798 // TODO(rays) see note on ReliablySeparable and try restricting the 00799 // canonical features to those that truly represent all samples. 00800 void TrainingSampleSet::ComputeCanonicalFeatures() { 00801 ASSERT_HOST(font_class_array_ != NULL); 00802 int font_size = font_id_map_.CompactSize(); 00803 for (int font_index = 0; font_index < font_size; ++font_index) { 00804 int font_id = font_id_map_.CompactToSparse(font_index); 00805 for (int c = 0; c < unicharset_size_; ++c) { 00806 int num_samples = NumClassSamples(font_id, c, false); 00807 if (num_samples == 0) 00808 continue; 00809 const TrainingSample* sample = GetCanonicalSample(font_id, c); 00810 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c); 00811 fcinfo.canonical_features = sample->indexed_features(); 00812 } 00813 } 00814 } 00815 00816 // Computes the combined set of features used by all the samples of each 00817 // font/class combination. Use after ReplicateAndRandomizeSamples. 00818 void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) { 00819 ASSERT_HOST(font_class_array_ != NULL); 00820 int font_size = font_id_map_.CompactSize(); 00821 for (int font_index = 0; font_index < font_size; ++font_index) { 00822 int font_id = font_id_map_.CompactToSparse(font_index); 00823 for (int c = 0; c < unicharset_size_; ++c) { 00824 int num_samples = NumClassSamples(font_id, c, false); 00825 if (num_samples == 0) 00826 continue; 00827 FontClassInfo& fcinfo = (*font_class_array_)(font_index, c); 00828 fcinfo.cloud_features.Init(feature_space_size); 00829 for (int s = 0; s < num_samples; ++s) { 00830 const TrainingSample* sample = GetSample(font_id, c, s); 00831 const GenericVector<int>& sample_features = sample->indexed_features(); 00832 for (int i = 0; i < sample_features.size(); ++i) 00833 fcinfo.cloud_features.SetBit(sample_features[i]); 00834 } 00835 } 00836 } 00837 } 00838 00839 // Adds all fonts of the given class to the shape. 00840 void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape* shape) const { 00841 for (int f = 0; f < font_id_map_.CompactSize(); ++f) { 00842 int font_id = font_id_map_.CompactToSparse(f); 00843 shape->AddToShape(class_id, font_id); 00844 } 00845 } 00846 00847 // Display the samples with the given indexed feature that also match 00848 // the given shape. 00849 void TrainingSampleSet::DisplaySamplesWithFeature(int f_index, 00850 const Shape& shape, 00851 const IntFeatureSpace& space, 00852 ScrollView::Color color, 00853 ScrollView* window) const { 00854 for (int s = 0; s < num_raw_samples(); ++s) { 00855 const TrainingSample* sample = GetSample(s); 00856 if (shape.ContainsUnichar(sample->class_id())) { 00857 GenericVector<int> indexed_features; 00858 space.IndexAndSortFeatures(sample->features(), sample->num_features(), 00859 &indexed_features); 00860 for (int f = 0; f < indexed_features.size(); ++f) { 00861 if (indexed_features[f] == f_index) { 00862 sample->DisplayFeatures(color, window); 00863 } 00864 } 00865 } 00866 } 00867 } 00868 00869 00870 } // namespace tesseract.