tesseract-doc/trainingsampleset_8cpp_source.html

00001 // Copyright 2010 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00003 //
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 //
00015
00016 #include "trainingsampleset.h"
00017 #include "allheaders.h"
00018 #include "boxread.h"
00019 #include "fontinfo.h"
00020 #include "indexmapbidi.h"
00021 #include "intfeaturedist.h"
00022 #include "intfeaturemap.h"
00023 #include "intfeaturespace.h"
00024 #include "shapetable.h"
00025 #include "trainingsample.h"
00026 #include "unicity_table.h"
00027
00028 namespace tesseract {
00029
00030 const int kTestChar = -1;  // 37;
00031 // Max number of distances to compute the squared way
00032 const int kSquareLimit = 25;
00033 // Prime numbers for subsampling distances.
00034 const int kPrime1 = 17;
00035 const int kPrime2 = 13;
00036 // Min samples from which to start discarding outliers.
00037 const int kMinOutlierSamples = 5;
00038
00039 TrainingSampleSet::FontClassInfo::FontClassInfo()
00040   : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
00041 }
00042
00043 // Writes to the given file. Returns false in case of error.
00044 bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp) const {
00045   if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
00046     return false;
00047   if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
00048     return false;
00049   if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
00050   if (!samples.Serialize(fp)) return false;
00051   return true;
00052 }
00053 // Reads from the given file. Returns false in case of error.
00054 // If swap is true, assumes a big/little-endian swap is needed.
00055 bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
00056   if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
00057     return false;
00058   if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
00059     return false;
00060   if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
00061   if (!samples.DeSerialize(swap, fp)) return false;
00062   if (swap) {
00063     ReverseN(&num_raw_samples, sizeof(num_raw_samples));
00064     ReverseN(&canonical_sample, sizeof(canonical_sample));
00065     ReverseN(&canonical_dist, sizeof(canonical_dist));
00066   }
00067   return true;
00068 }
00069
00070 TrainingSampleSet::TrainingSampleSet(const UnicityTable<FontInfo>& font_table)
00071   : num_raw_samples_(0), unicharset_size_(0),
00072     font_class_array_(NULL), fontinfo_table_(font_table) {
00073 }
00074
00075 TrainingSampleSet::~TrainingSampleSet() {
00076   delete font_class_array_;
00077 }
00078
00079 // Writes to the given file. Returns false in case of error.
00080 bool TrainingSampleSet::Serialize(FILE* fp) const {
00081   if (!samples_.Serialize(fp)) return false;
00082   if (!unicharset_.save_to_file(fp)) return false;
00083   if (!font_id_map_.Serialize(fp)) return false;
00084   inT8 not_null = font_class_array_ != NULL;
00085   if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) return false;
00086   if (not_null) {
00087     if (!font_class_array_->SerializeClasses(fp)) return false;
00088   }
00089   return true;
00090 }
00091
00092 // Reads from the given file. Returns false in case of error.
00093 // If swap is true, assumes a big/little-endian swap is needed.
00094 bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
00095   if (!samples_.DeSerialize(swap, fp)) return false;
00096   num_raw_samples_ = samples_.size();
00097   if (!unicharset_.load_from_file(fp)) return false;
00098   if (!font_id_map_.DeSerialize(swap, fp)) return false;
00099   if (font_class_array_ != NULL) {
00100     delete font_class_array_;
00101     font_class_array_ = NULL;
00102   }
00103   inT8 not_null;
00104   if (fread(&not_null, sizeof(not_null), 1, fp) != 1) return false;
00105   if (not_null) {
00106     FontClassInfo empty;
00107     font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
00108     if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
00109   }
00110   unicharset_size_ = unicharset_.size();
00111   return true;
00112 }
00113
00114 // Load an initial unicharset, or set one up if the file cannot be read.
00115 void TrainingSampleSet::LoadUnicharset(const char* filename) {
00116   if (!unicharset_.load_from_file(filename)) {
00117     tprintf("Failed to load unicharset from file %s\n"
00118             "Building unicharset for boosting from scratch...\n",
00119             filename);
00120     unicharset_.clear();
00121     // Space character needed to represent NIL_LIST classification.
00122     unicharset_.unichar_insert(" ");
00123   }
00124   unicharset_size_ = unicharset_.size();
00125 }
00126
00127 // Adds a character sample to this sample set.
00128 // If the unichar is not already in the local unicharset, it is added.
00129 // Returns the unichar_id of the added sample, from the local unicharset.
00130 int TrainingSampleSet::AddSample(const char* unichar, TrainingSample* sample) {
00131   if (!unicharset_.contains_unichar(unichar)) {
00132     unicharset_.unichar_insert(unichar);
00133     if (unicharset_.size() > MAX_NUM_CLASSES) {
00134       tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
00135               "greater than MAX_NUM_CLASSES\n");
00136       return -1;
00137     }
00138   }
00139   UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
00140   AddSample(char_id, sample);
00141   return char_id;
00142 }
00143
00144 // Adds a character sample to this sample set with the given unichar_id,
00145 // which must correspond to the local unicharset (in this).
00146 void TrainingSampleSet::AddSample(int unichar_id, TrainingSample* sample) {
00147   sample->set_class_id(unichar_id);
00148   samples_.push_back(sample);
00149   num_raw_samples_ = samples_.size();
00150   unicharset_size_ = unicharset_.size();
00151 }
00152
00153 // Returns the number of samples for the given font,class pair.
00154 // If randomize is true, returns the number of samples accessible
00155 // with randomizing on. (Increases the number of samples if small.)
00156 // OrganizeByFontAndClass must have been already called.
00157 int TrainingSampleSet::NumClassSamples(int font_id, int class_id,
00158                                        bool randomize) const {
00159   ASSERT_HOST(font_class_array_ != NULL);
00160   if (font_id < 0 || class_id < 0 ||
00161       font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
00162     // There are no samples because the font or class doesn't exist.
00163     return 0;
00164   }
00165   int font_index = font_id_map_.SparseToCompact(font_id);
00166   if (font_index < 0)
00167     return 0;  // The font has no samples.
00168   if (randomize)
00169     return (*font_class_array_)(font_index, class_id).samples.size();
00170   else
00171     return (*font_class_array_)(font_index, class_id).num_raw_samples;
00172 }
00173
00174 // Gets a sample by its index.
00175 const TrainingSample* TrainingSampleSet::GetSample(int index) const {
00176   return samples_[index];
00177 }
00178
00179 // Gets a sample by its font, class, index.
00180 // OrganizeByFontAndClass must have been already called.
00181 const TrainingSample* TrainingSampleSet::GetSample(int font_id, int class_id,
00182                                                    int index) const {
00183   ASSERT_HOST(font_class_array_ != NULL);
00184   int font_index = font_id_map_.SparseToCompact(font_id);
00185   if (font_index < 0) return NULL;
00186   int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
00187   return samples_[sample_index];
00188 }
00189
00190 // Get a sample by its font, class, index. Does not randomize.
00191 // OrganizeByFontAndClass must have been already called.
00192 TrainingSample* TrainingSampleSet::MutableSample(int font_id, int class_id,
00193                                                  int index) {
00194   ASSERT_HOST(font_class_array_ != NULL);
00195   int font_index = font_id_map_.SparseToCompact(font_id);
00196   if (font_index < 0) return NULL;
00197   int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
00198   return samples_[sample_index];
00199 }
00200
00201 // Returns a string debug representation of the given sample:
00202 // font, unichar_str, bounding box, page.
00203 STRING TrainingSampleSet::SampleToString(const TrainingSample& sample) const {
00204   STRING boxfile_str;
00205   MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
00206                  sample.bounding_box(), sample.page_num(), &boxfile_str);
00207   return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
00208 }
00209
00210 // Gets the combined set of features used by all the samples of the given
00211 // font/class combination.
00212 const BitVector& TrainingSampleSet::GetCloudFeatures(
00213     int font_id, int class_id) const {
00214   int font_index = font_id_map_.SparseToCompact(font_id);
00215   ASSERT_HOST(font_index >= 0);
00216   return (*font_class_array_)(font_index, class_id).cloud_features;
00217 }
00218 // Gets the indexed features of the canonical sample of the given
00219 // font/class combination.
00220 const GenericVector<int>& TrainingSampleSet::GetCanonicalFeatures(
00221     int font_id, int class_id) const {
00222   int font_index = font_id_map_.SparseToCompact(font_id);
00223   ASSERT_HOST(font_index >= 0);
00224   return (*font_class_array_)(font_index, class_id).canonical_features;
00225 }
00226
00227 // Returns the distance between the given UniCharAndFonts pair.
00228 // If matched_fonts, only matching fonts, are considered, unless that yields
00229 // the empty set.
00230 // OrganizeByFontAndClass must have been already called.
00231 float TrainingSampleSet::UnicharDistance(const UnicharAndFonts& uf1,
00232                                          const UnicharAndFonts& uf2,
00233                                          bool matched_fonts,
00234                                          const IntFeatureMap& feature_map) {
00235   int num_fonts1 = uf1.font_ids.size();
00236   int c1 = uf1.unichar_id;
00237   int num_fonts2 = uf2.font_ids.size();
00238   int c2 = uf2.unichar_id;
00239   double dist_sum = 0.0;
00240   int dist_count = 0;
00241   bool debug = false;
00242   if (matched_fonts) {
00243     // Compute distances only where fonts match.
00244     for (int i = 0; i < num_fonts1; ++i) {
00245       int f1 = uf1.font_ids[i];
00246       for (int j = 0; j < num_fonts2; ++j) {
00247         int f2 = uf2.font_ids[j];
00248         if (f1 == f2) {
00249           dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
00250           ++dist_count;
00251         }
00252       }
00253     }
00254   } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
00255     // Small enough sets to compute all the distances.
00256     for (int i = 0; i < num_fonts1; ++i) {
00257       int f1 = uf1.font_ids[i];
00258       for (int j = 0; j < num_fonts2; ++j) {
00259         int f2 = uf2.font_ids[j];
00260         dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
00261         if (debug) {
00262             tprintf("Cluster dist %d %d %d %d = %g\n",
00263                     f1, c1, f2, c2,
00264                     ClusterDistance(f1, c1, f2, c2, feature_map));
00265         }
00266         ++dist_count;
00267       }
00268     }
00269   } else {
00270     // Subsample distances, using the largest set once, and stepping through
00271     // the smaller set so as to ensure that all the pairs are different.
00272     int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
00273     int index = 0;
00274     int num_samples = MAX(num_fonts1, num_fonts2);
00275     for (int i = 0; i < num_samples; ++i, index += increment) {
00276       int f1 = uf1.font_ids[i % num_fonts1];
00277       int f2 = uf2.font_ids[index % num_fonts2];
00278       if (debug) {
00279           tprintf("Cluster dist %d %d %d %d = %g\n",
00280                   f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
00281       }
00282       dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
00283       ++dist_count;
00284     }
00285   }
00286   if (dist_count == 0) {
00287     if (matched_fonts)
00288       return UnicharDistance(uf1, uf2, false, feature_map);
00289     return 0.0f;
00290   }
00291   return dist_sum / dist_count;
00292 }
00293
00294 // Returns the distance between the given pair of font/class pairs.
00295 // Finds in cache or computes and caches.
00296 // OrganizeByFontAndClass must have been already called.
00297 float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1,
00298                                          int font_id2, int class_id2,
00299                                          const IntFeatureMap& feature_map) {
00300   ASSERT_HOST(font_class_array_ != NULL);
00301   int font_index1 = font_id_map_.SparseToCompact(font_id1);
00302   int font_index2 = font_id_map_.SparseToCompact(font_id2);
00303   if (font_index1 < 0 || font_index2 < 0)
00304     return 0.0f;
00305   FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
00306   if (font_id1 == font_id2) {
00307     // Special case cache for speed.
00308     if (fc_info.unichar_distance_cache.size() == 0)
00309       fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
00310     if (fc_info.unichar_distance_cache[class_id2] < 0) {
00311       // Distance has to be calculated.
00312       float result = ComputeClusterDistance(font_id1, class_id1,
00313                                             font_id2, class_id2,
00314                                             feature_map);
00315       fc_info.unichar_distance_cache[class_id2] = result;
00316       // Copy to the symmetric cache entry.
00317       FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
00318       if (fc_info2.unichar_distance_cache.size() == 0)
00319         fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
00320       fc_info2.unichar_distance_cache[class_id1] = result;
00321     }
00322     return fc_info.unichar_distance_cache[class_id2];
00323   } else if (class_id1 == class_id2) {
00324     // Another special-case cache for equal class-id.
00325     if (fc_info.font_distance_cache.size() == 0)
00326       fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
00327                                                -1.0f);
00328     if (fc_info.font_distance_cache[font_index2] < 0) {
00329       // Distance has to be calculated.
00330       float result = ComputeClusterDistance(font_id1, class_id1,
00331                                             font_id2, class_id2,
00332                                             feature_map);
00333       fc_info.font_distance_cache[font_index2] = result;
00334       // Copy to the symmetric cache entry.
00335       FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
00336       if (fc_info2.font_distance_cache.size() == 0)
00337         fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
00338                                                   -1.0f);
00339       fc_info2.font_distance_cache[font_index1] = result;
00340     }
00341     return fc_info.font_distance_cache[font_index2];
00342   }
00343   // Both font and class are different. Linear search for class_id2/font_id2
00344   // in what is a hopefully short list of distances.
00345   int cache_index = 0;
00346   while (cache_index < fc_info.distance_cache.size() &&
00347          (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
00348           fc_info.distance_cache[cache_index].font_id != font_id2))
00349     ++cache_index;
00350   if (cache_index == fc_info.distance_cache.size()) {
00351     // Distance has to be calculated.
00352     float result = ComputeClusterDistance(font_id1, class_id1,
00353                                           font_id2, class_id2,
00354                                           feature_map);
00355     FontClassDistance fc_dist = { class_id2, font_id2, result };
00356     fc_info.distance_cache.push_back(fc_dist);
00357     // Copy to the symmetric cache entry. We know it isn't there already, as
00358     // we always copy to the symmetric entry.
00359     FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
00360     fc_dist.unichar_id = class_id1;
00361     fc_dist.font_id = font_id1;
00362     fc_info2.distance_cache.push_back(fc_dist);
00363   }
00364   return fc_info.distance_cache[cache_index].distance;
00365 }
00366
00367 // Computes the distance between the given pair of font/class pairs.
00368 float TrainingSampleSet::ComputeClusterDistance(
00369     int font_id1, int class_id1, int font_id2, int class_id2,
00370     const IntFeatureMap& feature_map) const {
00371   int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
00372                                feature_map, false);
00373   dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
00374                             feature_map, false);
00375   int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
00376   denominator += GetCanonicalFeatures(font_id2, class_id2).size();
00377   return static_cast<float>(dist) / denominator;
00378 }
00379
00380 // Helper to add a feature and its near neighbors to the good_features.
00381 // levels indicates how many times to compute the offset features of what is
00382 // already there. This is done by iteration rather than recursion.
00383 static void AddNearFeatures(const IntFeatureMap& feature_map, int f, int levels,
00384                             GenericVector<int>* good_features) {
00385   int prev_num_features = 0;
00386   good_features->push_back(f);
00387   int num_features = 1;
00388   for (int level = 0; level < levels; ++level) {
00389     for (int i = prev_num_features; i < num_features; ++i) {
00390       int feature = (*good_features)[i];
00391       for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
00392         if (dir == 0) continue;
00393         int f1 = feature_map.OffsetFeature(feature, dir);
00394         if (f1 >= 0) {
00395           good_features->push_back(f1);
00396         }
00397       }
00398     }
00399     prev_num_features = num_features;
00400     num_features = good_features->size();
00401   }
00402 }
00403
00404 // Returns the number of canonical features of font/class 2 for which
00405 // neither the feature nor any of its near neighbors occurs in the cloud
00406 // of font/class 1. Each such feature is a reliable separation between
00407 // the classes, ASSUMING that the canonical sample is sufficiently
00408 // representative that every sample has a feature near that particular
00409 // feature. To check that this is so on the fly would be prohibitively
00410 // expensive, but it might be possible to pre-qualify the canonical features
00411 // to include only those for which this assumption is true.
00412 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
00413 // first, or the results will be nonsense.
00414 int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1,
00415                                          int font_id2, int class_id2,
00416                                          const IntFeatureMap& feature_map,
00417                                          bool thorough) const {
00418   int result = 0;
00419   const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
00420   if (sample2 == NULL)
00421     return 0;  // There are no canonical features.
00422   const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
00423                                                               class_id2);
00424   const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
00425   if (cloud1.size() == 0)
00426     return canonical2.size();  // There are no cloud features.
00427
00428   // Find a canonical2 feature that is not in cloud1.
00429   for (int f = 0; f < canonical2.size(); ++f) {
00430     int feature = canonical2[f];
00431     if (cloud1[feature])
00432       continue;
00433     // Gather the near neighbours of f.
00434     GenericVector<int> good_features;
00435     AddNearFeatures(feature_map, feature, 1, &good_features);
00436     // Check that none of the good_features are in the cloud.
00437     int i;
00438     for (i = 0; i < good_features.size(); ++i) {
00439       int good_f = good_features[i];
00440       if (cloud1[good_f]) {
00441         break;
00442       }
00443     }
00444     if (i < good_features.size())
00445        continue;  // Found one in the cloud.
00446     ++result;
00447   }
00448   return result;
00449 }
00450
00451 // Returns the total index of the requested sample.
00452 // OrganizeByFontAndClass must have been already called.
00453 int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id,
00454                                          int index) const {
00455   ASSERT_HOST(font_class_array_ != NULL);
00456   int font_index = font_id_map_.SparseToCompact(font_id);
00457   if (font_index < 0) return -1;
00458   return (*font_class_array_)(font_index, class_id).samples[index];
00459 }
00460
00461 // Gets the canonical sample for the given font, class pair.
00462 // ComputeCanonicalSamples must have been called first.
00463 const TrainingSample* TrainingSampleSet::GetCanonicalSample(
00464     int font_id, int class_id) const {
00465   ASSERT_HOST(font_class_array_ != NULL);
00466   int font_index = font_id_map_.SparseToCompact(font_id);
00467   if (font_index < 0) return NULL;
00468   int sample_index = (*font_class_array_)(font_index,
00469                                           class_id).canonical_sample;
00470   return sample_index >= 0 ? samples_[sample_index] : NULL;
00471 }
00472
00473 // Gets the max distance for the given canonical sample.
00474 // ComputeCanonicalSamples must have been called first.
00475 float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {
00476   ASSERT_HOST(font_class_array_ != NULL);
00477   int font_index = font_id_map_.SparseToCompact(font_id);
00478   if (font_index < 0) return 0.0f;
00479   if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
00480     return (*font_class_array_)(font_index, class_id).canonical_dist;
00481   else
00482     return 0.0f;
00483 }
00484
00485 // Generates indexed features for all samples with the supplied feature_space.
00486 void TrainingSampleSet::IndexFeatures(const IntFeatureSpace& feature_space) {
00487   for (int s = 0; s < samples_.size(); ++s)
00488     samples_[s]->IndexFeatures(feature_space);
00489 }
00490
00491 // Delete outlier samples with few features that are shared with others.
00492 // IndexFeatures must have been called already.
00493 void TrainingSampleSet::DeleteOutliers(const IntFeatureSpace& feature_space,
00494                                        bool debug) {
00495   if (font_class_array_ == NULL)
00496     OrganizeByFontAndClass();
00497   Pixa* pixa = NULL;
00498   if (debug)
00499     pixa = pixaCreate(0);
00500   GenericVector<int> feature_counts;
00501   int fs_size = feature_space.Size();
00502   int font_size = font_id_map_.CompactSize();
00503   for (int font_index = 0; font_index < font_size; ++font_index) {
00504     for (int c = 0; c < unicharset_size_; ++c) {
00505       // Create a histogram of the features used by all samples of this
00506       // font/class combination.
00507       feature_counts.init_to_size(fs_size, 0);
00508       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00509       int sample_count = fcinfo.samples.size();
00510       if (sample_count < kMinOutlierSamples)
00511         continue;
00512       for (int i = 0; i < sample_count; ++i) {
00513         int s = fcinfo.samples[i];
00514         const GenericVector<int>& features = samples_[s]->indexed_features();
00515         for (int f = 0; f < features.size(); ++f) {
00516           ++feature_counts[features[f]];
00517         }
00518       }
00519       for (int i = 0; i < sample_count; ++i) {
00520         int s = fcinfo.samples[i];
00521         const TrainingSample& sample = *samples_[s];
00522         const GenericVector<int>& features = sample.indexed_features();
00523         // A feature that has a histogram count of 1 is only used by this
00524         // sample, making it 'bad'. All others are 'good'.
00525         int good_features = 0;
00526         int bad_features = 0;
00527         for (int f = 0; f < features.size(); ++f) {
00528           if (feature_counts[features[f]] > 1)
00529             ++good_features;
00530           else
00531             ++bad_features;
00532         }
00533         // If more than 1/3 features are bad, then this is an outlier.
00534         if (bad_features * 2 > good_features) {
00535           tprintf("Deleting outlier sample of %s, %d good, %d bad\n",
00536                   SampleToString(sample).string(),
00537                   good_features, bad_features);
00538           if (debug) {
00539             pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT);
00540             // Add the previous sample as well, so it is easier to see in
00541             // the output what is wrong with this sample.
00542             int t;
00543             if (i == 0)
00544               t = fcinfo.samples[1];
00545             else
00546               t = fcinfo.samples[i - 1];
00547             const TrainingSample &csample = *samples_[t];
00548             pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT);
00549           }
00550           // Mark the sample for deletion.
00551           KillSample(samples_[s]);
00552         }
00553       }
00554     }
00555   }
00556   // Truly delete all bad samples and renumber everything.
00557   DeleteDeadSamples();
00558   if (pixa != NULL) {
00559     Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
00560     pixaDestroy(&pixa);
00561     pixWrite("outliers.png", pix, IFF_PNG);
00562     pixDestroy(&pix);
00563   }
00564 }
00565
00566 // Marks the given sample index for deletion.
00567 // Deletion is actually completed by DeleteDeadSamples.
00568 void TrainingSampleSet::KillSample(TrainingSample* sample) {
00569   sample->set_sample_index(-1);
00570 }
00571
00572 // Deletes all samples with zero features marked by KillSample.
00573 void TrainingSampleSet::DeleteDeadSamples() {
00574   samples_.compact(
00575       NewPermanentTessCallback(this, &TrainingSampleSet::DeleteableSample));
00576   num_raw_samples_ = samples_.size();
00577   // Samples must be re-organized now we have deleted a few.
00578 }
00579
00580 // Callback function returns true if the given sample is to be deleted, due
00581 // to having a negative classid.
00582 bool TrainingSampleSet::DeleteableSample(const TrainingSample* sample) {
00583   return sample == NULL || sample->class_id() < 0;
00584 }
00585
00586 static Pix* DebugSample(const UNICHARSET& unicharset,
00587                         TrainingSample* sample) {
00588   tprintf("\nOriginal features:\n");
00589   for (int i = 0; i < sample->num_features(); ++i) {
00590     sample->features()[i].print();
00591   }
00592   if (sample->features_are_mapped()) {
00593     tprintf("\nMapped features:\n");
00594     for (int i = 0; i < sample->mapped_features().size(); ++i) {
00595       tprintf("%d ", sample->mapped_features()[i]);
00596     }
00597     tprintf("\n");
00598   }
00599   return sample->RenderToPix(&unicharset);
00600 }
00601
00602 // Construct an array to access the samples by font,class pair.
00603 void TrainingSampleSet::OrganizeByFontAndClass() {
00604   // Font indexes are sparse, so we used a map to compact them, so we can
00605   // have an efficient 2-d array of fonts and character classes.
00606   SetupFontIdMap();
00607   int compact_font_size = font_id_map_.CompactSize();
00608   // Get a 2-d array of generic vectors.
00609   if (font_class_array_ != NULL)
00610     delete font_class_array_;
00611   FontClassInfo empty;
00612   font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
00613       compact_font_size, unicharset_size_, empty);
00614   for (int s = 0; s < samples_.size(); ++s) {
00615     int font_id = samples_[s]->font_id();
00616     int class_id = samples_[s]->class_id();
00617     if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
00618       tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
00619               font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
00620               s);
00621     }
00622     ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
00623     ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
00624     int font_index = font_id_map_.SparseToCompact(font_id);
00625     (*font_class_array_)(font_index, class_id).samples.push_back(s);
00626   }
00627   // Set the num_raw_samples member of the FontClassInfo, to set the boundary
00628   // between the raw samples and the replicated ones.
00629   for (int f = 0; f < compact_font_size; ++f) {
00630     for (int c = 0; c < unicharset_size_; ++c)
00631       (*font_class_array_)(f, c).num_raw_samples =
00632           (*font_class_array_)(f, c).samples.size();
00633   }
00634   // This is the global number of samples and also marks the boundary between
00635   // real and replicated samples.
00636   num_raw_samples_ = samples_.size();
00637 }
00638
00639 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
00640 // index for the font_class_array_.
00641 void TrainingSampleSet::SetupFontIdMap() {
00642   // Number of samples for each font_id.
00643   GenericVector<int> font_counts;
00644   for (int s = 0; s < samples_.size(); ++s) {
00645     int font_id = samples_[s]->font_id();
00646     while (font_id >= font_counts.size())
00647       font_counts.push_back(0);
00648     ++font_counts[font_id];
00649   }
00650   font_id_map_.Init(font_counts.size(), false);
00651   for (int f = 0; f < font_counts.size(); ++f) {
00652     font_id_map_.SetMap(f, font_counts[f] > 0);
00653   }
00654   font_id_map_.Setup();
00655 }
00656
00657
00658 // Finds the sample for each font, class pair that has least maximum
00659 // distance to all the other samples of the same font, class.
00660 // OrganizeByFontAndClass must have been already called.
00661 void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap& map,
00662                                                 bool debug) {
00663   ASSERT_HOST(font_class_array_ != NULL);
00664   IntFeatureDist f_table;
00665   if (debug) tprintf("feature table size %d\n", map.sparse_size());
00666   f_table.Init(&map);
00667   int worst_s1 = 0;
00668   int worst_s2 = 0;
00669   double global_worst_dist = 0.0;
00670   // Compute distances independently for each font and char index.
00671   int font_size = font_id_map_.CompactSize();
00672   for (int font_index = 0; font_index < font_size; ++font_index) {
00673     int font_id = font_id_map_.CompactToSparse(font_index);
00674     for (int c = 0; c < unicharset_size_; ++c) {
00675       int samples_found = 0;
00676       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00677       if (fcinfo.samples.size() == 0 ||
00678           (kTestChar >= 0 && c != kTestChar)) {
00679         fcinfo.canonical_sample = -1;
00680         fcinfo.canonical_dist = 0.0f;
00681         if (debug) tprintf("Skipping class %d\n", c);
00682         continue;
00683       }
00684       // The canonical sample will be the one with the min_max_dist, which
00685       // is the sample with the lowest maximum distance to all other samples.
00686       double min_max_dist = 2.0;
00687       // We keep track of the farthest apart pair (max_s1, max_s2) which
00688       // are max_max_dist apart, so we can see how bad the variability is.
00689       double max_max_dist = 0.0;
00690       int max_s1 = 0;
00691       int max_s2 = 0;
00692       fcinfo.canonical_sample = fcinfo.samples[0];
00693       fcinfo.canonical_dist = 0.0f;
00694       for (int i = 0; i < fcinfo.samples.size(); ++i) {
00695         int s1 = fcinfo.samples[i];
00696         const GenericVector<int>& features1 = samples_[s1]->indexed_features();
00697         f_table.Set(features1, features1.size(), true);
00698         double max_dist = 0.0;
00699         // Run the full squared-order search for similar samples. It is still
00700         // reasonably fast because f_table.FeatureDistance is fast, but we
00701         // may have to reconsider if we start playing with too many samples
00702         // of a single char/font.
00703         for (int j = 0; j < fcinfo.samples.size(); ++j) {
00704           int s2 = fcinfo.samples[j];
00705           if (samples_[s2]->class_id() != c  ||
00706               samples_[s2]->font_id() != font_id ||
00707               s2 == s1)
00708             continue;
00709           GenericVector<int> features2 = samples_[s2]->indexed_features();
00710           double dist = f_table.FeatureDistance(features2);
00711           int height = samples_[s2]->geo_feature(GeoTop) -
00712               samples_[s2]->geo_feature(GeoBottom);
00713           if (dist == 1.0 && height > 64) {
00714             // TODO(rays) rethink this when the polygonal approximation goes.
00715             // Currently it is possible for dots and other small characters
00716             // to be completely different, even within the same class.
00717             f_table.DebugFeatureDistance(features2);
00718           }
00719           if (dist > max_dist) {
00720             max_dist = dist;
00721             if (dist > max_max_dist) {
00722               max_s1 = s1;
00723               max_s2 = s2;
00724             }
00725           }
00726         }
00727         // Using Set(..., false) is far faster than re initializing, due to
00728         // the sparseness of the feature space.
00729         f_table.Set(features1, features1.size(), false);
00730         samples_[s1]->set_max_dist(max_dist);
00731         ++samples_found;
00732         if (max_dist < min_max_dist) {
00733           fcinfo.canonical_sample = s1;
00734           fcinfo.canonical_dist = max_dist;
00735         }
00736         UpdateRange(max_dist, &min_max_dist, &max_max_dist);
00737       }
00738       if (max_max_dist > global_worst_dist) {
00739         // Keep a record of the worst pair over all characters/fonts too.
00740         global_worst_dist = max_max_dist;
00741         worst_s1 = max_s1;
00742         worst_s2 = max_s2;
00743       }
00744       if (debug) {
00745         tprintf("Found %d samples of class %d=%s, font %d, "
00746                 "dist range [%g, %g], worst pair= %s, %s\n",
00747                 samples_found, c, unicharset_.debug_str(c).string(),
00748                 font_index, min_max_dist, max_max_dist,
00749                 SampleToString(*samples_[max_s1]).string(),
00750                 SampleToString(*samples_[max_s2]).string());
00751       }
00752     }
00753   }
00754   if (debug) {
00755     tprintf("Global worst dist = %g, between sample %d and %d\n",
00756             global_worst_dist, worst_s1, worst_s2);
00757     Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
00758     Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
00759     pixOr(pix1, pix1, pix2);
00760     pixWrite("worstpair.png", pix1, IFF_PNG);
00761     pixDestroy(&pix1);
00762     pixDestroy(&pix2);
00763   }
00764 }
00765
00766 // Replicates the samples to a minimum frequency defined by
00767 // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
00768 // After replication, the replicated samples are perturbed slightly, but
00769 // in a predictable and repeatable way.
00770 // Use after OrganizeByFontAndClass().
00771 void TrainingSampleSet::ReplicateAndRandomizeSamples() {
00772   ASSERT_HOST(font_class_array_ != NULL);
00773   int font_size = font_id_map_.CompactSize();
00774   for (int font_index = 0; font_index < font_size; ++font_index) {
00775     for (int c = 0; c < unicharset_size_; ++c) {
00776       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00777       int sample_count = fcinfo.samples.size();
00778       int min_samples = 2 * MAX(kSampleRandomSize, sample_count);
00779       if (sample_count > 0 && sample_count < min_samples) {
00780         int base_count = sample_count;
00781         for (int base_index = 0; sample_count < min_samples; ++sample_count) {
00782           int src_index = fcinfo.samples[base_index++];
00783           if (base_index >= base_count) base_index = 0;
00784           TrainingSample* sample = samples_[src_index]->RandomizedCopy(
00785               sample_count % kSampleRandomSize);
00786           int sample_index = samples_.size();
00787           sample->set_sample_index(sample_index);
00788           samples_.push_back(sample);
00789           fcinfo.samples.push_back(sample_index);
00790         }
00791       }
00792     }
00793   }
00794 }
00795
00796 // Caches the indexed features of the canonical samples.
00797 // ComputeCanonicalSamples must have been already called.
00798 // TODO(rays) see note on ReliablySeparable and try restricting the
00799 // canonical features to those that truly represent all samples.
00800 void TrainingSampleSet::ComputeCanonicalFeatures() {
00801   ASSERT_HOST(font_class_array_ != NULL);
00802   int font_size = font_id_map_.CompactSize();
00803   for (int font_index = 0; font_index < font_size; ++font_index) {
00804     int font_id = font_id_map_.CompactToSparse(font_index);
00805     for (int c = 0; c < unicharset_size_; ++c) {
00806       int num_samples = NumClassSamples(font_id, c, false);
00807       if (num_samples == 0)
00808         continue;
00809       const TrainingSample* sample = GetCanonicalSample(font_id, c);
00810       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00811       fcinfo.canonical_features = sample->indexed_features();
00812     }
00813   }
00814 }
00815
00816 // Computes the combined set of features used by all the samples of each
00817 // font/class combination. Use after ReplicateAndRandomizeSamples.
00818 void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {
00819   ASSERT_HOST(font_class_array_ != NULL);
00820   int font_size = font_id_map_.CompactSize();
00821   for (int font_index = 0; font_index < font_size; ++font_index) {
00822     int font_id = font_id_map_.CompactToSparse(font_index);
00823     for (int c = 0; c < unicharset_size_; ++c) {
00824       int num_samples = NumClassSamples(font_id, c, false);
00825       if (num_samples == 0)
00826         continue;
00827       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00828       fcinfo.cloud_features.Init(feature_space_size);
00829       for (int s = 0; s < num_samples; ++s) {
00830         const TrainingSample* sample = GetSample(font_id, c, s);
00831         const GenericVector<int>& sample_features = sample->indexed_features();
00832         for (int i = 0; i < sample_features.size(); ++i)
00833           fcinfo.cloud_features.SetBit(sample_features[i]);
00834       }
00835     }
00836   }
00837 }
00838
00839 // Adds all fonts of the given class to the shape.
00840 void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape* shape) const {
00841   for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
00842     int font_id = font_id_map_.CompactToSparse(f);
00843     shape->AddToShape(class_id, font_id);
00844   }
00845 }
00846
00847 // Display the samples with the given indexed feature that also match
00848 // the given shape.
00849 void TrainingSampleSet::DisplaySamplesWithFeature(int f_index,
00850                                                   const Shape& shape,
00851                                                   const IntFeatureSpace& space,
00852                                                   ScrollView::Color color,
00853                                                   ScrollView* window) const {
00854   for (int s = 0; s < num_raw_samples(); ++s) {
00855     const TrainingSample* sample = GetSample(s);
00856     if (shape.ContainsUnichar(sample->class_id())) {
00857       GenericVector<int> indexed_features;
00858       space.IndexAndSortFeatures(sample->features(), sample->num_features(),
00859                                  &indexed_features);
00860       for (int f = 0; f < indexed_features.size(); ++f) {
00861         if (indexed_features[f] == f_index) {
00862           sample->DisplayFeatures(color, window);
00863         }
00864       }
00865     }
00866   }
00867 }
00868
00869
00870 }  // namespace tesseract.