Tesseract  3.02
tesseract-ocr/classify/intfeaturedist.cpp
Go to the documentation of this file.
00001 // Copyright 2011 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00004 // File:        intfeaturedist.cpp
00005 // Description: Fast set-difference-based feature distance calculator.
00006 // Created:     Thu Sep 01 13:07:30 PDT 2011
00007 //
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "intfeaturedist.h"
00021 #include "intfeaturemap.h"
00022 
00023 namespace tesseract {
00024 
00025 IntFeatureDist::IntFeatureDist()
00026   : size_(0), total_feature_weight_(0.0),
00027     feature_map_(NULL), features_(NULL),
00028     features_delta_one_(NULL), features_delta_two_(NULL) {
00029 }
00030 
00031 IntFeatureDist::~IntFeatureDist() {
00032   Clear();
00033 }
00034 
00035 // Initialize the table to the given size of feature space.
00036 void IntFeatureDist::Init(const IntFeatureMap* feature_map) {
00037   size_ = feature_map->sparse_size();
00038   Clear();
00039   feature_map_ = feature_map;
00040   features_ = new bool[size_];
00041   features_delta_one_ = new bool[size_];
00042   features_delta_two_ = new bool[size_];
00043   memset(features_, false, size_ * sizeof(features_[0]));
00044   memset(features_delta_one_, false, size_ * sizeof(features_delta_one_[0]));
00045   memset(features_delta_two_, false, size_ * sizeof(features_delta_two_[0]));
00046   total_feature_weight_ = 0.0;
00047 }
00048 
00049 // Setup the map for the given indexed_features that have been indexed by
00050 // feature_map.
00051 void IntFeatureDist::Set(const GenericVector<int>& indexed_features,
00052                           int canonical_count, bool value) {
00053   total_feature_weight_ = canonical_count;
00054   for (int i = 0; i < indexed_features.size(); ++i) {
00055     int f = indexed_features[i];
00056     features_[f] = value;
00057     for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
00058       if (dir == 0) continue;
00059       int mapped_f = feature_map_->OffsetFeature(f, dir);
00060       if (mapped_f >= 0) {
00061         features_delta_one_[mapped_f] = value;
00062         for (int dir2 = -kNumOffsetMaps; dir2 <= kNumOffsetMaps; ++dir2) {
00063           if (dir2 == 0) continue;
00064           int mapped_f2 = feature_map_->OffsetFeature(mapped_f, dir2);
00065           if (mapped_f2 >= 0)
00066             features_delta_two_[mapped_f2] = value;
00067         }
00068       }
00069     }
00070   }
00071 }
00072 
00073 // Compute the distance between the given feature vector and the last
00074 // Set feature vector.
00075 double IntFeatureDist::FeatureDistance(
00076     const GenericVector<int>& features) const {
00077   int num_test_features = features.size();
00078   double denominator = total_feature_weight_ + num_test_features;
00079   double misses = denominator;
00080   for (int i = 0; i < num_test_features; ++i) {
00081     int index = features[i];
00082     double weight = 1.0;
00083     if (features_[index]) {
00084       // A perfect match.
00085       misses -= 2.0 * weight;
00086     } else if (features_delta_one_[index]) {
00087       misses -= 1.5 * weight;
00088     } else if (features_delta_two_[index]) {
00089       // A near miss.
00090       misses -= 1.0 * weight;
00091     }
00092   }
00093   return misses / denominator;
00094 }
00095 
00096 // Compute the distance between the given feature vector and the last
00097 // Set feature vector.
00098 double IntFeatureDist::DebugFeatureDistance(
00099     const GenericVector<int>& features) const {
00100   int num_test_features = features.size();
00101   double denominator = total_feature_weight_ + num_test_features;
00102   double misses = denominator;
00103   for (int i = 0; i < num_test_features; ++i) {
00104     int index = features[i];
00105     double weight = 1.0;
00106     INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(features[i]);
00107     tprintf("Testing feature weight %g:", weight);
00108     f.print();
00109     if (features_[index]) {
00110       // A perfect match.
00111       misses -= 2.0 * weight;
00112       tprintf("Perfect hit\n");
00113     } else if (features_delta_one_[index]) {
00114       misses -= 1.5 * weight;
00115       tprintf("-1 hit\n");
00116     } else if (features_delta_two_[index]) {
00117       // A near miss.
00118       misses -= 1.0 * weight;
00119       tprintf("-2 hit\n");
00120     } else {
00121       tprintf("Total miss\n");
00122     }
00123   }
00124   tprintf("Features present:");
00125   for (int i = 0; i < size_; ++i) {
00126     if (features_[i]) {
00127       INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
00128       f.print();
00129     }
00130   }
00131   tprintf("\nMinus one features:");
00132   for (int i = 0; i < size_; ++i) {
00133     if (features_delta_one_[i]) {
00134       INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
00135       f.print();
00136     }
00137   }
00138   tprintf("\nMinus two features:");
00139   for (int i = 0; i < size_; ++i) {
00140     if (features_delta_two_[i]) {
00141       INT_FEATURE_STRUCT f = feature_map_->InverseMapFeature(i);
00142       f.print();
00143     }
00144   }
00145   tprintf("\n");
00146   return misses / denominator;
00147 }
00148 
00149 // Clear all data.
00150 void IntFeatureDist::Clear() {
00151   delete [] features_;
00152   features_ = NULL;
00153   delete [] features_delta_one_;
00154   features_delta_one_ = NULL;
00155   delete [] features_delta_two_;
00156   features_delta_two_ = NULL;
00157 }
00158 
00159 }  // namespace tesseract