Tesseract  3.02
tesseract-ocr/classify/intfeaturemap.h
Go to the documentation of this file.
00001 // Copyright 2010 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00004 // File:        intfeaturemap.h
00005 // Description: Encapsulation of IntFeatureSpace with IndexMapBiDi
00006 //              to provide a subspace mapping and fast feature lookup.
00007 // Created:     Tue Oct 26 08:58:30 PDT 2010
00008 //
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H__
00022 #define TESSERACT_CLASSIFY_INTFEATUREMAP_H__
00023 
00024 #include "intfeaturespace.h"
00025 #include "indexmapbidi.h"
00026 #include "intproto.h"
00027 
00028 namespace tesseract {
00029 
00030 class SampleIterator;
00031 
00032 // Number of positive and negative offset maps.
00033 static const int kNumOffsetMaps = 2;
00034 
00035 // Class to map a feature space defined by INT_FEATURE_STRUCT to a compact
00036 // down-sampled subspace of actually used features.
00037 // The IntFeatureMap copes with 2 stages of transformation:
00038 // The first step is down-sampling (re-quantization) and converting to a
00039 // single index value from the 3-D input:
00040 //   INT_FEATURE_STRUCT <-> index feature (via IntFeatureSpace) and
00041 // the second is a feature-space compaction to map only the feature indices
00042 // that are actually used. This saves space in classifiers that are built
00043 // using the mapped feature space.
00044 //   index (sparse) feature <-> map (compact) feature via IndexMapBiDi.
00045 // Although the transformations are reversible, the inverses are lossy and do
00046 // not return the exact input INT_FEATURE_STRUCT, due to the many->one nature
00047 // of both transformations.
00048 class IntFeatureMap {
00049  public:
00050   IntFeatureMap();
00051   ~IntFeatureMap();
00052 
00053   // Accessors.
00054   int sparse_size() const {
00055     return feature_space_.Size();
00056   }
00057   int compact_size() const {
00058     return compact_size_;
00059   }
00060   const IntFeatureSpace& feature_space() const {
00061     return feature_space_;
00062   }
00063   const IndexMapBiDi& feature_map() const {
00064     return feature_map_;
00065   }
00066 
00067   // Pseudo-accessors.
00068   int IndexFeature(const INT_FEATURE_STRUCT& f) const;
00069   int MapFeature(const INT_FEATURE_STRUCT& f) const;
00070   int MapIndexFeature(int index_feature) const;
00071   INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const;
00072   INT_FEATURE_STRUCT InverseMapFeature(int map_feature) const;
00073   void DeleteMapFeature(int map_feature);
00074   bool IsMapFeatureDeleted(int map_feature) const;
00075 
00076   // Copies the given feature_space and uses it as the index feature map
00077   // from INT_FEATURE_STRUCT.
00078   void Init(const IntFeatureSpace& feature_space);
00079 
00080   // Helper to return an offset index feature. In this context an offset
00081   // feature with a dir of +/-1 is a feature of a similar direction,
00082   // but shifted perpendicular to the direction of the feature. An offset
00083   // feature with a dir of +/-2 is feature at the same position, but rotated
00084   // by +/- one [compact] quantum. Returns the index of the generated offset
00085   // feature, or -1 if it doesn't exist. Dir should be in
00086   // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
00087   // A dir of 0 is an identity transformation.
00088   // Both input and output are from the index(sparse) feature space, not
00089   // the mapped/compact feature space, but the offset feature is the minimum
00090   // distance moved from the input to guarantee that it maps to the next
00091   // available quantum in the mapped/compact space.
00092   int OffsetFeature(int index_feature, int dir) const;
00093 
00094   // Computes the features used by the subset of samples defined by
00095   // the iterator and sets up the feature mapping.
00096   // Returns the size of the compacted feature space.
00097   int FindNZFeatureMapping(SampleIterator* it);
00098 
00099   // After deleting some features, finish setting up the mapping, and map
00100   // all the samples. Returns the size of the compacted feature space.
00101   int FinalizeMapping(SampleIterator* it);
00102 
00103   // Indexes the given array of features to a vector of sorted indices.
00104   void IndexAndSortFeatures(const INT_FEATURE_STRUCT* features,
00105                             int num_features,
00106                             GenericVector<int>* sorted_features) const {
00107     feature_space_.IndexAndSortFeatures(features, num_features,
00108                                         sorted_features);
00109   }
00110   // Maps the given array of index/sparse features to an array of map/compact
00111   // features.
00112   // Assumes the input is sorted. The output indices are sorted and uniqued.
00113   // Returns the number of "missed" features, being features that
00114   // don't map to the compact feature space.
00115   int MapIndexedFeatures(const GenericVector<int>& index_features,
00116                          GenericVector<int>* map_features) const {
00117     return feature_map_.MapFeatures(index_features, map_features);
00118   }
00119 
00120   // Prints the map features from the set in human-readable form.
00121   void DebugMapFeatures(const GenericVector<int>& map_features) const;
00122 
00123  private:
00124   void Clear();
00125 
00126   // Helper to compute an offset index feature. In this context an offset
00127   // feature with a dir of +/-1 is a feature of a similar direction,
00128   // but shifted perpendicular to the direction of the feature. An offset
00129   // feature with a dir of +/-2 is feature at the same position, but rotated
00130   // by +/- one [compact] quantum. Returns the index of the generated offset
00131   // feature, or -1 if it doesn't exist. Dir should be in
00132   // [-kNumOffsetMaps, kNumOffsetMaps] to indicate the relative direction.
00133   // A dir of 0 is an identity transformation.
00134   // Both input and output are from the index(sparse) feature space, not
00135   // the mapped/compact feature space, but the offset feature is the minimum
00136   // distance moved from the input to guarantee that it maps to the next
00137   // available quantum in the mapped/compact space.
00138   int ComputeOffsetFeature(int index_feature, int dir) const;
00139 
00140   // True if the mapping has changed since it was last finalized.
00141   bool mapping_changed_;
00142   // Size of the compacted feature space, after unused features are removed.
00143   int compact_size_;
00144   // Feature space quantization definition and indexing from INT_FEATURE_STRUCT.
00145   IntFeatureSpace feature_space_;
00146   // Mapping from indexed feature space to the compacted space with unused
00147   // features mapping to -1.
00148   IndexMapBiDi feature_map_;
00149   // Index tables to map a feature index to the corresponding feature after a
00150   // shift perpendicular to the feature direction, or a rotation in place.
00151   // An entry of -1 indicates that there is no corresponding feature.
00152   // Array of arrays of size feature_space_.Size() owned by this class.
00153   int* offset_plus_[kNumOffsetMaps];
00154   int* offset_minus_[kNumOffsetMaps];
00155 
00156   // Don't use default copy and assign!
00157   IntFeatureMap(const IntFeatureMap&);
00158   void operator=(const IntFeatureMap&);
00159 };
00160 
00161 }  // namespace tesseract.
00162 
00163 #endif  // TESSERACT_CLASSIFY_INTFEATUREMAP_H__