Tesseract  3.02
tesseract-ocr/classify/sampleiterator.h
Go to the documentation of this file.
00001 // Copyright 2011 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00003 //
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 //
00015 
00016 
00017 #ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
00018 #define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_
00019 
00020 namespace tesseract {
00021 
00022 class IndexMapBiDi;
00023 class IntFeatureMap;
00024 class ShapeTable;
00025 class TrainingSample;
00026 class TrainingSampleSet;
00027 class UnicharAndFonts;
00028 
00029 // Iterator class to encapsulate the complex iteration involved in getting
00030 // all samples of all shapes needed for a classification problem.
00031 //
00032 // =====INPUTS TO Init FUNCTION=====
00033 // The charset_map defines a subset of the sample_set classes (with a NULL
00034 // shape_table, or the shape_table classes if not NULL.)
00035 //
00036 // The shape_table (if not NULL) defines the mapping from shapes to
00037 // font_id/class_id pairs. Each shape is a list of unichar_id and font lists.
00038 //
00039 // The sample_set holds the samples and provides indexed access to samples
00040 // of font_id/class_id pairs.
00041 //
00042 // If randomize is true, the samples are perturbed slightly, but the
00043 // perturbation is guaranteed to be the same for multiple identical
00044 // iterations.
00045 //
00046 // =====DIFFERENT COMBINATIONS OF INPUTS=====
00047 // NULL shape_table:
00048 // Without a shape_table, everything works in UNICHAR_IDs.
00049 //
00050 // NULL shape_table, NULL charset_map:
00051 // Iterations simply run over the samples in the order the samples occur in the
00052 // input files.
00053 // GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID.
00054 //
00055 // NULL shape_table, non-NULL charset_map:
00056 // When shape_table is NULL, the charset_map indexes unichar_ids directly,
00057 // and an iteration returns all samples of all chars in the charset_map, which
00058 // is a subset of the full unicharset.
00059 // The iteration will be in groups of the same unichar_id, in the order
00060 // defined by the charset_map.
00061 // GetCompactClassID returns the charset_map index of a sample, and
00062 // GetSparseClassID returns the sample UNICHAR_ID.
00063 //
00064 // Non-NULL shape_table:
00065 // With a shape_table, samples are grouped according to the shape_table, so
00066 // multiple UNICHAR_IDs and fonts may be grouped together, and everything
00067 // works in shape_ids.
00068 //
00069 // Non-NULL shape_table, NULL charset_map.
00070 // Iterations simply run over the samples in the order of shape_id.
00071 // GetCompactClassID and GetSparseClassID both return the shape_id.
00072 // (If you want the unichar_id or font_id, the sample still has them.)
00073 //
00074 // Non-NULL shape_table, non-NULL charset_map.
00075 // When shape_table is not NULL, the charset_map indexes and subsets shapes in
00076 // the shape_table, and iterations will be in shape_table order, not
00077 // charset_map order.
00078 // GetCompactClassID returns the charset_map index of a shape, and
00079 // GetSparseClassID returns the shape_id.
00080 //
00081 // =====What is SampleIterator good for?=====
00082 // Inside a classifier training module, the SampleIterator has abstracted away
00083 // all the different modes above.
00084 // Use the following iteration to train your classifier:
00085 // for (it.Begin(); !it.AtEnd(); it.Next()) {
00086 //   const TrainingSample& sample = it.GetSample();
00087 //   int class_id = it.GetCompactClassID();
00088 // Your classifier may or may not be dealing with a shape_table, and may be
00089 // dealing with some subset of the character/shape set. It doesn't need to
00090 // know and shouldn't care. It is just learning shapes with compact class ids
00091 // in the range [0, it.CompactCharsetSize()).
00092 class SampleIterator {
00093  public:
00094   SampleIterator();
00095   ~SampleIterator();
00096 
00097   void Clear();
00098 
00099   // See class comment for arguments.
00100   void Init(const IndexMapBiDi* charset_map,
00101             const ShapeTable* shape_table,
00102             bool randomize,
00103             TrainingSampleSet* sample_set);
00104 
00105   // Iterator functions designed for use with a simple for loop:
00106   // for (it.Begin(); !it.AtEnd(); it.Next()) {
00107   //   const TrainingSample& sample = it.GetSample();
00108   //   int class_id = it.GetCompactClassID();
00109   //   ...
00110   // }
00111   void Begin();
00112   bool AtEnd() const;
00113   const TrainingSample& GetSample() const;
00114   TrainingSample* MutableSample() const;
00115   // Returns the total index (from the original set of samples) of the current
00116   // sample.
00117   int GlobalSampleIndex() const;
00118   // Returns the index of the current sample in compact charset space, so
00119   // in a 2-class problem between x and y, the returned indices will all be
00120   // 0 or 1, and have nothing to do with the unichar_ids.
00121   // If the charset_map_ is NULL, then this is equal to GetSparseClassID().
00122   int GetCompactClassID() const;
00123   // Returns the index of the current sample in sparse charset space, so
00124   // in a 2-class problem between x and y, the returned indices will all be
00125   // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids
00126   // with a shape_table_.
00127   int GetSparseClassID() const;
00128   // Moves on to the next indexable sample. If the end is reached, leaves
00129   // the state such that AtEnd() is true.
00130   void Next();
00131 
00132   // Returns the size of the compact charset space.
00133   int CompactCharsetSize() const;
00134   // Returns the size of the sparse charset space.
00135   int SparseCharsetSize() const;
00136 
00137   const IndexMapBiDi& charset_map() const {
00138     return *charset_map_;
00139   }
00140   const ShapeTable* shape_table() const {
00141     return shape_table_;
00142   }
00143   // Sample set operations.
00144   const TrainingSampleSet* sample_set() const {
00145     return sample_set_;
00146   }
00147 
00148   // A set of functions that do something to all the samples accessed by the
00149   // iterator, as it is currently setup.
00150 
00151   // Apply the supplied feature_space/feature_map transform to all samples
00152   // accessed by this iterator.
00153   void MapSampleFeatures(const IntFeatureMap& feature_map);
00154 
00155   // Adjust the weights of all the samples to be uniform in the given charset.
00156   // Returns the number of samples in the iterator.
00157   int UniformSamples();
00158 
00159   // Normalize the weights of all the samples defined by the iterator so they
00160   // sum to 1. Returns the minimum assigned sample weight.
00161   double NormalizeSamples();
00162 
00163  private:
00164   // Helper returns the current UnicharAndFont shape_entry.
00165   const UnicharAndFonts* GetShapeEntry() const;
00166 
00167   // Map to subset the actual charset space.
00168   const IndexMapBiDi* charset_map_;
00169   // Shape table to recombine character classes into shapes
00170   const ShapeTable* shape_table_;
00171   // The samples to iterate over.
00172   TrainingSampleSet* sample_set_;
00173   // Flag to control randomizing the sample features.
00174   bool randomize_;
00175   // Shape table owned by this used to iterate character classes.
00176   ShapeTable* owned_shape_table_;
00177 
00178   // Top-level iteration. Shape index in sparse charset_map space.
00179   int shape_index_;
00180   int num_shapes_;
00181   // Index to the character class within a shape.
00182   int shape_char_index_;
00183   int num_shape_chars_;
00184   // Index to the font within a shape/class pair.
00185   int shape_font_index_;
00186   int num_shape_fonts_;
00187   // The lowest level iteration. sample_index_/num_samples_ counts samples
00188   // in the current shape/class/font combination.
00189   int sample_index_;
00190   int num_samples_;
00191 };
00192 
00193 }  // namespace tesseract.
00194 
00195 #endif  // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_