Tesseract  3.02
tesseract-ocr/ccutil/indexmapbidi.h
Go to the documentation of this file.
00001 
00002 // File:        indexmapbidi.h
00003 // Description: Bi-directional mapping between a sparse and compact space.
00004 // Author:      rays@google.com (Ray Smith)
00005 // Created:     Tue Apr 06 11:33:59 PDT 2010
00006 //
00007 // (C) Copyright 2010, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_CCUTIL_INDEXMAPBIDI_H_
00021 #define TESSERACT_CCUTIL_INDEXMAPBIDI_H_
00022 
00023 #include <stdio.h>
00024 #include "genericvector.h"
00025 
00026 namespace tesseract {
00027 
00028 class IndexMapBiDi;
00029 
00030 // Bidirectional one-to-one mapping between a sparse and a compact discrete
00031 // space. Many entries in the sparse space are unmapped, but those that are
00032 // mapped have a 1-1 mapping to (and from) the compact space, where all
00033 // values are used. This is useful for forming subsets of larger collections,
00034 // such as subsets of character sets, or subsets of binary feature spaces.
00035 //
00036 // This base class provides basic functionality with binary search for the
00037 // SparseToCompact mapping to save memory.
00038 // For a faster inverse mapping, or to allow a many-to-one mapping, use
00039 // IndexMapBiDi below.
00040 // NOTE: there are currently no methods to setup an IndexMap on its own!
00041 // It must be initialized by copying from an IndexMapBiDi or by DeSerialize.
00042 class IndexMap {
00043  public:
00044   virtual ~IndexMap() {}
00045 
00046   // SparseToCompact takes a sparse index to an index in the compact space.
00047   // Uses a binary search to find the result. For faster speed use
00048   // IndexMapBiDi, but that takes more memory.
00049   virtual int SparseToCompact(int sparse_index) const;
00050 
00051   // CompactToSparse takes a compact index to the corresponding index in the
00052   // sparse space.
00053   int CompactToSparse(int compact_index) const {
00054     return compact_map_[compact_index];
00055   }
00056   // The size of the sparse space.
00057   virtual int SparseSize() const {
00058     return sparse_size_;
00059   }
00060   // The size of the compact space.
00061   int CompactSize() const {
00062     return compact_map_.size();
00063   }
00064 
00065   // Copy from the input.
00066   void CopyFrom(const IndexMap& src);
00067   void CopyFrom(const IndexMapBiDi& src);
00068 
00069   // Writes to the given file. Returns false in case of error.
00070   bool Serialize(FILE* fp) const;
00071   // Reads from the given file. Returns false in case of error.
00072   // If swap is true, assumes a big/little-endian swap is needed.
00073   bool DeSerialize(bool swap, FILE* fp);
00074 
00075  protected:
00076   // The sparse space covers integers in the range [0, sparse_size_-1].
00077   int sparse_size_;
00078   // The compact space covers integers in the range [0, compact_map_.size()-1].
00079   // Each element contains the corresponding sparse index.
00080   GenericVector<inT32> compact_map_;
00081 };
00082 
00083 // Bidirectional many-to-one mapping between a sparse and a compact discrete
00084 // space. As with IndexMap, many entries may be unmapped, but unlike IndexMap,
00085 // of those that are, many may be mapped to the same compact index.
00086 // If the map is many-to-one, it is not possible to directly obtain all the
00087 // sparse indices that map to a single compact index.
00088 // This map is time- rather than space-efficient. It stores the entire sparse
00089 // space.
00090 // IndexMapBiDi may be initialized in one of 3 ways:
00091 // 1. Init(size, true);
00092 //    Setup();
00093 //    Sets a complete 1:1 mapping with no unmapped elements.
00094 // 2. Init(size, false);
00095 //    for ... SetMap(index, true);
00096 //    Setup();
00097 //    Specifies precisely which sparse indices are mapped. The mapping is 1:1.
00098 // 3. Either of the above, followed by:
00099 //    for ... Merge(index1, index2);
00100 //    CompleteMerges();
00101 //    Allows a many-to-one mapping by merging compact space indices.
00102 class IndexMapBiDi : public IndexMap {
00103  public:
00104   virtual ~IndexMapBiDi() {}
00105 
00106   // Top-level init function in a single call to initialize a map to select
00107   // a single contiguous subrange [start, end) of the sparse space to be mapped
00108   // 1 to 1 to the compact space, with all other elements of the sparse space
00109   // left unmapped.
00110   // No need to call Setup after this.
00111   void InitAndSetupRange(int sparse_size, int start, int end);
00112 
00113   // Initializes just the sparse_map_ to the given size with either all
00114   // forward indices mapped (all_mapped = true) or none (all_mapped = false).
00115   // Call Setup immediately after, or make calls to SetMap first to adjust the
00116   // mapping and then call Setup before using the map.
00117   void Init(int size, bool all_mapped);
00118   // Sets a given index in the sparse_map_ to be mapped or not.
00119   void SetMap(int sparse_index, bool mapped);
00120   // Sets up the sparse_map_ and compact_map_ properly after Init and
00121   // some calls to SetMap. Assumes an ordered 1-1 map from set indices
00122   // in the sparse space to the compact space.
00123   void Setup();
00124 
00125   // Merges the two compact space indices. May be called many times, but
00126   // the merges must be concluded by a call to CompleteMerges.
00127   // Returns true if a merge was actually performed.
00128   bool Merge(int compact_index1, int compact_index2);
00129   // Returns true if the given compact index has been deleted.
00130   bool IsCompactDeleted(int index) const {
00131     return MasterCompactIndex(index) < 0;
00132   }
00133   // Completes one or more Merge operations by further compacting the
00134   // compact space.
00135   void CompleteMerges();
00136 
00137   // SparseToCompact takes a sparse index to an index in the compact space.
00138   virtual int SparseToCompact(int sparse_index) const {
00139     return sparse_map_[sparse_index];
00140   }
00141   // The size of the sparse space.
00142   virtual int SparseSize() const {
00143     return sparse_map_.size();
00144   }
00145 
00146   // Copy from the input.
00147   void CopyFrom(const IndexMapBiDi& src);
00148 
00149   // Writes to the given file. Returns false in case of error.
00150   bool Serialize(FILE* fp) const;
00151   // Reads from the given file. Returns false in case of error.
00152   // If swap is true, assumes a big/little-endian swap is needed.
00153   bool DeSerialize(bool swap, FILE* fp);
00154 
00155   // Bulk calls to SparseToCompact.
00156   // Maps the given array of sparse indices to an array of compact indices.
00157   // Assumes the input is sorted. The output indices are sorted and uniqued.
00158   // Return value is the number of "missed" features, being features that
00159   // don't map to the compact feature space.
00160   int MapFeatures(const GenericVector<int>& sparse,
00161                   GenericVector<int>* compact) const;
00162 
00163  private:
00164   // Returns the master compact index for a given compact index.
00165   // During a multiple merge operation, several compact indices may be
00166   // combined, so we need to be able to find the master of all.
00167   int MasterCompactIndex(int compact_index) const {
00168     while (compact_index >= 0 &&
00169            sparse_map_[compact_map_[compact_index]] != compact_index)
00170       compact_index = sparse_map_[compact_map_[compact_index]];
00171     return compact_index;
00172   }
00173 
00174   // Direct look-up of the compact index for each element in sparse space.
00175   GenericVector<inT32> sparse_map_;
00176 };
00177 
00178 }  // namespace tesseract.
00179 
00180 #endif  // TESSERACT_CCUTIL_INDEXMAPBIDI_H_