Tesseract
3.02
|
00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00004 // File: shapetable.h 00005 // Description: Class to map a classifier shape index to unicharset 00006 // indices and font indices. 00007 // Author: Ray Smith 00008 // Created: Thu Oct 28 17:46:32 PDT 2010 00009 // 00010 // (C) Copyright 2010, Google Inc. 00011 // Licensed under the Apache License, Version 2.0 (the "License"); 00012 // you may not use this file except in compliance with the License. 00013 // You may obtain a copy of the License at 00014 // http://www.apache.org/licenses/LICENSE-2.0 00015 // Unless required by applicable law or agreed to in writing, software 00016 // distributed under the License is distributed on an "AS IS" BASIS, 00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 // See the License for the specific language governing permissions and 00019 // limitations under the License. 00020 // 00022 00023 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_ 00024 #define TESSERACT_CLASSIFY_SHAPETABLE_H_ 00025 00026 #include "genericvector.h" 00027 #include "intmatcher.h" 00028 00029 class STRING; 00030 class UNICHARSET; 00031 00032 namespace tesseract { 00033 00034 // Simple struct to hold a set of fonts associated with a single unichar-id. 00035 // A vector of UnicharAndFonts makes a shape. 00036 struct UnicharAndFonts { 00037 UnicharAndFonts() : unichar_id(0) { 00038 } 00039 UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) { 00040 font_ids.push_back(font_id); 00041 } 00042 00043 // Writes to the given file. Returns false in case of error. 00044 bool Serialize(FILE* fp); 00045 // Reads from the given file. Returns false in case of error. 00046 // If swap is true, assumes a big/little-endian swap is needed. 00047 bool DeSerialize(bool swap, FILE* fp); 00048 00049 // Sort function to sort a pair of UnicharAndFonts by unichar_id. 00050 static int SortByUnicharId(const void* v1, const void* v2); 00051 00052 GenericVector<int> font_ids; 00053 int unichar_id; 00054 }; 00055 00056 // A Shape is a collection of unichar-ids and a list of fonts associated with 00057 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is 00058 // a classifiable unit, and represents a group of characters or parts of 00059 // characters that have a similar or identical shape. Shapes/ShapeTables may 00060 // be organized hierarchically from identical shapes at the leaves to vaguely 00061 // similar shapes near the root. 00062 class Shape { 00063 public: 00064 Shape() : destination_index_(-1) {} 00065 00066 // Writes to the given file. Returns false in case of error. 00067 bool Serialize(FILE* fp); 00068 // Reads from the given file. Returns false in case of error. 00069 // If swap is true, assumes a big/little-endian swap is needed. 00070 bool DeSerialize(bool swap, FILE* fp); 00071 00072 int destination_index() const { 00073 return destination_index_; 00074 } 00075 void set_destination_index(int index) { 00076 destination_index_ = index; 00077 } 00078 int size() const { 00079 return unichars_.size(); 00080 } 00081 // Returns a UnicharAndFonts entry for the given index, which must be 00082 // in the range [0, size()). 00083 const UnicharAndFonts& operator[](int index) const { 00084 return unichars_[index]; 00085 } 00086 // Adds a font_id for the given unichar_id. If the unichar_id is not 00087 // in the shape, it is added. 00088 void AddToShape(int unichar_id, int font_id); 00089 // Adds everything in other to this. 00090 void AddShape(const Shape& other); 00091 // Returns true if the shape contains the given unichar_id, font_id pair. 00092 bool ContainsUnicharAndFont(int unichar_id, int font_id) const; 00093 // Returns true if the shape contains the given unichar_id, ignoring font. 00094 bool ContainsUnichar(int unichar_id) const; 00095 // Returns true if the shape contains the given font, ignoring unichar_id. 00096 bool ContainsFont(int font_id) const; 00097 // Returns true if this is a subset (including equal) of other. 00098 bool IsSubsetOf(const Shape& other) const; 00099 // Returns true if the lists of unichar ids are the same in this and other, 00100 // ignoring fonts. 00101 // NOT const, as it will sort the unichars on demand. 00102 bool IsEqualUnichars(Shape* other); 00103 00104 private: 00105 // Sorts the unichars_ vector by unichar. 00106 void SortUnichars(); 00107 00108 // Flag indicates that the unichars are sorted, allowing faster set 00109 // operations with another shape. 00110 bool unichars_sorted_; 00111 // If this Shape is part of a ShapeTable the destiation_index_ is the index 00112 // of some other shape in the ShapeTable with which this shape is merged. 00113 int destination_index_; 00114 // Array of unichars, each with a set of fonts. Each unichar has at most 00115 // one entry in the vector. 00116 GenericVector<UnicharAndFonts> unichars_; 00117 }; 00118 00119 // ShapeTable is a class to encapsulate the triple indirection that is 00120 // used here. 00121 // ShapeTable is a vector of shapes. 00122 // Each shape is a vector of UnicharAndFonts representing the set of unichars 00123 // that the shape represents. 00124 // Each UnicharAndFonts also lists the fonts of the unichar_id that were 00125 // mapped to the shape during training. 00126 class ShapeTable { 00127 public: 00128 ShapeTable(); 00129 // The UNICHARSET reference supplied here, or in set_unicharset below must 00130 // exist for the entire life of the ShapeTable. It is used only by DebugStr. 00131 explicit ShapeTable(const UNICHARSET& unicharset); 00132 00133 // Writes to the given file. Returns false in case of error. 00134 bool Serialize(FILE* fp) const; 00135 // Reads from the given file. Returns false in case of error. 00136 // If swap is true, assumes a big/little-endian swap is needed. 00137 bool DeSerialize(bool swap, FILE* fp); 00138 00139 // Accessors. 00140 int NumShapes() const { 00141 return shape_table_.size(); 00142 } 00143 const UNICHARSET& unicharset() const { 00144 return *unicharset_; 00145 } 00146 // Shapetable takes a pointer to the UNICHARSET, so it must persist for the 00147 // entire life of the ShapeTable. 00148 void set_unicharset(const UNICHARSET& unicharset) { 00149 unicharset_ = &unicharset; 00150 } 00151 // Returns a string listing the classes/fonts in a shape. 00152 STRING DebugStr(int shape_id) const; 00153 // Returns a debug string summarizing the table. 00154 STRING SummaryStr() const; 00155 00156 // Adds a new shape starting with the given unichar_id and font_id. 00157 // Returns the assigned index. 00158 int AddShape(int unichar_id, int font_id); 00159 // Adds a copy of the given shape. 00160 // Returns the assigned index. 00161 int AddShape(const Shape& other); 00162 // Removes the shape given by the shape index. All indices above are changed! 00163 void DeleteShape(int shape_id); 00164 // Adds a font_id to the given existing shape index for the given 00165 // unichar_id. If the unichar_id is not in the shape, it is added. 00166 void AddToShape(int shape_id, int unichar_id, int font_id); 00167 // Adds the given shape to the existing shape with the given index. 00168 void AddShapeToShape(int shape_id, const Shape& other); 00169 // Returns the id of the shape that contains the given unichar and font. 00170 // If not found, returns -1. 00171 // If font_id < 0, the font_id is ignored and the first shape that matches 00172 // the unichar_id is returned. 00173 int FindShape(int unichar_id, int font_id) const; 00174 // Returns the first unichar_id and font_id in the given shape. 00175 void GetFirstUnicharAndFont(int shape_id, 00176 int* unichar_id, int* font_id) const; 00177 00178 // Accessors for the Shape with the given shape_id. 00179 const Shape& GetShape(int shape_id) const { 00180 return *shape_table_[shape_id]; 00181 } 00182 Shape* MutableShape(int shape_id) { 00183 return shape_table_[shape_id]; 00184 } 00185 00186 // Expands all the classes/fonts in the shape individually to build 00187 // a ShapeTable. 00188 int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes); 00189 00190 // Returns true if the shapes are already merged. 00191 bool AlreadyMerged(int shape_id1, int shape_id2); 00192 // Returns true if any shape contains multiple unichars. 00193 bool AnyMultipleUnichars(); 00194 // Returns the maximum number of unichars over all shapes. 00195 int MaxNumUnichars() const; 00196 // Merges shapes with a common unichar over the [start, end) interval. 00197 // Assumes single unichar per shape. 00198 void ForceFontMerges(int start, int end); 00199 // Returns the number of unichars in the master shape. 00200 int MasterUnicharCount(int shape_id) const; 00201 // Returns the sum of the font counts in the master shape. 00202 int MasterFontCount(int shape_id) const; 00203 // Returns the number of unichars that would result from merging the shapes. 00204 int MergedUnicharCount(int shape_id1, int shape_id2) const; 00205 // Merges two shape_ids, leaving shape_id2 marked as merged. 00206 void MergeShapes(int shape_id1, int shape_id2); 00207 // Appends the master shapes from other to this. 00208 // Used to create a clean ShapeTable from a merged one, or to create a 00209 // copy of a ShapeTable. 00210 void AppendMasterShapes(const ShapeTable& other); 00211 // Returns the number of master shapes remaining after merging. 00212 int NumMasterShapes() const; 00213 // Returns the destination of this shape, (if merged), taking into account 00214 // the fact that the destination may itself have been merged. 00215 // For a non-merged shape, returns the input shape_id. 00216 int MasterDestinationIndex(int shape_id) const; 00217 00218 private: 00219 // Pointer to a provided unicharset used only by the Debugstr member. 00220 const UNICHARSET* unicharset_; 00221 // Vector of pointers to the Shapes in this ShapeTable. 00222 PointerVector<Shape> shape_table_; 00223 }; 00224 00225 } // namespace tesseract. 00226 00227 #endif // TESSERACT_CLASSIFY_SHAPETABLE_H_