Tesseract  3.02
tesseract-ocr/classify/shapetable.h
Go to the documentation of this file.
00001 // Copyright 2010 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00004 // File:        shapetable.h
00005 // Description: Class to map a classifier shape index to unicharset
00006 //              indices and font indices.
00007 // Author:      Ray Smith
00008 // Created:     Thu Oct 28 17:46:32 PDT 2010
00009 //
00010 // (C) Copyright 2010, Google Inc.
00011 // Licensed under the Apache License, Version 2.0 (the "License");
00012 // you may not use this file except in compliance with the License.
00013 // You may obtain a copy of the License at
00014 // http://www.apache.org/licenses/LICENSE-2.0
00015 // Unless required by applicable law or agreed to in writing, software
00016 // distributed under the License is distributed on an "AS IS" BASIS,
00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00018 // See the License for the specific language governing permissions and
00019 // limitations under the License.
00020 //
00022 
00023 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
00024 #define TESSERACT_CLASSIFY_SHAPETABLE_H_
00025 
00026 #include "genericvector.h"
00027 #include "intmatcher.h"
00028 
00029 class STRING;
00030 class UNICHARSET;
00031 
00032 namespace tesseract {
00033 
00034 // Simple struct to hold a set of fonts associated with a single unichar-id.
00035 // A vector of UnicharAndFonts makes a shape.
00036 struct UnicharAndFonts {
00037   UnicharAndFonts() : unichar_id(0) {
00038   }
00039   UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
00040     font_ids.push_back(font_id);
00041   }
00042 
00043   // Writes to the given file. Returns false in case of error.
00044   bool Serialize(FILE* fp);
00045   // Reads from the given file. Returns false in case of error.
00046   // If swap is true, assumes a big/little-endian swap is needed.
00047   bool DeSerialize(bool swap, FILE* fp);
00048 
00049   // Sort function to sort a pair of UnicharAndFonts by unichar_id.
00050   static int SortByUnicharId(const void* v1, const void* v2);
00051 
00052   GenericVector<int> font_ids;
00053   int unichar_id;
00054 };
00055 
00056 // A Shape is a collection of unichar-ids and a list of fonts associated with
00057 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
00058 // a classifiable unit, and represents a group of characters or parts of
00059 // characters that have a similar or identical shape. Shapes/ShapeTables may
00060 // be organized hierarchically from identical shapes at the leaves to vaguely
00061 // similar shapes near the root.
00062 class Shape {
00063  public:
00064   Shape() : destination_index_(-1) {}
00065 
00066   // Writes to the given file. Returns false in case of error.
00067   bool Serialize(FILE* fp);
00068   // Reads from the given file. Returns false in case of error.
00069   // If swap is true, assumes a big/little-endian swap is needed.
00070   bool DeSerialize(bool swap, FILE* fp);
00071 
00072   int destination_index() const {
00073     return destination_index_;
00074   }
00075   void set_destination_index(int index) {
00076     destination_index_ = index;
00077   }
00078   int size() const {
00079     return unichars_.size();
00080   }
00081   // Returns a UnicharAndFonts entry for the given index, which must be
00082   // in the range [0, size()).
00083   const UnicharAndFonts& operator[](int index) const {
00084     return unichars_[index];
00085   }
00086   // Adds a font_id for the given unichar_id. If the unichar_id is not
00087   // in the shape, it is added.
00088   void AddToShape(int unichar_id, int font_id);
00089   // Adds everything in other to this.
00090   void AddShape(const Shape& other);
00091   // Returns true if the shape contains the given unichar_id, font_id pair.
00092   bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
00093   // Returns true if the shape contains the given unichar_id, ignoring font.
00094   bool ContainsUnichar(int unichar_id) const;
00095   // Returns true if the shape contains the given font, ignoring unichar_id.
00096   bool ContainsFont(int font_id) const;
00097   // Returns true if this is a subset (including equal) of other.
00098   bool IsSubsetOf(const Shape& other) const;
00099   // Returns true if the lists of unichar ids are the same in this and other,
00100   // ignoring fonts.
00101   // NOT const, as it will sort the unichars on demand.
00102   bool IsEqualUnichars(Shape* other);
00103 
00104  private:
00105   // Sorts the unichars_ vector by unichar.
00106   void SortUnichars();
00107 
00108   // Flag indicates that the unichars are sorted, allowing faster set
00109   // operations with another shape.
00110   bool unichars_sorted_;
00111   // If this Shape is part of a ShapeTable the destiation_index_ is the index
00112   // of some other shape in the ShapeTable with which this shape is merged.
00113   int destination_index_;
00114   // Array of unichars, each with a set of fonts. Each unichar has at most
00115   // one entry in the vector.
00116   GenericVector<UnicharAndFonts> unichars_;
00117 };
00118 
00119 // ShapeTable is a class to encapsulate the triple indirection that is
00120 // used here.
00121 // ShapeTable is a vector of shapes.
00122 // Each shape is a vector of UnicharAndFonts representing the set of unichars
00123 // that the shape represents.
00124 // Each UnicharAndFonts also lists the fonts of the unichar_id that were
00125 // mapped to the shape during training.
00126 class ShapeTable {
00127  public:
00128   ShapeTable();
00129   // The UNICHARSET reference supplied here, or in set_unicharset below must
00130   // exist for the entire life of the ShapeTable. It is used only by DebugStr.
00131   explicit ShapeTable(const UNICHARSET& unicharset);
00132 
00133   // Writes to the given file. Returns false in case of error.
00134   bool Serialize(FILE* fp) const;
00135   // Reads from the given file. Returns false in case of error.
00136   // If swap is true, assumes a big/little-endian swap is needed.
00137   bool DeSerialize(bool swap, FILE* fp);
00138 
00139   // Accessors.
00140   int NumShapes() const {
00141     return shape_table_.size();
00142   }
00143   const UNICHARSET& unicharset() const {
00144     return *unicharset_;
00145   }
00146   // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
00147   // entire life of the ShapeTable.
00148   void set_unicharset(const UNICHARSET& unicharset) {
00149     unicharset_ = &unicharset;
00150   }
00151   // Returns a string listing the classes/fonts in a shape.
00152   STRING DebugStr(int shape_id) const;
00153   // Returns a debug string summarizing the table.
00154   STRING SummaryStr() const;
00155 
00156   // Adds a new shape starting with the given unichar_id and font_id.
00157   // Returns the assigned index.
00158   int AddShape(int unichar_id, int font_id);
00159   // Adds a copy of the given shape.
00160   // Returns the assigned index.
00161   int AddShape(const Shape& other);
00162   // Removes the shape given by the shape index. All indices above are changed!
00163   void DeleteShape(int shape_id);
00164   // Adds a font_id to the given existing shape index for the given
00165   // unichar_id. If the unichar_id is not in the shape, it is added.
00166   void AddToShape(int shape_id, int unichar_id, int font_id);
00167   // Adds the given shape to the existing shape with the given index.
00168   void AddShapeToShape(int shape_id, const Shape& other);
00169   // Returns the id of the shape that contains the given unichar and font.
00170   // If not found, returns -1.
00171   // If font_id < 0, the font_id is ignored and the first shape that matches
00172   // the unichar_id is returned.
00173   int FindShape(int unichar_id, int font_id) const;
00174   // Returns the first unichar_id and font_id in the given shape.
00175   void GetFirstUnicharAndFont(int shape_id,
00176                               int* unichar_id, int* font_id) const;
00177 
00178   // Accessors for the Shape with the given shape_id.
00179   const Shape& GetShape(int shape_id) const {
00180     return *shape_table_[shape_id];
00181   }
00182   Shape* MutableShape(int shape_id) {
00183     return shape_table_[shape_id];
00184   }
00185 
00186   // Expands all the classes/fonts in the shape individually to build
00187   // a ShapeTable.
00188   int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes);
00189 
00190   // Returns true if the shapes are already merged.
00191   bool AlreadyMerged(int shape_id1, int shape_id2);
00192   // Returns true if any shape contains multiple unichars.
00193   bool AnyMultipleUnichars();
00194   // Returns the maximum number of unichars over all shapes.
00195   int MaxNumUnichars() const;
00196   // Merges shapes with a common unichar over the [start, end) interval.
00197   // Assumes single unichar per shape.
00198   void ForceFontMerges(int start, int end);
00199   // Returns the number of unichars in the master shape.
00200   int MasterUnicharCount(int shape_id) const;
00201   // Returns the sum of the font counts in the master shape.
00202   int MasterFontCount(int shape_id) const;
00203   // Returns the number of unichars that would result from merging the shapes.
00204   int MergedUnicharCount(int shape_id1, int shape_id2) const;
00205   // Merges two shape_ids, leaving shape_id2 marked as merged.
00206   void MergeShapes(int shape_id1, int shape_id2);
00207   // Appends the master shapes from other to this.
00208   // Used to create a clean ShapeTable from a merged one, or to create a
00209   // copy of a ShapeTable.
00210   void AppendMasterShapes(const ShapeTable& other);
00211   // Returns the number of master shapes remaining after merging.
00212   int NumMasterShapes() const;
00213   // Returns the destination of this shape, (if merged), taking into account
00214   // the fact that the destination may itself have been merged.
00215   // For a non-merged shape, returns the input shape_id.
00216   int MasterDestinationIndex(int shape_id) const;
00217 
00218  private:
00219   // Pointer to a provided unicharset used only by the Debugstr member.
00220   const UNICHARSET* unicharset_;
00221   // Vector of pointers to the Shapes in this ShapeTable.
00222   PointerVector<Shape> shape_table_;
00223 };
00224 
00225 }  // namespace tesseract.
00226 
00227 #endif  // TESSERACT_CLASSIFY_SHAPETABLE_H_