Tesseract  3.02
tesseract-ocr/classify/shapetable.cpp
Go to the documentation of this file.
00001 // Copyright 2010 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00004 // File:        shapetable.cpp
00005 // Description: Class to map a classifier shape index to unicharset
00006 //              indices and font indices.
00007 // Author:      Ray Smith
00008 // Created:     Tue Nov 02 15:31:32 PDT 2010
00009 //
00010 // (C) Copyright 2010, Google Inc.
00011 // Licensed under the Apache License, Version 2.0 (the "License");
00012 // you may not use this file except in compliance with the License.
00013 // You may obtain a copy of the License at
00014 // http://www.apache.org/licenses/LICENSE-2.0
00015 // Unless required by applicable law or agreed to in writing, software
00016 // distributed under the License is distributed on an "AS IS" BASIS,
00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00018 // See the License for the specific language governing permissions and
00019 // limitations under the License.
00020 //
00022 
00023 #include "shapetable.h"
00024 
00025 #include "intfeaturespace.h"
00026 #include "strngs.h"
00027 #include "unicharset.h"
00028 
00029 namespace tesseract {
00030 
00031 // Writes to the given file. Returns false in case of error.
00032 bool UnicharAndFonts::Serialize(FILE* fp) {
00033   inT32 uni_id = unichar_id;
00034   if (fwrite(&uni_id, sizeof(uni_id), 1, fp) != 1) return false;
00035   if (!font_ids.Serialize(fp)) return false;
00036   return true;
00037 }
00038 // Reads from the given file. Returns false in case of error.
00039 // If swap is true, assumes a big/little-endian swap is needed.
00040 bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) {
00041   inT32 uni_id;
00042   if (fread(&uni_id, sizeof(uni_id), 1, fp) != 1) return false;
00043   if (swap)
00044     ReverseN(&uni_id, sizeof(uni_id));
00045   unichar_id = uni_id;
00046   if (!font_ids.DeSerialize(swap, fp)) return false;
00047   return true;
00048 }
00049 
00050 // Sort function to sort a pair of UnicharAndFonts by unichar_id.
00051 int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) {
00052   const UnicharAndFonts* p1 = reinterpret_cast<const UnicharAndFonts*>(v1);
00053   const UnicharAndFonts* p2 = reinterpret_cast<const UnicharAndFonts*>(v2);
00054   return p1->unichar_id - p2->unichar_id;
00055 }
00056 
00057 // Writes to the given file. Returns false in case of error.
00058 bool Shape::Serialize(FILE* fp) {
00059   if (fwrite(&unichars_sorted_, sizeof(unichars_sorted_), 1, fp) != 1)
00060     return false;
00061   if (!unichars_.SerializeClasses(fp)) return false;
00062   return true;
00063 }
00064 // Reads from the given file. Returns false in case of error.
00065 // If swap is true, assumes a big/little-endian swap is needed.
00066 bool Shape::DeSerialize(bool swap, FILE* fp) {
00067   if (fread(&unichars_sorted_, sizeof(unichars_sorted_), 1, fp) != 1)
00068     return false;
00069   if (!unichars_.DeSerializeClasses(swap, fp)) return false;
00070   return true;
00071 }
00072 
00073 // Adds a font_id for the given unichar_id. If the unichar_id is not
00074 // in the shape, it is added.
00075 void Shape::AddToShape(int unichar_id, int font_id) {
00076   for (int c = 0; c < unichars_.size(); ++c) {
00077     if (unichars_[c].unichar_id == unichar_id) {
00078       // Found the unichar in the shape table.
00079       GenericVector<int>& font_list = unichars_[c].font_ids;
00080       for (int f = 0; f < font_list.size(); ++f) {
00081         if (font_list[f] == font_id)
00082           return;  // Font is already there.
00083       }
00084       font_list.push_back(font_id);
00085       return;
00086     }
00087   }
00088   // Unichar_id is not in shape, so add it to shape.
00089   unichars_.push_back(UnicharAndFonts(unichar_id, font_id));
00090   unichars_sorted_ =  unichars_.size() <= 1;
00091 }
00092 
00093 // Adds everything in other to this.
00094 void Shape::AddShape(const Shape& other) {
00095   for (int c = 0; c < other.unichars_.size(); ++c) {
00096     for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) {
00097       AddToShape(other.unichars_[c].unichar_id,
00098                  other.unichars_[c].font_ids[f]);
00099     }
00100   }
00101   unichars_sorted_ =  unichars_.size() <= 1;
00102 }
00103 
00104 // Returns true if the shape contains the given unichar_id, font_id pair.
00105 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
00106   for (int c = 0; c < unichars_.size(); ++c) {
00107     if (unichars_[c].unichar_id == unichar_id) {
00108       // Found the unichar, so look for the font.
00109       GenericVector<int>& font_list = unichars_[c].font_ids;
00110       for (int f = 0; f < font_list.size(); ++f) {
00111         if (font_list[f] == font_id)
00112           return true;
00113       }
00114       return false;
00115     }
00116   }
00117   return false;
00118 }
00119 
00120 // Returns true if the shape contains the given unichar_id, ignoring font.
00121 bool Shape::ContainsUnichar(int unichar_id) const {
00122   for (int c = 0; c < unichars_.size(); ++c) {
00123     if (unichars_[c].unichar_id == unichar_id) {
00124       return true;
00125     }
00126   }
00127   return false;
00128 }
00129 
00130 // Returns true if the shape contains the given font, ignoring unichar_id.
00131 bool Shape::ContainsFont(int font_id) const {
00132   for (int c = 0; c < unichars_.size(); ++c) {
00133     GenericVector<int>& font_list = unichars_[c].font_ids;
00134     for (int f = 0; f < font_list.size(); ++f) {
00135       if (font_list[f] == font_id)
00136         return true;
00137     }
00138   }
00139   return false;
00140 }
00141 
00142 // Returns true if this is a subset (including equal) of other.
00143 bool Shape::IsSubsetOf(const Shape& other) const {
00144   for (int c = 0; c < unichars_.size(); ++c) {
00145     int unichar_id = unichars_[c].unichar_id;
00146     const GenericVector<int>& font_list = unichars_[c].font_ids;
00147     for (int f = 0; f < font_list.size(); ++f) {
00148       if (!other.ContainsUnicharAndFont(unichar_id, font_list[f]))
00149         return false;
00150     }
00151   }
00152   return true;
00153 }
00154 
00155 // Returns true if the lists of unichar ids are the same in this and other,
00156 // ignoring fonts.
00157 // NOT const, as it will sort the unichars on demand.
00158 bool Shape::IsEqualUnichars(Shape* other) {
00159   if (unichars_.size() != other->unichars_.size()) return false;
00160   if (!unichars_sorted_) SortUnichars();
00161   if (!other->unichars_sorted_) other->SortUnichars();
00162   for (int c = 0; c < unichars_.size(); ++c) {
00163     if (unichars_[c].unichar_id != other->unichars_[c].unichar_id)
00164       return false;
00165   }
00166   return true;
00167 }
00168 
00169 // Sorts the unichars_ vector by unichar.
00170 void Shape::SortUnichars() {
00171   unichars_.sort(UnicharAndFonts::SortByUnicharId);
00172   unichars_sorted_ = true;
00173 }
00174 
00175 ShapeTable::ShapeTable() : unicharset_(NULL) {
00176 }
00177 ShapeTable::ShapeTable(const UNICHARSET& unicharset)
00178   : unicharset_(&unicharset) {
00179 }
00180 
00181 // Writes to the given file. Returns false in case of error.
00182 bool ShapeTable::Serialize(FILE* fp) const {
00183   if (!shape_table_.Serialize(fp)) return false;
00184   return true;
00185 }
00186 // Reads from the given file. Returns false in case of error.
00187 // If swap is true, assumes a big/little-endian swap is needed.
00188 bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
00189   if (!shape_table_.DeSerialize(swap, fp)) return false;
00190   return true;
00191 }
00192 
00193 // Returns a string listing the classes/fonts in a shape.
00194 STRING ShapeTable::DebugStr(int shape_id) const {
00195   if (shape_id < 0 || shape_id >= shape_table_.size())
00196     return STRING("INVALID_UNICHAR_ID");
00197   const Shape& shape = GetShape(shape_id);
00198   STRING result;
00199   result.add_str_int("Shape", shape_id);
00200   for (int c = 0; c < shape.size(); ++c) {
00201     result.add_str_int(" c_id=", shape[c].unichar_id);
00202     result += "=";
00203     result += unicharset_->id_to_unichar(shape[c].unichar_id);
00204     result.add_str_int(", ", shape[c].font_ids.size());
00205     result += " fonts =";
00206     for (int f = 0; f < shape[c].font_ids.size(); ++f) {
00207       result.add_str_int(" ", shape[c].font_ids[f]);
00208     }
00209   }
00210   return result;
00211 }
00212 
00213 // Returns a debug string summarizing the table.
00214 STRING ShapeTable::SummaryStr() const {
00215   int max_unichars = 0;
00216   int num_multi_shapes = 0;
00217   int num_master_shapes = 0;
00218   for (int s = 0; s < shape_table_.size(); ++s) {
00219     if (MasterDestinationIndex(s) != s) continue;
00220     ++num_master_shapes;
00221     int shape_size = GetShape(s).size();
00222     if (shape_size > 1)
00223       ++num_multi_shapes;
00224     if (shape_size > max_unichars)
00225       max_unichars = shape_size;
00226   }
00227   STRING result;
00228   result.add_str_int("Number of shapes = ", num_master_shapes);
00229   result.add_str_int(" max unichars = ", max_unichars);
00230   result.add_str_int(" number with multiple unichars = ", num_multi_shapes);
00231   return result;
00232 }
00233 
00234 
00235 // Adds a new shape starting with the given unichar_id and font_id.
00236 // Returns the assigned index.
00237 int ShapeTable::AddShape(int unichar_id, int font_id) {
00238   int index = shape_table_.size();
00239   Shape* shape = new Shape;
00240   shape->AddToShape(unichar_id, font_id);
00241   shape_table_.push_back(shape);
00242   return index;
00243 }
00244 
00245 // Adds a copy of the given shape.
00246 // Returns the assigned index.
00247 int ShapeTable::AddShape(const Shape& other) {
00248   int index = shape_table_.size();
00249   Shape* shape = new Shape(other);
00250   shape_table_.push_back(shape);
00251   return index;
00252 }
00253 
00254 // Removes the shape given by the shape index.
00255 void ShapeTable::DeleteShape(int shape_id) {
00256   delete shape_table_[shape_id];
00257   shape_table_[shape_id] = NULL;
00258   shape_table_.remove(shape_id);
00259 }
00260 
00261 // Adds a font_id to the given existing shape index for the given
00262 // unichar_id. If the unichar_id is not in the shape, it is added.
00263 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
00264   Shape& shape = *shape_table_[shape_id];
00265   shape.AddToShape(unichar_id, font_id);
00266 }
00267 
00268 // Adds the given shape to the existing shape with the given index.
00269 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
00270   Shape& shape = *shape_table_[shape_id];
00271   shape.AddShape(other);
00272 }
00273 
00274 // Returns the id of the shape that contains the given unichar and font.
00275 // If not found, returns -1.
00276 // If font_id < 0, the font_id is ignored and the first shape that matches
00277 // the unichar_id is returned.
00278 int ShapeTable::FindShape(int unichar_id, int font_id) const {
00279   for (int s = 0; s < shape_table_.size(); ++s) {
00280     const Shape& shape = GetShape(s);
00281     for (int c = 0; c < shape.size(); ++c) {
00282       if (shape[c].unichar_id == unichar_id) {
00283         if (font_id < 0)
00284           return s;  // We don't care about the font.
00285         for (int f = 0; f < shape[c].font_ids.size(); ++f) {
00286           if (shape[c].font_ids[f] == font_id)
00287             return s;
00288         }
00289       }
00290     }
00291   }
00292   return -1;
00293 }
00294 
00295 // Returns the first unichar_id and font_id in the given shape.
00296 void ShapeTable::GetFirstUnicharAndFont(int shape_id,
00297                                         int* unichar_id, int* font_id) const {
00298   const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0];
00299   *unichar_id = unichar_and_fonts.unichar_id;
00300   *font_id = unichar_and_fonts.font_ids[0];
00301 }
00302 
00303 // Expands all the classes/fonts in the shape individually to build
00304 // a ShapeTable.
00305 int ShapeTable::BuildFromShape(const Shape& shape,
00306                                const ShapeTable& master_shapes) {
00307   int num_masters = 0;
00308   for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
00309     for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
00310       int c = shape[u_ind].unichar_id;
00311       int f = shape[u_ind].font_ids[f_ind];
00312       if (FindShape(c, f) < 0) {
00313         int shape_id = AddShape(c, f);
00314         int master_id = master_shapes.FindShape(c, f);
00315         if (master_id >= 0 && shape.size() > 1) {
00316           const Shape& master = master_shapes.GetShape(master_id);
00317           if (master.IsSubsetOf(shape) && !shape.IsSubsetOf(master)) {
00318             // Add everything else from the master shape.
00319             shape_table_[shape_id]->AddShape(master);
00320             ++num_masters;
00321           }
00322         }
00323       }
00324     }
00325   }
00326   return num_masters;
00327 }
00328 
00329 // Returns true if the shapes are already merged.
00330 bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) {
00331   return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
00332 }
00333 
00334 // Returns true if any shape contains multiple unichars.
00335 bool ShapeTable::AnyMultipleUnichars() {
00336   int num_shapes = NumShapes();
00337   for (int s1 = 0; s1 < num_shapes; ++s1) {
00338     if (MasterDestinationIndex(s1) != s1) continue;
00339     if (GetShape(s1).size() > 1)
00340       return true;
00341   }
00342   return false;
00343 }
00344 
00345 // Returns the maximum number of unichars over all shapes.
00346 int ShapeTable::MaxNumUnichars() const {
00347   int max_num_unichars = 0;
00348   int num_shapes = NumShapes();
00349   for (int s = 0; s < num_shapes; ++s) {
00350     if (GetShape(s).size() > max_num_unichars)
00351       max_num_unichars = GetShape(s).size();
00352   }
00353   return max_num_unichars;
00354 }
00355 
00356 
00357 // Merges shapes with a common unichar over the [start, end) interval.
00358 // Assumes single unichar per shape.
00359 void ShapeTable::ForceFontMerges(int start, int end) {
00360   for (int s1 = start; s1 < end; ++s1) {
00361     if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
00362       int unichar_id = GetShape(s1)[0].unichar_id;
00363       for (int s2 = s1 + 1; s2 < end; ++s2) {
00364         if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
00365             unichar_id == GetShape(s2)[0].unichar_id) {
00366           MergeShapes(s1, s2);
00367         }
00368       }
00369     }
00370   }
00371   ShapeTable compacted(*unicharset_);
00372   compacted.AppendMasterShapes(*this);
00373   *this = compacted;
00374 }
00375 
00376 // Returns the number of unichars in the master shape.
00377 int ShapeTable::MasterUnicharCount(int shape_id) const {
00378   int master_id = MasterDestinationIndex(shape_id);
00379   return GetShape(master_id).size();
00380 }
00381 
00382 // Returns the sum of the font counts in the master shape.
00383 int ShapeTable::MasterFontCount(int shape_id) const {
00384   int master_id = MasterDestinationIndex(shape_id);
00385   const Shape& shape = GetShape(master_id);
00386   int font_count = 0;
00387   for (int c = 0; c < shape.size(); ++c) {
00388     font_count += shape[c].font_ids.size();
00389   }
00390   return font_count;
00391 }
00392 
00393 // Returns the number of unichars that would result from merging the shapes.
00394 int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
00395   // Do it the easy way for now.
00396   int master_id1 = MasterDestinationIndex(shape_id1);
00397   int master_id2 = MasterDestinationIndex(shape_id2);
00398   Shape combined_shape(*shape_table_[master_id1]);
00399   combined_shape.AddShape(*shape_table_[master_id2]);
00400   return combined_shape.size();
00401 }
00402 
00403 // Merges two shape_ids, leaving shape_id2 marked as merged.
00404 void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
00405   int master_id1 = MasterDestinationIndex(shape_id1);
00406   int master_id2 = MasterDestinationIndex(shape_id2);
00407   // Point master_id2 (and all merged shapes) to master_id1.
00408   shape_table_[master_id2]->set_destination_index(master_id1);
00409   // Add all the shapes of master_id2 to master_id1.
00410   shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
00411   tprintf("Merged shape %d->%d, %d->%d, now with %d unichars: %s\n",
00412           shape_id1, master_id1, shape_id2, master_id2,
00413           shape_table_[master_id1]->size(),
00414           DebugStr(master_id1).string());
00415 }
00416 
00417 // Returns the destination of this shape, (if merged), taking into account
00418 // the fact that the destination may itself have been merged.
00419 int ShapeTable::MasterDestinationIndex(int shape_id) const {
00420   int dest_id = shape_table_[shape_id]->destination_index();
00421   if (dest_id == shape_id || dest_id < 0)
00422     return shape_id;  // Is master already.
00423   int master_id = shape_table_[dest_id]->destination_index();
00424   if (master_id == dest_id || master_id < 0)
00425     return dest_id;  // Dest is the master and shape_id points to it.
00426   master_id = MasterDestinationIndex(master_id);
00427   return master_id;
00428 }
00429 
00430 // Appends the master shapes from other to this.
00431 void ShapeTable::AppendMasterShapes(const ShapeTable& other) {
00432   for (int s = 0; s < other.shape_table_.size(); ++s) {
00433     if (other.shape_table_[s]->destination_index() < 0) {
00434       AddShape(*other.shape_table_[s]);
00435     }
00436   }
00437 }
00438 
00439 // Returns the number of master shapes remaining after merging.
00440 int ShapeTable::NumMasterShapes() const {
00441   int num_shapes = 0;
00442   for (int s = 0; s < shape_table_.size(); ++s) {
00443     if (shape_table_[s]->destination_index() < 0)
00444       ++num_shapes;
00445   }
00446   return num_shapes;
00447 }
00448 
00449 
00450 }  // namespace tesseract
00451 
00452