Tesseract
3.02
|
00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00004 // File: shapetable.cpp 00005 // Description: Class to map a classifier shape index to unicharset 00006 // indices and font indices. 00007 // Author: Ray Smith 00008 // Created: Tue Nov 02 15:31:32 PDT 2010 00009 // 00010 // (C) Copyright 2010, Google Inc. 00011 // Licensed under the Apache License, Version 2.0 (the "License"); 00012 // you may not use this file except in compliance with the License. 00013 // You may obtain a copy of the License at 00014 // http://www.apache.org/licenses/LICENSE-2.0 00015 // Unless required by applicable law or agreed to in writing, software 00016 // distributed under the License is distributed on an "AS IS" BASIS, 00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 // See the License for the specific language governing permissions and 00019 // limitations under the License. 00020 // 00022 00023 #include "shapetable.h" 00024 00025 #include "intfeaturespace.h" 00026 #include "strngs.h" 00027 #include "unicharset.h" 00028 00029 namespace tesseract { 00030 00031 // Writes to the given file. Returns false in case of error. 00032 bool UnicharAndFonts::Serialize(FILE* fp) { 00033 inT32 uni_id = unichar_id; 00034 if (fwrite(&uni_id, sizeof(uni_id), 1, fp) != 1) return false; 00035 if (!font_ids.Serialize(fp)) return false; 00036 return true; 00037 } 00038 // Reads from the given file. Returns false in case of error. 00039 // If swap is true, assumes a big/little-endian swap is needed. 00040 bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) { 00041 inT32 uni_id; 00042 if (fread(&uni_id, sizeof(uni_id), 1, fp) != 1) return false; 00043 if (swap) 00044 ReverseN(&uni_id, sizeof(uni_id)); 00045 unichar_id = uni_id; 00046 if (!font_ids.DeSerialize(swap, fp)) return false; 00047 return true; 00048 } 00049 00050 // Sort function to sort a pair of UnicharAndFonts by unichar_id. 00051 int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) { 00052 const UnicharAndFonts* p1 = reinterpret_cast<const UnicharAndFonts*>(v1); 00053 const UnicharAndFonts* p2 = reinterpret_cast<const UnicharAndFonts*>(v2); 00054 return p1->unichar_id - p2->unichar_id; 00055 } 00056 00057 // Writes to the given file. Returns false in case of error. 00058 bool Shape::Serialize(FILE* fp) { 00059 if (fwrite(&unichars_sorted_, sizeof(unichars_sorted_), 1, fp) != 1) 00060 return false; 00061 if (!unichars_.SerializeClasses(fp)) return false; 00062 return true; 00063 } 00064 // Reads from the given file. Returns false in case of error. 00065 // If swap is true, assumes a big/little-endian swap is needed. 00066 bool Shape::DeSerialize(bool swap, FILE* fp) { 00067 if (fread(&unichars_sorted_, sizeof(unichars_sorted_), 1, fp) != 1) 00068 return false; 00069 if (!unichars_.DeSerializeClasses(swap, fp)) return false; 00070 return true; 00071 } 00072 00073 // Adds a font_id for the given unichar_id. If the unichar_id is not 00074 // in the shape, it is added. 00075 void Shape::AddToShape(int unichar_id, int font_id) { 00076 for (int c = 0; c < unichars_.size(); ++c) { 00077 if (unichars_[c].unichar_id == unichar_id) { 00078 // Found the unichar in the shape table. 00079 GenericVector<int>& font_list = unichars_[c].font_ids; 00080 for (int f = 0; f < font_list.size(); ++f) { 00081 if (font_list[f] == font_id) 00082 return; // Font is already there. 00083 } 00084 font_list.push_back(font_id); 00085 return; 00086 } 00087 } 00088 // Unichar_id is not in shape, so add it to shape. 00089 unichars_.push_back(UnicharAndFonts(unichar_id, font_id)); 00090 unichars_sorted_ = unichars_.size() <= 1; 00091 } 00092 00093 // Adds everything in other to this. 00094 void Shape::AddShape(const Shape& other) { 00095 for (int c = 0; c < other.unichars_.size(); ++c) { 00096 for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) { 00097 AddToShape(other.unichars_[c].unichar_id, 00098 other.unichars_[c].font_ids[f]); 00099 } 00100 } 00101 unichars_sorted_ = unichars_.size() <= 1; 00102 } 00103 00104 // Returns true if the shape contains the given unichar_id, font_id pair. 00105 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const { 00106 for (int c = 0; c < unichars_.size(); ++c) { 00107 if (unichars_[c].unichar_id == unichar_id) { 00108 // Found the unichar, so look for the font. 00109 GenericVector<int>& font_list = unichars_[c].font_ids; 00110 for (int f = 0; f < font_list.size(); ++f) { 00111 if (font_list[f] == font_id) 00112 return true; 00113 } 00114 return false; 00115 } 00116 } 00117 return false; 00118 } 00119 00120 // Returns true if the shape contains the given unichar_id, ignoring font. 00121 bool Shape::ContainsUnichar(int unichar_id) const { 00122 for (int c = 0; c < unichars_.size(); ++c) { 00123 if (unichars_[c].unichar_id == unichar_id) { 00124 return true; 00125 } 00126 } 00127 return false; 00128 } 00129 00130 // Returns true if the shape contains the given font, ignoring unichar_id. 00131 bool Shape::ContainsFont(int font_id) const { 00132 for (int c = 0; c < unichars_.size(); ++c) { 00133 GenericVector<int>& font_list = unichars_[c].font_ids; 00134 for (int f = 0; f < font_list.size(); ++f) { 00135 if (font_list[f] == font_id) 00136 return true; 00137 } 00138 } 00139 return false; 00140 } 00141 00142 // Returns true if this is a subset (including equal) of other. 00143 bool Shape::IsSubsetOf(const Shape& other) const { 00144 for (int c = 0; c < unichars_.size(); ++c) { 00145 int unichar_id = unichars_[c].unichar_id; 00146 const GenericVector<int>& font_list = unichars_[c].font_ids; 00147 for (int f = 0; f < font_list.size(); ++f) { 00148 if (!other.ContainsUnicharAndFont(unichar_id, font_list[f])) 00149 return false; 00150 } 00151 } 00152 return true; 00153 } 00154 00155 // Returns true if the lists of unichar ids are the same in this and other, 00156 // ignoring fonts. 00157 // NOT const, as it will sort the unichars on demand. 00158 bool Shape::IsEqualUnichars(Shape* other) { 00159 if (unichars_.size() != other->unichars_.size()) return false; 00160 if (!unichars_sorted_) SortUnichars(); 00161 if (!other->unichars_sorted_) other->SortUnichars(); 00162 for (int c = 0; c < unichars_.size(); ++c) { 00163 if (unichars_[c].unichar_id != other->unichars_[c].unichar_id) 00164 return false; 00165 } 00166 return true; 00167 } 00168 00169 // Sorts the unichars_ vector by unichar. 00170 void Shape::SortUnichars() { 00171 unichars_.sort(UnicharAndFonts::SortByUnicharId); 00172 unichars_sorted_ = true; 00173 } 00174 00175 ShapeTable::ShapeTable() : unicharset_(NULL) { 00176 } 00177 ShapeTable::ShapeTable(const UNICHARSET& unicharset) 00178 : unicharset_(&unicharset) { 00179 } 00180 00181 // Writes to the given file. Returns false in case of error. 00182 bool ShapeTable::Serialize(FILE* fp) const { 00183 if (!shape_table_.Serialize(fp)) return false; 00184 return true; 00185 } 00186 // Reads from the given file. Returns false in case of error. 00187 // If swap is true, assumes a big/little-endian swap is needed. 00188 bool ShapeTable::DeSerialize(bool swap, FILE* fp) { 00189 if (!shape_table_.DeSerialize(swap, fp)) return false; 00190 return true; 00191 } 00192 00193 // Returns a string listing the classes/fonts in a shape. 00194 STRING ShapeTable::DebugStr(int shape_id) const { 00195 if (shape_id < 0 || shape_id >= shape_table_.size()) 00196 return STRING("INVALID_UNICHAR_ID"); 00197 const Shape& shape = GetShape(shape_id); 00198 STRING result; 00199 result.add_str_int("Shape", shape_id); 00200 for (int c = 0; c < shape.size(); ++c) { 00201 result.add_str_int(" c_id=", shape[c].unichar_id); 00202 result += "="; 00203 result += unicharset_->id_to_unichar(shape[c].unichar_id); 00204 result.add_str_int(", ", shape[c].font_ids.size()); 00205 result += " fonts ="; 00206 for (int f = 0; f < shape[c].font_ids.size(); ++f) { 00207 result.add_str_int(" ", shape[c].font_ids[f]); 00208 } 00209 } 00210 return result; 00211 } 00212 00213 // Returns a debug string summarizing the table. 00214 STRING ShapeTable::SummaryStr() const { 00215 int max_unichars = 0; 00216 int num_multi_shapes = 0; 00217 int num_master_shapes = 0; 00218 for (int s = 0; s < shape_table_.size(); ++s) { 00219 if (MasterDestinationIndex(s) != s) continue; 00220 ++num_master_shapes; 00221 int shape_size = GetShape(s).size(); 00222 if (shape_size > 1) 00223 ++num_multi_shapes; 00224 if (shape_size > max_unichars) 00225 max_unichars = shape_size; 00226 } 00227 STRING result; 00228 result.add_str_int("Number of shapes = ", num_master_shapes); 00229 result.add_str_int(" max unichars = ", max_unichars); 00230 result.add_str_int(" number with multiple unichars = ", num_multi_shapes); 00231 return result; 00232 } 00233 00234 00235 // Adds a new shape starting with the given unichar_id and font_id. 00236 // Returns the assigned index. 00237 int ShapeTable::AddShape(int unichar_id, int font_id) { 00238 int index = shape_table_.size(); 00239 Shape* shape = new Shape; 00240 shape->AddToShape(unichar_id, font_id); 00241 shape_table_.push_back(shape); 00242 return index; 00243 } 00244 00245 // Adds a copy of the given shape. 00246 // Returns the assigned index. 00247 int ShapeTable::AddShape(const Shape& other) { 00248 int index = shape_table_.size(); 00249 Shape* shape = new Shape(other); 00250 shape_table_.push_back(shape); 00251 return index; 00252 } 00253 00254 // Removes the shape given by the shape index. 00255 void ShapeTable::DeleteShape(int shape_id) { 00256 delete shape_table_[shape_id]; 00257 shape_table_[shape_id] = NULL; 00258 shape_table_.remove(shape_id); 00259 } 00260 00261 // Adds a font_id to the given existing shape index for the given 00262 // unichar_id. If the unichar_id is not in the shape, it is added. 00263 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) { 00264 Shape& shape = *shape_table_[shape_id]; 00265 shape.AddToShape(unichar_id, font_id); 00266 } 00267 00268 // Adds the given shape to the existing shape with the given index. 00269 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) { 00270 Shape& shape = *shape_table_[shape_id]; 00271 shape.AddShape(other); 00272 } 00273 00274 // Returns the id of the shape that contains the given unichar and font. 00275 // If not found, returns -1. 00276 // If font_id < 0, the font_id is ignored and the first shape that matches 00277 // the unichar_id is returned. 00278 int ShapeTable::FindShape(int unichar_id, int font_id) const { 00279 for (int s = 0; s < shape_table_.size(); ++s) { 00280 const Shape& shape = GetShape(s); 00281 for (int c = 0; c < shape.size(); ++c) { 00282 if (shape[c].unichar_id == unichar_id) { 00283 if (font_id < 0) 00284 return s; // We don't care about the font. 00285 for (int f = 0; f < shape[c].font_ids.size(); ++f) { 00286 if (shape[c].font_ids[f] == font_id) 00287 return s; 00288 } 00289 } 00290 } 00291 } 00292 return -1; 00293 } 00294 00295 // Returns the first unichar_id and font_id in the given shape. 00296 void ShapeTable::GetFirstUnicharAndFont(int shape_id, 00297 int* unichar_id, int* font_id) const { 00298 const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0]; 00299 *unichar_id = unichar_and_fonts.unichar_id; 00300 *font_id = unichar_and_fonts.font_ids[0]; 00301 } 00302 00303 // Expands all the classes/fonts in the shape individually to build 00304 // a ShapeTable. 00305 int ShapeTable::BuildFromShape(const Shape& shape, 00306 const ShapeTable& master_shapes) { 00307 int num_masters = 0; 00308 for (int u_ind = 0; u_ind < shape.size(); ++u_ind) { 00309 for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) { 00310 int c = shape[u_ind].unichar_id; 00311 int f = shape[u_ind].font_ids[f_ind]; 00312 if (FindShape(c, f) < 0) { 00313 int shape_id = AddShape(c, f); 00314 int master_id = master_shapes.FindShape(c, f); 00315 if (master_id >= 0 && shape.size() > 1) { 00316 const Shape& master = master_shapes.GetShape(master_id); 00317 if (master.IsSubsetOf(shape) && !shape.IsSubsetOf(master)) { 00318 // Add everything else from the master shape. 00319 shape_table_[shape_id]->AddShape(master); 00320 ++num_masters; 00321 } 00322 } 00323 } 00324 } 00325 } 00326 return num_masters; 00327 } 00328 00329 // Returns true if the shapes are already merged. 00330 bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) { 00331 return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2); 00332 } 00333 00334 // Returns true if any shape contains multiple unichars. 00335 bool ShapeTable::AnyMultipleUnichars() { 00336 int num_shapes = NumShapes(); 00337 for (int s1 = 0; s1 < num_shapes; ++s1) { 00338 if (MasterDestinationIndex(s1) != s1) continue; 00339 if (GetShape(s1).size() > 1) 00340 return true; 00341 } 00342 return false; 00343 } 00344 00345 // Returns the maximum number of unichars over all shapes. 00346 int ShapeTable::MaxNumUnichars() const { 00347 int max_num_unichars = 0; 00348 int num_shapes = NumShapes(); 00349 for (int s = 0; s < num_shapes; ++s) { 00350 if (GetShape(s).size() > max_num_unichars) 00351 max_num_unichars = GetShape(s).size(); 00352 } 00353 return max_num_unichars; 00354 } 00355 00356 00357 // Merges shapes with a common unichar over the [start, end) interval. 00358 // Assumes single unichar per shape. 00359 void ShapeTable::ForceFontMerges(int start, int end) { 00360 for (int s1 = start; s1 < end; ++s1) { 00361 if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) { 00362 int unichar_id = GetShape(s1)[0].unichar_id; 00363 for (int s2 = s1 + 1; s2 < end; ++s2) { 00364 if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 && 00365 unichar_id == GetShape(s2)[0].unichar_id) { 00366 MergeShapes(s1, s2); 00367 } 00368 } 00369 } 00370 } 00371 ShapeTable compacted(*unicharset_); 00372 compacted.AppendMasterShapes(*this); 00373 *this = compacted; 00374 } 00375 00376 // Returns the number of unichars in the master shape. 00377 int ShapeTable::MasterUnicharCount(int shape_id) const { 00378 int master_id = MasterDestinationIndex(shape_id); 00379 return GetShape(master_id).size(); 00380 } 00381 00382 // Returns the sum of the font counts in the master shape. 00383 int ShapeTable::MasterFontCount(int shape_id) const { 00384 int master_id = MasterDestinationIndex(shape_id); 00385 const Shape& shape = GetShape(master_id); 00386 int font_count = 0; 00387 for (int c = 0; c < shape.size(); ++c) { 00388 font_count += shape[c].font_ids.size(); 00389 } 00390 return font_count; 00391 } 00392 00393 // Returns the number of unichars that would result from merging the shapes. 00394 int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const { 00395 // Do it the easy way for now. 00396 int master_id1 = MasterDestinationIndex(shape_id1); 00397 int master_id2 = MasterDestinationIndex(shape_id2); 00398 Shape combined_shape(*shape_table_[master_id1]); 00399 combined_shape.AddShape(*shape_table_[master_id2]); 00400 return combined_shape.size(); 00401 } 00402 00403 // Merges two shape_ids, leaving shape_id2 marked as merged. 00404 void ShapeTable::MergeShapes(int shape_id1, int shape_id2) { 00405 int master_id1 = MasterDestinationIndex(shape_id1); 00406 int master_id2 = MasterDestinationIndex(shape_id2); 00407 // Point master_id2 (and all merged shapes) to master_id1. 00408 shape_table_[master_id2]->set_destination_index(master_id1); 00409 // Add all the shapes of master_id2 to master_id1. 00410 shape_table_[master_id1]->AddShape(*shape_table_[master_id2]); 00411 tprintf("Merged shape %d->%d, %d->%d, now with %d unichars: %s\n", 00412 shape_id1, master_id1, shape_id2, master_id2, 00413 shape_table_[master_id1]->size(), 00414 DebugStr(master_id1).string()); 00415 } 00416 00417 // Returns the destination of this shape, (if merged), taking into account 00418 // the fact that the destination may itself have been merged. 00419 int ShapeTable::MasterDestinationIndex(int shape_id) const { 00420 int dest_id = shape_table_[shape_id]->destination_index(); 00421 if (dest_id == shape_id || dest_id < 0) 00422 return shape_id; // Is master already. 00423 int master_id = shape_table_[dest_id]->destination_index(); 00424 if (master_id == dest_id || master_id < 0) 00425 return dest_id; // Dest is the master and shape_id points to it. 00426 master_id = MasterDestinationIndex(master_id); 00427 return master_id; 00428 } 00429 00430 // Appends the master shapes from other to this. 00431 void ShapeTable::AppendMasterShapes(const ShapeTable& other) { 00432 for (int s = 0; s < other.shape_table_.size(); ++s) { 00433 if (other.shape_table_[s]->destination_index() < 0) { 00434 AddShape(*other.shape_table_[s]); 00435 } 00436 } 00437 } 00438 00439 // Returns the number of master shapes remaining after merging. 00440 int ShapeTable::NumMasterShapes() const { 00441 int num_shapes = 0; 00442 for (int s = 0; s < shape_table_.size(); ++s) { 00443 if (shape_table_[s]->destination_index() < 0) 00444 ++num_shapes; 00445 } 00446 return num_shapes; 00447 } 00448 00449 00450 } // namespace tesseract 00451 00452