Tesseract
3.02
|
00001 00002 // File: unicharset.h 00003 // Description: Unicode character/ligature set class. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__ 00021 #define TESSERACT_CCUTIL_UNICHARSET_H__ 00022 00023 #include "assert.h" 00024 #include "strngs.h" 00025 #include "unichar.h" 00026 #include "unicharmap.h" 00027 #include "params.h" 00028 00029 enum StrongScriptDirection { 00030 DIR_NEUTRAL = 0, // Text contains only neutral characters. 00031 DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters. 00032 DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters. 00033 DIR_MIX = 3, // Text contains a mixture of left-to-right 00034 // and right-to-left characters. 00035 }; 00036 00037 class CHAR_FRAGMENT { 00038 public: 00039 // Minimum number of characters used for fragment representation. 00040 static const int kMinLen = 6; 00041 // Maximum number of characters used for fragment representation. 00042 static const int kMaxLen = 3 + UNICHAR_LEN + 2; 00043 // Maximum number of fragments per character. 00044 static const int kMaxChunks = 5; 00045 00046 // Setters and Getters. 00047 inline void set_all(const char *unichar, int pos, int total, bool natural) { 00048 set_unichar(unichar); 00049 set_pos(pos); 00050 set_total(total); 00051 set_natural(natural); 00052 } 00053 inline void set_unichar(const char *uch) { 00054 strncpy(this->unichar, uch, UNICHAR_LEN); 00055 this->unichar[UNICHAR_LEN] = '\0'; 00056 } 00057 inline void set_pos(int p) { this->pos = p; } 00058 inline void set_total(int t) { this->total = t; } 00059 inline const char* get_unichar() const { return this->unichar; } 00060 inline int get_pos() const { return this->pos; } 00061 inline int get_total() const { return this->total; } 00062 00063 // Returns the string that represents a fragment 00064 // with the given unichar, pos and total. 00065 static STRING to_string(const char *unichar, int pos, int total, 00066 bool natural); 00067 // Returns the string that represents this fragment. 00068 STRING to_string() const { 00069 return to_string(unichar, pos, total, natural); 00070 } 00071 00072 // Checks whether a fragment has the same unichar, 00073 // position and total as the given inputs. 00074 inline bool equals(const char *other_unichar, 00075 int other_pos, int other_total) const { 00076 return (strcmp(this->unichar, other_unichar) == 0 && 00077 this->pos == other_pos && this->total == other_total); 00078 } 00079 inline bool equals(const CHAR_FRAGMENT *other) const { 00080 return this->equals(other->get_unichar(), 00081 other->get_pos(), 00082 other->get_total()); 00083 } 00084 00085 // Checks whether a given fragment is a continuation of this fragment. 00086 // Assumes that the given fragment pointer is not NULL. 00087 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { 00088 return (strcmp(this->unichar, fragment->get_unichar()) == 0 && 00089 this->total == fragment->get_total() && 00090 this->pos == fragment->get_pos() + 1); 00091 } 00092 00093 // Returns true if this fragment is a beginning fragment. 00094 inline bool is_beginning() const { return this->pos == 0; } 00095 00096 // Returns true if this fragment is an ending fragment. 00097 inline bool is_ending() const { return this->pos == this->total-1; } 00098 00099 // Returns true if the fragment was a separate component to begin with, 00100 // ie did not need chopping to be isolated, but may have been separated 00101 // out from a multi-outline blob. 00102 inline bool is_natural() const { return natural; } 00103 void set_natural(bool value) { natural = value; } 00104 00105 // Parses the string to see whether it represents a character fragment 00106 // (rather than a regular character). If so, allocates memory for a new 00107 // CHAR_FRAGMENT instance and fills it in with the corresponding fragment 00108 // information. Fragments are of the form: 00109 // |m|1|2, meaning chunk 1 of 2 of character m, or 00110 // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed 00111 // to divide the parts, as they were already separate connected components. 00112 // 00113 // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT 00114 // instance, otherwise (if the string does not represent a fragment or it 00115 // looks like it does, but parsing it as a fragment fails) returns NULL. 00116 // 00117 // Note: The caller is responsible for deallocating memory 00118 // associated with the returned pointer. 00119 static CHAR_FRAGMENT *parse_from_string(const char *str); 00120 00121 private: 00122 char unichar[UNICHAR_LEN + 1]; 00123 // True if the fragment was a separate component to begin with, 00124 // ie did not need chopping to be isolated, but may have been separated 00125 // out from a multi-outline blob. 00126 bool natural; 00127 inT16 pos; // fragment position in the character 00128 inT16 total; // total number of fragments in the character 00129 }; 00130 00131 // The UNICHARSET class is an utility class for Tesseract that holds the 00132 // set of characters that are used by the engine. Each character is identified 00133 // by a unique number, from 0 to (size - 1). 00134 class UNICHARSET { 00135 public: 00136 // Custom list of characters and their ligature forms (UTF8) 00137 // These map to unicode values in the private use area (PUC) and are supported 00138 // by only few font families (eg. Wyld, Adobe Caslon Pro). 00139 static const char* kCustomLigatures[][2]; 00140 00141 // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h) 00142 enum Direction { 00143 U_LEFT_TO_RIGHT = 0, 00144 U_RIGHT_TO_LEFT = 1, 00145 U_EUROPEAN_NUMBER = 2, 00146 U_EUROPEAN_NUMBER_SEPARATOR = 3, 00147 U_EUROPEAN_NUMBER_TERMINATOR = 4, 00148 U_ARABIC_NUMBER = 5, 00149 U_COMMON_NUMBER_SEPARATOR = 6, 00150 U_BLOCK_SEPARATOR = 7, 00151 U_SEGMENT_SEPARATOR = 8, 00152 U_WHITE_SPACE_NEUTRAL = 9, 00153 U_OTHER_NEUTRAL = 10, 00154 U_LEFT_TO_RIGHT_EMBEDDING = 11, 00155 U_LEFT_TO_RIGHT_OVERRIDE = 12, 00156 U_RIGHT_TO_LEFT_ARABIC = 13, 00157 U_RIGHT_TO_LEFT_EMBEDDING = 14, 00158 U_RIGHT_TO_LEFT_OVERRIDE = 15, 00159 U_POP_DIRECTIONAL_FORMAT = 16, 00160 U_DIR_NON_SPACING_MARK = 17, 00161 U_BOUNDARY_NEUTRAL = 18, 00162 U_CHAR_DIRECTION_COUNT 00163 }; 00164 00165 // Create an empty UNICHARSET 00166 UNICHARSET(); 00167 00168 ~UNICHARSET(); 00169 00170 // Return the UNICHAR_ID of a given unichar representation within the 00171 // UNICHARSET. 00172 const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const; 00173 00174 // Return the UNICHAR_ID of a given unichar representation within the 00175 // UNICHARSET. Only the first length characters from unichar_repr are used. 00176 const UNICHAR_ID unichar_to_id(const char* const unichar_repr, 00177 int length) const; 00178 00179 // Return the minimum number of bytes that matches a legal UNICHAR_ID, 00180 // while leaving a legal UNICHAR_ID afterwards. In other words, if there 00181 // is both a short and a long match to the string, return the length that 00182 // ensures there is a legal match after it. 00183 int step(const char* str) const; 00184 00185 // Return whether the given UTF-8 string is encodable with this UNICHARSET. 00186 // If not encodable, write the first byte offset which cannot be converted 00187 // into the second (return) argument. 00188 bool encodable_string(const char *str, int *first_bad_position) const; 00189 00190 // Return the unichar representation corresponding to the given UNICHAR_ID 00191 // within the UNICHARSET. 00192 const char* const id_to_unichar(UNICHAR_ID id) const; 00193 00194 // Return the UTF8 representation corresponding to the given UNICHAR_ID after 00195 // resolving any private encodings internal to Tesseract. This method is 00196 // preferrable to id_to_unichar for outputting text that will be visible to 00197 // external applications. 00198 const char* const id_to_unichar_ext(UNICHAR_ID id) const; 00199 00200 // Return a STRING that reformats the utf8 str into the str followed 00201 // by its hex unicodes. 00202 static STRING debug_utf8_str(const char* str); 00203 00204 // Return a STRING containing debug information on the unichar, including 00205 // the id_to_unichar, its hex unicodes and the properties. 00206 STRING debug_str(UNICHAR_ID id) const; 00207 STRING debug_str(const char * unichar_repr) const { 00208 return debug_str(unichar_to_id(unichar_repr)); 00209 } 00210 00211 // Add a unichar representation to the set. 00212 void unichar_insert(const char* const unichar_repr); 00213 00214 // Return true if the given unichar id exists within the set. 00215 // Relies on the fact that unichar ids are contiguous in the unicharset. 00216 bool contains_unichar_id(UNICHAR_ID unichar_id) const { 00217 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used && 00218 unichar_id >= 0; 00219 } 00220 00221 // Return true if the given unichar representation exists within the set. 00222 bool contains_unichar(const char* const unichar_repr) const; 00223 bool contains_unichar(const char* const unichar_repr, int length) const; 00224 00225 // Return true if the given unichar representation corresponds to the given 00226 // UNICHAR_ID within the set. 00227 bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const; 00228 00229 // Delete CHAR_FRAGMENTs stored in properties of unichars array. 00230 void delete_pointers_in_unichars() { 00231 for (int i = 0; i < size_used; ++i) { 00232 if (unichars[i].properties.fragment != NULL) { 00233 delete unichars[i].properties.fragment; 00234 unichars[i].properties.fragment = NULL; 00235 } 00236 } 00237 } 00238 00239 // Clear the UNICHARSET (all the previous data is lost). 00240 void clear() { 00241 if (script_table != NULL) { 00242 for (int i = 0; i < script_table_size_used; ++i) 00243 delete[] script_table[i]; 00244 delete[] script_table; 00245 script_table = NULL; 00246 script_table_size_used = 0; 00247 } 00248 if (unichars != NULL) { 00249 delete_pointers_in_unichars(); 00250 delete[] unichars; 00251 unichars = NULL; 00252 } 00253 script_table_size_reserved = 0; 00254 size_reserved = 0; 00255 size_used = 0; 00256 ids.clear(); 00257 top_bottom_set_ = false; 00258 script_has_upper_lower_ = false; 00259 script_has_xheight_ = false; 00260 null_sid_ = 0; 00261 common_sid_ = 0; 00262 latin_sid_ = 0; 00263 cyrillic_sid_ = 0; 00264 greek_sid_ = 0; 00265 han_sid_ = 0; 00266 hiragana_sid_ = 0; 00267 katakana_sid_ = 0; 00268 } 00269 00270 // Return the size of the set (the number of different UNICHAR it holds). 00271 int size() const { 00272 return size_used; 00273 } 00274 00275 // Reserve enough memory space for the given number of UNICHARS 00276 void reserve(int unichars_number); 00277 00278 // Opens the file indicated by filename and saves unicharset to that file. 00279 // Returns true if the operation is successful. 00280 bool save_to_file(const char * const filename) const { 00281 FILE* file = fopen(filename, "w+b"); 00282 if (file == NULL) return false; 00283 bool result = save_to_file(file); 00284 fclose(file); 00285 return result; 00286 } 00287 00288 // Saves the content of the UNICHARSET to the given file. 00289 // Returns true if the operation is successful. 00290 bool save_to_file(FILE *file) const; 00291 00292 // Load a unicharset from a unicharset file that has been loaded into 00293 // the given memory buffer. 00294 // Returns true if the operation is successful. 00295 bool load_from_inmemory_file(const char* const memory, int mem_size, 00296 bool skip_fragments); 00297 // Returns true if the operation is successful. 00298 bool load_from_inmemory_file(const char* const memory, int mem_size) { 00299 return load_from_inmemory_file(memory, mem_size, false); 00300 } 00301 00302 // Opens the file indicated by filename and loads the UNICHARSET 00303 // from the given file. The previous data is lost. 00304 // Returns true if the operation is successful. 00305 bool load_from_file(const char* const filename, bool skip_fragments) { 00306 FILE* file = fopen(filename, "rb"); 00307 if (file == NULL) return false; 00308 bool result = load_from_file(file, skip_fragments); 00309 fclose(file); 00310 return result; 00311 } 00312 // returns true if the operation is successful. 00313 bool load_from_file(const char* const filename) { 00314 return load_from_file(filename, false); 00315 } 00316 00317 // Loads the UNICHARSET from the given file. The previous data is lost. 00318 // Returns true if the operation is successful. 00319 bool load_from_file(FILE *file, bool skip_fragments); 00320 bool load_from_file(FILE *file) { return load_from_file(file, false); } 00321 00322 // Sets up internal data after loading the file, based on the char 00323 // properties. Called from load_from_file, but also needs to be run 00324 // during set_unicharset_properties. 00325 void post_load_setup(); 00326 00327 // Returns true if right_to_left scripts are significant in the unicharset, 00328 // but without being so sensitive that "universal" unicharsets containing 00329 // characters from many scripts, like orientation and script detection, 00330 // look like they are right_to_left. 00331 bool major_right_to_left() const; 00332 00333 // Set a whitelist and/or blacklist of characters to recognize. 00334 // An empty or NULL whitelist enables everything (minus any blacklist). 00335 // An empty or NULL blacklist disables nothing. 00336 // The blacklist overrides the whitelist. 00337 // Each list is a string of utf8 character strings. Boundaries between 00338 // unicharset units are worked out automatically, and characters not in 00339 // the unicharset are silently ignored. 00340 void set_black_and_whitelist(const char* blacklist, const char* whitelist); 00341 00342 // Set the isalpha property of the given unichar to the given value. 00343 void set_isalpha(UNICHAR_ID unichar_id, bool value) { 00344 unichars[unichar_id].properties.isalpha = value; 00345 } 00346 00347 // Set the islower property of the given unichar to the given value. 00348 void set_islower(UNICHAR_ID unichar_id, bool value) { 00349 unichars[unichar_id].properties.islower = value; 00350 } 00351 00352 // Set the isupper property of the given unichar to the given value. 00353 void set_isupper(UNICHAR_ID unichar_id, bool value) { 00354 unichars[unichar_id].properties.isupper = value; 00355 } 00356 00357 // Set the isdigit property of the given unichar to the given value. 00358 void set_isdigit(UNICHAR_ID unichar_id, bool value) { 00359 unichars[unichar_id].properties.isdigit = value; 00360 } 00361 00362 // Set the ispunctuation property of the given unichar to the given value. 00363 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { 00364 unichars[unichar_id].properties.ispunctuation = value; 00365 } 00366 00367 // Set the isngram property of the given unichar to the given value. 00368 void set_isngram(UNICHAR_ID unichar_id, bool value) { 00369 unichars[unichar_id].properties.isngram = value; 00370 } 00371 00372 // Set the script name of the given unichar to the given value. 00373 // Value is copied and thus can be a temporary; 00374 void set_script(UNICHAR_ID unichar_id, const char* value) { 00375 unichars[unichar_id].properties.script_id = add_script(value); 00376 } 00377 00378 // Set other_case unichar id in the properties for the given unichar id. 00379 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { 00380 unichars[unichar_id].properties.other_case = other_case; 00381 } 00382 00383 // Set the direction property of the given unichar to the given value. 00384 void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) { 00385 unichars[unichar_id].properties.direction = value; 00386 } 00387 00388 // Set mirror unichar id in the properties for the given unichar id. 00389 void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) { 00390 unichars[unichar_id].properties.mirror = mirror; 00391 } 00392 00393 // Record normalized version of unichar with the given unichar_id. 00394 void set_normed(UNICHAR_ID unichar_id, const char* normed) { 00395 unichars[unichar_id].properties.normed = normed; 00396 } 00397 00398 // Return the isalpha property of the given unichar. 00399 bool get_isalpha(UNICHAR_ID unichar_id) const { 00400 if (INVALID_UNICHAR_ID == unichar_id) return false; 00401 ASSERT_HOST(contains_unichar_id(unichar_id)); 00402 return unichars[unichar_id].properties.isalpha; 00403 } 00404 00405 // Return the islower property of the given unichar. 00406 bool get_islower(UNICHAR_ID unichar_id) const { 00407 if (INVALID_UNICHAR_ID == unichar_id) return false; 00408 ASSERT_HOST(contains_unichar_id(unichar_id)); 00409 return unichars[unichar_id].properties.islower; 00410 } 00411 00412 // Return the isupper property of the given unichar. 00413 bool get_isupper(UNICHAR_ID unichar_id) const { 00414 if (INVALID_UNICHAR_ID == unichar_id) return false; 00415 ASSERT_HOST(contains_unichar_id(unichar_id)); 00416 return unichars[unichar_id].properties.isupper; 00417 } 00418 00419 // Return the isdigit property of the given unichar. 00420 bool get_isdigit(UNICHAR_ID unichar_id) const { 00421 if (INVALID_UNICHAR_ID == unichar_id) return false; 00422 ASSERT_HOST(contains_unichar_id(unichar_id)); 00423 return unichars[unichar_id].properties.isdigit; 00424 } 00425 00426 // Return the ispunctuation property of the given unichar. 00427 bool get_ispunctuation(UNICHAR_ID unichar_id) const { 00428 if (INVALID_UNICHAR_ID == unichar_id) return false; 00429 ASSERT_HOST(contains_unichar_id(unichar_id)); 00430 return unichars[unichar_id].properties.ispunctuation; 00431 } 00432 00433 // Return the isngram property of the given unichar. 00434 bool get_isngram(UNICHAR_ID unichar_id) const { 00435 if (INVALID_UNICHAR_ID == unichar_id) return false; 00436 ASSERT_HOST(contains_unichar_id(unichar_id)); 00437 return unichars[unichar_id].properties.isngram; 00438 } 00439 00440 // Returns whether the unichar id represents a unicode value in the private 00441 // use area. 00442 bool get_isprivate(UNICHAR_ID unichar_id) const; 00443 00444 // Returns true if the ids have useful min/max top/bottom values. 00445 bool top_bottom_useful() const { 00446 return top_bottom_set_; 00447 } 00448 // Sets all ranges to empty, so they can be expanded to set the values. 00449 void set_ranges_empty(); 00450 // Sets all the properties for this unicharset given a src_unicharset with 00451 // everything set. The unicharsets don't have to be the same, and graphemes 00452 // are correctly accounted for. 00453 void SetPropertiesFromOther(const UNICHARSET& src); 00454 // Expands the tops and bottoms and widths for this unicharset given a 00455 // src_unicharset with ranges in it. The unicharsets don't have to be the 00456 // same, and graphemes are correctly accounted for. 00457 void ExpandRangesFromOther(const UNICHARSET& src); 00458 // For each id in src, if it does not occur in this, add it, as in 00459 // SetPropertiesFromOther, otherwise expand the ranges, as in 00460 // ExpandRangesFromOther. 00461 void AppendOtherUnicharset(const UNICHARSET& src); 00462 // Returns the min and max bottom and top of the given unichar in 00463 // baseline-normalized coordinates, ie, where the baseline is 00464 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 00465 // (See normalis.h for the definitions). 00466 void get_top_bottom(UNICHAR_ID unichar_id, 00467 int* min_bottom, int* max_bottom, 00468 int* min_top, int* max_top) const { 00469 if (INVALID_UNICHAR_ID == unichar_id) { 00470 *min_bottom = *min_top = 0; 00471 *max_bottom = *max_top = 256; // kBlnCellHeight 00472 return; 00473 } 00474 ASSERT_HOST(contains_unichar_id(unichar_id)); 00475 *min_bottom = unichars[unichar_id].properties.min_bottom; 00476 *max_bottom = unichars[unichar_id].properties.max_bottom; 00477 *min_top = unichars[unichar_id].properties.min_top; 00478 *max_top = unichars[unichar_id].properties.max_top; 00479 } 00480 void set_top_bottom(UNICHAR_ID unichar_id, 00481 int min_bottom, int max_bottom, 00482 int min_top, int max_top) { 00483 unichars[unichar_id].properties.min_bottom = 00484 static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8)); 00485 unichars[unichar_id].properties.max_bottom = 00486 static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8)); 00487 unichars[unichar_id].properties.min_top = 00488 static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8)); 00489 unichars[unichar_id].properties.max_top = 00490 static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8)); 00491 } 00492 // Returns the width range of the given unichar in baseline-normalized 00493 // coordinates, ie, where the baseline is kBlnBaselineOffset and the 00494 // meanline is kBlnBaselineOffset + kBlnXHeight. 00495 // (See normalis.h for the definitions). 00496 void get_width_range(UNICHAR_ID unichar_id, 00497 int* min_width, int* max_width) const { 00498 if (INVALID_UNICHAR_ID == unichar_id) { 00499 *min_width = 0; 00500 *max_width = 256; // kBlnCellHeight; 00501 return; 00502 } 00503 ASSERT_HOST(contains_unichar_id(unichar_id)); 00504 *min_width = unichars[unichar_id].properties.min_width; 00505 *max_width = unichars[unichar_id].properties.max_width; 00506 } 00507 void set_width_range(UNICHAR_ID unichar_id, int min_width, int max_width) { 00508 unichars[unichar_id].properties.min_width = 00509 static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16)); 00510 unichars[unichar_id].properties.max_width = 00511 static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16)); 00512 } 00513 // Returns the range of the x-bearing of the given unichar in 00514 // baseline-normalized coordinates, ie, where the baseline is 00515 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight. 00516 // (See normalis.h for the definitions). 00517 void get_bearing_range(UNICHAR_ID unichar_id, 00518 int* min_bearing, int* max_bearing) const { 00519 if (INVALID_UNICHAR_ID == unichar_id) { 00520 *min_bearing = *max_bearing = 0; 00521 return; 00522 } 00523 ASSERT_HOST(contains_unichar_id(unichar_id)); 00524 *min_bearing = unichars[unichar_id].properties.min_bearing; 00525 *max_bearing = unichars[unichar_id].properties.max_bearing; 00526 } 00527 void set_bearing_range(UNICHAR_ID unichar_id, 00528 int min_bearing, int max_bearing) { 00529 unichars[unichar_id].properties.min_bearing = 00530 static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16)); 00531 unichars[unichar_id].properties.max_bearing = 00532 static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16)); 00533 } 00534 // Returns the range of the x-advance of the given unichar in 00535 // baseline-normalized coordinates, ie, where the baseline is 00536 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight. 00537 // (See normalis.h for the definitions). 00538 void get_advance_range(UNICHAR_ID unichar_id, 00539 int* min_advance, int* max_advance) const { 00540 if (INVALID_UNICHAR_ID == unichar_id) { 00541 *min_advance = *max_advance = 0; 00542 return; 00543 } 00544 ASSERT_HOST(contains_unichar_id(unichar_id)); 00545 *min_advance = unichars[unichar_id].properties.min_advance; 00546 *max_advance = unichars[unichar_id].properties.max_advance; 00547 } 00548 void set_advance_range(UNICHAR_ID unichar_id, 00549 int min_advance, int max_advance) { 00550 unichars[unichar_id].properties.min_advance = 00551 static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16)); 00552 unichars[unichar_id].properties.max_advance = 00553 static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16)); 00554 } 00555 00556 // Return the script name of the given unichar. 00557 // The returned pointer will always be the same for the same script, it's 00558 // managed by unicharset and thus MUST NOT be deleted 00559 int get_script(UNICHAR_ID unichar_id) const { 00560 if (INVALID_UNICHAR_ID == unichar_id) return null_sid_; 00561 ASSERT_HOST(contains_unichar_id(unichar_id)); 00562 return unichars[unichar_id].properties.script_id; 00563 } 00564 00565 // Return the character properties, eg. alpha/upper/lower/digit/punct, 00566 // as a bit field of unsigned int. 00567 unsigned int get_properties(UNICHAR_ID unichar_id) const; 00568 00569 // Return the character property as a single char. If a character has 00570 // multiple attributes, the main property is defined by the following order: 00571 // upper_case : 'A' 00572 // lower_case : 'a' 00573 // alpha : 'x' 00574 // digit : '0' 00575 // punctuation: 'p' 00576 char get_chartype(UNICHAR_ID unichar_id) const; 00577 00578 // Get other_case unichar id in the properties for the given unichar id. 00579 UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { 00580 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00581 ASSERT_HOST(contains_unichar_id(unichar_id)); 00582 return unichars[unichar_id].properties.other_case; 00583 } 00584 00585 // Returns the direction property of the given unichar. 00586 Direction get_direction(UNICHAR_ID unichar_id) const { 00587 if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL; 00588 ASSERT_HOST(contains_unichar_id(unichar_id)); 00589 return unichars[unichar_id].properties.direction; 00590 } 00591 00592 // Get mirror unichar id in the properties for the given unichar id. 00593 UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const { 00594 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00595 ASSERT_HOST(contains_unichar_id(unichar_id)); 00596 return unichars[unichar_id].properties.mirror; 00597 } 00598 00599 // Returns UNICHAR_ID of the corresponding lower-case unichar. 00600 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { 00601 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00602 ASSERT_HOST(contains_unichar_id(unichar_id)); 00603 if (unichars[unichar_id].properties.islower) return unichar_id; 00604 return unichars[unichar_id].properties.other_case; 00605 } 00606 00607 // Returns UNICHAR_ID of the corresponding upper-case unichar. 00608 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { 00609 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00610 ASSERT_HOST(contains_unichar_id(unichar_id)); 00611 if (unichars[unichar_id].properties.isupper) return unichar_id; 00612 return unichars[unichar_id].properties.other_case; 00613 } 00614 00615 // Return a pointer to the CHAR_FRAGMENT class if the given 00616 // unichar id represents a character fragment. 00617 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { 00618 if (INVALID_UNICHAR_ID == unichar_id) return NULL; 00619 ASSERT_HOST(contains_unichar_id(unichar_id)); 00620 return unichars[unichar_id].properties.fragment; 00621 } 00622 00623 // Return the isalpha property of the given unichar representation. 00624 bool get_isalpha(const char* const unichar_repr) const { 00625 return get_isalpha(unichar_to_id(unichar_repr)); 00626 } 00627 00628 // Return the islower property of the given unichar representation. 00629 bool get_islower(const char* const unichar_repr) const { 00630 return get_islower(unichar_to_id(unichar_repr)); 00631 } 00632 00633 // Return the isupper property of the given unichar representation. 00634 bool get_isupper(const char* const unichar_repr) const { 00635 return get_isupper(unichar_to_id(unichar_repr)); 00636 } 00637 00638 // Return the isdigit property of the given unichar representation. 00639 bool get_isdigit(const char* const unichar_repr) const { 00640 return get_isdigit(unichar_to_id(unichar_repr)); 00641 } 00642 00643 // Return the ispunctuation property of the given unichar representation. 00644 bool get_ispunctuation(const char* const unichar_repr) const { 00645 return get_ispunctuation(unichar_to_id(unichar_repr)); 00646 } 00647 00648 // Return the character properties, eg. alpha/upper/lower/digit/punct, 00649 // of the given unichar representation 00650 unsigned int get_properties(const char* const unichar_repr) const { 00651 return get_properties(unichar_to_id(unichar_repr)); 00652 } 00653 00654 char get_chartype(const char* const unichar_repr) const { 00655 return get_chartype(unichar_to_id(unichar_repr)); 00656 } 00657 00658 // Return the script name of the given unichar representation. 00659 // The returned pointer will always be the same for the same script, it's 00660 // managed by unicharset and thus MUST NOT be deleted 00661 int get_script(const char* const unichar_repr) const { 00662 return get_script(unichar_to_id(unichar_repr)); 00663 } 00664 00665 // Return a pointer to the CHAR_FRAGMENT class struct if the given 00666 // unichar representation represents a character fragment. 00667 const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const { 00668 if (unichar_repr == NULL || unichar_repr[0] == '\0' || 00669 !ids.contains(unichar_repr)) { 00670 return NULL; 00671 } 00672 return get_fragment(unichar_to_id(unichar_repr)); 00673 } 00674 00675 // Return the isalpha property of the given unichar representation. 00676 // Only the first length characters from unichar_repr are used. 00677 bool get_isalpha(const char* const unichar_repr, 00678 int length) const { 00679 return get_isalpha(unichar_to_id(unichar_repr, length)); 00680 } 00681 00682 // Return the islower property of the given unichar representation. 00683 // Only the first length characters from unichar_repr are used. 00684 bool get_islower(const char* const unichar_repr, 00685 int length) const { 00686 return get_islower(unichar_to_id(unichar_repr, length)); 00687 } 00688 00689 // Return the isupper property of the given unichar representation. 00690 // Only the first length characters from unichar_repr are used. 00691 bool get_isupper(const char* const unichar_repr, 00692 int length) const { 00693 return get_isupper(unichar_to_id(unichar_repr, length)); 00694 } 00695 00696 // Return the isdigit property of the given unichar representation. 00697 // Only the first length characters from unichar_repr are used. 00698 bool get_isdigit(const char* const unichar_repr, 00699 int length) const { 00700 return get_isdigit(unichar_to_id(unichar_repr, length)); 00701 } 00702 00703 // Return the ispunctuation property of the given unichar representation. 00704 // Only the first length characters from unichar_repr are used. 00705 bool get_ispunctuation(const char* const unichar_repr, 00706 int length) const { 00707 return get_ispunctuation(unichar_to_id(unichar_repr, length)); 00708 } 00709 00710 // Returns normalized version of unichar with the given unichar_id. 00711 const char *get_normed_unichar(UNICHAR_ID unichar_id) const { 00712 return unichars[unichar_id].properties.normed.string(); 00713 } 00714 00715 // Return the script name of the given unichar representation. 00716 // Only the first length characters from unichar_repr are used. 00717 // The returned pointer will always be the same for the same script, it's 00718 // managed by unicharset and thus MUST NOT be deleted 00719 int get_script(const char* const unichar_repr, 00720 int length) const { 00721 return get_script(unichar_to_id(unichar_repr, length)); 00722 } 00723 00724 // Return the (current) number of scripts in the script table 00725 int get_script_table_size() const { 00726 return script_table_size_used; 00727 } 00728 00729 // Return the script string from its id 00730 const char* get_script_from_script_id(int id) const { 00731 if (id >= script_table_size_used || id < 0) 00732 return null_script; 00733 return script_table[id]; 00734 } 00735 00736 // Returns the id from the name of the script, or 0 if script is not found. 00737 // Note that this is an expensive operation since it involves iteratively 00738 // comparing strings in the script table. To avoid dependency on STL, we 00739 // won't use a hash. Instead, the calling function can use this to lookup 00740 // and save the ID for relevant scripts for fast comparisons later. 00741 int get_script_id_from_name(const char* script_name) const; 00742 00743 // Return true if the given script is the null script 00744 bool is_null_script(const char* script) const { 00745 return script == null_script; 00746 } 00747 00748 // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, 00749 // then the returned pointer will be the same. 00750 // The script parameter is copied and thus can be a temporary. 00751 int add_script(const char* script); 00752 00753 // Return the enabled property of the given unichar. 00754 bool get_enabled(UNICHAR_ID unichar_id) const { 00755 return unichars[unichar_id].properties.enabled; 00756 } 00757 00758 00759 int null_sid() const { return null_sid_; } 00760 int common_sid() const { return common_sid_; } 00761 int latin_sid() const { return latin_sid_; } 00762 int cyrillic_sid() const { return cyrillic_sid_; } 00763 int greek_sid() const { return greek_sid_; } 00764 int han_sid() const { return han_sid_; } 00765 int hiragana_sid() const { return hiragana_sid_; } 00766 int katakana_sid() const { return katakana_sid_; } 00767 int default_sid() const { return default_sid_; } 00768 00769 // Returns true if the unicharset has the concept of upper/lower case. 00770 bool script_has_upper_lower() const { 00771 return script_has_upper_lower_; 00772 } 00773 // Returns true if the unicharset has the concept of x-height. 00774 // script_has_xheight can be true even if script_has_upper_lower is not, 00775 // when the script has a sufficiently predominant top line with ascenders, 00776 // such as Devanagari and Thai. 00777 bool script_has_xheight() const { 00778 return script_has_xheight_; 00779 } 00780 00781 private: 00782 00783 struct UNICHAR_PROPERTIES { 00784 UNICHAR_PROPERTIES(); 00785 // Initializes all properties to sensible default values. 00786 void Init(); 00787 // Sets all ranges wide open. Initialization default in case there are 00788 // no useful values available. 00789 void SetRangesOpen(); 00790 // Sets all ranges to empty. Used before expanding with font-based data. 00791 void SetRangesEmpty(); 00792 // Returns true if any of the top/bottom/width/bearing/advance ranges is 00793 // emtpy. 00794 bool AnyRangeEmpty() const; 00795 // Expands the ranges with the ranges from the src properties. 00796 void ExpandRangesFrom(const UNICHAR_PROPERTIES& src); 00797 // Copies the properties from src into this. 00798 void CopyFrom(const UNICHAR_PROPERTIES& src); 00799 00800 bool isalpha; 00801 bool islower; 00802 bool isupper; 00803 bool isdigit; 00804 bool ispunctuation; 00805 bool isngram; 00806 bool enabled; 00807 // Possible limits of the top and bottom of the bounding box in 00808 // baseline-normalized coordinates, ie, where the baseline is 00809 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 00810 // (See normalis.h for the definitions). 00811 uinT8 min_bottom; 00812 uinT8 max_bottom; 00813 uinT8 min_top; 00814 uinT8 max_top; 00815 // Limits on the widths of bounding box, also in baseline-normalized coords. 00816 inT16 min_width; 00817 inT16 max_width; 00818 // Limits on the x-bearing and advance, also in baseline-normalized coords. 00819 inT16 min_bearing; 00820 inT16 max_bearing; 00821 inT16 min_advance; 00822 inT16 max_advance; 00823 int script_id; 00824 UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar 00825 Direction direction; // direction of this unichar 00826 // Mirror property is useful for reverse DAWG lookup for words in 00827 // right-to-left languages (e.g. "(word)" would be in 00828 // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string. 00829 // However, what we want in our DAWG is 00830 // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not 00831 // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'. 00832 UNICHAR_ID mirror; 00833 STRING normed; // normalized version of this unichar 00834 // Contains meta information about the fragment if a unichar represents 00835 // a fragment of a character, otherwise should be set to NULL. 00836 // It is assumed that character fragments are added to the unicharset 00837 // after the corresponding 'base' characters. 00838 CHAR_FRAGMENT *fragment; 00839 }; 00840 00841 struct UNICHAR_SLOT { 00842 char representation[UNICHAR_LEN + 1]; 00843 UNICHAR_PROPERTIES properties; 00844 }; 00845 00846 // Gets the properties for a grapheme string, combining properties for 00847 // multiple characters in a meaningful way where possible. 00848 // Returns false if no valid match was found in the unicharset. 00849 // NOTE that script_id, mirror, and other_case refer to this unicharset on 00850 // return and will need redirecting if the target unicharset is different. 00851 bool GetStrProperties(const char* utf8_str, 00852 UNICHAR_PROPERTIES* props) const; 00853 00854 // Load ourselves from a "file" where our only interface to the file is 00855 // an implementation of fgets(). This is the parsing primitive accessed by 00856 // the public routines load_from_file() and load_from_inmemory_file(). 00857 bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb, 00858 bool skip_fragments); 00859 00860 UNICHAR_SLOT* unichars; 00861 UNICHARMAP ids; 00862 int size_used; 00863 int size_reserved; 00864 char** script_table; 00865 int script_table_size_used; 00866 int script_table_size_reserved; 00867 const char* null_script; 00868 // True if the unichars have their tops/bottoms set. 00869 bool top_bottom_set_; 00870 // True if the unicharset has significant upper/lower case chars. 00871 bool script_has_upper_lower_; 00872 // True if the unicharset has a significant mean-line with significant 00873 // ascenders above that. 00874 bool script_has_xheight_; 00875 00876 // A few convenient script name-to-id mapping without using hash. 00877 // These are initialized when unicharset file is loaded. Anything 00878 // missing from this list can be looked up using get_script_id_from_name. 00879 int null_sid_; 00880 int common_sid_; 00881 int latin_sid_; 00882 int cyrillic_sid_; 00883 int greek_sid_; 00884 int han_sid_; 00885 int hiragana_sid_; 00886 int katakana_sid_; 00887 // The most frequently occurring script in the charset. 00888 int default_sid_; 00889 }; 00890 00891 #endif // TESSERACT_CCUTIL_UNICHARSET_H__