Tesseract
3.02
|
00001 00002 // File: unicharset.cpp 00003 // Description: Unicode character/ligature set class. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include <assert.h> 00021 #include <stdio.h> 00022 #include <string.h> 00023 00024 #include "tesscallback.h" 00025 #include "tprintf.h" 00026 #include "unichar.h" 00027 #include "unicharset.h" 00028 #include "params.h" 00029 00030 // Special character used in representing character fragments. 00031 static const char kSeparator = '|'; 00032 // Special character used in representing 'natural' character fragments. 00033 static const char kNaturalFlag = 'n'; 00034 00035 static const int ISALPHA_MASK = 0x1; 00036 static const int ISLOWER_MASK = 0x2; 00037 static const int ISUPPER_MASK = 0x4; 00038 static const int ISDIGIT_MASK = 0x8; 00039 static const int ISPUNCTUATION_MASK = 0x10; 00040 00041 // Y coordinate threshold for determining cap-height vs x-height. 00042 // TODO(rays) Bring the global definition down to the ccutil library level, 00043 // so this constant is relative to some other constants. 00044 static const int kMeanlineThreshold = 220; 00045 // Let C be the number of alpha chars for which all tops exceed 00046 // kMeanlineThreshold, and X the number of alpha chars for which all 00047 // tops are below kMeanlineThreshold, then if X > C * 00048 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than 00049 // half the alpha characters have upper or lower case, then the 00050 // unicharset "has x-height". 00051 const double kMinXHeightFraction = 0.25; 00052 const double kMinCapHeightFraction = 0.05; 00053 00054 /*static */ 00055 const char* UNICHARSET::kCustomLigatures[][2] = { 00056 {"ct", "\uE003"}, // c + t -> U+E003 00057 {"ſh", "\uE006"}, // long-s + h -> U+E006 00058 {"ſi", "\uE007"}, // long-s + i -> U+E007 00059 {"ſl", "\uE008"}, // long-s + l -> U+E008 00060 {"ſſ", "\uE009"}, // long-s + long-s -> U+E009 00061 {NULL, NULL} 00062 }; 00063 00064 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() { 00065 Init(); 00066 } 00067 00068 // Initialize all properties to sensible default values. 00069 void UNICHARSET::UNICHAR_PROPERTIES::Init() { 00070 isalpha = false; 00071 islower = false; 00072 isupper = false; 00073 isdigit = false; 00074 ispunctuation = false; 00075 isngram = false; 00076 enabled = false; 00077 SetRangesOpen(); 00078 script_id = 0; 00079 other_case = 0; 00080 mirror = 0; 00081 normed = ""; 00082 direction = UNICHARSET::U_LEFT_TO_RIGHT; 00083 fragment = NULL; 00084 } 00085 00086 // Sets all ranges wide open. Initialization default in case there are 00087 // no useful values available. 00088 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() { 00089 min_bottom = 0; 00090 max_bottom = MAX_UINT8; 00091 min_top = 0; 00092 max_top = MAX_UINT8; 00093 min_width = 0; 00094 max_width = MAX_INT16; 00095 min_bearing = 0; 00096 max_bearing = MAX_INT16; 00097 min_advance = 0; 00098 max_advance = MAX_INT16; 00099 } 00100 00101 // Sets all ranges to empty. Used before expanding with font-based data. 00102 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() { 00103 min_bottom = MAX_UINT8; 00104 max_bottom = 0; 00105 min_top = MAX_UINT8; 00106 max_top = 0; 00107 min_width = MAX_INT16; 00108 max_width = 0; 00109 min_bearing = MAX_INT16; 00110 max_bearing = 0; 00111 min_advance = MAX_INT16; 00112 max_advance = 0; 00113 } 00114 00115 // Returns true if any of the top/bottom/width/bearing/advance ranges is 00116 // emtpy. 00117 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const { 00118 return min_bottom > max_bottom || min_top > max_top || 00119 min_width > max_width || min_bearing > max_bearing || 00120 min_advance > max_advance; 00121 } 00122 00123 // Expands the ranges with the ranges from the src properties. 00124 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom( 00125 const UNICHAR_PROPERTIES& src) { 00126 UpdateRange(src.min_bottom, &min_bottom, &max_bottom); 00127 UpdateRange(src.max_bottom, &min_bottom, &max_bottom); 00128 UpdateRange(src.min_top, &min_top, &max_top); 00129 UpdateRange(src.max_top, &min_top, &max_top); 00130 UpdateRange(src.min_width, &min_width, &max_width); 00131 UpdateRange(src.max_width, &min_width, &max_width); 00132 UpdateRange(src.min_bearing, &min_bearing, &max_bearing); 00133 UpdateRange(src.max_bearing, &min_bearing, &max_bearing); 00134 UpdateRange(src.min_advance, &min_advance, &max_advance); 00135 UpdateRange(src.max_advance, &min_advance, &max_advance); 00136 } 00137 00138 // Copies the properties from src into this. 00139 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) { 00140 // Apart from the fragment, everything else can be done with a default copy. 00141 CHAR_FRAGMENT* saved_fragment = fragment; 00142 *this = src; // Bitwise copy. 00143 fragment = saved_fragment; 00144 } 00145 00146 UNICHARSET::UNICHARSET() : 00147 unichars(NULL), 00148 ids(), 00149 size_used(0), 00150 size_reserved(0), 00151 script_table(NULL), 00152 script_table_size_used(0), 00153 null_script("NULL") { 00154 clear(); 00155 } 00156 00157 UNICHARSET::~UNICHARSET() { 00158 clear(); 00159 } 00160 00161 void UNICHARSET::reserve(int unichars_number) { 00162 if (unichars_number > size_reserved) { 00163 UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number]; 00164 for (int i = 0; i < size_used; ++i) 00165 unichars_new[i] = unichars[i]; 00166 for (int j = size_used; j < unichars_number; ++j) { 00167 unichars_new[j].properties.script_id = add_script(null_script); 00168 } 00169 delete[] unichars; 00170 unichars = unichars_new; 00171 size_reserved = unichars_number; 00172 } 00173 } 00174 00175 const UNICHAR_ID 00176 UNICHARSET::unichar_to_id(const char* const unichar_repr) const { 00177 return ids.contains(unichar_repr) ? 00178 ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID; 00179 } 00180 00181 const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr, 00182 int length) const { 00183 assert(length > 0 && length <= UNICHAR_LEN); 00184 return ids.contains(unichar_repr, length) ? 00185 ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID; 00186 } 00187 00188 // Return the minimum number of bytes that matches a legal UNICHAR_ID, 00189 // while leaving a legal UNICHAR_ID afterwards. In other words, if there 00190 // is both a short and a long match to the string, return the length that 00191 // ensures there is a legal match after it. 00192 int UNICHARSET::step(const char* str) const { 00193 // Find the length of the first matching unicharset member. 00194 int minlength = ids.minmatch(str); 00195 if (minlength == 0) 00196 return 0; // Empty string or illegal char. 00197 00198 int goodlength = minlength; 00199 while (goodlength <= UNICHAR_LEN) { 00200 if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0) 00201 return goodlength; // This length works! 00202 00203 // The next char is illegal so find the next usable length. 00204 do { 00205 ++goodlength; 00206 } while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN && 00207 !ids.contains(str, goodlength)); 00208 if (goodlength > UNICHAR_LEN || !ids.contains(str, goodlength)) { 00209 // This does not constitute a good length! 00210 return minlength; 00211 } 00212 } 00213 // Search to find a subsequent legal char failed so return the minlength. 00214 return minlength; 00215 } 00216 00217 // Return whether the given UTF-8 string is encodable with this UNICHARSET. 00218 // If not encodable, write the first byte offset which cannot be converted 00219 // into the second (return) argument. 00220 bool UNICHARSET::encodable_string(const char *str, 00221 int *first_bad_position) const { 00222 for (int i = 0, len = strlen(str); i < len; ) { 00223 int increment = step(str + i); 00224 if (increment == 0) { 00225 if (first_bad_position) *first_bad_position = i; 00226 return false; 00227 } 00228 i += increment; 00229 } 00230 return true; 00231 } 00232 00233 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const { 00234 if (id == INVALID_UNICHAR_ID) { 00235 return INVALID_UNICHAR; 00236 } 00237 ASSERT_HOST(id < this->size()); 00238 return unichars[id].representation; 00239 } 00240 00241 const char* const UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const { 00242 if (id == INVALID_UNICHAR_ID) { 00243 return INVALID_UNICHAR; 00244 } 00245 ASSERT_HOST(id < this->size()); 00246 // Resolve from the kCustomLigatures table if this is a private encoding. 00247 if (get_isprivate(id)) { 00248 const char* ch = id_to_unichar(id); 00249 for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) { 00250 if (!strcmp(ch, kCustomLigatures[i][1])) { 00251 return kCustomLigatures[i][0]; 00252 } 00253 } 00254 } 00255 // Otherwise return the stored representation. 00256 return unichars[id].representation; 00257 } 00258 00259 // Return a STRING that reformats the utf8 str into the str followed 00260 // by its hex unicodes. 00261 STRING UNICHARSET::debug_utf8_str(const char* str) { 00262 STRING result = str; 00263 result += " ["; 00264 int step = 1; 00265 // Chop into unicodes and code each as hex. 00266 for (int i = 0; str[i] != '\0'; i += step) { 00267 char hex[sizeof(int) * 2 + 1]; 00268 step = UNICHAR::utf8_step(str + i); 00269 if (step == 0) { 00270 step = 1; 00271 sprintf(hex, "%x", str[i]); 00272 } else { 00273 UNICHAR ch(str + i, step); 00274 sprintf(hex, "%x", ch.first_uni()); 00275 } 00276 result += hex; 00277 result += " "; 00278 } 00279 result += "]"; 00280 return result; 00281 } 00282 00283 // Return a STRING containing debug information on the unichar, including 00284 // the id_to_unichar, its hex unicodes and the properties. 00285 STRING UNICHARSET::debug_str(UNICHAR_ID id) const { 00286 if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id)); 00287 const CHAR_FRAGMENT *fragment = this->get_fragment(id); 00288 if (fragment) { 00289 return fragment->to_string(); 00290 } 00291 const char* str = id_to_unichar(id); 00292 STRING result = debug_utf8_str(str); 00293 // Append a for lower alpha, A for upper alpha, and x if alpha but neither. 00294 if (get_isalpha(id)) { 00295 if (get_islower(id)) 00296 result += "a"; 00297 else if (get_isupper(id)) 00298 result += "A"; 00299 else 00300 result += "x"; 00301 } 00302 // Append 0 if a digit. 00303 if (get_isdigit(id)) { 00304 result += "0"; 00305 } 00306 // Append p is a punctuation symbol. 00307 if (get_ispunctuation(id)) { 00308 result += "p"; 00309 } 00310 return result; 00311 } 00312 00313 // Returns whether the unichar id represents a unicode value in the private use 00314 // area. We use this range only internally to represent uncommon ligatures 00315 // (eg. 'ct') that do not have regular unicode values. 00316 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const { 00317 UNICHAR uc(id_to_unichar(unichar_id), -1); 00318 int uni = uc.first_uni(); 00319 return (uni >= 0xE000 && uni <= 0xF8FF); 00320 } 00321 00322 00323 // Sets all ranges to empty, so they can be expanded to set the values. 00324 void UNICHARSET::set_ranges_empty() { 00325 for (int id = 0; id < size_used; ++id) { 00326 unichars[id].properties.SetRangesEmpty(); 00327 } 00328 } 00329 00330 // Sets all the properties for this unicharset given a src unicharset with 00331 // everything set. The unicharsets don't have to be the same, and graphemes 00332 // are correctly accounted for. 00333 void UNICHARSET::SetPropertiesFromOther(const UNICHARSET& src) { 00334 for (int ch = 0; ch < size_used; ++ch) { 00335 const char* utf8 = id_to_unichar(ch); 00336 UNICHAR_PROPERTIES properties; 00337 if (src.GetStrProperties(utf8, &properties)) { 00338 // Setup the script_id, other_case, and mirror properly. 00339 const char* script = src.get_script_from_script_id(properties.script_id); 00340 properties.script_id = add_script(script); 00341 const char* other_case = src.id_to_unichar(properties.other_case); 00342 if (contains_unichar(other_case)) { 00343 properties.other_case = unichar_to_id(other_case); 00344 } else { 00345 properties.other_case = ch; 00346 } 00347 const char* mirror_str = src.id_to_unichar(properties.mirror); 00348 if (contains_unichar(mirror_str)) { 00349 properties.mirror = unichar_to_id(mirror_str); 00350 } else { 00351 properties.mirror = ch; 00352 } 00353 unichars[ch].properties.CopyFrom(properties); 00354 } 00355 } 00356 } 00357 00358 // Expands the tops and bottoms and widths for this unicharset given a 00359 // src unicharset with ranges in it. The unicharsets don't have to be the 00360 // same, and graphemes are correctly accounted for. 00361 void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) { 00362 for (int ch = 0; ch < size_used; ++ch) { 00363 const char* utf8 = id_to_unichar(ch); 00364 UNICHAR_PROPERTIES properties; 00365 if (src.GetStrProperties(utf8, &properties)) { 00366 // Expand just the ranges from properties. 00367 unichars[ch].properties.ExpandRangesFrom(properties); 00368 } 00369 } 00370 } 00371 00372 // For each id in src, if it does not occur in this, add it, as in 00373 // SetPropertiesFromOther, otherwise expand the ranges, as in 00374 // ExpandRangesFromOther. 00375 void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) { 00376 for (int ch = 0; ch < src.size_used; ++ch) { 00377 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; 00378 const char* utf8 = src.id_to_unichar(ch); 00379 if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) { 00380 // Only use fully valid entries. 00381 tprintf("Bad properties for char %s: %d,%d %d,%d %d,%d %d,%d %d,%d\n", 00382 utf8, src_props.min_bottom, src_props.max_bottom, 00383 src_props.min_top, src_props.max_top, 00384 src_props.min_width, src_props.max_width, 00385 src_props.min_bearing, src_props.max_bearing, 00386 src_props.min_advance, src_props.max_advance); 00387 continue; 00388 } 00389 int id = size_used; 00390 if (contains_unichar(utf8)) { 00391 id = unichar_to_id(utf8); 00392 } else { 00393 unichar_insert(utf8); 00394 unichars[id].properties.SetRangesEmpty(); 00395 } 00396 if (!unichars[id].properties.AnyRangeEmpty()) { 00397 // Just expand current ranges. 00398 unichars[id].properties.ExpandRangesFrom(src_props); 00399 } else { 00400 // Copy properties from src_props. 00401 unichars[id].properties.CopyFrom(src_props); 00402 // Setup the script_id, other_case and mirror properly. 00403 const char* script = src.get_script_from_script_id(src_props.script_id); 00404 unichars[id].properties.script_id = add_script(script); 00405 const char* other_case = src.id_to_unichar(src_props.other_case); 00406 if (!contains_unichar(other_case)) { 00407 unichar_insert(other_case); 00408 unichars[size_used - 1].properties.SetRangesEmpty(); 00409 // Other_case will have its ranges set later as it is contained in src. 00410 } 00411 unichars[id].properties.other_case = unichar_to_id(other_case); 00412 const char* mirror_str = src.id_to_unichar(src_props.mirror); 00413 if (!contains_unichar(mirror_str)) { 00414 unichar_insert(mirror_str); 00415 unichars[size_used - 1].properties.SetRangesEmpty(); 00416 // Mirror will have its ranges set later as it is contained in src. 00417 } 00418 unichars[id].properties.mirror = unichar_to_id(mirror_str); 00419 } 00420 } 00421 } 00422 00423 // Gets the properties for a grapheme string, combining properties for 00424 // multiple characters in a meaningful way where possible. 00425 // Returns false if no valid match was found in the unicharset. 00426 // NOTE that script_id, mirror, and other_case refer to this unicharset on 00427 // return and will need translation if the target unicharset is different. 00428 bool UNICHARSET::GetStrProperties(const char* utf8_str, 00429 UNICHAR_PROPERTIES* props) const { 00430 props->Init(); 00431 props->SetRangesEmpty(); 00432 props->min_advance = 0; 00433 props->max_advance = 0; 00434 int utf8_step = 0; 00435 int total_unicodes = 0; 00436 for (int offset = 0; utf8_str[offset] != '\0'; offset += utf8_step) { 00437 utf8_step = step(utf8_str + offset); 00438 if (utf8_step == 0) return false; 00439 int id = unichar_to_id(utf8_str + offset, utf8_step); 00440 if (id < 0) return false; 00441 const UNICHAR_PROPERTIES& src_props = unichars[id].properties; 00442 // Logical OR all the bools. 00443 if (src_props.isalpha) props->isalpha = true; 00444 if (src_props.islower) props->islower = true; 00445 if (src_props.isupper) props->isupper = true; 00446 if (src_props.isdigit) props->isdigit = true; 00447 if (src_props.ispunctuation) props->ispunctuation = true; 00448 if (src_props.isngram) props->isngram = true; 00449 if (src_props.enabled) props->enabled = true; 00450 // Min/max the tops/bottoms. 00451 UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom); 00452 UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom); 00453 UpdateRange(src_props.min_top, &props->min_top, &props->max_top); 00454 UpdateRange(src_props.max_top, &props->min_top, &props->max_top); 00455 int bearing = props->min_advance + src_props.min_bearing; 00456 if (total_unicodes == 0 || bearing < props->min_bearing) 00457 props->min_bearing = bearing; 00458 bearing = props->max_advance + src_props.max_bearing; 00459 if (total_unicodes == 0 || bearing < props->max_bearing) 00460 props->max_bearing = bearing; 00461 props->min_advance += src_props.min_advance; 00462 props->max_advance += src_props.max_advance; 00463 // With a single width, just use the widths stored in the unicharset. 00464 props->min_width = src_props.min_width; 00465 props->max_width = src_props.max_width; 00466 // Use the first script id, other_case, mirror, direction. 00467 // Note that these will need translation, except direction. 00468 if (total_unicodes == 0) { 00469 props->script_id = src_props.script_id; 00470 props->other_case = src_props.other_case; 00471 props->mirror = src_props.mirror; 00472 props->direction = src_props.direction; 00473 } 00474 // The normed string for the compound character is the concatenation of 00475 // the normed versions of the individual characters. 00476 props->normed += src_props.normed; 00477 ++total_unicodes; 00478 } 00479 if (total_unicodes > 1) { 00480 // Estimate the total widths from the advance - bearing. 00481 props->min_width = props->min_advance - props->max_bearing; 00482 props->max_width = props->max_advance - props->min_bearing; 00483 } 00484 return total_unicodes > 0; 00485 } 00486 00487 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const { 00488 unsigned int properties = 0; 00489 if (this->get_isalpha(id)) 00490 properties |= ISALPHA_MASK; 00491 if (this->get_islower(id)) 00492 properties |= ISLOWER_MASK; 00493 if (this->get_isupper(id)) 00494 properties |= ISUPPER_MASK; 00495 if (this->get_isdigit(id)) 00496 properties |= ISDIGIT_MASK; 00497 if (this->get_ispunctuation(id)) 00498 properties |= ISPUNCTUATION_MASK; 00499 return properties; 00500 } 00501 00502 char UNICHARSET::get_chartype(UNICHAR_ID id) const { 00503 if (this->get_isupper(id)) return 'A'; 00504 if (this->get_islower(id)) return 'a'; 00505 if (this->get_isalpha(id)) return 'x'; 00506 if (this->get_isdigit(id)) return '0'; 00507 if (this->get_ispunctuation(id)) return 'p'; 00508 return 0; 00509 } 00510 00511 void UNICHARSET::unichar_insert(const char* const unichar_repr) { 00512 if (!ids.contains(unichar_repr)) { 00513 if (strlen(unichar_repr) > UNICHAR_LEN) { 00514 fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n", 00515 int(strlen(unichar_repr)), unichar_repr); 00516 return; 00517 } 00518 if (size_used == size_reserved) { 00519 if (size_used == 0) 00520 reserve(8); 00521 else 00522 reserve(2 * size_used); 00523 } 00524 00525 strcpy(unichars[size_used].representation, unichar_repr); 00526 this->set_script(size_used, null_script); 00527 // If the given unichar_repr represents a fragmented character, set 00528 // fragment property to a pointer to CHAR_FRAGMENT class instance with 00529 // information parsed from the unichar representation. Use the script 00530 // of the base unichar for the fragmented character if possible. 00531 CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr); 00532 this->unichars[size_used].properties.fragment = frag; 00533 if (frag != NULL && this->contains_unichar(frag->get_unichar())) { 00534 this->unichars[size_used].properties.script_id = 00535 this->get_script(frag->get_unichar()); 00536 } 00537 this->unichars[size_used].properties.enabled = true; 00538 ids.insert(unichar_repr, size_used); 00539 ++size_used; 00540 } 00541 } 00542 00543 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const { 00544 return ids.contains(unichar_repr); 00545 } 00546 00547 bool UNICHARSET::contains_unichar(const char* const unichar_repr, 00548 int length) const { 00549 if (length == 0) { 00550 return false; 00551 } 00552 return ids.contains(unichar_repr, length); 00553 } 00554 00555 bool UNICHARSET::eq(UNICHAR_ID unichar_id, 00556 const char* const unichar_repr) const { 00557 return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0; 00558 } 00559 00560 bool UNICHARSET::save_to_file(FILE *file) const { 00561 fprintf(file, "%d\n", this->size()); 00562 for (UNICHAR_ID id = 0; id < this->size(); ++id) { 00563 int min_bottom, max_bottom, min_top, max_top; 00564 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); 00565 int min_width, max_width; 00566 get_width_range(id, &min_width, &max_width); 00567 int min_bearing, max_bearing; 00568 get_bearing_range(id, &min_bearing, &max_bearing); 00569 int min_advance, max_advance; 00570 get_advance_range(id, &min_advance, &max_advance); 00571 unsigned int properties = this->get_properties(id); 00572 if (strcmp(this->id_to_unichar(id), " ") == 0) { 00573 fprintf(file, "%s %x %s %d\n", "NULL", properties, 00574 this->get_script_from_script_id(this->get_script(id)), 00575 this->get_other_case(id)); 00576 } else { 00577 fprintf(file, 00578 "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n", 00579 this->id_to_unichar(id), properties, 00580 min_bottom, max_bottom, min_top, max_top, min_width, max_width, 00581 min_bearing, max_bearing, min_advance, max_advance, 00582 this->get_script_from_script_id(this->get_script(id)), 00583 this->get_other_case(id), this->get_direction(id), 00584 this->get_mirror(id), this->get_normed_unichar(id), 00585 this->debug_str(id).string()); 00586 } 00587 } 00588 return true; 00589 } 00590 00591 class InMemoryFilePointer { 00592 public: 00593 InMemoryFilePointer(const char *memory, int mem_size) 00594 : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { } 00595 00596 char *fgets(char *orig_dst, int size) { 00597 const char *src_end = memory_ + mem_size_; 00598 char *dst_end = orig_dst + size - 1; 00599 if (size < 1) { 00600 return fgets_ptr_ < src_end ? orig_dst : NULL; 00601 } 00602 00603 char *dst = orig_dst; 00604 char ch = '^'; 00605 while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') { 00606 ch = *dst++ = *fgets_ptr_++; 00607 } 00608 *dst = 0; 00609 return (dst == orig_dst) ? NULL : orig_dst; 00610 } 00611 00612 private: 00613 const char *memory_; 00614 const char *fgets_ptr_; 00615 const int mem_size_; 00616 }; 00617 00618 bool UNICHARSET::load_from_inmemory_file( 00619 const char *memory, int mem_size, bool skip_fragments) { 00620 InMemoryFilePointer mem_fp(memory, mem_size); 00621 TessResultCallback2<char *, char *, int> *fgets_cb = 00622 NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets); 00623 bool success = load_via_fgets(fgets_cb, skip_fragments); 00624 delete fgets_cb; 00625 return success; 00626 } 00627 00628 class LocalFilePointer { 00629 public: 00630 LocalFilePointer(FILE *stream) : fp_(stream) {} 00631 char *fgets(char *dst, int size) { 00632 return ::fgets(dst, size, fp_); 00633 } 00634 private: 00635 FILE *fp_; 00636 }; 00637 00638 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) { 00639 LocalFilePointer lfp(file); 00640 TessResultCallback2<char *, char *, int> *fgets_cb = 00641 NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets); 00642 bool success = load_via_fgets(fgets_cb, skip_fragments); 00643 delete fgets_cb; 00644 return success; 00645 } 00646 00647 bool UNICHARSET::load_via_fgets( 00648 TessResultCallback2<char *, char *, int> *fgets_cb, 00649 bool skip_fragments) { 00650 int unicharset_size; 00651 char buffer[256]; 00652 00653 this->clear(); 00654 if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL || 00655 sscanf(buffer, "%d", &unicharset_size) != 1) { 00656 return false; 00657 } 00658 this->reserve(unicharset_size); 00659 for (UNICHAR_ID id = 0; id < unicharset_size; ++id) { 00660 char unichar[256]; 00661 unsigned int properties; 00662 char script[64]; 00663 00664 strcpy(script, null_script); 00665 int min_bottom = 0; 00666 int max_bottom = MAX_UINT8; 00667 int min_top = 0; 00668 int max_top = MAX_UINT8; 00669 int min_width = 0; 00670 int max_width = MAX_INT16; 00671 int min_bearing = 0; 00672 int max_bearing = MAX_INT16; 00673 int min_advance = 0; 00674 int max_advance = MAX_INT16; 00675 // TODO(eger): check that this default it ok 00676 // after enabling BiDi iterator for Arabic+Cube. 00677 int direction = UNICHARSET::U_LEFT_TO_RIGHT; 00678 UNICHAR_ID other_case = id; 00679 UNICHAR_ID mirror = id; 00680 char normed[64]; 00681 int v = -1; 00682 if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL || 00683 ((v = sscanf(buffer, 00684 "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d %63s", 00685 unichar, &properties, 00686 &min_bottom, &max_bottom, &min_top, &max_top, 00687 &min_width, &max_width, &min_bearing, &max_bearing, 00688 &min_advance, &max_advance, script, &other_case, 00689 &direction, &mirror, normed)) != 17 && 00690 (v = sscanf(buffer, 00691 "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d", 00692 unichar, &properties, 00693 &min_bottom, &max_bottom, &min_top, &max_top, 00694 &min_width, &max_width, &min_bearing, &max_bearing, 00695 &min_advance, &max_advance, 00696 script, &other_case, &direction, &mirror)) != 16 && 00697 (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d", 00698 unichar, &properties, 00699 &min_bottom, &max_bottom, &min_top, &max_top, 00700 script, &other_case, &direction, &mirror)) != 10 && 00701 (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties, 00702 &min_bottom, &max_bottom, &min_top, &max_top, 00703 script, &other_case)) != 8 && 00704 (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties, 00705 script, &other_case)) != 4 && 00706 (v = sscanf(buffer, "%s %x %63s", 00707 unichar, &properties, script)) != 3 && 00708 (v = sscanf(buffer, "%s %x", unichar, &properties) != 2))) { 00709 return false; 00710 } 00711 00712 // Skip fragments if needed. 00713 CHAR_FRAGMENT *frag = NULL; 00714 if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) { 00715 delete frag; 00716 continue; 00717 } 00718 // Insert unichar into unicharset and set its properties. 00719 if (strcmp(unichar, "NULL") == 0) 00720 this->unichar_insert(" "); 00721 else 00722 this->unichar_insert(unichar); 00723 00724 this->set_isalpha(id, properties & ISALPHA_MASK); 00725 this->set_islower(id, properties & ISLOWER_MASK); 00726 this->set_isupper(id, properties & ISUPPER_MASK); 00727 this->set_isdigit(id, properties & ISDIGIT_MASK); 00728 this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK); 00729 this->set_isngram(id, false); 00730 this->set_script(id, script); 00731 this->unichars[id].properties.enabled = true; 00732 this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top); 00733 this->set_width_range(id, min_width, max_width); 00734 this->set_bearing_range(id, min_bearing, max_bearing); 00735 this->set_advance_range(id, min_advance, max_advance); 00736 this->set_direction(id, static_cast<UNICHARSET::Direction>(direction)); 00737 ASSERT_HOST(other_case < unicharset_size); 00738 this->set_other_case(id, (v>3) ? other_case : id); 00739 ASSERT_HOST(mirror < unicharset_size); 00740 this->set_mirror(id, (v>8) ? mirror : id); 00741 this->set_normed(id, (v>16) ? normed : unichar); 00742 } 00743 post_load_setup(); 00744 return true; 00745 } 00746 00747 // Sets up internal data after loading the file, based on the char 00748 // properties. Called from load_from_file, but also needs to be run 00749 // during set_unicharset_properties. 00750 void UNICHARSET::post_load_setup() { 00751 // Number of alpha chars with the case property minus those without, 00752 // in order to determine that half the alpha chars have case. 00753 int net_case_alphas = 0; 00754 int x_height_alphas = 0; 00755 int cap_height_alphas = 0; 00756 top_bottom_set_ = false; 00757 for (UNICHAR_ID id = 0; id < size_used; ++id) { 00758 int min_bottom = 0; 00759 int max_bottom = MAX_UINT8; 00760 int min_top = 0; 00761 int max_top = MAX_UINT8; 00762 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); 00763 if (min_top > 0) 00764 top_bottom_set_ = true; 00765 if (get_isalpha(id)) { 00766 if (get_islower(id) || get_isupper(id)) 00767 ++net_case_alphas; 00768 else 00769 --net_case_alphas; 00770 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) 00771 ++x_height_alphas; 00772 else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) 00773 ++cap_height_alphas; 00774 } 00775 } 00776 00777 script_has_upper_lower_ = net_case_alphas > 0; 00778 script_has_xheight_ = script_has_upper_lower_ || 00779 (x_height_alphas > cap_height_alphas * kMinXHeightFraction && 00780 cap_height_alphas > x_height_alphas * kMinCapHeightFraction); 00781 00782 null_sid_ = get_script_id_from_name(null_script); 00783 ASSERT_HOST(null_sid_ == 0); 00784 common_sid_ = get_script_id_from_name("Common"); 00785 latin_sid_ = get_script_id_from_name("Latin"); 00786 cyrillic_sid_ = get_script_id_from_name("Cyrillic"); 00787 greek_sid_ = get_script_id_from_name("Greek"); 00788 han_sid_ = get_script_id_from_name("Han"); 00789 hiragana_sid_ = get_script_id_from_name("Hiragana"); 00790 katakana_sid_ = get_script_id_from_name("Katakana"); 00791 00792 // Compute default script. Use the highest-counting alpha script, that is 00793 // not the common script, as that still contains some "alphas". 00794 int* script_counts = new int[script_table_size_used]; 00795 memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used); 00796 for (int id = 0; id < size_used; ++id) { 00797 if (get_isalpha(id)) { 00798 ++script_counts[get_script(id)]; 00799 } 00800 } 00801 default_sid_ = 0; 00802 for (int s = 1; s < script_table_size_used; ++s) { 00803 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) 00804 default_sid_ = s; 00805 } 00806 delete [] script_counts; 00807 } 00808 00809 // Returns true if right_to_left scripts are significant in the unicharset, 00810 // but without being so sensitive that "universal" unicharsets containing 00811 // characters from many scripts, like orientation and script detection, 00812 // look like they are right_to_left. 00813 bool UNICHARSET::major_right_to_left() const { 00814 int ltr_count = 0; 00815 int rtl_count = 0; 00816 for (int id = 0; id < size_used; ++id) { 00817 int dir = get_direction(id); 00818 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++; 00819 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || 00820 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC || 00821 dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++; 00822 } 00823 return rtl_count > ltr_count; 00824 } 00825 00826 // Set a whitelist and/or blacklist of characters to recognize. 00827 // An empty or NULL whitelist enables everything (minus any blacklist). 00828 // An empty or NULL blacklist disables nothing. 00829 void UNICHARSET::set_black_and_whitelist(const char* blacklist, 00830 const char* whitelist) { 00831 bool def_enabled = whitelist == NULL || whitelist[0] == '\0'; 00832 // Set everything to default 00833 for (int ch = 0; ch < size_used; ++ch) 00834 unichars[ch].properties.enabled = def_enabled; 00835 int ch_step; 00836 if (!def_enabled) { 00837 // Enable the whitelist. 00838 for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) { 00839 ch_step = step(whitelist + w_ind); 00840 if (ch_step > 0) { 00841 UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step); 00842 if (u_id != INVALID_UNICHAR_ID) { 00843 unichars[u_id].properties.enabled = true; 00844 } 00845 } else { 00846 ch_step = 1; 00847 } 00848 } 00849 } 00850 if (blacklist != NULL && blacklist[0] != '\0') { 00851 // Disable the blacklist. 00852 for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) { 00853 ch_step = step(blacklist + b_ind); 00854 if (ch_step > 0) { 00855 UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step); 00856 if (u_id != INVALID_UNICHAR_ID) { 00857 unichars[u_id].properties.enabled = false; 00858 } 00859 } else { 00860 ch_step = 1; 00861 } 00862 } 00863 } 00864 } 00865 00866 int UNICHARSET::add_script(const char* script) { 00867 for (int i = 0; i < script_table_size_used; ++i) { 00868 if (strcmp(script, script_table[i]) == 0) 00869 return i; 00870 } 00871 if (script_table_size_reserved == 0) { 00872 script_table_size_reserved = 8; 00873 script_table = new char*[script_table_size_reserved]; 00874 } 00875 if (script_table_size_used + 1 >= script_table_size_reserved) { 00876 char** new_script_table = new char*[script_table_size_reserved * 2]; 00877 memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*)); 00878 delete[] script_table; 00879 script_table = new_script_table; 00880 script_table_size_reserved = 2 * script_table_size_reserved; 00881 } 00882 script_table[script_table_size_used] = new char[strlen(script) + 1]; 00883 strcpy(script_table[script_table_size_used], script); 00884 return script_table_size_used++; 00885 } 00886 00887 // Returns the string that represents a fragment 00888 // with the given unichar, pos and total. 00889 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total, 00890 bool natural) { 00891 if (total == 1) return STRING(unichar); 00892 STRING result = ""; 00893 result += kSeparator; 00894 result += unichar; 00895 char buffer[kMaxLen]; 00896 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, 00897 natural ? kNaturalFlag : kSeparator, total); 00898 result += buffer; 00899 return result; 00900 } 00901 00902 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) { 00903 const char *ptr = string; 00904 int len = strlen(string); 00905 if (len < kMinLen || *ptr != kSeparator) { 00906 return NULL; // this string can not represent a fragment 00907 } 00908 ptr++; // move to the next character 00909 int step = 0; 00910 while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) { 00911 step += UNICHAR::utf8_step(ptr + step); 00912 } 00913 if (step == 0 || step > UNICHAR_LEN) { 00914 return NULL; // no character for unichar or the character is too long 00915 } 00916 char unichar[UNICHAR_LEN + 1]; 00917 strncpy(unichar, ptr, step); 00918 unichar[step] = '\0'; // null terminate unichar 00919 ptr += step; // move to the next fragment separator 00920 int pos = 0; 00921 int total = 0; 00922 bool natural = false; 00923 char *end_ptr = NULL; 00924 for (int i = 0; i < 2; i++) { 00925 if (ptr > string + len || *ptr != kSeparator) { 00926 if (i == 1 && *ptr == kNaturalFlag) 00927 natural = true; 00928 else 00929 return NULL; // Failed to parse fragment representation. 00930 } 00931 ptr++; // move to the next character 00932 i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10)) 00933 : total = static_cast<int>(strtol(ptr, &end_ptr, 10)); 00934 ptr = end_ptr; 00935 } 00936 if (ptr != string + len) { 00937 return NULL; // malformed fragment representation 00938 } 00939 CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT(); 00940 fragment->set_all(unichar, pos, total, natural); 00941 return fragment; 00942 } 00943 00944 int UNICHARSET::get_script_id_from_name(const char* script_name) const { 00945 for (int i = 0; i < script_table_size_used; ++i) { 00946 if (strcmp(script_name, script_table[i]) == 0) 00947 return i; 00948 } 00949 return 0; // 0 is always the null_script 00950 }