tesseract-doc/unicharset_8cpp_source.html

00001
00002 // File:        unicharset.cpp
00003 // Description: Unicode character/ligature set class.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019
00020 #include <assert.h>
00021 #include <stdio.h>
00022 #include <string.h>
00023
00024 #include "tesscallback.h"
00025 #include "tprintf.h"
00026 #include "unichar.h"
00027 #include "unicharset.h"
00028 #include "params.h"
00029
00030 // Special character used in representing character fragments.
00031 static const char kSeparator = '|';
00032 // Special character used in representing 'natural' character fragments.
00033 static const char kNaturalFlag = 'n';
00034
00035 static const int ISALPHA_MASK = 0x1;
00036 static const int ISLOWER_MASK = 0x2;
00037 static const int ISUPPER_MASK = 0x4;
00038 static const int ISDIGIT_MASK = 0x8;
00039 static const int ISPUNCTUATION_MASK = 0x10;
00040
00041 // Y coordinate threshold for determining cap-height vs x-height.
00042 // TODO(rays) Bring the global definition down to the ccutil library level,
00043 // so this constant is relative to some other constants.
00044 static const int kMeanlineThreshold = 220;
00045 // Let C be the number of alpha chars for which all tops exceed
00046 // kMeanlineThreshold, and X the number of alpha chars for which all
00047 // tops are below kMeanlineThreshold, then if X > C *
00048 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
00049 // half the alpha characters have upper or lower case, then the
00050 // unicharset "has x-height".
00051 const double kMinXHeightFraction = 0.25;
00052 const double kMinCapHeightFraction = 0.05;
00053
00054 /*static */
00055 const char* UNICHARSET::kCustomLigatures[][2] = {
00056   {"ct", "\uE003"},  // c + t -> U+E003
00057   {"ſh", "\uE006"},  // long-s + h -> U+E006
00058   {"ſi", "\uE007"},  // long-s + i -> U+E007
00059   {"ſl", "\uE008"},  // long-s + l -> U+E008
00060   {"ſſ", "\uE009"},  // long-s + long-s -> U+E009
00061   {NULL, NULL}
00062 };
00063
00064 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
00065   Init();
00066 }
00067
00068 // Initialize all properties to sensible default values.
00069 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
00070   isalpha = false;
00071   islower = false;
00072   isupper = false;
00073   isdigit = false;
00074   ispunctuation = false;
00075   isngram = false;
00076   enabled = false;
00077   SetRangesOpen();
00078   script_id = 0;
00079   other_case = 0;
00080   mirror = 0;
00081   normed = "";
00082   direction = UNICHARSET::U_LEFT_TO_RIGHT;
00083   fragment = NULL;
00084 }
00085
00086 // Sets all ranges wide open. Initialization default in case there are
00087 // no useful values available.
00088 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
00089   min_bottom = 0;
00090   max_bottom = MAX_UINT8;
00091   min_top = 0;
00092   max_top = MAX_UINT8;
00093   min_width = 0;
00094   max_width = MAX_INT16;
00095   min_bearing = 0;
00096   max_bearing = MAX_INT16;
00097   min_advance = 0;
00098   max_advance = MAX_INT16;
00099 }
00100
00101 // Sets all ranges to empty. Used before expanding with font-based data.
00102 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
00103   min_bottom = MAX_UINT8;
00104   max_bottom = 0;
00105   min_top = MAX_UINT8;
00106   max_top = 0;
00107   min_width = MAX_INT16;
00108   max_width = 0;
00109   min_bearing = MAX_INT16;
00110   max_bearing = 0;
00111   min_advance = MAX_INT16;
00112   max_advance = 0;
00113 }
00114
00115 // Returns true if any of the top/bottom/width/bearing/advance ranges is
00116 // emtpy.
00117 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
00118   return min_bottom > max_bottom || min_top > max_top ||
00119       min_width > max_width || min_bearing > max_bearing ||
00120       min_advance > max_advance;
00121 }
00122
00123 // Expands the ranges with the ranges from the src properties.
00124 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
00125     const UNICHAR_PROPERTIES& src) {
00126   UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
00127   UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
00128   UpdateRange(src.min_top, &min_top, &max_top);
00129   UpdateRange(src.max_top, &min_top, &max_top);
00130   UpdateRange(src.min_width, &min_width, &max_width);
00131   UpdateRange(src.max_width, &min_width, &max_width);
00132   UpdateRange(src.min_bearing, &min_bearing, &max_bearing);
00133   UpdateRange(src.max_bearing, &min_bearing, &max_bearing);
00134   UpdateRange(src.min_advance, &min_advance, &max_advance);
00135   UpdateRange(src.max_advance, &min_advance, &max_advance);
00136 }
00137
00138 // Copies the properties from src into this.
00139 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
00140   // Apart from the fragment, everything else can be done with a default copy.
00141   CHAR_FRAGMENT* saved_fragment = fragment;
00142   *this = src;  // Bitwise copy.
00143   fragment = saved_fragment;
00144 }
00145
00146 UNICHARSET::UNICHARSET() :
00147     unichars(NULL),
00148     ids(),
00149     size_used(0),
00150     size_reserved(0),
00151     script_table(NULL),
00152     script_table_size_used(0),
00153     null_script("NULL") {
00154   clear();
00155 }
00156
00157 UNICHARSET::~UNICHARSET() {
00158   clear();
00159 }
00160
00161 void UNICHARSET::reserve(int unichars_number) {
00162   if (unichars_number > size_reserved) {
00163     UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
00164     for (int i = 0; i < size_used; ++i)
00165       unichars_new[i] = unichars[i];
00166     for (int j = size_used; j < unichars_number; ++j) {
00167       unichars_new[j].properties.script_id = add_script(null_script);
00168     }
00169     delete[] unichars;
00170     unichars = unichars_new;
00171     size_reserved = unichars_number;
00172   }
00173 }
00174
00175 const UNICHAR_ID
00176 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
00177   return ids.contains(unichar_repr) ?
00178     ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
00179 }
00180
00181 const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
00182                                            int length) const {
00183   assert(length > 0 && length <= UNICHAR_LEN);
00184   return ids.contains(unichar_repr, length) ?
00185     ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
00186 }
00187
00188 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
00189 // while leaving a legal UNICHAR_ID afterwards. In other words, if there
00190 // is both a short and a long match to the string, return the length that
00191 // ensures there is a legal match after it.
00192 int UNICHARSET::step(const char* str) const {
00193   // Find the length of the first matching unicharset member.
00194   int minlength = ids.minmatch(str);
00195   if (minlength == 0)
00196     return 0;  // Empty string or illegal char.
00197
00198   int goodlength = minlength;
00199   while (goodlength <= UNICHAR_LEN) {
00200     if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
00201       return goodlength;  // This length works!
00202
00203     // The next char is illegal so find the next usable length.
00204     do {
00205       ++goodlength;
00206     } while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
00207              !ids.contains(str, goodlength));
00208     if (goodlength > UNICHAR_LEN || !ids.contains(str, goodlength)) {
00209       // This does not constitute a good length!
00210       return minlength;
00211     }
00212   }
00213   // Search to find a subsequent legal char failed so return the minlength.
00214   return minlength;
00215 }
00216
00217 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
00218 // If not encodable, write the first byte offset which cannot be converted
00219 // into the second (return) argument.
00220 bool UNICHARSET::encodable_string(const char *str,
00221                                   int *first_bad_position) const {
00222   for (int i = 0, len = strlen(str); i < len; ) {
00223     int increment = step(str + i);
00224     if (increment == 0) {
00225       if (first_bad_position) *first_bad_position = i;
00226       return false;
00227     }
00228     i += increment;
00229   }
00230   return true;
00231 }
00232
00233 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
00234   if (id == INVALID_UNICHAR_ID) {
00235     return INVALID_UNICHAR;
00236   }
00237   ASSERT_HOST(id < this->size());
00238   return unichars[id].representation;
00239 }
00240
00241 const char* const UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
00242   if (id == INVALID_UNICHAR_ID) {
00243     return INVALID_UNICHAR;
00244   }
00245   ASSERT_HOST(id < this->size());
00246   // Resolve from the kCustomLigatures table if this is a private encoding.
00247   if (get_isprivate(id)) {
00248     const char* ch = id_to_unichar(id);
00249     for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
00250       if (!strcmp(ch, kCustomLigatures[i][1])) {
00251         return kCustomLigatures[i][0];
00252       }
00253     }
00254   }
00255   // Otherwise return the stored representation.
00256   return unichars[id].representation;
00257 }
00258
00259 // Return a STRING that reformats the utf8 str into the str followed
00260 // by its hex unicodes.
00261 STRING UNICHARSET::debug_utf8_str(const char* str) {
00262   STRING result = str;
00263   result += " [";
00264   int step = 1;
00265   // Chop into unicodes and code each as hex.
00266   for (int i = 0; str[i] != '\0'; i += step) {
00267     char hex[sizeof(int) * 2 + 1];
00268     step = UNICHAR::utf8_step(str + i);
00269     if (step == 0) {
00270       step = 1;
00271       sprintf(hex, "%x", str[i]);
00272     } else {
00273       UNICHAR ch(str + i, step);
00274       sprintf(hex, "%x", ch.first_uni());
00275     }
00276     result += hex;
00277     result += " ";
00278   }
00279   result += "]";
00280   return result;
00281 }
00282
00283 // Return a STRING containing debug information on the unichar, including
00284 // the id_to_unichar, its hex unicodes and the properties.
00285 STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
00286   if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
00287   const CHAR_FRAGMENT *fragment = this->get_fragment(id);
00288   if (fragment) {
00289     return fragment->to_string();
00290   }
00291   const char* str = id_to_unichar(id);
00292   STRING result = debug_utf8_str(str);
00293   // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
00294   if (get_isalpha(id)) {
00295     if (get_islower(id))
00296       result += "a";
00297     else if (get_isupper(id))
00298       result += "A";
00299     else
00300       result += "x";
00301   }
00302   // Append 0 if a digit.
00303   if (get_isdigit(id)) {
00304     result += "0";
00305   }
00306   // Append p is a punctuation symbol.
00307   if (get_ispunctuation(id)) {
00308     result += "p";
00309   }
00310   return result;
00311 }
00312
00313 // Returns whether the unichar id represents a unicode value in the private use
00314 // area. We use this range only internally to represent uncommon ligatures
00315 // (eg. 'ct') that do not have regular unicode values.
00316 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
00317   UNICHAR uc(id_to_unichar(unichar_id), -1);
00318   int uni = uc.first_uni();
00319   return (uni >= 0xE000 && uni <= 0xF8FF);
00320 }
00321
00322
00323 // Sets all ranges to empty, so they can be expanded to set the values.
00324 void UNICHARSET::set_ranges_empty() {
00325   for (int id = 0; id < size_used; ++id) {
00326     unichars[id].properties.SetRangesEmpty();
00327   }
00328 }
00329
00330 // Sets all the properties for this unicharset given a src unicharset with
00331 // everything set. The unicharsets don't have to be the same, and graphemes
00332 // are correctly accounted for.
00333 void UNICHARSET::SetPropertiesFromOther(const UNICHARSET& src) {
00334   for (int ch = 0; ch < size_used; ++ch) {
00335     const char* utf8 = id_to_unichar(ch);
00336     UNICHAR_PROPERTIES properties;
00337     if (src.GetStrProperties(utf8, &properties)) {
00338       // Setup the script_id, other_case, and mirror properly.
00339       const char* script = src.get_script_from_script_id(properties.script_id);
00340       properties.script_id = add_script(script);
00341       const char* other_case = src.id_to_unichar(properties.other_case);
00342       if (contains_unichar(other_case)) {
00343         properties.other_case = unichar_to_id(other_case);
00344       } else {
00345         properties.other_case = ch;
00346       }
00347       const char* mirror_str = src.id_to_unichar(properties.mirror);
00348       if (contains_unichar(mirror_str)) {
00349         properties.mirror = unichar_to_id(mirror_str);
00350       } else {
00351         properties.mirror = ch;
00352       }
00353       unichars[ch].properties.CopyFrom(properties);
00354     }
00355   }
00356 }
00357
00358 // Expands the tops and bottoms and widths for this unicharset given a
00359 // src unicharset with ranges in it. The unicharsets don't have to be the
00360 // same, and graphemes are correctly accounted for.
00361 void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
00362   for (int ch = 0; ch < size_used; ++ch) {
00363     const char* utf8 = id_to_unichar(ch);
00364     UNICHAR_PROPERTIES properties;
00365     if (src.GetStrProperties(utf8, &properties)) {
00366       // Expand just the ranges from properties.
00367       unichars[ch].properties.ExpandRangesFrom(properties);
00368     }
00369   }
00370 }
00371
00372 // For each id in src, if it does not occur in this, add it, as in
00373 // SetPropertiesFromOther, otherwise expand the ranges, as in
00374 // ExpandRangesFromOther.
00375 void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
00376   for (int ch = 0; ch < src.size_used; ++ch) {
00377     const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
00378     const char* utf8 = src.id_to_unichar(ch);
00379     if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) {
00380       // Only use fully valid entries.
00381       tprintf("Bad properties for char %s: %d,%d %d,%d %d,%d %d,%d %d,%d\n",
00382               utf8, src_props.min_bottom, src_props.max_bottom,
00383               src_props.min_top, src_props.max_top,
00384               src_props.min_width, src_props.max_width,
00385               src_props.min_bearing, src_props.max_bearing,
00386               src_props.min_advance, src_props.max_advance);
00387       continue;
00388     }
00389     int id = size_used;
00390     if (contains_unichar(utf8)) {
00391       id = unichar_to_id(utf8);
00392     } else {
00393       unichar_insert(utf8);
00394       unichars[id].properties.SetRangesEmpty();
00395     }
00396     if (!unichars[id].properties.AnyRangeEmpty()) {
00397       // Just expand current ranges.
00398       unichars[id].properties.ExpandRangesFrom(src_props);
00399     } else {
00400       // Copy properties from src_props.
00401       unichars[id].properties.CopyFrom(src_props);
00402       // Setup the script_id, other_case and mirror properly.
00403       const char* script = src.get_script_from_script_id(src_props.script_id);
00404       unichars[id].properties.script_id = add_script(script);
00405       const char* other_case = src.id_to_unichar(src_props.other_case);
00406       if (!contains_unichar(other_case)) {
00407         unichar_insert(other_case);
00408         unichars[size_used - 1].properties.SetRangesEmpty();
00409         // Other_case will have its ranges set later as it is contained in src.
00410       }
00411       unichars[id].properties.other_case = unichar_to_id(other_case);
00412       const char* mirror_str = src.id_to_unichar(src_props.mirror);
00413       if (!contains_unichar(mirror_str)) {
00414         unichar_insert(mirror_str);
00415         unichars[size_used - 1].properties.SetRangesEmpty();
00416         // Mirror will have its ranges set later as it is contained in src.
00417       }
00418       unichars[id].properties.mirror = unichar_to_id(mirror_str);
00419     }
00420   }
00421 }
00422
00423 // Gets the properties for a grapheme string, combining properties for
00424 // multiple characters in a meaningful way where possible.
00425 // Returns false if no valid match was found in the unicharset.
00426 // NOTE that script_id, mirror, and other_case refer to this unicharset on
00427 // return and will need translation if the target unicharset is different.
00428 bool UNICHARSET::GetStrProperties(const char* utf8_str,
00429                                   UNICHAR_PROPERTIES* props) const {
00430   props->Init();
00431   props->SetRangesEmpty();
00432   props->min_advance = 0;
00433   props->max_advance = 0;
00434   int utf8_step = 0;
00435   int total_unicodes = 0;
00436   for (int offset = 0; utf8_str[offset] != '\0'; offset += utf8_step) {
00437     utf8_step = step(utf8_str + offset);
00438     if (utf8_step == 0) return false;
00439     int id = unichar_to_id(utf8_str + offset, utf8_step);
00440     if (id < 0) return false;
00441     const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
00442     // Logical OR all the bools.
00443     if (src_props.isalpha) props->isalpha = true;
00444     if (src_props.islower) props->islower = true;
00445     if (src_props.isupper) props->isupper = true;
00446     if (src_props.isdigit) props->isdigit = true;
00447     if (src_props.ispunctuation) props->ispunctuation = true;
00448     if (src_props.isngram) props->isngram = true;
00449     if (src_props.enabled) props->enabled = true;
00450     // Min/max the tops/bottoms.
00451     UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
00452     UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
00453     UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
00454     UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
00455     int bearing = props->min_advance + src_props.min_bearing;
00456     if (total_unicodes == 0 || bearing < props->min_bearing)
00457       props->min_bearing = bearing;
00458     bearing = props->max_advance + src_props.max_bearing;
00459     if (total_unicodes == 0 || bearing < props->max_bearing)
00460       props->max_bearing = bearing;
00461     props->min_advance += src_props.min_advance;
00462     props->max_advance += src_props.max_advance;
00463     // With a single width, just use the widths stored in the unicharset.
00464     props->min_width = src_props.min_width;
00465     props->max_width = src_props.max_width;
00466     // Use the first script id, other_case, mirror, direction.
00467     // Note that these will need translation, except direction.
00468     if (total_unicodes == 0) {
00469       props->script_id = src_props.script_id;
00470       props->other_case = src_props.other_case;
00471       props->mirror = src_props.mirror;
00472       props->direction = src_props.direction;
00473     }
00474     // The normed string for the compound character is the concatenation of
00475     // the normed versions of the individual characters.
00476     props->normed += src_props.normed;
00477     ++total_unicodes;
00478   }
00479   if (total_unicodes > 1) {
00480     // Estimate the total widths from the advance - bearing.
00481     props->min_width = props->min_advance - props->max_bearing;
00482     props->max_width = props->max_advance - props->min_bearing;
00483   }
00484   return total_unicodes > 0;
00485 }
00486
00487 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
00488   unsigned int properties = 0;
00489   if (this->get_isalpha(id))
00490     properties |= ISALPHA_MASK;
00491   if (this->get_islower(id))
00492     properties |= ISLOWER_MASK;
00493   if (this->get_isupper(id))
00494     properties |= ISUPPER_MASK;
00495   if (this->get_isdigit(id))
00496     properties |= ISDIGIT_MASK;
00497   if (this->get_ispunctuation(id))
00498     properties |= ISPUNCTUATION_MASK;
00499   return properties;
00500 }
00501
00502 char UNICHARSET::get_chartype(UNICHAR_ID id) const {
00503   if (this->get_isupper(id)) return 'A';
00504   if (this->get_islower(id)) return 'a';
00505   if (this->get_isalpha(id)) return 'x';
00506   if (this->get_isdigit(id)) return '0';
00507   if (this->get_ispunctuation(id)) return 'p';
00508   return 0;
00509 }
00510
00511 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
00512   if (!ids.contains(unichar_repr)) {
00513     if (strlen(unichar_repr) > UNICHAR_LEN) {
00514       fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
00515               int(strlen(unichar_repr)), unichar_repr);
00516       return;
00517     }
00518     if (size_used == size_reserved) {
00519       if (size_used == 0)
00520         reserve(8);
00521       else
00522         reserve(2 * size_used);
00523     }
00524
00525     strcpy(unichars[size_used].representation, unichar_repr);
00526     this->set_script(size_used, null_script);
00527     // If the given unichar_repr represents a fragmented character, set
00528     // fragment property to a pointer to CHAR_FRAGMENT class instance with
00529     // information parsed from the unichar representation. Use the script
00530     // of the base unichar for the fragmented character if possible.
00531     CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
00532     this->unichars[size_used].properties.fragment = frag;
00533     if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
00534       this->unichars[size_used].properties.script_id =
00535         this->get_script(frag->get_unichar());
00536     }
00537     this->unichars[size_used].properties.enabled = true;
00538     ids.insert(unichar_repr, size_used);
00539     ++size_used;
00540   }
00541 }
00542
00543 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
00544   return ids.contains(unichar_repr);
00545 }
00546
00547 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
00548                                   int length) const {
00549   if (length == 0) {
00550     return false;
00551   }
00552   return ids.contains(unichar_repr, length);
00553 }
00554
00555 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
00556                     const char* const unichar_repr) const {
00557   return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
00558 }
00559
00560 bool UNICHARSET::save_to_file(FILE *file) const {
00561   fprintf(file, "%d\n", this->size());
00562   for (UNICHAR_ID id = 0; id < this->size(); ++id) {
00563     int min_bottom, max_bottom, min_top, max_top;
00564     get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
00565     int min_width, max_width;
00566     get_width_range(id, &min_width, &max_width);
00567     int min_bearing, max_bearing;
00568     get_bearing_range(id, &min_bearing, &max_bearing);
00569     int min_advance, max_advance;
00570     get_advance_range(id, &min_advance, &max_advance);
00571     unsigned int properties = this->get_properties(id);
00572     if (strcmp(this->id_to_unichar(id), " ") == 0) {
00573       fprintf(file, "%s %x %s %d\n", "NULL", properties,
00574               this->get_script_from_script_id(this->get_script(id)),
00575               this->get_other_case(id));
00576     } else {
00577       fprintf(file,
00578               "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
00579               this->id_to_unichar(id), properties,
00580               min_bottom, max_bottom, min_top, max_top, min_width, max_width,
00581               min_bearing, max_bearing, min_advance, max_advance,
00582               this->get_script_from_script_id(this->get_script(id)),
00583               this->get_other_case(id), this->get_direction(id),
00584               this->get_mirror(id), this->get_normed_unichar(id),
00585               this->debug_str(id).string());
00586     }
00587   }
00588   return true;
00589 }
00590
00591 class InMemoryFilePointer {
00592  public:
00593   InMemoryFilePointer(const char *memory, int mem_size)
00594       : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
00595
00596   char *fgets(char *orig_dst, int size) {
00597     const char *src_end = memory_ + mem_size_;
00598     char *dst_end = orig_dst + size - 1;
00599     if (size < 1) {
00600       return fgets_ptr_ < src_end ? orig_dst : NULL;
00601     }
00602
00603     char *dst = orig_dst;
00604     char ch = '^';
00605     while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
00606       ch = *dst++ = *fgets_ptr_++;
00607     }
00608     *dst = 0;
00609     return (dst == orig_dst) ? NULL : orig_dst;
00610   }
00611
00612  private:
00613   const char *memory_;
00614   const char *fgets_ptr_;
00615   const int mem_size_;
00616 };
00617
00618 bool UNICHARSET::load_from_inmemory_file(
00619     const char *memory, int mem_size, bool skip_fragments) {
00620   InMemoryFilePointer mem_fp(memory, mem_size);
00621   TessResultCallback2<char *, char *, int> *fgets_cb =
00622       NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets);
00623   bool success = load_via_fgets(fgets_cb, skip_fragments);
00624   delete fgets_cb;
00625   return success;
00626 }
00627
00628 class LocalFilePointer {
00629  public:
00630   LocalFilePointer(FILE *stream) : fp_(stream) {}
00631   char *fgets(char *dst, int size) {
00632     return ::fgets(dst, size, fp_);
00633   }
00634  private:
00635   FILE *fp_;
00636 };
00637
00638 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
00639   LocalFilePointer lfp(file);
00640   TessResultCallback2<char *, char *, int> *fgets_cb =
00641       NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets);
00642   bool success = load_via_fgets(fgets_cb, skip_fragments);
00643   delete fgets_cb;
00644   return success;
00645 }
00646
00647 bool UNICHARSET::load_via_fgets(
00648     TessResultCallback2<char *, char *, int> *fgets_cb,
00649     bool skip_fragments) {
00650   int unicharset_size;
00651   char buffer[256];
00652
00653   this->clear();
00654   if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL ||
00655       sscanf(buffer, "%d", &unicharset_size) != 1) {
00656     return false;
00657   }
00658   this->reserve(unicharset_size);
00659   for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
00660     char unichar[256];
00661     unsigned int properties;
00662     char script[64];
00663
00664     strcpy(script, null_script);
00665     int min_bottom = 0;
00666     int max_bottom = MAX_UINT8;
00667     int min_top = 0;
00668     int max_top = MAX_UINT8;
00669     int min_width = 0;
00670     int max_width = MAX_INT16;
00671     int min_bearing = 0;
00672     int max_bearing = MAX_INT16;
00673     int min_advance = 0;
00674     int max_advance = MAX_INT16;
00675     // TODO(eger): check that this default it ok
00676     // after enabling BiDi iterator for Arabic+Cube.
00677     int direction = UNICHARSET::U_LEFT_TO_RIGHT;
00678     UNICHAR_ID other_case = id;
00679     UNICHAR_ID mirror = id;
00680     char normed[64];
00681     int v = -1;
00682     if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL ||
00683         ((v = sscanf(buffer,
00684                      "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d %63s",
00685                      unichar, &properties,
00686                      &min_bottom, &max_bottom, &min_top, &max_top,
00687                      &min_width, &max_width, &min_bearing, &max_bearing,
00688                      &min_advance, &max_advance, script, &other_case,
00689                      &direction, &mirror, normed)) != 17 &&
00690          (v = sscanf(buffer,
00691                      "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d",
00692                      unichar, &properties,
00693                      &min_bottom, &max_bottom, &min_top, &max_top,
00694                      &min_width, &max_width, &min_bearing, &max_bearing,
00695                      &min_advance, &max_advance,
00696                      script, &other_case, &direction, &mirror)) != 16 &&
00697           (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
00698                       unichar, &properties,
00699                       &min_bottom, &max_bottom, &min_top, &max_top,
00700                       script, &other_case, &direction, &mirror)) != 10 &&
00701           (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
00702                       &min_bottom, &max_bottom, &min_top, &max_top,
00703                       script, &other_case)) != 8 &&
00704           (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
00705                       script, &other_case)) != 4 &&
00706           (v = sscanf(buffer, "%s %x %63s",
00707                       unichar, &properties, script)) != 3 &&
00708           (v = sscanf(buffer, "%s %x", unichar, &properties) != 2))) {
00709       return false;
00710     }
00711
00712     // Skip fragments if needed.
00713     CHAR_FRAGMENT *frag = NULL;
00714     if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
00715       delete frag;
00716       continue;
00717     }
00718     // Insert unichar into unicharset and set its properties.
00719     if (strcmp(unichar, "NULL") == 0)
00720       this->unichar_insert(" ");
00721     else
00722       this->unichar_insert(unichar);
00723
00724     this->set_isalpha(id, properties & ISALPHA_MASK);
00725     this->set_islower(id, properties & ISLOWER_MASK);
00726     this->set_isupper(id, properties & ISUPPER_MASK);
00727     this->set_isdigit(id, properties & ISDIGIT_MASK);
00728     this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
00729     this->set_isngram(id, false);
00730     this->set_script(id, script);
00731     this->unichars[id].properties.enabled = true;
00732     this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
00733     this->set_width_range(id, min_width, max_width);
00734     this->set_bearing_range(id, min_bearing, max_bearing);
00735     this->set_advance_range(id, min_advance, max_advance);
00736     this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
00737     ASSERT_HOST(other_case < unicharset_size);
00738     this->set_other_case(id, (v>3) ? other_case : id);
00739     ASSERT_HOST(mirror < unicharset_size);
00740     this->set_mirror(id, (v>8) ? mirror : id);
00741     this->set_normed(id, (v>16) ? normed : unichar);
00742   }
00743   post_load_setup();
00744   return true;
00745 }
00746
00747 // Sets up internal data after loading the file, based on the char
00748 // properties. Called from load_from_file, but also needs to be run
00749 // during set_unicharset_properties.
00750 void UNICHARSET::post_load_setup() {
00751   // Number of alpha chars with the case property minus those without,
00752   // in order to determine that half the alpha chars have case.
00753   int net_case_alphas = 0;
00754   int x_height_alphas = 0;
00755   int cap_height_alphas = 0;
00756   top_bottom_set_ = false;
00757   for (UNICHAR_ID id = 0; id < size_used; ++id) {
00758     int min_bottom = 0;
00759     int max_bottom = MAX_UINT8;
00760     int min_top = 0;
00761     int max_top = MAX_UINT8;
00762     get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
00763     if (min_top > 0)
00764       top_bottom_set_ = true;
00765     if (get_isalpha(id)) {
00766       if (get_islower(id) || get_isupper(id))
00767         ++net_case_alphas;
00768       else
00769         --net_case_alphas;
00770       if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
00771         ++x_height_alphas;
00772       else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
00773         ++cap_height_alphas;
00774     }
00775   }
00776
00777   script_has_upper_lower_ = net_case_alphas > 0;
00778   script_has_xheight_ = script_has_upper_lower_ ||
00779       (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
00780        cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
00781
00782   null_sid_ = get_script_id_from_name(null_script);
00783   ASSERT_HOST(null_sid_ == 0);
00784   common_sid_ = get_script_id_from_name("Common");
00785   latin_sid_ = get_script_id_from_name("Latin");
00786   cyrillic_sid_ = get_script_id_from_name("Cyrillic");
00787   greek_sid_ = get_script_id_from_name("Greek");
00788   han_sid_ = get_script_id_from_name("Han");
00789   hiragana_sid_ = get_script_id_from_name("Hiragana");
00790   katakana_sid_ = get_script_id_from_name("Katakana");
00791
00792   // Compute default script. Use the highest-counting alpha script, that is
00793   // not the common script, as that still contains some "alphas".
00794   int* script_counts = new int[script_table_size_used];
00795   memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
00796   for (int id = 0; id < size_used; ++id) {
00797     if (get_isalpha(id)) {
00798       ++script_counts[get_script(id)];
00799     }
00800   }
00801   default_sid_ = 0;
00802   for (int s = 1; s < script_table_size_used; ++s) {
00803     if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
00804       default_sid_ = s;
00805   }
00806   delete [] script_counts;
00807 }
00808
00809 // Returns true if right_to_left scripts are significant in the unicharset,
00810 // but without being so sensitive that "universal" unicharsets containing
00811 // characters from many scripts, like orientation and script detection,
00812 // look like they are right_to_left.
00813 bool UNICHARSET::major_right_to_left() const {
00814   int ltr_count = 0;
00815   int rtl_count = 0;
00816   for (int id = 0; id < size_used; ++id) {
00817     int dir = get_direction(id);
00818     if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
00819     if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
00820         dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
00821         dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
00822   }
00823   return rtl_count > ltr_count;
00824 }
00825
00826 // Set a whitelist and/or blacklist of characters to recognize.
00827 // An empty or NULL whitelist enables everything (minus any blacklist).
00828 // An empty or NULL blacklist disables nothing.
00829 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
00830                                          const char* whitelist) {
00831   bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
00832   // Set everything to default
00833   for (int ch = 0; ch < size_used; ++ch)
00834     unichars[ch].properties.enabled = def_enabled;
00835   int ch_step;
00836   if (!def_enabled) {
00837     // Enable the whitelist.
00838     for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
00839       ch_step = step(whitelist + w_ind);
00840       if (ch_step > 0) {
00841         UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
00842         if (u_id != INVALID_UNICHAR_ID) {
00843           unichars[u_id].properties.enabled = true;
00844         }
00845       } else {
00846         ch_step = 1;
00847       }
00848     }
00849   }
00850   if (blacklist != NULL && blacklist[0] != '\0') {
00851     // Disable the blacklist.
00852     for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
00853       ch_step = step(blacklist + b_ind);
00854       if (ch_step > 0) {
00855         UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
00856         if (u_id != INVALID_UNICHAR_ID) {
00857           unichars[u_id].properties.enabled = false;
00858         }
00859       } else {
00860         ch_step = 1;
00861       }
00862     }
00863   }
00864 }
00865
00866 int UNICHARSET::add_script(const char* script) {
00867   for (int i = 0; i < script_table_size_used; ++i) {
00868     if (strcmp(script, script_table[i]) == 0)
00869       return i;
00870   }
00871   if (script_table_size_reserved == 0) {
00872     script_table_size_reserved = 8;
00873     script_table = new char*[script_table_size_reserved];
00874   }
00875   if (script_table_size_used + 1 >= script_table_size_reserved) {
00876     char** new_script_table = new char*[script_table_size_reserved * 2];
00877     memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
00878     delete[] script_table;
00879     script_table = new_script_table;
00880       script_table_size_reserved = 2 * script_table_size_reserved;
00881   }
00882   script_table[script_table_size_used] = new char[strlen(script) + 1];
00883   strcpy(script_table[script_table_size_used], script);
00884   return script_table_size_used++;
00885 }
00886
00887 // Returns the string that represents a fragment
00888 // with the given unichar, pos and total.
00889 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
00890                                 bool natural) {
00891   if (total == 1) return STRING(unichar);
00892   STRING result = "";
00893   result += kSeparator;
00894   result += unichar;
00895   char buffer[kMaxLen];
00896   snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
00897            natural ? kNaturalFlag : kSeparator, total);
00898   result += buffer;
00899   return result;
00900 }
00901
00902 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
00903   const char *ptr = string;
00904   int len = strlen(string);
00905   if (len < kMinLen || *ptr != kSeparator) {
00906     return NULL;  // this string can not represent a fragment
00907   }
00908   ptr++;  // move to the next character
00909   int step = 0;
00910   while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
00911     step += UNICHAR::utf8_step(ptr + step);
00912   }
00913   if (step == 0 || step > UNICHAR_LEN) {
00914     return NULL;  // no character for unichar or the character is too long
00915   }
00916   char unichar[UNICHAR_LEN + 1];
00917   strncpy(unichar, ptr, step);
00918   unichar[step] = '\0';  // null terminate unichar
00919   ptr += step;  // move to the next fragment separator
00920   int pos = 0;
00921   int total = 0;
00922   bool natural = false;
00923   char *end_ptr = NULL;
00924   for (int i = 0; i < 2; i++) {
00925     if (ptr > string + len || *ptr != kSeparator) {
00926       if (i == 1 && *ptr == kNaturalFlag)
00927         natural = true;
00928       else
00929         return NULL;  // Failed to parse fragment representation.
00930     }
00931     ptr++;  // move to the next character
00932     i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
00933       : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
00934     ptr = end_ptr;
00935   }
00936   if (ptr != string + len) {
00937     return NULL;  // malformed fragment representation
00938   }
00939   CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
00940   fragment->set_all(unichar, pos, total, natural);
00941   return fragment;
00942 }
00943
00944 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
00945   for (int i = 0; i < script_table_size_used; ++i) {
00946     if (strcmp(script_name, script_table[i]) == 0)
00947       return i;
00948   }
00949   return 0;  // 0 is always the null_script
00950 }