tesseract-doc/unicharset_8h_source.html

00001
00002 // File:        unicharset.h
00003 // Description: Unicode character/ligature set class.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019
00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
00021 #define TESSERACT_CCUTIL_UNICHARSET_H__
00022
00023 #include "assert.h"
00024 #include "strngs.h"
00025 #include "unichar.h"
00026 #include "unicharmap.h"
00027 #include "params.h"
00028
00029 enum StrongScriptDirection {
00030   DIR_NEUTRAL = 0,        // Text contains only neutral characters.
00031   DIR_LEFT_TO_RIGHT = 1,  // Text contains no Right-to-Left characters.
00032   DIR_RIGHT_TO_LEFT = 2,  // Text contains no Left-to-Right characters.
00033   DIR_MIX = 3,            // Text contains a mixture of left-to-right
00034                           // and right-to-left characters.
00035 };
00036
00037 class CHAR_FRAGMENT {
00038  public:
00039   // Minimum number of characters used for fragment representation.
00040   static const int kMinLen = 6;
00041   // Maximum number of characters used for fragment representation.
00042   static const int kMaxLen = 3 + UNICHAR_LEN + 2;
00043   // Maximum number of fragments per character.
00044   static const int kMaxChunks = 5;
00045
00046   // Setters and Getters.
00047   inline void set_all(const char *unichar, int pos, int total, bool natural) {
00048     set_unichar(unichar);
00049     set_pos(pos);
00050     set_total(total);
00051     set_natural(natural);
00052   }
00053   inline void set_unichar(const char *uch) {
00054     strncpy(this->unichar, uch, UNICHAR_LEN);
00055     this->unichar[UNICHAR_LEN] = '\0';
00056   }
00057   inline void set_pos(int p) { this->pos = p; }
00058   inline void set_total(int t) { this->total = t; }
00059   inline const char* get_unichar() const { return this->unichar; }
00060   inline int get_pos() const { return this->pos; }
00061   inline int get_total() const { return this->total; }
00062
00063   // Returns the string that represents a fragment
00064   // with the given unichar, pos and total.
00065   static STRING to_string(const char *unichar, int pos, int total,
00066                           bool natural);
00067   // Returns the string that represents this fragment.
00068   STRING to_string() const {
00069     return to_string(unichar, pos, total, natural);
00070   }
00071
00072   // Checks whether a fragment has the same unichar,
00073   // position and total as the given inputs.
00074   inline bool equals(const char *other_unichar,
00075                      int other_pos, int other_total) const {
00076     return (strcmp(this->unichar, other_unichar) == 0 &&
00077             this->pos == other_pos && this->total == other_total);
00078   }
00079   inline bool equals(const CHAR_FRAGMENT *other) const {
00080     return this->equals(other->get_unichar(),
00081                         other->get_pos(),
00082                         other->get_total());
00083   }
00084
00085   // Checks whether a given fragment is a continuation of this fragment.
00086   // Assumes that the given fragment pointer is not NULL.
00087   inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
00088     return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
00089             this->total == fragment->get_total() &&
00090             this->pos == fragment->get_pos() + 1);
00091   }
00092
00093   // Returns true if this fragment is a beginning fragment.
00094   inline bool is_beginning() const { return this->pos == 0; }
00095
00096   // Returns true if this fragment is an ending fragment.
00097   inline bool is_ending() const { return this->pos == this->total-1; }
00098
00099   // Returns true if the fragment was a separate component to begin with,
00100   // ie did not need chopping to be isolated, but may have been separated
00101   // out from a multi-outline blob.
00102   inline bool is_natural() const { return natural; }
00103   void set_natural(bool value) { natural = value; }
00104
00105   // Parses the string to see whether it represents a character fragment
00106   // (rather than a regular character). If so, allocates memory for a new
00107   // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
00108   // information. Fragments are of the form:
00109   // |m|1|2, meaning chunk 1 of 2 of character m, or
00110   // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
00111   // to divide the parts, as they were already separate connected components.
00112   //
00113   // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
00114   // instance, otherwise (if the string does not represent a fragment or it
00115   // looks like it does, but parsing it as a fragment fails) returns NULL.
00116   //
00117   // Note: The caller is responsible for deallocating memory
00118   // associated with the returned pointer.
00119   static CHAR_FRAGMENT *parse_from_string(const char *str);
00120
00121  private:
00122   char unichar[UNICHAR_LEN + 1];
00123   // True if the fragment was a separate component to begin with,
00124   // ie did not need chopping to be isolated, but may have been separated
00125   // out from a multi-outline blob.
00126   bool natural;
00127   inT16 pos;    // fragment position in the character
00128   inT16 total;  // total number of fragments in the character
00129 };
00130
00131 // The UNICHARSET class is an utility class for Tesseract that holds the
00132 // set of characters that are used by the engine. Each character is identified
00133 // by a unique number, from 0 to (size - 1).
00134 class UNICHARSET {
00135  public:
00136   // Custom list of characters and their ligature forms (UTF8)
00137   // These map to unicode values in the private use area (PUC) and are supported
00138   // by only few font families (eg. Wyld, Adobe Caslon Pro).
00139   static const char* kCustomLigatures[][2];
00140
00141   // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h)
00142   enum Direction {
00143       U_LEFT_TO_RIGHT               = 0,
00144       U_RIGHT_TO_LEFT               = 1,
00145       U_EUROPEAN_NUMBER             = 2,
00146       U_EUROPEAN_NUMBER_SEPARATOR   = 3,
00147       U_EUROPEAN_NUMBER_TERMINATOR  = 4,
00148       U_ARABIC_NUMBER               = 5,
00149       U_COMMON_NUMBER_SEPARATOR     = 6,
00150       U_BLOCK_SEPARATOR             = 7,
00151       U_SEGMENT_SEPARATOR           = 8,
00152       U_WHITE_SPACE_NEUTRAL         = 9,
00153       U_OTHER_NEUTRAL               = 10,
00154       U_LEFT_TO_RIGHT_EMBEDDING     = 11,
00155       U_LEFT_TO_RIGHT_OVERRIDE      = 12,
00156       U_RIGHT_TO_LEFT_ARABIC        = 13,
00157       U_RIGHT_TO_LEFT_EMBEDDING     = 14,
00158       U_RIGHT_TO_LEFT_OVERRIDE      = 15,
00159       U_POP_DIRECTIONAL_FORMAT      = 16,
00160       U_DIR_NON_SPACING_MARK        = 17,
00161       U_BOUNDARY_NEUTRAL            = 18,
00162       U_CHAR_DIRECTION_COUNT
00163   };
00164
00165   // Create an empty UNICHARSET
00166   UNICHARSET();
00167
00168   ~UNICHARSET();
00169
00170   // Return the UNICHAR_ID of a given unichar representation within the
00171   // UNICHARSET.
00172   const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
00173
00174   // Return the UNICHAR_ID of a given unichar representation within the
00175   // UNICHARSET. Only the first length characters from unichar_repr are used.
00176   const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
00177                                  int length) const;
00178
00179   // Return the minimum number of bytes that matches a legal UNICHAR_ID,
00180   // while leaving a legal UNICHAR_ID afterwards. In other words, if there
00181   // is both a short and a long match to the string, return the length that
00182   // ensures there is a legal match after it.
00183   int step(const char* str) const;
00184
00185   // Return whether the given UTF-8 string is encodable with this UNICHARSET.
00186   // If not encodable, write the first byte offset which cannot be converted
00187   // into the second (return) argument.
00188   bool encodable_string(const char *str, int *first_bad_position) const;
00189
00190   // Return the unichar representation corresponding to the given UNICHAR_ID
00191   // within the UNICHARSET.
00192   const char* const id_to_unichar(UNICHAR_ID id) const;
00193
00194   // Return the UTF8 representation corresponding to the given UNICHAR_ID after
00195   // resolving any private encodings internal to Tesseract. This method is
00196   // preferrable to id_to_unichar for outputting text that will be visible to
00197   // external applications.
00198   const char* const id_to_unichar_ext(UNICHAR_ID id) const;
00199
00200   // Return a STRING that reformats the utf8 str into the str followed
00201   // by its hex unicodes.
00202   static STRING debug_utf8_str(const char* str);
00203
00204   // Return a STRING containing debug information on the unichar, including
00205   // the id_to_unichar, its hex unicodes and the properties.
00206   STRING debug_str(UNICHAR_ID id) const;
00207   STRING debug_str(const char * unichar_repr) const {
00208     return debug_str(unichar_to_id(unichar_repr));
00209   }
00210
00211   // Add a unichar representation to the set.
00212   void unichar_insert(const char* const unichar_repr);
00213
00214   // Return true if the given unichar id exists within the set.
00215   // Relies on the fact that unichar ids are contiguous in the unicharset.
00216   bool contains_unichar_id(UNICHAR_ID unichar_id) const {
00217     return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
00218         unichar_id >= 0;
00219   }
00220
00221   // Return true if the given unichar representation exists within the set.
00222   bool contains_unichar(const char* const unichar_repr) const;
00223   bool contains_unichar(const char* const unichar_repr, int length) const;
00224
00225   // Return true if the given unichar representation corresponds to the given
00226   // UNICHAR_ID within the set.
00227   bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
00228
00229   // Delete CHAR_FRAGMENTs stored in properties of unichars array.
00230   void delete_pointers_in_unichars() {
00231     for (int i = 0; i < size_used; ++i) {
00232       if (unichars[i].properties.fragment != NULL) {
00233         delete unichars[i].properties.fragment;
00234         unichars[i].properties.fragment = NULL;
00235       }
00236     }
00237   }
00238
00239   // Clear the UNICHARSET (all the previous data is lost).
00240   void clear() {
00241     if (script_table != NULL) {
00242       for (int i = 0; i < script_table_size_used; ++i)
00243         delete[] script_table[i];
00244       delete[] script_table;
00245       script_table = NULL;
00246       script_table_size_used = 0;
00247     }
00248     if (unichars != NULL) {
00249       delete_pointers_in_unichars();
00250       delete[] unichars;
00251       unichars = NULL;
00252     }
00253     script_table_size_reserved = 0;
00254     size_reserved = 0;
00255     size_used = 0;
00256     ids.clear();
00257     top_bottom_set_ = false;
00258     script_has_upper_lower_ = false;
00259     script_has_xheight_ = false;
00260     null_sid_ = 0;
00261     common_sid_ = 0;
00262     latin_sid_ = 0;
00263     cyrillic_sid_ = 0;
00264     greek_sid_ = 0;
00265     han_sid_ = 0;
00266     hiragana_sid_ = 0;
00267     katakana_sid_ = 0;
00268   }
00269
00270   // Return the size of the set (the number of different UNICHAR it holds).
00271   int size() const {
00272     return size_used;
00273   }
00274
00275   // Reserve enough memory space for the given number of UNICHARS
00276   void reserve(int unichars_number);
00277
00278   // Opens the file indicated by filename and saves unicharset to that file.
00279   // Returns true if the operation is successful.
00280   bool save_to_file(const char * const filename) const {
00281     FILE* file = fopen(filename, "w+b");
00282     if (file == NULL) return false;
00283     bool result = save_to_file(file);
00284     fclose(file);
00285     return result;
00286   }
00287
00288   // Saves the content of the UNICHARSET to the given file.
00289   // Returns true if the operation is successful.
00290   bool save_to_file(FILE *file) const;
00291
00292   // Load a unicharset from a unicharset file that has been loaded into
00293   // the given memory buffer.
00294   // Returns true if the operation is successful.
00295   bool load_from_inmemory_file(const char* const memory, int mem_size,
00296                                bool skip_fragments);
00297   // Returns true if the operation is successful.
00298   bool load_from_inmemory_file(const char* const memory, int mem_size) {
00299     return load_from_inmemory_file(memory, mem_size, false);
00300   }
00301
00302   // Opens the file indicated by filename and loads the UNICHARSET
00303   // from the given file. The previous data is lost.
00304   // Returns true if the operation is successful.
00305   bool load_from_file(const char* const filename, bool skip_fragments) {
00306     FILE* file = fopen(filename, "rb");
00307     if (file == NULL) return false;
00308     bool result = load_from_file(file, skip_fragments);
00309     fclose(file);
00310     return result;
00311   }
00312   // returns true if the operation is successful.
00313   bool load_from_file(const char* const filename) {
00314     return load_from_file(filename, false);
00315   }
00316
00317   // Loads the UNICHARSET from the given file. The previous data is lost.
00318   // Returns true if the operation is successful.
00319   bool load_from_file(FILE *file, bool skip_fragments);
00320   bool load_from_file(FILE *file) { return load_from_file(file, false); }
00321
00322   // Sets up internal data after loading the file, based on the char
00323   // properties. Called from load_from_file, but also needs to be run
00324   // during set_unicharset_properties.
00325   void post_load_setup();
00326
00327   // Returns true if right_to_left scripts are significant in the unicharset,
00328   // but without being so sensitive that "universal" unicharsets containing
00329   // characters from many scripts, like orientation and script detection,
00330   // look like they are right_to_left.
00331   bool major_right_to_left() const;
00332
00333   // Set a whitelist and/or blacklist of characters to recognize.
00334   // An empty or NULL whitelist enables everything (minus any blacklist).
00335   // An empty or NULL blacklist disables nothing.
00336   // The blacklist overrides the whitelist.
00337   // Each list is a string of utf8 character strings. Boundaries between
00338   // unicharset units are worked out automatically, and characters not in
00339   // the unicharset are silently ignored.
00340   void set_black_and_whitelist(const char* blacklist, const char* whitelist);
00341
00342   // Set the isalpha property of the given unichar to the given value.
00343   void set_isalpha(UNICHAR_ID unichar_id, bool value) {
00344     unichars[unichar_id].properties.isalpha = value;
00345   }
00346
00347   // Set the islower property of the given unichar to the given value.
00348   void set_islower(UNICHAR_ID unichar_id, bool value) {
00349     unichars[unichar_id].properties.islower = value;
00350   }
00351
00352   // Set the isupper property of the given unichar to the given value.
00353   void set_isupper(UNICHAR_ID unichar_id, bool value) {
00354     unichars[unichar_id].properties.isupper = value;
00355   }
00356
00357   // Set the isdigit property of the given unichar to the given value.
00358   void set_isdigit(UNICHAR_ID unichar_id, bool value) {
00359     unichars[unichar_id].properties.isdigit = value;
00360   }
00361
00362   // Set the ispunctuation property of the given unichar to the given value.
00363   void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
00364     unichars[unichar_id].properties.ispunctuation = value;
00365   }
00366
00367   // Set the isngram property of the given unichar to the given value.
00368   void set_isngram(UNICHAR_ID unichar_id, bool value) {
00369     unichars[unichar_id].properties.isngram = value;
00370   }
00371
00372   // Set the script name of the given unichar to the given value.
00373   // Value is copied and thus can be a temporary;
00374   void set_script(UNICHAR_ID unichar_id, const char* value) {
00375     unichars[unichar_id].properties.script_id = add_script(value);
00376   }
00377
00378   // Set other_case unichar id in the properties for the given unichar id.
00379   void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
00380     unichars[unichar_id].properties.other_case = other_case;
00381   }
00382
00383   // Set the direction property of the given unichar to the given value.
00384   void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
00385     unichars[unichar_id].properties.direction = value;
00386   }
00387
00388   // Set mirror unichar id in the properties for the given unichar id.
00389   void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
00390     unichars[unichar_id].properties.mirror = mirror;
00391   }
00392
00393   // Record normalized version of unichar with the given unichar_id.
00394   void set_normed(UNICHAR_ID unichar_id, const char* normed) {
00395     unichars[unichar_id].properties.normed = normed;
00396   }
00397
00398   // Return the isalpha property of the given unichar.
00399   bool get_isalpha(UNICHAR_ID unichar_id) const {
00400     if (INVALID_UNICHAR_ID == unichar_id) return false;
00401     ASSERT_HOST(contains_unichar_id(unichar_id));
00402     return unichars[unichar_id].properties.isalpha;
00403   }
00404
00405   // Return the islower property of the given unichar.
00406   bool get_islower(UNICHAR_ID unichar_id) const {
00407     if (INVALID_UNICHAR_ID == unichar_id) return false;
00408     ASSERT_HOST(contains_unichar_id(unichar_id));
00409     return unichars[unichar_id].properties.islower;
00410   }
00411
00412   // Return the isupper property of the given unichar.
00413   bool get_isupper(UNICHAR_ID unichar_id) const {
00414     if (INVALID_UNICHAR_ID == unichar_id) return false;
00415     ASSERT_HOST(contains_unichar_id(unichar_id));
00416     return unichars[unichar_id].properties.isupper;
00417   }
00418
00419   // Return the isdigit property of the given unichar.
00420   bool get_isdigit(UNICHAR_ID unichar_id) const {
00421     if (INVALID_UNICHAR_ID == unichar_id) return false;
00422     ASSERT_HOST(contains_unichar_id(unichar_id));
00423     return unichars[unichar_id].properties.isdigit;
00424   }
00425
00426   // Return the ispunctuation property of the given unichar.
00427   bool get_ispunctuation(UNICHAR_ID unichar_id) const {
00428     if (INVALID_UNICHAR_ID == unichar_id) return false;
00429     ASSERT_HOST(contains_unichar_id(unichar_id));
00430     return unichars[unichar_id].properties.ispunctuation;
00431   }
00432
00433   // Return the isngram property of the given unichar.
00434   bool get_isngram(UNICHAR_ID unichar_id) const {
00435     if (INVALID_UNICHAR_ID == unichar_id) return false;
00436     ASSERT_HOST(contains_unichar_id(unichar_id));
00437     return unichars[unichar_id].properties.isngram;
00438   }
00439
00440   // Returns whether the unichar id represents a unicode value in the private
00441   // use area.
00442   bool get_isprivate(UNICHAR_ID unichar_id) const;
00443
00444   // Returns true if the ids have useful min/max top/bottom values.
00445   bool top_bottom_useful() const {
00446     return top_bottom_set_;
00447   }
00448   // Sets all ranges to empty, so they can be expanded to set the values.
00449   void set_ranges_empty();
00450   // Sets all the properties for this unicharset given a src_unicharset with
00451   // everything set. The unicharsets don't have to be the same, and graphemes
00452   // are correctly accounted for.
00453   void SetPropertiesFromOther(const UNICHARSET& src);
00454   // Expands the tops and bottoms and widths for this unicharset given a
00455   // src_unicharset with ranges in it. The unicharsets don't have to be the
00456   // same, and graphemes are correctly accounted for.
00457   void ExpandRangesFromOther(const UNICHARSET& src);
00458   // For each id in src, if it does not occur in this, add it, as in
00459   // SetPropertiesFromOther, otherwise expand the ranges, as in
00460   // ExpandRangesFromOther.
00461   void AppendOtherUnicharset(const UNICHARSET& src);
00462   // Returns the min and max bottom and top of the given unichar in
00463   // baseline-normalized coordinates, ie, where the baseline is
00464   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
00465   // (See normalis.h for the definitions).
00466   void get_top_bottom(UNICHAR_ID unichar_id,
00467                       int* min_bottom, int* max_bottom,
00468                       int* min_top, int* max_top) const {
00469     if (INVALID_UNICHAR_ID == unichar_id) {
00470       *min_bottom = *min_top = 0;
00471       *max_bottom = *max_top = 256;  // kBlnCellHeight
00472       return;
00473     }
00474     ASSERT_HOST(contains_unichar_id(unichar_id));
00475     *min_bottom = unichars[unichar_id].properties.min_bottom;
00476     *max_bottom = unichars[unichar_id].properties.max_bottom;
00477     *min_top = unichars[unichar_id].properties.min_top;
00478     *max_top = unichars[unichar_id].properties.max_top;
00479   }
00480   void set_top_bottom(UNICHAR_ID unichar_id,
00481                       int min_bottom, int max_bottom,
00482                       int min_top, int max_top) {
00483     unichars[unichar_id].properties.min_bottom =
00484         static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
00485     unichars[unichar_id].properties.max_bottom =
00486         static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
00487     unichars[unichar_id].properties.min_top =
00488         static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
00489     unichars[unichar_id].properties.max_top =
00490         static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
00491   }
00492   // Returns the width range of the given unichar in baseline-normalized
00493   // coordinates, ie, where the baseline is kBlnBaselineOffset and the
00494   // meanline is kBlnBaselineOffset + kBlnXHeight.
00495   // (See normalis.h for the definitions).
00496   void get_width_range(UNICHAR_ID unichar_id,
00497                        int* min_width, int* max_width) const {
00498     if (INVALID_UNICHAR_ID == unichar_id) {
00499       *min_width = 0;
00500       *max_width = 256;  // kBlnCellHeight;
00501       return;
00502     }
00503     ASSERT_HOST(contains_unichar_id(unichar_id));
00504     *min_width = unichars[unichar_id].properties.min_width;
00505     *max_width = unichars[unichar_id].properties.max_width;
00506   }
00507   void set_width_range(UNICHAR_ID unichar_id, int min_width, int max_width) {
00508     unichars[unichar_id].properties.min_width =
00509         static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16));
00510     unichars[unichar_id].properties.max_width =
00511         static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16));
00512   }
00513   // Returns the range of the x-bearing of the given unichar in
00514   // baseline-normalized coordinates, ie, where the baseline is
00515   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight.
00516   // (See normalis.h for the definitions).
00517   void get_bearing_range(UNICHAR_ID unichar_id,
00518                          int* min_bearing, int* max_bearing) const {
00519     if (INVALID_UNICHAR_ID == unichar_id) {
00520       *min_bearing = *max_bearing = 0;
00521       return;
00522     }
00523     ASSERT_HOST(contains_unichar_id(unichar_id));
00524     *min_bearing = unichars[unichar_id].properties.min_bearing;
00525     *max_bearing = unichars[unichar_id].properties.max_bearing;
00526   }
00527   void set_bearing_range(UNICHAR_ID unichar_id,
00528                          int min_bearing, int max_bearing) {
00529     unichars[unichar_id].properties.min_bearing =
00530         static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16));
00531     unichars[unichar_id].properties.max_bearing =
00532         static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16));
00533   }
00534   // Returns the range of the x-advance of the given unichar in
00535   // baseline-normalized coordinates, ie, where the baseline is
00536   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight.
00537   // (See normalis.h for the definitions).
00538   void get_advance_range(UNICHAR_ID unichar_id,
00539                          int* min_advance, int* max_advance) const {
00540     if (INVALID_UNICHAR_ID == unichar_id) {
00541       *min_advance = *max_advance = 0;
00542       return;
00543     }
00544     ASSERT_HOST(contains_unichar_id(unichar_id));
00545     *min_advance = unichars[unichar_id].properties.min_advance;
00546     *max_advance = unichars[unichar_id].properties.max_advance;
00547   }
00548   void set_advance_range(UNICHAR_ID unichar_id,
00549                          int min_advance, int max_advance) {
00550     unichars[unichar_id].properties.min_advance =
00551         static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16));
00552     unichars[unichar_id].properties.max_advance =
00553         static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16));
00554   }
00555
00556   // Return the script name of the given unichar.
00557   // The returned pointer will always be the same for the same script, it's
00558   // managed by unicharset and thus MUST NOT be deleted
00559   int get_script(UNICHAR_ID unichar_id) const {
00560     if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
00561     ASSERT_HOST(contains_unichar_id(unichar_id));
00562     return unichars[unichar_id].properties.script_id;
00563   }
00564
00565   // Return the character properties, eg. alpha/upper/lower/digit/punct,
00566   // as a bit field of unsigned int.
00567   unsigned int get_properties(UNICHAR_ID unichar_id) const;
00568
00569   // Return the character property as a single char.  If a character has
00570   // multiple attributes, the main property is defined by the following order:
00571   //   upper_case : 'A'
00572   //   lower_case : 'a'
00573   //   alpha      : 'x'
00574   //   digit      : '0'
00575   //   punctuation: 'p'
00576   char get_chartype(UNICHAR_ID unichar_id) const;
00577
00578   // Get other_case unichar id in the properties for the given unichar id.
00579   UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
00580     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00581     ASSERT_HOST(contains_unichar_id(unichar_id));
00582     return unichars[unichar_id].properties.other_case;
00583   }
00584
00585   // Returns the direction property of the given unichar.
00586   Direction get_direction(UNICHAR_ID unichar_id) const {
00587      if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
00588      ASSERT_HOST(contains_unichar_id(unichar_id));
00589      return unichars[unichar_id].properties.direction;
00590    }
00591
00592   // Get mirror unichar id in the properties for the given unichar id.
00593   UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
00594     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00595     ASSERT_HOST(contains_unichar_id(unichar_id));
00596     return unichars[unichar_id].properties.mirror;
00597   }
00598
00599   // Returns UNICHAR_ID of the corresponding lower-case unichar.
00600   UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
00601     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00602     ASSERT_HOST(contains_unichar_id(unichar_id));
00603     if (unichars[unichar_id].properties.islower) return unichar_id;
00604     return unichars[unichar_id].properties.other_case;
00605   }
00606
00607   // Returns UNICHAR_ID of the corresponding upper-case unichar.
00608   UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
00609     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00610     ASSERT_HOST(contains_unichar_id(unichar_id));
00611     if (unichars[unichar_id].properties.isupper) return unichar_id;
00612     return unichars[unichar_id].properties.other_case;
00613   }
00614
00615   // Return a pointer to the CHAR_FRAGMENT class if the given
00616   // unichar id represents a character fragment.
00617   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
00618     if (INVALID_UNICHAR_ID == unichar_id) return NULL;
00619     ASSERT_HOST(contains_unichar_id(unichar_id));
00620     return unichars[unichar_id].properties.fragment;
00621   }
00622
00623   // Return the isalpha property of the given unichar representation.
00624   bool get_isalpha(const char* const unichar_repr) const {
00625     return get_isalpha(unichar_to_id(unichar_repr));
00626   }
00627
00628   // Return the islower property of the given unichar representation.
00629   bool get_islower(const char* const unichar_repr) const {
00630     return get_islower(unichar_to_id(unichar_repr));
00631   }
00632
00633   // Return the isupper property of the given unichar representation.
00634   bool get_isupper(const char* const unichar_repr) const {
00635     return get_isupper(unichar_to_id(unichar_repr));
00636   }
00637
00638   // Return the isdigit property of the given unichar representation.
00639   bool get_isdigit(const char* const unichar_repr) const {
00640     return get_isdigit(unichar_to_id(unichar_repr));
00641   }
00642
00643   // Return the ispunctuation property of the given unichar representation.
00644   bool get_ispunctuation(const char* const unichar_repr) const {
00645     return get_ispunctuation(unichar_to_id(unichar_repr));
00646   }
00647
00648   // Return the character properties, eg. alpha/upper/lower/digit/punct,
00649   // of the given unichar representation
00650   unsigned int get_properties(const char* const unichar_repr) const {
00651     return get_properties(unichar_to_id(unichar_repr));
00652   }
00653
00654   char get_chartype(const char* const unichar_repr) const {
00655     return get_chartype(unichar_to_id(unichar_repr));
00656   }
00657
00658   // Return the script name of the given unichar representation.
00659   // The returned pointer will always be the same for the same script, it's
00660   // managed by unicharset and thus MUST NOT be deleted
00661   int get_script(const char* const unichar_repr) const {
00662     return get_script(unichar_to_id(unichar_repr));
00663   }
00664
00665   // Return a pointer to the CHAR_FRAGMENT class struct if the given
00666   // unichar representation represents a character fragment.
00667   const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
00668     if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
00669         !ids.contains(unichar_repr)) {
00670       return NULL;
00671     }
00672     return get_fragment(unichar_to_id(unichar_repr));
00673   }
00674
00675   // Return the isalpha property of the given unichar representation.
00676   // Only the first length characters from unichar_repr are used.
00677   bool get_isalpha(const char* const unichar_repr,
00678                int length) const {
00679     return get_isalpha(unichar_to_id(unichar_repr, length));
00680   }
00681
00682   // Return the islower property of the given unichar representation.
00683   // Only the first length characters from unichar_repr are used.
00684   bool get_islower(const char* const unichar_repr,
00685                int length) const {
00686     return get_islower(unichar_to_id(unichar_repr, length));
00687   }
00688
00689   // Return the isupper property of the given unichar representation.
00690   // Only the first length characters from unichar_repr are used.
00691   bool get_isupper(const char* const unichar_repr,
00692                int length) const {
00693     return get_isupper(unichar_to_id(unichar_repr, length));
00694   }
00695
00696   // Return the isdigit property of the given unichar representation.
00697   // Only the first length characters from unichar_repr are used.
00698   bool get_isdigit(const char* const unichar_repr,
00699                int length) const {
00700     return get_isdigit(unichar_to_id(unichar_repr, length));
00701   }
00702
00703   // Return the ispunctuation property of the given unichar representation.
00704   // Only the first length characters from unichar_repr are used.
00705   bool get_ispunctuation(const char* const unichar_repr,
00706                           int length) const {
00707     return get_ispunctuation(unichar_to_id(unichar_repr, length));
00708   }
00709
00710   // Returns normalized version of unichar with the given unichar_id.
00711   const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
00712     return unichars[unichar_id].properties.normed.string();
00713   }
00714
00715   // Return the script name of the given unichar representation.
00716   // Only the first length characters from unichar_repr are used.
00717   // The returned pointer will always be the same for the same script, it's
00718   // managed by unicharset and thus MUST NOT be deleted
00719   int get_script(const char* const unichar_repr,
00720                  int length) const {
00721     return get_script(unichar_to_id(unichar_repr, length));
00722   }
00723
00724   // Return the (current) number of scripts in the script table
00725   int get_script_table_size() const {
00726     return script_table_size_used;
00727   }
00728
00729   // Return the script string from its id
00730   const char* get_script_from_script_id(int id) const {
00731     if (id >= script_table_size_used || id < 0)
00732       return null_script;
00733     return script_table[id];
00734   }
00735
00736   // Returns the id from the name of the script, or 0 if script is not found.
00737   // Note that this is an expensive operation since it involves iteratively
00738   // comparing strings in the script table.  To avoid dependency on STL, we
00739   // won't use a hash.  Instead, the calling function can use this to lookup
00740   // and save the ID for relevant scripts for fast comparisons later.
00741   int get_script_id_from_name(const char* script_name) const;
00742
00743   // Return true if the given script is the null script
00744   bool is_null_script(const char* script) const {
00745     return script == null_script;
00746   }
00747
00748   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
00749   // then the returned pointer will be the same.
00750   // The script parameter is copied and thus can be a temporary.
00751   int add_script(const char* script);
00752
00753   // Return the enabled property of the given unichar.
00754   bool get_enabled(UNICHAR_ID unichar_id) const {
00755     return unichars[unichar_id].properties.enabled;
00756   }
00757
00758
00759   int null_sid() const { return null_sid_; }
00760   int common_sid() const { return common_sid_; }
00761   int latin_sid() const { return latin_sid_; }
00762   int cyrillic_sid() const { return cyrillic_sid_; }
00763   int greek_sid() const { return greek_sid_; }
00764   int han_sid() const { return han_sid_; }
00765   int hiragana_sid() const { return hiragana_sid_; }
00766   int katakana_sid() const { return katakana_sid_; }
00767   int default_sid() const { return default_sid_; }
00768
00769   // Returns true if the unicharset has the concept of upper/lower case.
00770   bool script_has_upper_lower() const {
00771     return script_has_upper_lower_;
00772   }
00773   // Returns true if the unicharset has the concept of x-height.
00774   // script_has_xheight can be true even if script_has_upper_lower is not,
00775   // when the script has a sufficiently predominant top line with ascenders,
00776   // such as Devanagari and Thai.
00777   bool script_has_xheight() const {
00778     return script_has_xheight_;
00779   }
00780
00781  private:
00782
00783   struct UNICHAR_PROPERTIES {
00784     UNICHAR_PROPERTIES();
00785     // Initializes all properties to sensible default values.
00786     void Init();
00787     // Sets all ranges wide open. Initialization default in case there are
00788     // no useful values available.
00789     void SetRangesOpen();
00790     // Sets all ranges to empty. Used before expanding with font-based data.
00791     void SetRangesEmpty();
00792     // Returns true if any of the top/bottom/width/bearing/advance ranges is
00793     // emtpy.
00794     bool AnyRangeEmpty() const;
00795     // Expands the ranges with the ranges from the src properties.
00796     void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
00797     // Copies the properties from src into this.
00798     void CopyFrom(const UNICHAR_PROPERTIES& src);
00799
00800     bool  isalpha;
00801     bool  islower;
00802     bool  isupper;
00803     bool  isdigit;
00804     bool  ispunctuation;
00805     bool  isngram;
00806     bool  enabled;
00807     // Possible limits of the top and bottom of the bounding box in
00808     // baseline-normalized coordinates, ie, where the baseline is
00809     // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
00810     // (See normalis.h for the definitions).
00811     uinT8 min_bottom;
00812     uinT8 max_bottom;
00813     uinT8 min_top;
00814     uinT8 max_top;
00815     // Limits on the widths of bounding box, also in baseline-normalized coords.
00816     inT16 min_width;
00817     inT16 max_width;
00818     // Limits on the x-bearing and advance, also in baseline-normalized coords.
00819     inT16 min_bearing;
00820     inT16 max_bearing;
00821     inT16 min_advance;
00822     inT16 max_advance;
00823     int   script_id;
00824     UNICHAR_ID other_case;  // id of the corresponding upper/lower case unichar
00825     Direction direction;  // direction of this unichar
00826     // Mirror property is useful for reverse DAWG lookup for words in
00827     // right-to-left languages (e.g. "(word)" would be in
00828     // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
00829     // However, what we want in our DAWG is
00830     // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
00831     // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
00832     UNICHAR_ID mirror;
00833     STRING normed;  // normalized version of this unichar
00834     // Contains meta information about the fragment if a unichar represents
00835     // a fragment of a character, otherwise should be set to NULL.
00836     // It is assumed that character fragments are added to the unicharset
00837     // after the corresponding 'base' characters.
00838     CHAR_FRAGMENT *fragment;
00839   };
00840
00841   struct UNICHAR_SLOT {
00842     char representation[UNICHAR_LEN + 1];
00843     UNICHAR_PROPERTIES properties;
00844   };
00845
00846   // Gets the properties for a grapheme string, combining properties for
00847   // multiple characters in a meaningful way where possible.
00848   // Returns false if no valid match was found in the unicharset.
00849   // NOTE that script_id, mirror, and other_case refer to this unicharset on
00850   // return and will need redirecting if the target unicharset is different.
00851   bool GetStrProperties(const char* utf8_str,
00852                         UNICHAR_PROPERTIES* props) const;
00853
00854   // Load ourselves from a "file" where our only interface to the file is
00855   // an implementation of fgets().  This is the parsing primitive accessed by
00856   // the public routines load_from_file() and load_from_inmemory_file().
00857   bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
00858                       bool skip_fragments);
00859
00860   UNICHAR_SLOT* unichars;
00861   UNICHARMAP ids;
00862   int size_used;
00863   int size_reserved;
00864   char** script_table;
00865   int script_table_size_used;
00866   int script_table_size_reserved;
00867   const char* null_script;
00868   // True if the unichars have their tops/bottoms set.
00869   bool top_bottom_set_;
00870   // True if the unicharset has significant upper/lower case chars.
00871   bool script_has_upper_lower_;
00872   // True if the unicharset has a significant mean-line with significant
00873   // ascenders above that.
00874   bool script_has_xheight_;
00875
00876   // A few convenient script name-to-id mapping without using hash.
00877   // These are initialized when unicharset file is loaded.  Anything
00878   // missing from this list can be looked up using get_script_id_from_name.
00879   int null_sid_;
00880   int common_sid_;
00881   int latin_sid_;
00882   int cyrillic_sid_;
00883   int greek_sid_;
00884   int han_sid_;
00885   int hiragana_sid_;
00886   int katakana_sid_;
00887   // The most frequently occurring script in the charset.
00888   int default_sid_;
00889 };
00890
00891 #endif  // TESSERACT_CCUTIL_UNICHARSET_H__