Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: char_samp_enum.h 00003 * Description: Declaration of a Character Set Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2007 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // The CharSet class encapsulates the list of 32-bit strings/characters that 00021 // Cube supports for a specific language. The char set is loaded from the 00022 // .unicharset file corresponding to a specific language 00023 // Each string has a corresponding int class-id that gets used throughout Cube 00024 // The class provides pass back and forth conversion between the class-id 00025 // and its corresponding 32-bit string. This is done using a hash table that 00026 // maps the string to the class id. 00027 00028 #ifndef CHAR_SET_H 00029 #define CHAR_SET_H 00030 00031 #include <string.h> 00032 #include <string> 00033 #include <algorithm> 00034 00035 #include "string_32.h" 00036 #include "tessdatamanager.h" 00037 #include "unicharset.h" 00038 #include "cube_const.h" 00039 00040 namespace tesseract { 00041 00042 class CharSet { 00043 public: 00044 CharSet(); 00045 ~CharSet(); 00046 00047 // Returns true if Cube is sharing Tesseract's unicharset. 00048 inline bool SharedUnicharset() { return (unicharset_map_ == NULL); } 00049 00050 // Returns the class id corresponding to a 32-bit string. Returns -1 00051 // if the string is not supported. This is done by hashing the 00052 // string and then looking up the string in the hash-bin if there 00053 // are collisions. 00054 inline int ClassID(const char_32 *str) const { 00055 int hash_val = Hash(str); 00056 if (hash_bin_size_[hash_val] == 0) 00057 return -1; 00058 for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) { 00059 if (class_strings_[hash_bins_[hash_val][bin]]->compare(str) == 0) 00060 return hash_bins_[hash_val][bin]; 00061 } 00062 return -1; 00063 } 00064 // Same as above but using a 32-bit char instead of a string 00065 inline int ClassID(char_32 ch) const { 00066 int hash_val = Hash(ch); 00067 if (hash_bin_size_[hash_val] == 0) 00068 return -1; 00069 for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) { 00070 if ((*class_strings_[hash_bins_[hash_val][bin]])[0] == ch && 00071 class_strings_[hash_bins_[hash_val][bin]]->length() == 1) { 00072 return hash_bins_[hash_val][bin]; 00073 } 00074 } 00075 return -1; 00076 } 00077 // Retrieve the unicharid in Tesseract's unicharset corresponding 00078 // to a 32-bit string. When Tesseract and Cube share the same 00079 // unicharset, this will just be the class id. 00080 inline int UnicharID(const char_32 *str) const { 00081 int class_id = ClassID(str); 00082 if (class_id == INVALID_UNICHAR_ID) 00083 return INVALID_UNICHAR_ID; 00084 int unichar_id; 00085 if (unicharset_map_) 00086 unichar_id = unicharset_map_[class_id]; 00087 else 00088 unichar_id = class_id; 00089 return unichar_id; 00090 } 00091 // Same as above but using a 32-bit char instead of a string 00092 inline int UnicharID(char_32 ch) const { 00093 int class_id = ClassID(ch); 00094 if (class_id == INVALID_UNICHAR_ID) 00095 return INVALID_UNICHAR_ID; 00096 int unichar_id; 00097 if (unicharset_map_) 00098 unichar_id = unicharset_map_[class_id]; 00099 else 00100 unichar_id = class_id; 00101 return unichar_id; 00102 } 00103 // Returns the 32-bit string corresponding to a class id 00104 inline const char_32 * ClassString(int class_id) const { 00105 if (class_id < 0 || class_id >= class_cnt_) { 00106 return NULL; 00107 } 00108 return reinterpret_cast<const char_32 *>(class_strings_[class_id]->c_str()); 00109 } 00110 // Returns the count of supported strings 00111 inline int ClassCount() const { return class_cnt_; } 00112 00113 // Creates CharSet object by reading the unicharset from the 00114 // TessDatamanager, and mapping Cube's unicharset to Tesseract's if 00115 // they differ. 00116 static CharSet *Create(TessdataManager *tessdata_manager, 00117 UNICHARSET *tess_unicharset); 00118 00119 // Return the UNICHARSET cube is using for recognition internally -- 00120 // ClassId() returns unichar_id's in this unicharset. 00121 UNICHARSET *InternalUnicharset() { return unicharset_; } 00122 00123 private: 00124 // Hash table configuration params. Determined emperically on 00125 // the supported languages so far (Eng, Ara, Hin). Might need to be 00126 // tuned for speed when more languages are supported 00127 static const int kHashBins = 3001; 00128 static const int kMaxHashSize = 16; 00129 00130 // Using djb2 hashing function to hash a 32-bit string 00131 // introduced in http://www.cse.yorku.ca/~oz/hash.html 00132 static inline int Hash(const char_32 *str) { 00133 unsigned long hash = 5381; 00134 int c; 00135 while ((c = *str++)) 00136 hash = ((hash << 5) + hash) + c; 00137 return (hash%kHashBins); 00138 } 00139 // Same as above but for a single char 00140 static inline int Hash(char_32 ch) { 00141 char_32 b[2]; 00142 b[0] = ch; 00143 b[1] = 0; 00144 return Hash(b); 00145 } 00146 00147 // Load the list of supported chars from the given data file 00148 // pointer. If tess_unicharset is non-NULL, mapping each Cube class 00149 // id to a tesseract unicharid. 00150 bool LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset); 00151 00152 // class count 00153 int class_cnt_; 00154 // hash-bin sizes array 00155 int hash_bin_size_[kHashBins]; 00156 // hash bins 00157 int hash_bins_[kHashBins][kMaxHashSize]; 00158 // supported strings array 00159 string_32 **class_strings_; 00160 // map from class id to secondary (tesseract's) unicharset's ids 00161 int *unicharset_map_; 00162 // A unicharset which is filled in with a Tesseract-style UNICHARSET for 00163 // cube's data if our unicharset is different from tesseract's. 00164 UNICHARSET cube_unicharset_; 00165 // This points to either the tess_unicharset we're passed or cube_unicharset_, 00166 // depending upon whether we just have one unicharset or one for each 00167 // tesseract and cube, respectively. 00168 UNICHARSET *unicharset_; 00169 // has the char set been initialized flag 00170 bool init_; 00171 }; 00172 } 00173 00174 #endif // CHAR_SET_H