tesseract-doc/char__set_8h_source.html

00001 /**********************************************************************
00002  * File:        char_samp_enum.h
00003  * Description: Declaration of a Character Set Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2007
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019
00020 // The CharSet class encapsulates the list of 32-bit strings/characters that
00021 // Cube supports for a specific language. The char set is loaded from the
00022 // .unicharset file corresponding to a specific language
00023 // Each string has a corresponding int class-id that gets used throughout Cube
00024 // The class provides pass back and forth conversion between the class-id
00025 // and its corresponding 32-bit string. This is done using a hash table that
00026 // maps the string to the class id.
00027
00028 #ifndef CHAR_SET_H
00029 #define CHAR_SET_H
00030
00031 #include <string.h>
00032 #include <string>
00033 #include <algorithm>
00034
00035 #include "string_32.h"
00036 #include "tessdatamanager.h"
00037 #include "unicharset.h"
00038 #include "cube_const.h"
00039
00040 namespace tesseract {
00041
00042 class CharSet {
00043  public:
00044   CharSet();
00045   ~CharSet();
00046
00047   // Returns true if Cube is sharing Tesseract's unicharset.
00048   inline bool SharedUnicharset() { return (unicharset_map_ == NULL); }
00049
00050   // Returns the class id corresponding to a 32-bit string. Returns -1
00051   // if the string is not supported. This is done by hashing the
00052   // string and then looking up the string in the hash-bin if there
00053   // are collisions.
00054   inline int ClassID(const char_32 *str) const {
00055     int hash_val = Hash(str);
00056     if (hash_bin_size_[hash_val] == 0)
00057       return -1;
00058     for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
00059       if (class_strings_[hash_bins_[hash_val][bin]]->compare(str) == 0)
00060         return hash_bins_[hash_val][bin];
00061     }
00062     return -1;
00063   }
00064   // Same as above but using a 32-bit char instead of a string
00065   inline int ClassID(char_32 ch) const {
00066     int hash_val = Hash(ch);
00067     if (hash_bin_size_[hash_val] == 0)
00068       return -1;
00069     for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) {
00070       if ((*class_strings_[hash_bins_[hash_val][bin]])[0] == ch &&
00071           class_strings_[hash_bins_[hash_val][bin]]->length() == 1) {
00072         return hash_bins_[hash_val][bin];
00073       }
00074     }
00075     return -1;
00076   }
00077   // Retrieve the unicharid in Tesseract's unicharset corresponding
00078   // to a 32-bit string. When Tesseract and Cube share the same
00079   // unicharset, this will just be the class id.
00080   inline int UnicharID(const char_32 *str) const {
00081     int class_id = ClassID(str);
00082     if (class_id == INVALID_UNICHAR_ID)
00083       return INVALID_UNICHAR_ID;
00084     int unichar_id;
00085     if (unicharset_map_)
00086       unichar_id = unicharset_map_[class_id];
00087     else
00088       unichar_id = class_id;
00089     return unichar_id;
00090   }
00091   // Same as above but using a 32-bit char instead of a string
00092   inline int UnicharID(char_32 ch) const {
00093     int class_id = ClassID(ch);
00094     if (class_id == INVALID_UNICHAR_ID)
00095       return INVALID_UNICHAR_ID;
00096     int unichar_id;
00097     if (unicharset_map_)
00098       unichar_id = unicharset_map_[class_id];
00099     else
00100       unichar_id = class_id;
00101     return unichar_id;
00102   }
00103   // Returns the 32-bit string corresponding to a class id
00104   inline const char_32 * ClassString(int class_id) const {
00105     if (class_id < 0 || class_id >= class_cnt_) {
00106       return NULL;
00107     }
00108     return reinterpret_cast<const char_32 *>(class_strings_[class_id]->c_str());
00109   }
00110   // Returns the count of supported strings
00111   inline int ClassCount() const { return class_cnt_; }
00112
00113   // Creates CharSet object by reading the unicharset from the
00114   // TessDatamanager, and mapping Cube's unicharset to Tesseract's if
00115   // they differ.
00116   static CharSet *Create(TessdataManager *tessdata_manager,
00117                          UNICHARSET *tess_unicharset);
00118
00119   // Return the UNICHARSET cube is using for recognition internally --
00120   // ClassId() returns unichar_id's in this unicharset.
00121   UNICHARSET *InternalUnicharset() { return unicharset_; }
00122
00123  private:
00124   // Hash table configuration params. Determined emperically on
00125   // the supported languages so far (Eng, Ara, Hin). Might need to be
00126   // tuned for speed when more languages are supported
00127   static const int kHashBins = 3001;
00128   static const int kMaxHashSize = 16;
00129
00130   // Using djb2 hashing function to hash a 32-bit string
00131   // introduced in http://www.cse.yorku.ca/~oz/hash.html
00132   static inline int Hash(const char_32 *str) {
00133     unsigned long hash = 5381;
00134     int c;
00135     while ((c = *str++))
00136       hash = ((hash << 5) + hash) + c;
00137     return (hash%kHashBins);
00138   }
00139   // Same as above but for a single char
00140   static inline int Hash(char_32 ch) {
00141     char_32 b[2];
00142     b[0] = ch;
00143     b[1] = 0;
00144     return Hash(b);
00145   }
00146
00147   // Load the list of supported chars from the given data file
00148   // pointer. If tess_unicharset is non-NULL, mapping each Cube class
00149   // id to a tesseract unicharid.
00150   bool LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset);
00151
00152   // class count
00153   int class_cnt_;
00154   // hash-bin sizes array
00155   int hash_bin_size_[kHashBins];
00156   // hash bins
00157   int hash_bins_[kHashBins][kMaxHashSize];
00158   // supported strings array
00159   string_32  **class_strings_;
00160   // map from class id to secondary (tesseract's) unicharset's ids
00161   int *unicharset_map_;
00162   // A unicharset which is filled in with a Tesseract-style UNICHARSET for
00163   // cube's data if our unicharset is different from tesseract's.
00164   UNICHARSET cube_unicharset_;
00165   // This points to either the tess_unicharset we're passed or cube_unicharset_,
00166   // depending upon whether we just have one unicharset or one for each
00167   // tesseract and cube, respectively.
00168   UNICHARSET *unicharset_;
00169   // has the char set been initialized flag
00170   bool init_;
00171 };
00172 }
00173
00174 #endif  // CHAR_SET_H