Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: char_samp_enum.cpp 00003 * Description: Implementation of a Character Set Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2007 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <string> 00021 00022 #include "char_set.h" 00023 #include "cube_utils.h" 00024 #include "tessdatamanager.h" 00025 00026 namespace tesseract { 00027 00028 CharSet::CharSet() { 00029 class_cnt_ = 0; 00030 class_strings_ = NULL; 00031 unicharset_map_ = NULL; 00032 init_ = false; 00033 00034 // init hash table 00035 memset(hash_bin_size_, 0, sizeof(hash_bin_size_)); 00036 } 00037 00038 CharSet::~CharSet() { 00039 if (class_strings_ != NULL) { 00040 for (int cls = 0; cls < class_cnt_; cls++) { 00041 if (class_strings_[cls] != NULL) { 00042 delete class_strings_[cls]; 00043 } 00044 } 00045 delete []class_strings_; 00046 class_strings_ = NULL; 00047 } 00048 delete []unicharset_map_; 00049 } 00050 00051 // Creates CharSet object by reading the unicharset from the 00052 // TessDatamanager, and mapping Cube's unicharset to Tesseract's if 00053 // they differ. 00054 CharSet *CharSet::Create(TessdataManager *tessdata_manager, 00055 UNICHARSET *tess_unicharset) { 00056 CharSet *char_set = new CharSet(); 00057 if (char_set == NULL) { 00058 return NULL; 00059 } 00060 00061 // First look for Cube's unicharset; if not there, use tesseract's 00062 bool cube_unicharset_exists; 00063 if (!(cube_unicharset_exists = 00064 tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) && 00065 !tessdata_manager->SeekToStart(TESSDATA_UNICHARSET)) { 00066 fprintf(stderr, "Cube ERROR (CharSet::Create): could not find " 00067 "either cube or tesseract unicharset\n"); 00068 return false; 00069 } 00070 FILE *charset_fp = tessdata_manager->GetDataFilePtr(); 00071 if (!charset_fp) { 00072 fprintf(stderr, "Cube ERROR (CharSet::Create): could not load " 00073 "a unicharset\n"); 00074 return false; 00075 } 00076 00077 // If we found a cube unicharset separate from tesseract's, load it and 00078 // map its unichars to tesseract's; if only one unicharset exists, 00079 // just load it. 00080 bool loaded; 00081 if (cube_unicharset_exists) { 00082 char_set->cube_unicharset_.load_from_file(charset_fp); 00083 loaded = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET); 00084 loaded = loaded && char_set->LoadSupportedCharList( 00085 tessdata_manager->GetDataFilePtr(), tess_unicharset); 00086 char_set->unicharset_ = &char_set->cube_unicharset_; 00087 } else { 00088 loaded = char_set->LoadSupportedCharList(charset_fp, NULL); 00089 char_set->unicharset_ = tess_unicharset; 00090 } 00091 if (!loaded) { 00092 delete char_set; 00093 return false; 00094 } 00095 00096 char_set->init_ = true; 00097 return char_set; 00098 } 00099 00100 // Load the list of supported chars from the given data file pointer. 00101 bool CharSet::LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset) { 00102 if (init_) 00103 return true; 00104 00105 char str_line[256]; 00106 // init hash table 00107 memset(hash_bin_size_, 0, sizeof(hash_bin_size_)); 00108 // read the char count 00109 if (fgets(str_line, sizeof(str_line), fp) == NULL) { 00110 fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not " 00111 "read char count.\n"); 00112 return false; 00113 } 00114 class_cnt_ = atoi(str_line); 00115 if (class_cnt_ < 2) { 00116 fprintf(stderr, "Cube ERROR (CharSet::InitMemory): invalid " 00117 "class count: %d\n", class_cnt_); 00118 return false; 00119 } 00120 // memory for class strings 00121 class_strings_ = new string_32*[class_cnt_]; 00122 if (class_strings_ == NULL) { 00123 fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not " 00124 "allocate memory for class strings.\n"); 00125 return false; 00126 } 00127 // memory for unicharset map 00128 if (tess_unicharset) { 00129 unicharset_map_ = new int[class_cnt_]; 00130 if (unicharset_map_ == NULL) { 00131 fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not " 00132 "allocate memory for unicharset map.\n"); 00133 return false; 00134 } 00135 } 00136 00137 // Read in character strings and add to hash table 00138 for (int class_id = 0; class_id < class_cnt_; class_id++) { 00139 // Read the class string 00140 if (fgets(str_line, sizeof(str_line), fp) == NULL) { 00141 fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): " 00142 "could not read class string with class_id=%d.\n", class_id); 00143 return false; 00144 } 00145 // Terminate at space if any 00146 char *p = strchr(str_line, ' '); 00147 if (p != NULL) 00148 *p = '\0'; 00149 // Convert to UTF32 and store 00150 string_32 str32; 00151 // Convert NULL to a space 00152 if (strcmp(str_line, "NULL") == 0) { 00153 strcpy(str_line, " "); 00154 } 00155 CubeUtils::UTF8ToUTF32(str_line, &str32); 00156 class_strings_[class_id] = new string_32(str32); 00157 if (class_strings_[class_id] == NULL) { 00158 fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): could not " 00159 "allocate memory for class string with class_id=%d.\n", class_id); 00160 return false; 00161 } 00162 00163 // Add to hash-table 00164 int hash_val = Hash(reinterpret_cast<const char_32 *>(str32.c_str())); 00165 if (hash_bin_size_[hash_val] >= kMaxHashSize) { 00166 fprintf(stderr, "Cube ERROR (CharSet::LoadSupportedCharList): hash " 00167 "table is full.\n"); 00168 return false; 00169 } 00170 hash_bins_[hash_val][hash_bin_size_[hash_val]++] = class_id; 00171 00172 if (tess_unicharset != NULL) { 00173 // Add class id to unicharset map 00174 UNICHAR_ID tess_id = tess_unicharset->unichar_to_id(str_line); 00175 if (tess_id == INVALID_UNICHAR_ID) { 00176 tess_unicharset->unichar_insert(str_line); 00177 tess_id = tess_unicharset->unichar_to_id(str_line); 00178 } 00179 ASSERT_HOST(tess_id != INVALID_UNICHAR_ID); 00180 unicharset_map_[class_id] = tess_id; 00181 } 00182 } 00183 return true; 00184 } 00185 00186 } // tesseract