Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: char_bigrams.cpp 00003 * Description: Implementation of a Character Bigrams Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2007 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <algorithm> 00021 #include <math.h> 00022 #include <string> 00023 #include <vector> 00024 00025 #include "char_bigrams.h" 00026 #include "cube_utils.h" 00027 #include "ndminx.h" 00028 #include "cube_const.h" 00029 00030 namespace tesseract { 00031 00032 CharBigrams::CharBigrams() { 00033 memset(&bigram_table_, 0, sizeof(bigram_table_)); 00034 } 00035 00036 CharBigrams::~CharBigrams() { 00037 if (bigram_table_.char_bigram != NULL) { 00038 for (int ch1 = 0; ch1 <= bigram_table_.max_char; ch1++) { 00039 CharBigram *char_bigram = bigram_table_.char_bigram + ch1; 00040 00041 if (char_bigram->bigram != NULL) { 00042 delete []char_bigram->bigram; 00043 } 00044 } 00045 delete []bigram_table_.char_bigram; 00046 } 00047 } 00048 00049 CharBigrams *CharBigrams::Create(const string &data_file_path, 00050 const string &lang) { 00051 string file_name; 00052 string str; 00053 00054 file_name = data_file_path + lang; 00055 file_name += ".cube.bigrams"; 00056 00057 // load the string into memory 00058 if (!CubeUtils::ReadFileToString(file_name, &str)) { 00059 return NULL; 00060 } 00061 00062 // construct a new object 00063 CharBigrams *char_bigrams_obj = new CharBigrams(); 00064 if (char_bigrams_obj == NULL) { 00065 fprintf(stderr, "Cube ERROR (CharBigrams::Create): could not create " 00066 "character bigrams object.\n"); 00067 return NULL; 00068 } 00069 CharBigramTable *table = &char_bigrams_obj->bigram_table_; 00070 00071 table->total_cnt = 0; 00072 table->max_char = -1; 00073 table->char_bigram = NULL; 00074 00075 // split into lines 00076 vector<string> str_vec; 00077 CubeUtils::SplitStringUsing(str, "\r\n", &str_vec); 00078 00079 for (int big = 0; big < str_vec.size(); big++) { 00080 char_32 ch1; 00081 char_32 ch2; 00082 int cnt; 00083 if (sscanf(str_vec[big].c_str(), "%d %x %x", &cnt, &ch1, &ch2) != 3) { 00084 fprintf(stderr, "Cube ERROR (CharBigrams::Create): invalid format " 00085 "reading line: %s\n", str_vec[big].c_str()); 00086 return NULL; 00087 } 00088 00089 // expand the bigram table 00090 if (ch1 > table->max_char) { 00091 CharBigram *char_bigram = new CharBigram[ch1 + 1]; 00092 if (char_bigram == NULL) { 00093 fprintf(stderr, "Cube ERROR (CharBigrams::Create): error allocating " 00094 "additional memory for character bigram table.\n"); 00095 return NULL; 00096 } 00097 00098 if (table->char_bigram != NULL && table->max_char >= 0) { 00099 memcpy(char_bigram, table->char_bigram, 00100 (table->max_char + 1) * sizeof(*char_bigram)); 00101 00102 delete []table->char_bigram; 00103 } 00104 table->char_bigram = char_bigram; 00105 00106 // init 00107 for (int new_big = table->max_char + 1; new_big <= ch1; new_big++) { 00108 table->char_bigram[new_big].total_cnt = 0; 00109 table->char_bigram[new_big].max_char = -1; 00110 table->char_bigram[new_big].bigram = NULL; 00111 } 00112 table->max_char = ch1; 00113 } 00114 00115 if (ch2 > table->char_bigram[ch1].max_char) { 00116 Bigram *bigram = new Bigram[ch2 + 1]; 00117 if (bigram == NULL) { 00118 fprintf(stderr, "Cube ERROR (CharBigrams::Create): error allocating " 00119 "memory for bigram.\n"); 00120 return NULL; 00121 } 00122 00123 if (table->char_bigram[ch1].bigram != NULL && 00124 table->char_bigram[ch1].max_char >= 0) { 00125 memcpy(bigram, table->char_bigram[ch1].bigram, 00126 (table->char_bigram[ch1].max_char + 1) * sizeof(*bigram)); 00127 delete []table->char_bigram[ch1].bigram; 00128 } 00129 table->char_bigram[ch1].bigram = bigram; 00130 00131 // init 00132 for (int new_big = table->char_bigram[ch1].max_char + 1; 00133 new_big <= ch2; new_big++) { 00134 table->char_bigram[ch1].bigram[new_big].cnt = 0; 00135 } 00136 table->char_bigram[ch1].max_char = ch2; 00137 } 00138 00139 table->char_bigram[ch1].bigram[ch2].cnt = cnt; 00140 table->char_bigram[ch1].total_cnt += cnt; 00141 table->total_cnt += cnt; 00142 } 00143 00144 // compute costs (-log probs) 00145 table->worst_cost = static_cast<int>( 00146 -PROB2COST_SCALE * log(0.5 / table->total_cnt)); 00147 for (char_32 ch1 = 0; ch1 <= table->max_char; ch1++) { 00148 for (char_32 ch2 = 0; ch2 <= table->char_bigram[ch1].max_char; ch2++) { 00149 int cnt = table->char_bigram[ch1].bigram[ch2].cnt; 00150 table->char_bigram[ch1].bigram[ch2].cost = 00151 static_cast<int>(-PROB2COST_SCALE * 00152 log(MAX(0.5, static_cast<double>(cnt)) / 00153 table->total_cnt)); 00154 } 00155 } 00156 return char_bigrams_obj; 00157 } 00158 00159 int CharBigrams::PairCost(char_32 ch1, char_32 ch2) const { 00160 if (ch1 > bigram_table_.max_char) { 00161 return bigram_table_.worst_cost; 00162 } 00163 if (ch2 > bigram_table_.char_bigram[ch1].max_char) { 00164 return bigram_table_.worst_cost; 00165 } 00166 return bigram_table_.char_bigram[ch1].bigram[ch2].cost; 00167 } 00168 00169 int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set) const { 00170 if (!char_32_ptr || char_32_ptr[0] == 0) { 00171 return bigram_table_.worst_cost; 00172 } 00173 int cost = MeanCostWithSpaces(char_32_ptr); 00174 if (CubeUtils::StrLen(char_32_ptr) >= kMinLengthCaseInvariant && 00175 CubeUtils::IsCaseInvariant(char_32_ptr, char_set)) { 00176 char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set); 00177 if (lower_32 && lower_32[0] != 0) { 00178 int cost_lower = MeanCostWithSpaces(lower_32); 00179 cost = MIN(cost, cost_lower); 00180 delete [] lower_32; 00181 } 00182 char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set); 00183 if (upper_32 && upper_32[0] != 0) { 00184 int cost_upper = MeanCostWithSpaces(upper_32); 00185 cost = MIN(cost, cost_upper); 00186 delete [] upper_32; 00187 } 00188 } 00189 return cost; 00190 } 00191 00192 int CharBigrams::MeanCostWithSpaces(const char_32 *char_32_ptr) const { 00193 if (!char_32_ptr) 00194 return bigram_table_.worst_cost; 00195 int len = CubeUtils::StrLen(char_32_ptr); 00196 int cost = 0; 00197 int c = 0; 00198 cost = PairCost(' ', char_32_ptr[0]); 00199 for (c = 1; c < len; c++) { 00200 cost += PairCost(char_32_ptr[c - 1], char_32_ptr[c]); 00201 } 00202 cost += PairCost(char_32_ptr[len - 1], ' '); 00203 return static_cast<int>(cost / static_cast<double>(len + 1)); 00204 } 00205 } // namespace tesseract