Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: word_unigrams.cpp 00003 * Description: Implementation of the Word Unigrams Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <math.h> 00021 #include <string> 00022 #include <vector> 00023 #include <algorithm> 00024 00025 #include "const.h" 00026 #include "cube_utils.h" 00027 #include "ndminx.h" 00028 #include "word_unigrams.h" 00029 00030 namespace tesseract { 00031 00032 WordUnigrams::WordUnigrams() { 00033 costs_ = NULL; 00034 words_ = NULL; 00035 word_cnt_ = 0; 00036 } 00037 00038 WordUnigrams::~WordUnigrams() { 00039 if (words_ != NULL) { 00040 if (words_[0] != NULL) { 00041 delete []words_[0]; 00042 } 00043 00044 delete []words_; 00045 words_ = NULL; 00046 } 00047 00048 if (costs_ != NULL) { 00049 delete []costs_; 00050 } 00051 } 00052 00053 // Load the word-list and unigrams from file and create an object 00054 // The word list is assumed to be sorted in lexicographic order. 00055 WordUnigrams *WordUnigrams::Create(const string &data_file_path, 00056 const string &lang) { 00057 string file_name; 00058 string str; 00059 00060 file_name = data_file_path + lang; 00061 file_name += ".cube.word-freq"; 00062 00063 // load the string into memory 00064 if (CubeUtils::ReadFileToString(file_name, &str) == false) { 00065 return NULL; 00066 } 00067 00068 // split into lines 00069 vector<string> str_vec; 00070 CubeUtils::SplitStringUsing(str, "\r\n \t", &str_vec); 00071 if (str_vec.size() < 2) { 00072 return NULL; 00073 } 00074 00075 // allocate memory 00076 WordUnigrams *word_unigrams_obj = new WordUnigrams(); 00077 if (word_unigrams_obj == NULL) { 00078 fprintf(stderr, "Cube ERROR (WordUnigrams::Create): could not create " 00079 "word unigrams object.\n"); 00080 return NULL; 00081 } 00082 00083 int full_len = str.length(); 00084 int word_cnt = str_vec.size() / 2; 00085 word_unigrams_obj->words_ = new char*[word_cnt]; 00086 word_unigrams_obj->costs_ = new int[word_cnt]; 00087 00088 if (word_unigrams_obj->words_ == NULL || 00089 word_unigrams_obj->costs_ == NULL) { 00090 fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating " 00091 "word unigram fields.\n"); 00092 delete word_unigrams_obj; 00093 return NULL; 00094 } 00095 00096 word_unigrams_obj->words_[0] = new char[full_len]; 00097 if (word_unigrams_obj->words_[0] == NULL) { 00098 fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating " 00099 "word unigram fields.\n"); 00100 delete word_unigrams_obj; 00101 return NULL; 00102 } 00103 00104 // construct sorted list of words and costs 00105 word_unigrams_obj->word_cnt_ = 0; 00106 char *char_buff = word_unigrams_obj->words_[0]; 00107 word_cnt = 0; 00108 int max_cost = 0; 00109 00110 for (int wrd = 0; wrd < str_vec.size(); wrd += 2) { 00111 word_unigrams_obj->words_[word_cnt] = char_buff; 00112 00113 strcpy(char_buff, str_vec[wrd].c_str()); 00114 char_buff += (str_vec[wrd].length() + 1); 00115 00116 if (sscanf(str_vec[wrd + 1].c_str(), "%d", 00117 word_unigrams_obj->costs_ + word_cnt) != 1) { 00118 fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error reading " 00119 "word unigram data.\n"); 00120 delete word_unigrams_obj; 00121 return NULL; 00122 } 00123 // update max cost 00124 max_cost = MAX(max_cost, word_unigrams_obj->costs_[word_cnt]); 00125 word_cnt++; 00126 } 00127 word_unigrams_obj->word_cnt_ = word_cnt; 00128 00129 // compute the not-in-list-cost by assuming that a word not in the list 00130 // [ahmadab]: This can be computed as follows: 00131 // - Given that the distribution of words follow Zipf's law: 00132 // (F = K / (rank ^ S)), where s is slightly > 1.0 00133 // - Number of words in the list is N 00134 // - The mean frequency of a word that did not appear in the list is the 00135 // area under the rest of the Zipf's curve divided by 2 (the mean) 00136 // - The area would be the bound integral from N to infinity = 00137 // (K * S) / (N ^ (S + 1)) ~= K / (N ^ 2) 00138 // - Given that cost = -LOG(prob), the cost of an unlisted word would be 00139 // = max_cost + 2*LOG(N) 00140 word_unigrams_obj->not_in_list_cost_ = max_cost + 00141 (2 * CubeUtils::Prob2Cost(1.0 / word_cnt)); 00142 // success 00143 return word_unigrams_obj; 00144 } 00145 00146 // Split input into space-separated tokens, strip trailing punctuation 00147 // from each, determine case properties, call UTF-8 flavor of cost 00148 // function on each word, and aggregate all into single mean word 00149 // cost. 00150 int WordUnigrams::Cost(const char_32 *key_str32, 00151 LangModel *lang_mod, 00152 CharSet *char_set) const { 00153 if (!key_str32) 00154 return 0; 00155 // convert string to UTF8 to split into space-separated words 00156 string key_str; 00157 CubeUtils::UTF32ToUTF8(key_str32, &key_str); 00158 vector<string> words; 00159 CubeUtils::SplitStringUsing(key_str, " \t", &words); 00160 00161 // no words => no cost 00162 if (words.size() <= 0) { 00163 return 0; 00164 } 00165 00166 // aggregate the costs of all the words 00167 int cost = 0; 00168 for (int word_idx = 0; word_idx < words.size(); word_idx++) { 00169 // convert each word back to UTF32 for analyzing case and punctuation 00170 string_32 str32; 00171 CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32); 00172 int len = CubeUtils::StrLen(str32.c_str()); 00173 00174 // strip all trailing punctuation 00175 string clean_str; 00176 int clean_len = len; 00177 bool trunc = false; 00178 while (clean_len > 0 && 00179 lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) { 00180 --clean_len; 00181 trunc = true; 00182 } 00183 00184 // If either the original string was not truncated (no trailing 00185 // punctuation) or the entire string was removed (all characters 00186 // are trailing punctuation), evaluate original word as is; 00187 // otherwise, copy all but the trailing punctuation characters 00188 char_32 *clean_str32 = NULL; 00189 if (clean_len == 0 || !trunc) { 00190 clean_str32 = CubeUtils::StrDup(str32.c_str()); 00191 } else { 00192 clean_str32 = new char_32[clean_len + 1]; 00193 for (int i = 0; i < clean_len; ++i) { 00194 clean_str32[i] = str32[i]; 00195 } 00196 clean_str32[clean_len] = '\0'; 00197 } 00198 ASSERT_HOST(clean_str32 != NULL); 00199 00200 string str8; 00201 CubeUtils::UTF32ToUTF8(clean_str32, &str8); 00202 int word_cost = CostInternal(str8.c_str()); 00203 00204 // if case invariant, get costs of all-upper-case and all-lower-case 00205 // versions and return the min cost 00206 if (clean_len >= kMinLengthNumOrCaseInvariant && 00207 CubeUtils::IsCaseInvariant(clean_str32, char_set)) { 00208 char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set); 00209 if (lower_32) { 00210 string lower_8; 00211 CubeUtils::UTF32ToUTF8(lower_32, &lower_8); 00212 word_cost = MIN(word_cost, CostInternal(lower_8.c_str())); 00213 delete [] lower_32; 00214 } 00215 char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set); 00216 if (upper_32) { 00217 string upper_8; 00218 CubeUtils::UTF32ToUTF8(upper_32, &upper_8); 00219 word_cost = MIN(word_cost, CostInternal(upper_8.c_str())); 00220 delete [] upper_32; 00221 } 00222 } 00223 00224 if (clean_len >= kMinLengthNumOrCaseInvariant) { 00225 // if characters are all numeric, incur 0 word cost 00226 bool is_numeric = true; 00227 for (int i = 0; i < clean_len; ++i) { 00228 if (!lang_mod->IsDigit(clean_str32[i])) 00229 is_numeric = false; 00230 } 00231 if (is_numeric) 00232 word_cost = 0; 00233 } 00234 delete [] clean_str32; 00235 cost += word_cost; 00236 } // word_idx 00237 00238 // return the mean cost 00239 return static_cast<int>(cost / static_cast<double>(words.size())); 00240 } 00241 00242 // Search for UTF-8 string using binary search of sorted words_ array. 00243 int WordUnigrams::CostInternal(const char *key_str) const { 00244 if (strlen(key_str) == 0) 00245 return not_in_list_cost_; 00246 int hi = word_cnt_ - 1; 00247 int lo = 0; 00248 while (lo <= hi) { 00249 int current = (hi + lo) / 2; 00250 int comp = strcmp(key_str, words_[current]); 00251 // a match 00252 if (comp == 0) { 00253 return costs_[current]; 00254 } 00255 if (comp < 0) { 00256 // go lower 00257 hi = current - 1; 00258 } else { 00259 // go higher 00260 lo = current + 1; 00261 } 00262 } 00263 return not_in_list_cost_; 00264 } 00265 } // namespace tesseract