Tesseract  3.02
tesseract-ocr/cube/word_unigrams.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        word_unigrams.cpp
00003  * Description: Implementation of the Word Unigrams Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include <math.h>
00021 #include <string>
00022 #include <vector>
00023 #include <algorithm>
00024 
00025 #include "const.h"
00026 #include "cube_utils.h"
00027 #include "ndminx.h"
00028 #include "word_unigrams.h"
00029 
00030 namespace tesseract {
00031 
00032 WordUnigrams::WordUnigrams() {
00033   costs_ = NULL;
00034   words_ = NULL;
00035   word_cnt_ = 0;
00036 }
00037 
00038 WordUnigrams::~WordUnigrams() {
00039   if (words_ != NULL) {
00040     if (words_[0] != NULL) {
00041       delete []words_[0];
00042     }
00043 
00044     delete []words_;
00045     words_ = NULL;
00046   }
00047 
00048   if (costs_ != NULL) {
00049     delete []costs_;
00050   }
00051 }
00052 
00053 // Load the word-list and unigrams from file and create an object
00054 // The word list is assumed to be sorted in lexicographic order.
00055 WordUnigrams *WordUnigrams::Create(const string &data_file_path,
00056                                    const string &lang) {
00057   string file_name;
00058   string str;
00059 
00060   file_name = data_file_path + lang;
00061   file_name += ".cube.word-freq";
00062 
00063   // load the string into memory
00064   if (CubeUtils::ReadFileToString(file_name, &str) == false) {
00065     return NULL;
00066   }
00067 
00068   // split into lines
00069   vector<string> str_vec;
00070   CubeUtils::SplitStringUsing(str, "\r\n \t", &str_vec);
00071   if (str_vec.size() < 2) {
00072     return NULL;
00073   }
00074 
00075   // allocate memory
00076   WordUnigrams *word_unigrams_obj = new WordUnigrams();
00077   if (word_unigrams_obj == NULL) {
00078     fprintf(stderr, "Cube ERROR (WordUnigrams::Create): could not create "
00079             "word unigrams object.\n");
00080     return NULL;
00081   }
00082 
00083   int full_len = str.length();
00084   int word_cnt = str_vec.size() / 2;
00085   word_unigrams_obj->words_ = new char*[word_cnt];
00086   word_unigrams_obj->costs_ = new int[word_cnt];
00087 
00088   if (word_unigrams_obj->words_ == NULL ||
00089       word_unigrams_obj->costs_ == NULL) {
00090     fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating "
00091             "word unigram fields.\n");
00092     delete word_unigrams_obj;
00093     return NULL;
00094   }
00095 
00096   word_unigrams_obj->words_[0] = new char[full_len];
00097   if (word_unigrams_obj->words_[0] == NULL) {
00098     fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating "
00099             "word unigram fields.\n");
00100     delete word_unigrams_obj;
00101     return NULL;
00102   }
00103 
00104   // construct sorted list of words and costs
00105   word_unigrams_obj->word_cnt_ = 0;
00106   char *char_buff = word_unigrams_obj->words_[0];
00107   word_cnt = 0;
00108   int max_cost = 0;
00109 
00110   for (int wrd = 0; wrd < str_vec.size(); wrd += 2) {
00111     word_unigrams_obj->words_[word_cnt] = char_buff;
00112 
00113     strcpy(char_buff, str_vec[wrd].c_str());
00114     char_buff += (str_vec[wrd].length() + 1);
00115 
00116     if (sscanf(str_vec[wrd + 1].c_str(), "%d",
00117                word_unigrams_obj->costs_ + word_cnt) != 1) {
00118       fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error reading "
00119               "word unigram data.\n");
00120       delete word_unigrams_obj;
00121       return NULL;
00122     }
00123     // update max cost
00124     max_cost = MAX(max_cost, word_unigrams_obj->costs_[word_cnt]);
00125     word_cnt++;
00126   }
00127   word_unigrams_obj->word_cnt_ = word_cnt;
00128 
00129   // compute the not-in-list-cost by assuming that a word not in the list
00130   // [ahmadab]: This can be computed as follows:
00131   // - Given that the distribution of words follow Zipf's law:
00132   //   (F = K / (rank ^ S)), where s is slightly > 1.0
00133   // - Number of words in the list is N
00134   // - The mean frequency of a word that did not appear in the list is the
00135   //   area under the rest of the Zipf's curve divided by 2 (the mean)
00136   // - The area would be the bound integral from N to infinity =
00137   //   (K * S) / (N ^ (S + 1)) ~= K / (N ^ 2)
00138   // - Given that cost = -LOG(prob), the cost of an unlisted word would be
00139   //   = max_cost + 2*LOG(N)
00140   word_unigrams_obj->not_in_list_cost_ = max_cost +
00141       (2 * CubeUtils::Prob2Cost(1.0 / word_cnt));
00142   // success
00143   return word_unigrams_obj;
00144 }
00145 
00146 // Split input into space-separated tokens, strip trailing punctuation
00147 // from each, determine case properties, call UTF-8 flavor of cost
00148 // function on each word, and aggregate all into single mean word
00149 // cost.
00150 int WordUnigrams::Cost(const char_32 *key_str32,
00151                        LangModel *lang_mod,
00152                        CharSet *char_set) const {
00153   if (!key_str32)
00154     return 0;
00155   // convert string to UTF8 to split into space-separated words
00156   string key_str;
00157   CubeUtils::UTF32ToUTF8(key_str32, &key_str);
00158   vector<string> words;
00159   CubeUtils::SplitStringUsing(key_str, " \t", &words);
00160 
00161   // no words => no cost
00162   if (words.size() <= 0) {
00163     return 0;
00164   }
00165 
00166   // aggregate the costs of all the words
00167   int cost = 0;
00168   for (int word_idx = 0; word_idx < words.size(); word_idx++) {
00169     // convert each word back to UTF32 for analyzing case and punctuation
00170     string_32 str32;
00171     CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32);
00172     int len = CubeUtils::StrLen(str32.c_str());
00173 
00174     // strip all trailing punctuation
00175     string clean_str;
00176     int clean_len = len;
00177     bool trunc = false;
00178     while (clean_len > 0 &&
00179            lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) {
00180       --clean_len;
00181       trunc = true;
00182     }
00183 
00184     // If either the original string was not truncated (no trailing
00185     // punctuation) or the entire string was removed (all characters
00186     // are trailing punctuation), evaluate original word as is;
00187     // otherwise, copy all but the trailing punctuation characters
00188     char_32 *clean_str32 = NULL;
00189     if (clean_len == 0 || !trunc) {
00190       clean_str32 = CubeUtils::StrDup(str32.c_str());
00191     } else {
00192       clean_str32 = new char_32[clean_len + 1];
00193       for (int i = 0; i < clean_len; ++i) {
00194         clean_str32[i] = str32[i];
00195       }
00196       clean_str32[clean_len] = '\0';
00197     }
00198     ASSERT_HOST(clean_str32 != NULL);
00199 
00200     string str8;
00201     CubeUtils::UTF32ToUTF8(clean_str32, &str8);
00202     int word_cost = CostInternal(str8.c_str());
00203 
00204     // if case invariant, get costs of all-upper-case and all-lower-case
00205     // versions and return the min cost
00206     if (clean_len >= kMinLengthNumOrCaseInvariant &&
00207         CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
00208       char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
00209       if (lower_32) {
00210         string lower_8;
00211         CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
00212         word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
00213         delete [] lower_32;
00214       }
00215       char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
00216       if (upper_32) {
00217         string upper_8;
00218         CubeUtils::UTF32ToUTF8(upper_32, &upper_8);
00219         word_cost = MIN(word_cost, CostInternal(upper_8.c_str()));
00220         delete [] upper_32;
00221       }
00222     }
00223 
00224     if (clean_len >= kMinLengthNumOrCaseInvariant) {
00225       // if characters are all numeric, incur 0 word cost
00226       bool is_numeric = true;
00227       for (int i = 0; i < clean_len; ++i) {
00228         if (!lang_mod->IsDigit(clean_str32[i]))
00229           is_numeric = false;
00230       }
00231       if (is_numeric)
00232         word_cost = 0;
00233     }
00234     delete [] clean_str32;
00235     cost += word_cost;
00236   }  // word_idx
00237 
00238   // return the mean cost
00239   return static_cast<int>(cost / static_cast<double>(words.size()));
00240 }
00241 
00242 // Search for UTF-8 string using binary search of sorted words_ array.
00243 int WordUnigrams::CostInternal(const char *key_str) const {
00244   if (strlen(key_str) == 0)
00245     return not_in_list_cost_;
00246   int hi = word_cnt_ - 1;
00247   int lo = 0;
00248   while (lo <= hi) {
00249     int current = (hi + lo) / 2;
00250     int comp = strcmp(key_str, words_[current]);
00251     // a match
00252     if (comp == 0) {
00253       return costs_[current];
00254     }
00255     if (comp < 0) {
00256       // go lower
00257       hi = current - 1;
00258     } else {
00259       // go higher
00260       lo = current + 1;
00261     }
00262   }
00263   return not_in_list_cost_;
00264 }
00265 }  // namespace tesseract