Tesseract
3.02
|
#include <word_unigrams.h>
Public Member Functions | |
WordUnigrams () | |
~WordUnigrams () | |
int | Cost (const char_32 *str32, LangModel *lang_mod, CharSet *char_set) const |
Static Public Member Functions | |
static WordUnigrams * | Create (const string &data_file_path, const string &lang) |
Protected Member Functions | |
int | CostInternal (const char *str) const |
Definition at line 34 of file word_unigrams.h.
tesseract::WordUnigrams::WordUnigrams | ( | ) |
Definition at line 32 of file word_unigrams.cpp.
tesseract::WordUnigrams::~WordUnigrams | ( | ) |
int tesseract::WordUnigrams::Cost | ( | const char_32 * | str32, |
LangModel * | lang_mod, | ||
CharSet * | char_set | ||
) | const |
Definition at line 150 of file word_unigrams.cpp.
{ if (!key_str32) return 0; // convert string to UTF8 to split into space-separated words string key_str; CubeUtils::UTF32ToUTF8(key_str32, &key_str); vector<string> words; CubeUtils::SplitStringUsing(key_str, " \t", &words); // no words => no cost if (words.size() <= 0) { return 0; } // aggregate the costs of all the words int cost = 0; for (int word_idx = 0; word_idx < words.size(); word_idx++) { // convert each word back to UTF32 for analyzing case and punctuation string_32 str32; CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32); int len = CubeUtils::StrLen(str32.c_str()); // strip all trailing punctuation string clean_str; int clean_len = len; bool trunc = false; while (clean_len > 0 && lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) { --clean_len; trunc = true; } // If either the original string was not truncated (no trailing // punctuation) or the entire string was removed (all characters // are trailing punctuation), evaluate original word as is; // otherwise, copy all but the trailing punctuation characters char_32 *clean_str32 = NULL; if (clean_len == 0 || !trunc) { clean_str32 = CubeUtils::StrDup(str32.c_str()); } else { clean_str32 = new char_32[clean_len + 1]; for (int i = 0; i < clean_len; ++i) { clean_str32[i] = str32[i]; } clean_str32[clean_len] = '\0'; } ASSERT_HOST(clean_str32 != NULL); string str8; CubeUtils::UTF32ToUTF8(clean_str32, &str8); int word_cost = CostInternal(str8.c_str()); // if case invariant, get costs of all-upper-case and all-lower-case // versions and return the min cost if (clean_len >= kMinLengthNumOrCaseInvariant && CubeUtils::IsCaseInvariant(clean_str32, char_set)) { char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set); if (lower_32) { string lower_8; CubeUtils::UTF32ToUTF8(lower_32, &lower_8); word_cost = MIN(word_cost, CostInternal(lower_8.c_str())); delete [] lower_32; } char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set); if (upper_32) { string upper_8; CubeUtils::UTF32ToUTF8(upper_32, &upper_8); word_cost = MIN(word_cost, CostInternal(upper_8.c_str())); delete [] upper_32; } } if (clean_len >= kMinLengthNumOrCaseInvariant) { // if characters are all numeric, incur 0 word cost bool is_numeric = true; for (int i = 0; i < clean_len; ++i) { if (!lang_mod->IsDigit(clean_str32[i])) is_numeric = false; } if (is_numeric) word_cost = 0; } delete [] clean_str32; cost += word_cost; } // word_idx // return the mean cost return static_cast<int>(cost / static_cast<double>(words.size())); }
int tesseract::WordUnigrams::CostInternal | ( | const char * | str | ) | const [protected] |
Definition at line 243 of file word_unigrams.cpp.
{ if (strlen(key_str) == 0) return not_in_list_cost_; int hi = word_cnt_ - 1; int lo = 0; while (lo <= hi) { int current = (hi + lo) / 2; int comp = strcmp(key_str, words_[current]); // a match if (comp == 0) { return costs_[current]; } if (comp < 0) { // go lower hi = current - 1; } else { // go higher lo = current + 1; } } return not_in_list_cost_; }
WordUnigrams * tesseract::WordUnigrams::Create | ( | const string & | data_file_path, |
const string & | lang | ||
) | [static] |
Definition at line 55 of file word_unigrams.cpp.
{ string file_name; string str; file_name = data_file_path + lang; file_name += ".cube.word-freq"; // load the string into memory if (CubeUtils::ReadFileToString(file_name, &str) == false) { return NULL; } // split into lines vector<string> str_vec; CubeUtils::SplitStringUsing(str, "\r\n \t", &str_vec); if (str_vec.size() < 2) { return NULL; } // allocate memory WordUnigrams *word_unigrams_obj = new WordUnigrams(); if (word_unigrams_obj == NULL) { fprintf(stderr, "Cube ERROR (WordUnigrams::Create): could not create " "word unigrams object.\n"); return NULL; } int full_len = str.length(); int word_cnt = str_vec.size() / 2; word_unigrams_obj->words_ = new char*[word_cnt]; word_unigrams_obj->costs_ = new int[word_cnt]; if (word_unigrams_obj->words_ == NULL || word_unigrams_obj->costs_ == NULL) { fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating " "word unigram fields.\n"); delete word_unigrams_obj; return NULL; } word_unigrams_obj->words_[0] = new char[full_len]; if (word_unigrams_obj->words_[0] == NULL) { fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating " "word unigram fields.\n"); delete word_unigrams_obj; return NULL; } // construct sorted list of words and costs word_unigrams_obj->word_cnt_ = 0; char *char_buff = word_unigrams_obj->words_[0]; word_cnt = 0; int max_cost = 0; for (int wrd = 0; wrd < str_vec.size(); wrd += 2) { word_unigrams_obj->words_[word_cnt] = char_buff; strcpy(char_buff, str_vec[wrd].c_str()); char_buff += (str_vec[wrd].length() + 1); if (sscanf(str_vec[wrd + 1].c_str(), "%d", word_unigrams_obj->costs_ + word_cnt) != 1) { fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error reading " "word unigram data.\n"); delete word_unigrams_obj; return NULL; } // update max cost max_cost = MAX(max_cost, word_unigrams_obj->costs_[word_cnt]); word_cnt++; } word_unigrams_obj->word_cnt_ = word_cnt; // compute the not-in-list-cost by assuming that a word not in the list // [ahmadab]: This can be computed as follows: // - Given that the distribution of words follow Zipf's law: // (F = K / (rank ^ S)), where s is slightly > 1.0 // - Number of words in the list is N // - The mean frequency of a word that did not appear in the list is the // area under the rest of the Zipf's curve divided by 2 (the mean) // - The area would be the bound integral from N to infinity = // (K * S) / (N ^ (S + 1)) ~= K / (N ^ 2) // - Given that cost = -LOG(prob), the cost of an unlisted word would be // = max_cost + 2*LOG(N) word_unigrams_obj->not_in_list_cost_ = max_cost + (2 * CubeUtils::Prob2Cost(1.0 / word_cnt)); // success return word_unigrams_obj; }