Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: word_unigrams.h 00003 * Description: Declaration of the Word Unigrams Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // The WordUnigram class holds the unigrams of the most frequent set of words 00021 // in a language. It is an optional component of the Cube OCR engine. If 00022 // present, the unigram cost of a word is aggregated with the other costs 00023 // (Recognition, Language Model, Size) to compute a cost for a word. 00024 // The word list is assumed to be sorted in lexicographic order. 00025 00026 #ifndef WORD_UNIGRAMS_H 00027 #define WORD_UNIGRAMS_H 00028 00029 #include <string> 00030 #include "char_set.h" 00031 #include "lang_model.h" 00032 00033 namespace tesseract { 00034 class WordUnigrams { 00035 public: 00036 WordUnigrams(); 00037 ~WordUnigrams(); 00038 // Load the word-list and unigrams from file and create an object 00039 // The word list is assumed to be sorted 00040 static WordUnigrams *Create(const string &data_file_path, 00041 const string &lang); 00042 // Compute the unigram cost of a UTF-32 string. Splits into 00043 // space-separated tokens, strips trailing punctuation from each 00044 // token, evaluates case properties, and calls internal Cost() 00045 // function on UTF-8 version. To avoid unnecessarily penalizing 00046 // all-one-case words or capitalized words (first-letter 00047 // upper-case and remaining letters lower-case) when not all 00048 // versions of the word appear in the <lang>.cube.word-freq file, a 00049 // case-invariant cost is computed in those cases, assuming the word 00050 // meets a minimum length. 00051 int Cost(const char_32 *str32, LangModel *lang_mod, 00052 CharSet *char_set) const; 00053 protected: 00054 // Compute the word unigram cost of a UTF-8 string with binary 00055 // search of sorted words_ array. 00056 int CostInternal(const char *str) const; 00057 private: 00058 // Only words this length or greater qualify for all-numeric or 00059 // case-invariant word unigram cost. 00060 static const int kMinLengthNumOrCaseInvariant = 4; 00061 00062 int word_cnt_; 00063 char **words_; 00064 int *costs_; 00065 int not_in_list_cost_; 00066 }; 00067 } 00068 00069 #endif // WORD_UNIGRAMS_H