tesseract-doc/word__unigrams_8h_source.html

00001  /**********************************************************************
00002  * File:        word_unigrams.h
00003  * Description: Declaration of the Word Unigrams Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019
00020 // The WordUnigram class holds the unigrams of the most frequent set of words
00021 // in a language. It is an optional component of the Cube OCR engine. If
00022 // present, the unigram cost of a word is aggregated with the other costs
00023 // (Recognition, Language Model, Size) to compute a cost for a word.
00024 // The word list is assumed to be sorted in lexicographic order.
00025
00026 #ifndef WORD_UNIGRAMS_H
00027 #define WORD_UNIGRAMS_H
00028
00029 #include <string>
00030 #include "char_set.h"
00031 #include "lang_model.h"
00032
00033 namespace tesseract {
00034 class WordUnigrams {
00035  public:
00036   WordUnigrams();
00037   ~WordUnigrams();
00038   // Load the word-list and unigrams from file and create an object
00039   // The word list is assumed to be sorted
00040   static WordUnigrams *Create(const string &data_file_path,
00041                               const string &lang);
00042   // Compute the unigram cost of a UTF-32 string. Splits into
00043   // space-separated tokens, strips trailing punctuation from each
00044   // token, evaluates case properties, and calls internal Cost()
00045   // function on UTF-8 version. To avoid unnecessarily penalizing
00046   // all-one-case words or capitalized words (first-letter
00047   // upper-case and remaining letters lower-case) when not all
00048   // versions of the word appear in the <lang>.cube.word-freq file, a
00049   // case-invariant cost is computed in those cases, assuming the word
00050   // meets a minimum length.
00051   int Cost(const char_32 *str32, LangModel *lang_mod,
00052            CharSet *char_set) const;
00053  protected:
00054   // Compute the word unigram cost of a UTF-8 string with binary
00055   // search of sorted words_ array.
00056   int CostInternal(const char *str) const;
00057  private:
00058   // Only words this length or greater qualify for all-numeric or
00059   // case-invariant word unigram cost.
00060   static const int kMinLengthNumOrCaseInvariant = 4;
00061
00062   int word_cnt_;
00063   char **words_;
00064   int *costs_;
00065   int not_in_list_cost_;
00066 };
00067 }
00068
00069 #endif  // WORD_UNIGRAMS_H