Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: char_bigrams.h 00003 * Description: Declaration of a Character Bigrams Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2007 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // The CharBigram class represents the interface to the character bigram 00021 // table used by Cube 00022 // A CharBigram object can be constructed from the Char Bigrams file 00023 // Given a sequence of characters, the "Cost" method returns the Char Bigram 00024 // cost of the string according to the table 00025 00026 #ifndef CHAR_BIGRAMS_H 00027 #define CHAR_BIGRAMS_H 00028 00029 #include <string> 00030 #include "char_set.h" 00031 00032 namespace tesseract { 00033 00034 // structure representing a single bigram value 00035 struct Bigram { 00036 int cnt; 00037 int cost; 00038 }; 00039 00040 // structure representing the char bigram array of characters 00041 // following a specific character 00042 struct CharBigram { 00043 int total_cnt; 00044 char_32 max_char; 00045 Bigram *bigram; 00046 }; 00047 00048 // structure representing the whole bigram table 00049 struct CharBigramTable { 00050 int total_cnt; 00051 int worst_cost; 00052 char_32 max_char; 00053 CharBigram *char_bigram; 00054 }; 00055 00056 class CharBigrams { 00057 public: 00058 CharBigrams(); 00059 ~CharBigrams(); 00060 // Construct the CharBigrams class from a file 00061 static CharBigrams *Create(const string &data_file_path, 00062 const string &lang); 00063 // Top-level function to return the mean character bigram cost of a 00064 // sequence of characters. If char_set is not NULL, use 00065 // tesseract functions to return a case-invariant cost. 00066 // This avoids unnecessarily penalizing all-one-case words or 00067 // capitalized words (first-letter upper-case and remaining letters 00068 // lower-case). 00069 int Cost(const char_32 *str, CharSet *char_set) const; 00070 00071 protected: 00072 // Returns the character bigram cost of two characters. 00073 int PairCost(char_32 ch1, char_32 ch2) const; 00074 // Returns the mean character bigram cost of a sequence of 00075 // characters. Adds a space at the beginning and end to account for 00076 // cost of starting and ending characters. 00077 int MeanCostWithSpaces(const char_32 *char_32_ptr) const; 00078 00079 private: 00080 // Only words this length or greater qualify for case-invariant character 00081 // bigram cost. 00082 static const int kMinLengthCaseInvariant = 4; 00083 00084 00085 CharBigramTable bigram_table_; 00086 }; 00087 } 00088 00089 #endif // CHAR_BIGRAMS_H