Tesseract  3.02
tesseract-ocr/cube/char_bigrams.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        char_bigrams.h
00003  * Description: Declaration of a Character Bigrams Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2007
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // The CharBigram class represents the interface to the character bigram
00021 // table used by Cube
00022 // A CharBigram object can be constructed from the Char Bigrams file
00023 // Given a sequence of characters, the "Cost" method returns the Char Bigram
00024 // cost of the string according to the table
00025 
00026 #ifndef CHAR_BIGRAMS_H
00027 #define CHAR_BIGRAMS_H
00028 
00029 #include <string>
00030 #include "char_set.h"
00031 
00032 namespace tesseract {
00033 
00034 // structure representing a single bigram value
00035 struct Bigram {
00036   int cnt;
00037   int cost;
00038 };
00039 
00040 // structure representing the char bigram array of characters
00041 // following a specific character
00042 struct CharBigram {
00043   int total_cnt;
00044   char_32 max_char;
00045   Bigram *bigram;
00046 };
00047 
00048 // structure representing the whole bigram table
00049 struct CharBigramTable {
00050   int total_cnt;
00051   int worst_cost;
00052   char_32 max_char;
00053   CharBigram *char_bigram;
00054 };
00055 
00056 class CharBigrams {
00057  public:
00058   CharBigrams();
00059   ~CharBigrams();
00060   // Construct the CharBigrams class from a file
00061   static CharBigrams *Create(const string &data_file_path,
00062                              const string &lang);
00063   // Top-level function to return the mean character bigram cost of a
00064   // sequence of characters.  If char_set is not NULL, use
00065   // tesseract functions to return a case-invariant cost.
00066   // This avoids unnecessarily penalizing all-one-case words or
00067   // capitalized words (first-letter upper-case and remaining letters
00068   // lower-case).
00069   int Cost(const char_32 *str, CharSet *char_set) const;
00070 
00071  protected:
00072   // Returns the character bigram cost of two characters.
00073   int PairCost(char_32 ch1, char_32 ch2) const;
00074   // Returns the mean character bigram cost of a sequence of
00075   // characters. Adds a space at the beginning and end to account for
00076   // cost of starting and ending characters.
00077   int MeanCostWithSpaces(const char_32 *char_32_ptr) const;
00078 
00079  private:
00080   // Only words this length or greater qualify for case-invariant character
00081   // bigram cost.
00082   static const int kMinLengthCaseInvariant = 4;
00083 
00084 
00085   CharBigramTable bigram_table_;
00086 };
00087 }
00088 
00089 #endif  // CHAR_BIGRAMS_H