Tesseract  3.02
tesseract-ocr/cube/word_list_lang_model.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        word_list_lang_model.h
00003  * Description: Declaration of the Word List Language Model Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // The WordListLangModel class abstracts a language model that is based on
00021 // a list of words. It inherits from the LangModel abstract class
00022 // Besides providing the methods inherited from the LangModel abstract class,
00023 // the class provided methods to add new strings to the Language Model:
00024 // AddString & AddString32
00025 
00026 #ifndef WORD_LIST_LANG_MODEL_H
00027 #define WORD_LIST_LANG_MODEL_H
00028 
00029 #include <vector>
00030 
00031 #include "cube_reco_context.h"
00032 #include "lang_model.h"
00033 #include "tess_lang_mod_edge.h"
00034 
00035 namespace tesseract {
00036 
00037 class Trie;
00038 
00039 class WordListLangModel : public LangModel {
00040  public:
00041   explicit WordListLangModel(CubeRecoContext *cntxt);
00042   ~WordListLangModel();
00043   // Returns an edge pointer to the Root
00044   LangModEdge *Root();
00045   // Returns the edges that fan-out of the specified edge and their count
00046   LangModEdge **GetEdges(CharAltList *alt_list,
00047                          LangModEdge *edge,
00048                          int *edge_cnt);
00049   // Returns is a sequence of 32-bit characters are valid within this language
00050   // model or net. And EndOfWord flag is specified. If true, the sequence has
00051   // to end on a valid word. The function also optionally returns the list
00052   // of language model edges traversed to parse the string
00053   bool IsValidSequence(const char_32 *sequence,
00054                        bool eow_flag,
00055                        LangModEdge **edges);
00056   bool IsLeadingPunc(char_32 ch) { return false; }  // not yet implemented
00057   bool IsTrailingPunc(char_32 ch) { return false; }  // not yet implemented
00058   bool IsDigit(char_32 ch) { return false; }  // not yet implemented
00059   // Adds a new UTF-8 string to the language model
00060   bool AddString(const char *char_ptr);
00061   // Adds a new UTF-32 string to the language model
00062   bool AddString32(const char_32 *char_32_ptr);
00063   // Compute all the variants of a 32-bit string in terms of the class-ids.
00064   // This is needed for languages that have ligatures. A word can then have
00065   // more than one spelling in terms of the class-ids.
00066   static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset,
00067                            string_32 str32,
00068                            vector<WERD_CHOICE *> *word_variants);
00069  private:
00070   // constants needed to configure the language model
00071   static const int kMaxEdge = 512;
00072   static const int kMaxDawgEdges = 20000;
00073 
00074   CubeRecoContext *cntxt_;
00075   Trie *dawg_;
00076   bool init_;
00077   // Initialize the language model
00078   bool Init();
00079   // Cleanup
00080   void Cleanup();
00081   // Recursive helper function for WordVariants().
00082   static void WordVariants(
00083       const CharSet &char_set,
00084       string_32 prefix_str32, WERD_CHOICE *word_so_far,
00085       string_32 str32,
00086       vector<WERD_CHOICE *> *word_variants);
00087 };
00088 }  // tesseract
00089 
00090 #endif  // WORD_LIST_LANG_MODEL_H