Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: word_list_lang_model.h 00003 * Description: Declaration of the Word List Language Model Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // The WordListLangModel class abstracts a language model that is based on 00021 // a list of words. It inherits from the LangModel abstract class 00022 // Besides providing the methods inherited from the LangModel abstract class, 00023 // the class provided methods to add new strings to the Language Model: 00024 // AddString & AddString32 00025 00026 #ifndef WORD_LIST_LANG_MODEL_H 00027 #define WORD_LIST_LANG_MODEL_H 00028 00029 #include <vector> 00030 00031 #include "cube_reco_context.h" 00032 #include "lang_model.h" 00033 #include "tess_lang_mod_edge.h" 00034 00035 namespace tesseract { 00036 00037 class Trie; 00038 00039 class WordListLangModel : public LangModel { 00040 public: 00041 explicit WordListLangModel(CubeRecoContext *cntxt); 00042 ~WordListLangModel(); 00043 // Returns an edge pointer to the Root 00044 LangModEdge *Root(); 00045 // Returns the edges that fan-out of the specified edge and their count 00046 LangModEdge **GetEdges(CharAltList *alt_list, 00047 LangModEdge *edge, 00048 int *edge_cnt); 00049 // Returns is a sequence of 32-bit characters are valid within this language 00050 // model or net. And EndOfWord flag is specified. If true, the sequence has 00051 // to end on a valid word. The function also optionally returns the list 00052 // of language model edges traversed to parse the string 00053 bool IsValidSequence(const char_32 *sequence, 00054 bool eow_flag, 00055 LangModEdge **edges); 00056 bool IsLeadingPunc(char_32 ch) { return false; } // not yet implemented 00057 bool IsTrailingPunc(char_32 ch) { return false; } // not yet implemented 00058 bool IsDigit(char_32 ch) { return false; } // not yet implemented 00059 // Adds a new UTF-8 string to the language model 00060 bool AddString(const char *char_ptr); 00061 // Adds a new UTF-32 string to the language model 00062 bool AddString32(const char_32 *char_32_ptr); 00063 // Compute all the variants of a 32-bit string in terms of the class-ids. 00064 // This is needed for languages that have ligatures. A word can then have 00065 // more than one spelling in terms of the class-ids. 00066 static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, 00067 string_32 str32, 00068 vector<WERD_CHOICE *> *word_variants); 00069 private: 00070 // constants needed to configure the language model 00071 static const int kMaxEdge = 512; 00072 static const int kMaxDawgEdges = 20000; 00073 00074 CubeRecoContext *cntxt_; 00075 Trie *dawg_; 00076 bool init_; 00077 // Initialize the language model 00078 bool Init(); 00079 // Cleanup 00080 void Cleanup(); 00081 // Recursive helper function for WordVariants(). 00082 static void WordVariants( 00083 const CharSet &char_set, 00084 string_32 prefix_str32, WERD_CHOICE *word_so_far, 00085 string_32 str32, 00086 vector<WERD_CHOICE *> *word_variants); 00087 }; 00088 } // tesseract 00089 00090 #endif // WORD_LIST_LANG_MODEL_H