Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: lang_model.h 00003 * Description: Declaration of the Language Model Edge Base Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2007 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // The LanguageModel class abstracts a State machine that is modeled as a Trie 00021 // structure. The state machine models the language being recognized by the OCR 00022 // Engine 00023 // This is an abstract class that is to be inherited by any language model 00024 00025 #ifndef LANG_MODEL_H 00026 #define LANG_MODEL_H 00027 00028 #include "lang_mod_edge.h" 00029 #include "char_altlist.h" 00030 #include "char_set.h" 00031 #include "tuning_params.h" 00032 00033 namespace tesseract { 00034 class LangModel { 00035 public: 00036 LangModel() { 00037 ood_enabled_ = true; 00038 numeric_enabled_ = true; 00039 word_list_enabled_ = true; 00040 punc_enabled_ = true; 00041 } 00042 virtual ~LangModel() {} 00043 00044 // Returns an edge pointer to the Root 00045 virtual LangModEdge *Root() = 0; 00046 // Returns the edges that fan-out of the specified edge and their count 00047 virtual LangModEdge **GetEdges(CharAltList *alt_list, 00048 LangModEdge *parent_edge, 00049 int *edge_cnt) = 0; 00050 // Returns is a sequence of 32-bit characters are valid within this language 00051 // model or net. And EndOfWord flag is specified. If true, the sequence has 00052 // to end on a valid word. The function also optionally returns the list 00053 // of language model edges traversed to parse the string 00054 virtual bool IsValidSequence(const char_32 *str, bool eow_flag, 00055 LangModEdge **edge_array = NULL) = 0; 00056 virtual bool IsLeadingPunc(char_32 ch) = 0; 00057 virtual bool IsTrailingPunc(char_32 ch) = 0; 00058 virtual bool IsDigit(char_32 ch) = 0; 00059 00060 // accessor functions 00061 inline bool OOD() { return ood_enabled_; } 00062 inline bool Numeric() { return numeric_enabled_; } 00063 inline bool WordList() { return word_list_enabled_; } 00064 inline bool Punc() { return punc_enabled_; } 00065 inline void SetOOD(bool ood) { ood_enabled_ = ood; } 00066 inline void SetNumeric(bool numeric) { numeric_enabled_ = numeric; } 00067 inline void SetWordList(bool word_list) { word_list_enabled_ = word_list; } 00068 inline void SetPunc(bool punc_enabled) { punc_enabled_ = punc_enabled; } 00069 00070 protected: 00071 bool ood_enabled_; 00072 bool numeric_enabled_; 00073 bool word_list_enabled_; 00074 bool punc_enabled_; 00075 }; 00076 } 00077 00078 #endif // LANG_MODEL_H