Tesseract  3.02
tesseract-ocr/cube/lang_model.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        lang_model.h
00003  * Description: Declaration of the Language Model Edge Base Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2007
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // The LanguageModel class abstracts a State machine that is modeled as a Trie
00021 // structure. The state machine models the language being recognized by the OCR
00022 // Engine
00023 // This is an abstract class that is to be inherited by any language model
00024 
00025 #ifndef LANG_MODEL_H
00026 #define LANG_MODEL_H
00027 
00028 #include "lang_mod_edge.h"
00029 #include "char_altlist.h"
00030 #include "char_set.h"
00031 #include "tuning_params.h"
00032 
00033 namespace tesseract {
00034 class LangModel {
00035  public:
00036   LangModel() {
00037     ood_enabled_ = true;
00038     numeric_enabled_ = true;
00039     word_list_enabled_ = true;
00040     punc_enabled_ = true;
00041   }
00042   virtual ~LangModel() {}
00043 
00044   // Returns an edge pointer to the Root
00045   virtual LangModEdge *Root() = 0;
00046   // Returns the edges that fan-out of the specified edge and their count
00047   virtual LangModEdge **GetEdges(CharAltList *alt_list,
00048                                  LangModEdge *parent_edge,
00049                                  int *edge_cnt) = 0;
00050   // Returns is a sequence of 32-bit characters are valid within this language
00051   // model or net. And EndOfWord flag is specified. If true, the sequence has
00052   // to end on a valid word. The function also optionally returns the list
00053   // of language model edges traversed to parse the string
00054   virtual bool IsValidSequence(const char_32 *str, bool eow_flag,
00055                                LangModEdge **edge_array = NULL) = 0;
00056   virtual bool IsLeadingPunc(char_32 ch) = 0;
00057   virtual bool IsTrailingPunc(char_32 ch) = 0;
00058   virtual bool IsDigit(char_32 ch) = 0;
00059 
00060   // accessor functions
00061   inline bool OOD() { return ood_enabled_; }
00062   inline bool Numeric() { return numeric_enabled_; }
00063   inline bool WordList() { return word_list_enabled_; }
00064   inline bool Punc() { return punc_enabled_; }
00065   inline void SetOOD(bool ood) { ood_enabled_ = ood; }
00066   inline void SetNumeric(bool numeric) { numeric_enabled_ = numeric; }
00067   inline void SetWordList(bool word_list) { word_list_enabled_ = word_list; }
00068   inline void SetPunc(bool punc_enabled) { punc_enabled_ = punc_enabled; }
00069 
00070  protected:
00071   bool ood_enabled_;
00072   bool numeric_enabled_;
00073   bool word_list_enabled_;
00074   bool punc_enabled_;
00075 };
00076 }
00077 
00078 #endif  // LANG_MODEL_H