Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: tess_lang_model.h 00003 * Description: Declaration of the Tesseract Language Model Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef TESS_LANG_MODEL_H 00021 #define TESS_LANG_MODEL_H 00022 00023 #include <string> 00024 00025 #include "char_altlist.h" 00026 #include "cube_reco_context.h" 00027 #include "cube_tuning_params.h" 00028 #include "dict.h" 00029 #include "lang_model.h" 00030 #include "tessdatamanager.h" 00031 #include "tess_lang_mod_edge.h" 00032 00033 namespace tesseract { 00034 00035 const int kStateCnt = 4; 00036 const int kNumLiteralCnt = 5; 00037 00038 class TessLangModel : public LangModel { 00039 public: 00040 TessLangModel(const string &lm_params, 00041 const string &data_file_path, 00042 bool load_system_dawg, 00043 TessdataManager *tessdata_manager, 00044 CubeRecoContext *cntxt); 00045 ~TessLangModel() { 00046 if (word_dawgs_ != NULL) { 00047 word_dawgs_->delete_data_pointers(); 00048 delete word_dawgs_; 00049 } 00050 } 00051 00052 // returns a pointer to the root of the language model 00053 inline TessLangModEdge *Root() { 00054 return NULL; 00055 } 00056 00057 // The general fan-out generation function. Returns the list of edges 00058 // fanning-out of the specified edge and their count. If an AltList is 00059 // specified, only the class-ids with a minimum cost are considered 00060 LangModEdge **GetEdges(CharAltList *alt_list, 00061 LangModEdge *edge, 00062 int *edge_cnt); 00063 // Determines if a sequence of 32-bit chars is valid in this language model 00064 // starting from the root. If the eow_flag is ON, also checks for 00065 // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last 00066 // edge 00067 bool IsValidSequence(const char_32 *sequence, bool eow_flag, 00068 LangModEdge **final_edge = NULL); 00069 bool IsLeadingPunc(char_32 ch); 00070 bool IsTrailingPunc(char_32 ch); 00071 bool IsDigit(char_32 ch); 00072 00073 void RemoveInvalidCharacters(string *lm_str); 00074 private: 00075 // static LM state machines 00076 static const Dawg *ood_dawg_; 00077 static const Dawg *number_dawg_; 00078 static const int num_state_machine_[kStateCnt][kNumLiteralCnt]; 00079 static const int num_max_repeat_[kStateCnt]; 00080 // word_dawgs_ should only be loaded if cube has its own version of the 00081 // unicharset (different from the one used by tesseract) and therefore 00082 // can not use the dawgs loaded for tesseract (since the unichar ids 00083 // encoded in the dawgs differ). 00084 DawgVector *word_dawgs_; 00085 00086 static int max_edge_; 00087 static int max_ood_shape_cost_; 00088 00089 // remaining language model elements needed by cube. These get loaded from 00090 // the .lm file 00091 string lead_punc_; 00092 string trail_punc_; 00093 string num_lead_punc_; 00094 string num_trail_punc_; 00095 string operators_; 00096 string digits_; 00097 string alphas_; 00098 // String of characters in RHS of each line of <lang>.cube.lm 00099 // Each element is hard-coded to correspond to a specific token type 00100 // (see LoadLangModelElements) 00101 string *literal_str_[kNumLiteralCnt]; 00102 // Recognition context needed to access language properties 00103 // (case, cursive,..) 00104 CubeRecoContext *cntxt_; 00105 bool has_case_; 00106 00107 // computes and returns the edges that fan out of an edge ref 00108 int FanOut(CharAltList *alt_list, 00109 const Dawg *dawg, EDGE_REF edge_ref, EDGE_REF edge_ref_mask, 00110 const char_32 *str, bool root_flag, LangModEdge **edge_array); 00111 // generate edges from an NULL terminated string 00112 // (used for punctuation, operators and digits) 00113 int Edges(const char *strng, const Dawg *dawg, 00114 EDGE_REF edge_ref, EDGE_REF edge_ref_mask, 00115 LangModEdge **edge_array); 00116 // Generate the edges fanning-out from an edge in the number state machine 00117 int NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array); 00118 // Generate OOD edges 00119 int OODEdges(CharAltList *alt_list, EDGE_REF edge_ref, 00120 EDGE_REF edge_ref_mask, LangModEdge **edge_array); 00121 // Cleanup an edge array 00122 void FreeEdges(int edge_cnt, LangModEdge **edge_array); 00123 // Determines if a sequence of 32-bit chars is valid in this language model 00124 // starting from the specified edge. If the eow_flag is ON, also checks for 00125 // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last 00126 // edge 00127 bool IsValidSequence(LangModEdge *edge, const char_32 *sequence, 00128 bool eow_flag, LangModEdge **final_edge); 00129 // Parse language model elements from the given string, which should 00130 // have been loaded from <lang>.cube.lm file, e.g. in CubeRecoContext 00131 bool LoadLangModelElements(const string &lm_params); 00132 00133 // Returns the number of word Dawgs in the language model. 00134 int NumDawgs() const; 00135 00136 // Returns the dawgs with the given index from either the dawgs 00137 // stored by the Tesseract object, or the word_dawgs_. 00138 const Dawg *GetDawg(int index) const; 00139 }; 00140 } // tesseract 00141 00142 #endif // TESS_LANG_MODEL_H