Tesseract  3.02
tesseract-ocr/cube/tess_lang_model.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        tess_lang_model.h
00003  * Description: Declaration of the Tesseract Language Model Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef TESS_LANG_MODEL_H
00021 #define TESS_LANG_MODEL_H
00022 
00023 #include <string>
00024 
00025 #include "char_altlist.h"
00026 #include "cube_reco_context.h"
00027 #include "cube_tuning_params.h"
00028 #include "dict.h"
00029 #include "lang_model.h"
00030 #include "tessdatamanager.h"
00031 #include "tess_lang_mod_edge.h"
00032 
00033 namespace tesseract {
00034 
00035 const int kStateCnt = 4;
00036 const int kNumLiteralCnt = 5;
00037 
00038 class TessLangModel : public LangModel {
00039  public:
00040   TessLangModel(const string &lm_params,
00041                 const string &data_file_path,
00042                 bool load_system_dawg,
00043                 TessdataManager *tessdata_manager,
00044                 CubeRecoContext *cntxt);
00045   ~TessLangModel() {
00046     if (word_dawgs_ != NULL) {
00047       word_dawgs_->delete_data_pointers();
00048       delete word_dawgs_;
00049     }
00050   }
00051 
00052   // returns a pointer to the root of the language model
00053   inline TessLangModEdge *Root() {
00054     return NULL;
00055   }
00056 
00057   // The general fan-out generation function. Returns the list of edges
00058   // fanning-out of the specified edge and their count. If an AltList is
00059   // specified, only the class-ids with a minimum cost are considered
00060   LangModEdge **GetEdges(CharAltList *alt_list,
00061                          LangModEdge *edge,
00062                          int *edge_cnt);
00063   // Determines if a sequence of 32-bit chars is valid in this language model
00064   // starting from the root. If the eow_flag is ON, also checks for
00065   // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
00066   // edge
00067   bool IsValidSequence(const char_32 *sequence, bool eow_flag,
00068                        LangModEdge **final_edge = NULL);
00069   bool IsLeadingPunc(char_32 ch);
00070   bool IsTrailingPunc(char_32 ch);
00071   bool IsDigit(char_32 ch);
00072 
00073   void RemoveInvalidCharacters(string *lm_str);
00074  private:
00075   // static LM state machines
00076   static const Dawg *ood_dawg_;
00077   static const Dawg *number_dawg_;
00078   static const int num_state_machine_[kStateCnt][kNumLiteralCnt];
00079   static const int num_max_repeat_[kStateCnt];
00080   // word_dawgs_ should only be loaded if cube has its own version of the
00081   // unicharset (different from the one used by tesseract) and therefore
00082   // can not use the dawgs loaded for tesseract (since the unichar ids
00083   // encoded in the dawgs differ).
00084   DawgVector *word_dawgs_;
00085 
00086   static int max_edge_;
00087   static int max_ood_shape_cost_;
00088 
00089   // remaining language model elements needed by cube. These get loaded from
00090   // the .lm file
00091   string lead_punc_;
00092   string trail_punc_;
00093   string num_lead_punc_;
00094   string num_trail_punc_;
00095   string operators_;
00096   string digits_;
00097   string alphas_;
00098   // String of characters in RHS of each line of <lang>.cube.lm
00099   // Each element is hard-coded to correspond to a specific token type
00100   // (see LoadLangModelElements)
00101   string *literal_str_[kNumLiteralCnt];
00102   // Recognition context needed to access language properties
00103   // (case, cursive,..)
00104   CubeRecoContext *cntxt_;
00105   bool has_case_;
00106 
00107   // computes and returns the edges that fan out of an edge ref
00108   int FanOut(CharAltList *alt_list,
00109              const Dawg *dawg, EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
00110              const char_32 *str, bool root_flag, LangModEdge **edge_array);
00111   // generate edges from an NULL terminated string
00112   // (used for punctuation, operators and digits)
00113   int Edges(const char *strng, const Dawg *dawg,
00114             EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
00115             LangModEdge **edge_array);
00116   // Generate the edges fanning-out from an edge in the number state machine
00117   int NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array);
00118   // Generate OOD edges
00119   int OODEdges(CharAltList *alt_list, EDGE_REF edge_ref,
00120                EDGE_REF edge_ref_mask, LangModEdge **edge_array);
00121   // Cleanup an edge array
00122   void FreeEdges(int edge_cnt, LangModEdge **edge_array);
00123   // Determines if a sequence of 32-bit chars is valid in this language model
00124   // starting from the specified edge. If the eow_flag is ON, also checks for
00125   // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last
00126   // edge
00127   bool IsValidSequence(LangModEdge *edge, const char_32 *sequence,
00128                        bool eow_flag, LangModEdge **final_edge);
00129   // Parse language model elements from the given string, which should
00130   // have been loaded from <lang>.cube.lm file, e.g. in CubeRecoContext
00131   bool LoadLangModelElements(const string &lm_params);
00132 
00133   // Returns the number of word Dawgs in the language model.
00134   int NumDawgs() const;
00135 
00136   // Returns the dawgs with the given index from either the dawgs
00137   // stored by the Tesseract object, or the word_dawgs_.
00138   const Dawg *GetDawg(int index) const;
00139 };
00140 }  // tesseract
00141 
00142 #endif  // TESS_LANG_MODEL_H