Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: word_list_lang_model.cpp 00003 * Description: Implementation of the Word List Language Model Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <string> 00021 #include <vector> 00022 #include "word_list_lang_model.h" 00023 #include "cube_utils.h" 00024 00025 #include "ratngs.h" 00026 #include "trie.h" 00027 00028 namespace tesseract { 00029 WordListLangModel::WordListLangModel(CubeRecoContext *cntxt) { 00030 cntxt_ = cntxt; 00031 dawg_ = NULL; 00032 init_ = false; 00033 } 00034 00035 WordListLangModel::~WordListLangModel() { 00036 Cleanup(); 00037 } 00038 00039 // Cleanup 00040 void WordListLangModel::Cleanup() { 00041 if (dawg_ != NULL) { 00042 delete dawg_; 00043 dawg_ = NULL; 00044 } 00045 init_ = false; 00046 } 00047 00048 // Initialize the language model 00049 bool WordListLangModel::Init() { 00050 if (init_ == true) { 00051 return true; 00052 } 00053 // The last parameter to the Trie constructor (the debug level) is set to 00054 // false for now, until Cube has a way to express its preferred debug level. 00055 dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM, 00056 WordListLangModel::kMaxDawgEdges, 00057 cntxt_->CharacterSet()->ClassCount(), false); 00058 if (dawg_ == NULL) { 00059 return false; 00060 } 00061 init_ = true; 00062 return true; 00063 } 00064 00065 // return a pointer to the root 00066 LangModEdge * WordListLangModel::Root() { 00067 return NULL; 00068 } 00069 00070 // return the edges emerging from the current state 00071 LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list, 00072 LangModEdge *edge, 00073 int *edge_cnt) { 00074 // initialize if necessary 00075 if (init_ == false) { 00076 if (Init() == false) { 00077 return false; 00078 } 00079 } 00080 00081 (*edge_cnt) = 0; 00082 00083 EDGE_REF edge_ref; 00084 00085 TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge); 00086 00087 if (tess_lm_edge == NULL) { 00088 edge_ref = 0; 00089 } else { 00090 edge_ref = tess_lm_edge->EndEdge(); 00091 00092 // advance node 00093 edge_ref = dawg_->next_node(edge_ref); 00094 if (edge_ref == 0) { 00095 return 0; 00096 } 00097 } 00098 00099 // allocate memory for edges 00100 LangModEdge **edge_array = new LangModEdge *[kMaxEdge]; 00101 if (edge_array == NULL) { 00102 return NULL; 00103 } 00104 00105 // now get all the emerging edges 00106 (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref, 00107 edge_array + (*edge_cnt)); 00108 00109 return edge_array; 00110 } 00111 00112 // returns true if the char_32 is supported by the language model 00113 // TODO(ahmadab) currently not implemented 00114 bool WordListLangModel::IsValidSequence(const char_32 *sequence, 00115 bool terminal, LangModEdge **edges) { 00116 return false; 00117 } 00118 00119 // Recursive helper function for WordVariants(). 00120 void WordListLangModel::WordVariants(const CharSet &char_set, 00121 string_32 prefix_str32, 00122 WERD_CHOICE *word_so_far, 00123 string_32 str32, 00124 vector<WERD_CHOICE *> *word_variants) { 00125 int str_len = str32.length(); 00126 if (str_len == 0) { 00127 if (word_so_far->length() > 0) { 00128 word_variants->push_back(new WERD_CHOICE(*word_so_far)); 00129 } 00130 } else { 00131 // Try out all the possible prefixes of the str32. 00132 for (int len = 1; len <= str_len; len++) { 00133 // Check if prefix is supported in character set. 00134 string_32 str_pref32 = str32.substr(0, len); 00135 int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>( 00136 str_pref32.c_str())); 00137 if (class_id <= 0) { 00138 continue; 00139 } else { 00140 string_32 new_prefix_str32 = prefix_str32 + str_pref32; 00141 string_32 new_str32 = str32.substr(len); 00142 word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0); 00143 WordVariants(char_set, new_prefix_str32, word_so_far, new_str32, 00144 word_variants); 00145 word_so_far->remove_last_unichar_id(); 00146 } 00147 } 00148 } 00149 } 00150 00151 // Compute all the variants of a 32-bit string in terms of the class-ids 00152 // This is needed for languages that have ligatures. A word can then have more 00153 // than one spelling in terms of the class-ids 00154 void WordListLangModel::WordVariants(const CharSet &char_set, 00155 const UNICHARSET *uchset, string_32 str32, 00156 vector<WERD_CHOICE *> *word_variants) { 00157 for (int i = 0; i < word_variants->size(); i++) { 00158 delete (*word_variants)[i]; 00159 } 00160 word_variants->clear(); 00161 string_32 prefix_str32; 00162 WERD_CHOICE word_so_far(uchset); 00163 WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants); 00164 } 00165 00166 // add a new UTF-8 string to the lang model 00167 bool WordListLangModel::AddString(const char *char_ptr) { 00168 if (!init_ && !Init()) { // initialize if necessary 00169 return false; 00170 } 00171 00172 string_32 str32; 00173 CubeUtils::UTF8ToUTF32(char_ptr, &str32); 00174 if (str32.length() < 1) { 00175 return false; 00176 } 00177 return AddString32(str32.c_str()); 00178 } 00179 00180 // add a new UTF-32 string to the lang model 00181 bool WordListLangModel::AddString32(const char_32 *char_32_ptr) { 00182 if (char_32_ptr == NULL) { 00183 return false; 00184 } 00185 // get all the word variants 00186 vector<WERD_CHOICE *> word_variants; 00187 WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(), 00188 char_32_ptr, &word_variants); 00189 00190 if (word_variants.size() > 0) { 00191 // find the shortest variant 00192 int shortest_word = 0; 00193 for (int word = 1; word < word_variants.size(); word++) { 00194 if (word_variants[shortest_word]->length() > 00195 word_variants[word]->length()) { 00196 shortest_word = word; 00197 } 00198 } 00199 // only add the shortest grapheme interpretation of string to the word list 00200 dawg_->add_word_to_dawg(*word_variants[shortest_word]); 00201 } 00202 for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; } 00203 return true; 00204 } 00205 00206 }