Tesseract  3.02
tesseract-ocr/cube/word_list_lang_model.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        word_list_lang_model.cpp
00003  * Description: Implementation of the Word List Language Model Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include <string>
00021 #include <vector>
00022 #include "word_list_lang_model.h"
00023 #include "cube_utils.h"
00024 
00025 #include "ratngs.h"
00026 #include "trie.h"
00027 
00028 namespace tesseract {
00029 WordListLangModel::WordListLangModel(CubeRecoContext *cntxt) {
00030   cntxt_ = cntxt;
00031   dawg_ = NULL;
00032   init_ = false;
00033 }
00034 
00035 WordListLangModel::~WordListLangModel() {
00036   Cleanup();
00037 }
00038 
00039 // Cleanup
00040 void WordListLangModel::Cleanup() {
00041   if (dawg_ != NULL) {
00042     delete dawg_;
00043     dawg_ = NULL;
00044   }
00045   init_ = false;
00046 }
00047 
00048 // Initialize the language model
00049 bool WordListLangModel::Init() {
00050   if (init_ == true) {
00051     return true;
00052   }
00053   // The last parameter to the Trie constructor (the debug level) is set to
00054   // false for now, until Cube has a way to express its preferred debug level.
00055   dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
00056                    WordListLangModel::kMaxDawgEdges,
00057                    cntxt_->CharacterSet()->ClassCount(), false);
00058   if (dawg_ == NULL) {
00059     return false;
00060   }
00061   init_ = true;
00062   return true;
00063 }
00064 
00065 // return a pointer to the root
00066 LangModEdge * WordListLangModel::Root() {
00067   return NULL;
00068 }
00069 
00070 // return the edges emerging from the current state
00071 LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
00072                                           LangModEdge *edge,
00073                                           int *edge_cnt) {
00074   // initialize if necessary
00075   if (init_ == false) {
00076     if (Init() == false) {
00077       return false;
00078     }
00079   }
00080 
00081   (*edge_cnt) = 0;
00082 
00083   EDGE_REF edge_ref;
00084 
00085   TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
00086 
00087   if (tess_lm_edge == NULL) {
00088     edge_ref = 0;
00089   } else {
00090     edge_ref = tess_lm_edge->EndEdge();
00091 
00092     // advance node
00093     edge_ref = dawg_->next_node(edge_ref);
00094     if (edge_ref == 0) {
00095       return 0;
00096     }
00097   }
00098 
00099   // allocate memory for edges
00100   LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
00101   if (edge_array == NULL) {
00102     return NULL;
00103   }
00104 
00105   // now get all the emerging edges
00106   (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
00107                                                  edge_array + (*edge_cnt));
00108 
00109   return edge_array;
00110 }
00111 
00112 // returns true if the char_32 is supported by the language model
00113 // TODO(ahmadab) currently not implemented
00114 bool WordListLangModel::IsValidSequence(const char_32 *sequence,
00115                                         bool terminal, LangModEdge **edges) {
00116   return false;
00117 }
00118 
00119 // Recursive helper function for WordVariants().
00120 void WordListLangModel::WordVariants(const CharSet &char_set,
00121                                      string_32 prefix_str32,
00122                                      WERD_CHOICE *word_so_far,
00123                                      string_32 str32,
00124                                      vector<WERD_CHOICE *> *word_variants) {
00125   int str_len = str32.length();
00126   if (str_len == 0) {
00127     if (word_so_far->length() > 0) {
00128       word_variants->push_back(new WERD_CHOICE(*word_so_far));
00129     }
00130   } else {
00131     // Try out all the possible prefixes of the str32.
00132     for (int len = 1; len <= str_len; len++) {
00133       // Check if prefix is supported in character set.
00134       string_32 str_pref32 = str32.substr(0, len);
00135       int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>(
00136           str_pref32.c_str()));
00137       if (class_id <= 0) {
00138         continue;
00139       } else {
00140         string_32 new_prefix_str32 = prefix_str32 + str_pref32;
00141         string_32 new_str32 = str32.substr(len);
00142         word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0);
00143         WordVariants(char_set, new_prefix_str32, word_so_far, new_str32,
00144                      word_variants);
00145         word_so_far->remove_last_unichar_id();
00146       }
00147     }
00148   }
00149 }
00150 
00151 // Compute all the variants of a 32-bit string in terms of the class-ids
00152 // This is needed for languages that have ligatures. A word can then have more
00153 // than one spelling in terms of the class-ids
00154 void WordListLangModel::WordVariants(const CharSet &char_set,
00155                                      const UNICHARSET *uchset, string_32 str32,
00156                                      vector<WERD_CHOICE *> *word_variants) {
00157   for (int i = 0; i < word_variants->size(); i++) {
00158     delete (*word_variants)[i];
00159   }
00160   word_variants->clear();
00161   string_32 prefix_str32;
00162   WERD_CHOICE word_so_far(uchset);
00163   WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
00164 }
00165 
00166 // add a new UTF-8 string to the lang model
00167 bool WordListLangModel::AddString(const char *char_ptr) {
00168   if (!init_ && !Init()) {  // initialize if necessary
00169     return false;
00170   }
00171 
00172   string_32 str32;
00173   CubeUtils::UTF8ToUTF32(char_ptr, &str32);
00174   if (str32.length() < 1) {
00175     return false;
00176   }
00177   return AddString32(str32.c_str());
00178 }
00179 
00180 // add a new UTF-32 string to the lang model
00181 bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
00182   if (char_32_ptr == NULL) {
00183     return false;
00184   }
00185   // get all the word variants
00186   vector<WERD_CHOICE *> word_variants;
00187   WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
00188                char_32_ptr, &word_variants);
00189 
00190   if (word_variants.size() > 0) {
00191     // find the shortest variant
00192     int shortest_word = 0;
00193     for (int word = 1; word < word_variants.size(); word++) {
00194       if (word_variants[shortest_word]->length() >
00195           word_variants[word]->length()) {
00196         shortest_word = word;
00197       }
00198     }
00199     // only add the shortest grapheme interpretation of string to the word list
00200     dawg_->add_word_to_dawg(*word_variants[shortest_word]);
00201   }
00202   for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
00203   return true;
00204 }
00205 
00206 }