Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: cube_reco_context.cpp 00003 * Description: Implementation of the Cube Recognition Context Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2007 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <string> 00021 #include <limits.h> 00022 00023 #include "cube_reco_context.h" 00024 00025 #include "classifier_factory.h" 00026 #include "cube_tuning_params.h" 00027 #include "dict.h" 00028 #include "feature_bmp.h" 00029 #include "tessdatamanager.h" 00030 #include "tesseractclass.h" 00031 #include "tess_lang_model.h" 00032 00033 namespace tesseract { 00034 00035 // Instantiate a CubeRecoContext object using a Tesseract object. 00036 // CubeRecoContext will not take ownership of tess_obj, but will 00037 // record the pointer to it and will make use of various Tesseract 00038 // components (language model, flags, etc). Thus the caller should 00039 // keep tess_obj alive so long as the instantiated CubeRecoContext is used. 00040 CubeRecoContext::CubeRecoContext(Tesseract *tess_obj) { 00041 tess_obj_ = tess_obj; 00042 lang_ = ""; 00043 loaded_ = false; 00044 lang_mod_ = NULL; 00045 params_ = NULL; 00046 char_classifier_ = NULL; 00047 char_set_ = NULL; 00048 word_size_model_ = NULL; 00049 char_bigrams_ = NULL; 00050 word_unigrams_ = NULL; 00051 noisy_input_ = false; 00052 size_normalization_ = false; 00053 } 00054 00055 CubeRecoContext::~CubeRecoContext() { 00056 if (char_classifier_ != NULL) { 00057 delete char_classifier_; 00058 char_classifier_ = NULL; 00059 } 00060 00061 if (word_size_model_ != NULL) { 00062 delete word_size_model_; 00063 word_size_model_ = NULL; 00064 } 00065 00066 if (char_set_ != NULL) { 00067 delete char_set_; 00068 char_set_ = NULL; 00069 } 00070 00071 if (char_bigrams_ != NULL) { 00072 delete char_bigrams_; 00073 char_bigrams_ = NULL; 00074 } 00075 00076 if (word_unigrams_ != NULL) { 00077 delete word_unigrams_; 00078 word_unigrams_ = NULL; 00079 } 00080 00081 if (lang_mod_ != NULL) { 00082 delete lang_mod_; 00083 lang_mod_ = NULL; 00084 } 00085 00086 if (params_ != NULL) { 00087 delete params_; 00088 params_ = NULL; 00089 } 00090 } 00091 00092 // Returns the path of the data files by looking up the TESSDATA_PREFIX 00093 // environment variable and appending a "tessdata" directory to it 00094 bool CubeRecoContext::GetDataFilePath(string *path) const { 00095 *path = tess_obj_->datadir.string(); 00096 return true; 00097 } 00098 00099 // The object initialization function that loads all the necessary 00100 // components of a RecoContext. TessdataManager is used to load the 00101 // data from [lang].traineddata file. If TESSDATA_CUBE_UNICHARSET 00102 // component is present, Cube will be instantiated with the unicharset 00103 // specified in this component and the corresponding dictionary 00104 // (TESSDATA_CUBE_SYSTEM_DAWG), and will map Cube's unicharset to 00105 // Tesseract's. Otherwise, TessdataManager will assume that Cube will 00106 // be using Tesseract's unicharset and dawgs, and will load the 00107 // unicharset from the TESSDATA_UNICHARSET component and will load the 00108 // dawgs from TESSDATA_*_DAWG components. 00109 bool CubeRecoContext::Load(TessdataManager *tessdata_manager, 00110 UNICHARSET *tess_unicharset) { 00111 ASSERT_HOST(tess_obj_ != NULL); 00112 tess_unicharset_ = tess_unicharset; 00113 string data_file_path; 00114 00115 // Get the data file path. 00116 if (GetDataFilePath(&data_file_path) == false) { 00117 fprintf(stderr, "Unable to get data file path\n"); 00118 return false; 00119 } 00120 00121 // Get the language from the Tesseract object. 00122 lang_ = tess_obj_->lang.string(); 00123 00124 // Create the char set. 00125 if ((char_set_ = 00126 CharSet::Create(tessdata_manager, tess_unicharset)) == NULL) { 00127 fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load " 00128 "CharSet\n"); 00129 return false; 00130 } 00131 // Create the language model. 00132 string lm_file_name = data_file_path + lang_ + ".cube.lm"; 00133 string lm_params; 00134 if (!CubeUtils::ReadFileToString(lm_file_name, &lm_params)) { 00135 fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read cube " 00136 "language model params from %s\n", lm_file_name.c_str()); 00137 return false; 00138 } 00139 lang_mod_ = new TessLangModel(lm_params, data_file_path, 00140 tess_obj_->getDict().load_system_dawg, 00141 tessdata_manager, this); 00142 if (lang_mod_ == NULL) { 00143 fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to create " 00144 "TessLangModel\n"); 00145 return false; 00146 } 00147 00148 // Create the optional char bigrams object. 00149 char_bigrams_ = CharBigrams::Create(data_file_path, lang_); 00150 00151 // Create the optional word unigrams object. 00152 word_unigrams_ = WordUnigrams::Create(data_file_path, lang_); 00153 00154 // Create the optional size model. 00155 word_size_model_ = WordSizeModel::Create(data_file_path, lang_, 00156 char_set_, Contextual()); 00157 00158 // Load tuning params. 00159 params_ = CubeTuningParams::Create(data_file_path, lang_); 00160 if (params_ == NULL) { 00161 fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read " 00162 "CubeTuningParams from %s\n", data_file_path.c_str()); 00163 return false; 00164 } 00165 00166 // Create the char classifier. 00167 char_classifier_ = CharClassifierFactory::Create(data_file_path, lang_, 00168 lang_mod_, char_set_, 00169 params_); 00170 if (char_classifier_ == NULL) { 00171 fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load " 00172 "CharClassifierFactory object from %s\n", data_file_path.c_str()); 00173 return false; 00174 } 00175 00176 loaded_ = true; 00177 00178 return true; 00179 } 00180 00181 // Creates a CubeRecoContext object using a tesseract object 00182 CubeRecoContext * CubeRecoContext::Create(Tesseract *tess_obj, 00183 TessdataManager *tessdata_manager, 00184 UNICHARSET *tess_unicharset) { 00185 // create the object 00186 CubeRecoContext *cntxt = new CubeRecoContext(tess_obj); 00187 if (cntxt == NULL) { 00188 fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to create " 00189 "CubeRecoContext object\n"); 00190 return NULL; 00191 } 00192 // load the necessary components 00193 if (cntxt->Load(tessdata_manager, tess_unicharset) == false) { 00194 fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to init " 00195 "CubeRecoContext object\n"); 00196 delete cntxt; 00197 return NULL; 00198 } 00199 // success 00200 return cntxt; 00201 } 00202 } // tesseract}