Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: tesseract_cube_combiner.h 00003 * Description: Declaration of the Tesseract & Cube results combiner Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // The TesseractCubeCombiner class provides the functionality of combining 00021 // the recognition results of Tesseract and Cube at the word level 00022 00023 #include <algorithm> 00024 #include <string> 00025 #include <vector> 00026 #include <wctype.h> 00027 00028 #include "tesseract_cube_combiner.h" 00029 00030 #include "cube_object.h" 00031 #include "cube_reco_context.h" 00032 #include "cube_utils.h" 00033 #include "neural_net.h" 00034 #include "tesseractclass.h" 00035 #include "word_altlist.h" 00036 00037 namespace tesseract { 00038 00039 TesseractCubeCombiner::TesseractCubeCombiner(CubeRecoContext *cube_cntxt) { 00040 cube_cntxt_ = cube_cntxt; 00041 combiner_net_ = NULL; 00042 } 00043 00044 TesseractCubeCombiner::~TesseractCubeCombiner() { 00045 if (combiner_net_ != NULL) { 00046 delete combiner_net_; 00047 combiner_net_ = NULL; 00048 } 00049 } 00050 00051 bool TesseractCubeCombiner::LoadCombinerNet() { 00052 ASSERT_HOST(cube_cntxt_); 00053 // Compute the path of the combiner net 00054 string data_path; 00055 cube_cntxt_->GetDataFilePath(&data_path); 00056 string net_file_name = data_path + cube_cntxt_->Lang() + 00057 ".tesseract_cube.nn"; 00058 00059 // Return false if file does not exist 00060 FILE *fp = fopen(net_file_name.c_str(), "rb"); 00061 if (fp == NULL) 00062 return false; 00063 else 00064 fclose(fp); 00065 00066 // Load and validate net 00067 combiner_net_ = NeuralNet::FromFile(net_file_name); 00068 if (combiner_net_ == NULL) { 00069 tprintf("Could not read combiner net file %s", net_file_name.c_str()); 00070 return false; 00071 } else if (combiner_net_->out_cnt() != 2) { 00072 tprintf("Invalid combiner net file %s! Output count != 2\n", 00073 net_file_name.c_str()); 00074 delete combiner_net_; 00075 combiner_net_ = NULL; 00076 return false; 00077 } 00078 return true; 00079 } 00080 00081 // Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally 00082 // strips punc and/or normalizes case and then converts back 00083 string TesseractCubeCombiner::NormalizeString(const string &str, 00084 bool remove_punc, 00085 bool norm_case) { 00086 // convert to UTF32 00087 string_32 str32; 00088 CubeUtils::UTF8ToUTF32(str.c_str(), &str32); 00089 // strip punc and normalize 00090 string_32 new_str32; 00091 for (int idx = 0; idx < str32.length(); idx++) { 00092 // if no punc removal is required or not a punctuation character 00093 if (!remove_punc || iswpunct(str32[idx]) == 0) { 00094 char_32 norm_char = str32[idx]; 00095 // normalize case if required 00096 if (norm_case && iswalpha(norm_char)) { 00097 norm_char = towlower(norm_char); 00098 } 00099 new_str32.push_back(norm_char); 00100 } 00101 } 00102 // convert back to UTF8 00103 string new_str; 00104 CubeUtils::UTF32ToUTF8(new_str32.c_str(), &new_str); 00105 return new_str; 00106 } 00107 00108 // Compares 2 strings optionally ignoring punctuation 00109 int TesseractCubeCombiner::CompareStrings(const string &str1, 00110 const string &str2, 00111 bool ignore_punc, 00112 bool ignore_case) { 00113 if (!ignore_punc && !ignore_case) { 00114 return str1.compare(str2); 00115 } 00116 string norm_str1 = NormalizeString(str1, ignore_punc, ignore_case); 00117 string norm_str2 = NormalizeString(str2, ignore_punc, ignore_case); 00118 return norm_str1.compare(norm_str2); 00119 } 00120 00121 // Check if a string is a valid Tess dict word or not 00122 bool TesseractCubeCombiner::ValidWord(const string &str) { 00123 return (cube_cntxt_->TesseractObject()->getDict().valid_word(str.c_str()) 00124 > 0); 00125 } 00126 00127 // Public method for computing the combiner features. The agreement 00128 // output parameter will be true if both answers are identical, 00129 // and false otherwise. 00130 bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str, 00131 int tess_confidence, 00132 CubeObject *cube_obj, 00133 WordAltList *cube_alt_list, 00134 vector<double> *features, 00135 bool *agreement) { 00136 features->clear(); 00137 *agreement = false; 00138 if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) 00139 return false; 00140 00141 // Get Cube's best string; return false if empty 00142 char_32 *cube_best_str32 = cube_alt_list->Alt(0); 00143 if (cube_best_str32 == NULL || CubeUtils::StrLen(cube_best_str32) < 1) 00144 return false; 00145 string cube_best_str; 00146 int cube_best_cost = cube_alt_list->AltCost(0); 00147 int cube_best_bigram_cost = 0; 00148 bool cube_best_bigram_cost_valid = true; 00149 if (cube_cntxt_->Bigrams()) 00150 cube_best_bigram_cost = cube_cntxt_->Bigrams()-> 00151 Cost(cube_best_str32, cube_cntxt_->CharacterSet()); 00152 else 00153 cube_best_bigram_cost_valid = false; 00154 CubeUtils::UTF32ToUTF8(cube_best_str32, &cube_best_str); 00155 00156 // Get Tesseract's UTF32 string 00157 string_32 tess_str32; 00158 CubeUtils::UTF8ToUTF32(tess_str.c_str(), &tess_str32); 00159 00160 // Compute agreement flag 00161 *agreement = (tess_str.compare(cube_best_str) == 0); 00162 00163 // Get Cube's second best string; if empty, return false 00164 char_32 *cube_next_best_str32; 00165 string cube_next_best_str; 00166 int cube_next_best_cost = WORST_COST; 00167 if (cube_alt_list->AltCount() > 1) { 00168 cube_next_best_str32 = cube_alt_list->Alt(1); 00169 if (cube_next_best_str32 == NULL || 00170 CubeUtils::StrLen(cube_next_best_str32) == 0) { 00171 return false; 00172 } 00173 cube_next_best_cost = cube_alt_list->AltCost(1); 00174 CubeUtils::UTF32ToUTF8(cube_next_best_str32, &cube_next_best_str); 00175 } 00176 // Rank of Tesseract's top result in Cube's alternate list 00177 int tess_rank = 0; 00178 for (tess_rank = 0; tess_rank < cube_alt_list->AltCount(); tess_rank++) { 00179 string alt_str; 00180 CubeUtils::UTF32ToUTF8(cube_alt_list->Alt(tess_rank), &alt_str); 00181 if (alt_str == tess_str) 00182 break; 00183 } 00184 00185 // Cube's cost for tesseract's result. Note that this modifies the 00186 // state of cube_obj, including its alternate list by calling RecognizeWord() 00187 int tess_cost = cube_obj->WordCost(tess_str.c_str()); 00188 // Cube's bigram cost of Tesseract's string 00189 int tess_bigram_cost = 0; 00190 int tess_bigram_cost_valid = true; 00191 if (cube_cntxt_->Bigrams()) 00192 tess_bigram_cost = cube_cntxt_->Bigrams()-> 00193 Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet()); 00194 else 00195 tess_bigram_cost_valid = false; 00196 00197 // Tesseract confidence 00198 features->push_back(tess_confidence); 00199 // Cube cost of Tesseract string 00200 features->push_back(tess_cost); 00201 // Cube Rank of Tesseract string 00202 features->push_back(tess_rank); 00203 // length of Tesseract OCR string 00204 features->push_back(tess_str.length()); 00205 // Tesseract OCR string in dictionary 00206 features->push_back(ValidWord(tess_str)); 00207 if (tess_bigram_cost_valid) { 00208 // bigram cost of Tesseract string 00209 features->push_back(tess_bigram_cost); 00210 } 00211 // Cube tess_cost of Cube best string 00212 features->push_back(cube_best_cost); 00213 // Cube tess_cost of Cube next best string 00214 features->push_back(cube_next_best_cost); 00215 // length of Cube string 00216 features->push_back(cube_best_str.length()); 00217 // Cube string in dictionary 00218 features->push_back(ValidWord(cube_best_str)); 00219 if (cube_best_bigram_cost_valid) { 00220 // bigram cost of Cube string 00221 features->push_back(cube_best_bigram_cost); 00222 } 00223 // case-insensitive string comparison, including punctuation 00224 int compare_nocase_punc = CompareStrings(cube_best_str.c_str(), 00225 tess_str.c_str(), false, true); 00226 features->push_back(compare_nocase_punc == 0); 00227 // case-sensitive string comparison, ignoring punctuation 00228 int compare_case_nopunc = CompareStrings(cube_best_str.c_str(), 00229 tess_str.c_str(), true, false); 00230 features->push_back(compare_case_nopunc == 0); 00231 // case-insensitive string comparison, ignoring punctuation 00232 int compare_nocase_nopunc = CompareStrings(cube_best_str.c_str(), 00233 tess_str.c_str(), true, true); 00234 features->push_back(compare_nocase_nopunc == 0); 00235 return true; 00236 } 00237 00238 // The CubeObject parameter is used for 2 purposes: 1) to retrieve 00239 // cube's alt list, and 2) to compute cube's word cost for the 00240 // tesseract result. The call to CubeObject::WordCost() modifies 00241 // the object's alternate list, so previous state will be lost. 00242 float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res, 00243 CubeObject *cube_obj) { 00244 // If no combiner is loaded or the cube object is undefined, 00245 // tesseract wins with probability 1.0 00246 if (combiner_net_ == NULL || cube_obj == NULL) { 00247 tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): " 00248 "Cube objects not initialized; defaulting to Tesseract\n"); 00249 return 1.0; 00250 } 00251 00252 // Retrieve the alternate list from the CubeObject's current state. 00253 // If the alt list empty, tesseract wins with probability 1.0 00254 WordAltList *cube_alt_list = cube_obj->AlternateList(); 00255 if (cube_alt_list == NULL) 00256 cube_alt_list = cube_obj->RecognizeWord(); 00257 if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) { 00258 tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): " 00259 "Cube returned no results; defaulting to Tesseract\n"); 00260 return 1.0; 00261 } 00262 return CombineResults(tess_res, cube_obj, cube_alt_list); 00263 } 00264 00265 // The alt_list parameter is expected to have been extracted from the 00266 // CubeObject that recognized the word to be combined. The cube_obj 00267 // parameter passed may be either same instance or a separate instance to 00268 // be used only by the combiner. In both cases, its alternate 00269 // list will be modified by an internal call to RecognizeWord(). 00270 float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res, 00271 CubeObject *cube_obj, 00272 WordAltList *cube_alt_list) { 00273 // If no combiner is loaded or the cube object is undefined, or the 00274 // alt list is empty, tesseract wins with probability 1.0 00275 if (combiner_net_ == NULL || cube_obj == NULL || 00276 cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) { 00277 tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): " 00278 "Cube result cannot be retrieved; defaulting to Tesseract\n"); 00279 return 1.0; 00280 } 00281 00282 // Tesseract result string, tesseract confidence, and cost of 00283 // tesseract result according to cube 00284 string tess_str = tess_res->best_choice->unichar_string().string(); 00285 // Map certainty [-20.0, 0.0] to confidence [0, 100] 00286 int tess_confidence = MIN(100, MAX(1, static_cast<int>( 00287 100 + (5 * tess_res->best_choice->certainty())))); 00288 00289 // Compute the combiner features. If feature computation fails or 00290 // answers are identical, tesseract wins with probability 1.0 00291 vector<double> features; 00292 bool agreement; 00293 bool combiner_success = ComputeCombinerFeatures(tess_str, tess_confidence, 00294 cube_obj, cube_alt_list, 00295 &features, &agreement); 00296 if (!combiner_success || agreement) 00297 return 1.0; 00298 00299 // Classify combiner feature vector and return output (probability 00300 // of tesseract class). 00301 double net_out[2]; 00302 if (!combiner_net_->FeedForward(&features[0], net_out)) 00303 return 1.0; 00304 return net_out[1]; 00305 } 00306 }