Tesseract  3.02
tesseract-ocr/ccmain/tesseract_cube_combiner.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        tesseract_cube_combiner.h
00003  * Description: Declaration of the Tesseract & Cube results combiner Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // The TesseractCubeCombiner class provides the functionality of combining
00021 // the recognition results of Tesseract and Cube at the word level
00022 
00023 #include <algorithm>
00024 #include <string>
00025 #include <vector>
00026 #include <wctype.h>
00027 
00028 #include "tesseract_cube_combiner.h"
00029 
00030 #include "cube_object.h"
00031 #include "cube_reco_context.h"
00032 #include "cube_utils.h"
00033 #include "neural_net.h"
00034 #include "tesseractclass.h"
00035 #include "word_altlist.h"
00036 
00037 namespace tesseract {
00038 
00039 TesseractCubeCombiner::TesseractCubeCombiner(CubeRecoContext *cube_cntxt) {
00040   cube_cntxt_ = cube_cntxt;
00041   combiner_net_ = NULL;
00042 }
00043 
00044 TesseractCubeCombiner::~TesseractCubeCombiner() {
00045   if (combiner_net_ != NULL) {
00046     delete combiner_net_;
00047     combiner_net_ = NULL;
00048   }
00049 }
00050 
00051 bool TesseractCubeCombiner::LoadCombinerNet() {
00052   ASSERT_HOST(cube_cntxt_);
00053   // Compute the path of the combiner net
00054   string data_path;
00055   cube_cntxt_->GetDataFilePath(&data_path);
00056   string net_file_name =  data_path + cube_cntxt_->Lang() +
00057                           ".tesseract_cube.nn";
00058 
00059   // Return false if file does not exist
00060   FILE *fp = fopen(net_file_name.c_str(), "rb");
00061   if (fp == NULL)
00062     return false;
00063   else
00064     fclose(fp);
00065 
00066   // Load and validate net
00067   combiner_net_ = NeuralNet::FromFile(net_file_name);
00068   if (combiner_net_ == NULL) {
00069     tprintf("Could not read combiner net file %s", net_file_name.c_str());
00070     return false;
00071   } else if (combiner_net_->out_cnt() != 2) {
00072     tprintf("Invalid combiner net file %s! Output count != 2\n",
00073             net_file_name.c_str());
00074     delete combiner_net_;
00075     combiner_net_ = NULL;
00076     return false;
00077   }
00078   return true;
00079 }
00080 
00081 // Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally
00082 // strips punc and/or normalizes case and then converts back
00083 string TesseractCubeCombiner::NormalizeString(const string &str,
00084                                               bool remove_punc,
00085                                               bool norm_case) {
00086   // convert to UTF32
00087   string_32 str32;
00088   CubeUtils::UTF8ToUTF32(str.c_str(), &str32);
00089   // strip punc and normalize
00090   string_32 new_str32;
00091   for (int idx = 0; idx < str32.length(); idx++) {
00092     // if no punc removal is required or not a punctuation character
00093     if (!remove_punc || iswpunct(str32[idx]) == 0) {
00094       char_32 norm_char = str32[idx];
00095       // normalize case if required
00096       if (norm_case && iswalpha(norm_char)) {
00097         norm_char = towlower(norm_char);
00098       }
00099       new_str32.push_back(norm_char);
00100     }
00101   }
00102   // convert back to UTF8
00103   string new_str;
00104   CubeUtils::UTF32ToUTF8(new_str32.c_str(), &new_str);
00105   return new_str;
00106 }
00107 
00108 // Compares 2 strings optionally ignoring punctuation
00109 int TesseractCubeCombiner::CompareStrings(const string &str1,
00110                                           const string &str2,
00111                                           bool ignore_punc,
00112                                           bool ignore_case) {
00113   if (!ignore_punc && !ignore_case) {
00114     return str1.compare(str2);
00115   }
00116   string norm_str1 = NormalizeString(str1, ignore_punc, ignore_case);
00117   string norm_str2 = NormalizeString(str2, ignore_punc, ignore_case);
00118   return norm_str1.compare(norm_str2);
00119 }
00120 
00121 // Check if a string is a valid Tess dict word or not
00122 bool TesseractCubeCombiner::ValidWord(const string &str) {
00123   return (cube_cntxt_->TesseractObject()->getDict().valid_word(str.c_str())
00124           > 0);
00125 }
00126 
00127 // Public method for computing the combiner features. The agreement
00128 // output parameter will be true if both answers are identical,
00129 // and false otherwise.
00130 bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
00131                                                     int tess_confidence,
00132                                                     CubeObject *cube_obj,
00133                                                     WordAltList *cube_alt_list,
00134                                                     vector<double> *features,
00135                                                     bool *agreement) {
00136   features->clear();
00137   *agreement = false;
00138   if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0)
00139     return false;
00140 
00141   // Get Cube's best string; return false if empty
00142   char_32 *cube_best_str32 = cube_alt_list->Alt(0);
00143   if (cube_best_str32 == NULL || CubeUtils::StrLen(cube_best_str32) < 1)
00144     return false;
00145   string cube_best_str;
00146   int cube_best_cost = cube_alt_list->AltCost(0);
00147   int cube_best_bigram_cost = 0;
00148   bool cube_best_bigram_cost_valid = true;
00149   if (cube_cntxt_->Bigrams())
00150     cube_best_bigram_cost = cube_cntxt_->Bigrams()->
00151         Cost(cube_best_str32, cube_cntxt_->CharacterSet());
00152   else
00153     cube_best_bigram_cost_valid = false;
00154   CubeUtils::UTF32ToUTF8(cube_best_str32, &cube_best_str);
00155 
00156   // Get Tesseract's UTF32 string
00157   string_32 tess_str32;
00158   CubeUtils::UTF8ToUTF32(tess_str.c_str(), &tess_str32);
00159 
00160   // Compute agreement flag
00161   *agreement = (tess_str.compare(cube_best_str) == 0);
00162 
00163   // Get Cube's second best string; if empty, return false
00164   char_32 *cube_next_best_str32;
00165   string cube_next_best_str;
00166   int cube_next_best_cost = WORST_COST;
00167   if (cube_alt_list->AltCount() > 1) {
00168     cube_next_best_str32 = cube_alt_list->Alt(1);
00169     if (cube_next_best_str32 == NULL ||
00170         CubeUtils::StrLen(cube_next_best_str32) == 0) {
00171       return false;
00172     }
00173     cube_next_best_cost = cube_alt_list->AltCost(1);
00174     CubeUtils::UTF32ToUTF8(cube_next_best_str32, &cube_next_best_str);
00175   }
00176   // Rank of Tesseract's top result in Cube's alternate list
00177   int tess_rank = 0;
00178   for (tess_rank = 0; tess_rank < cube_alt_list->AltCount(); tess_rank++) {
00179     string alt_str;
00180     CubeUtils::UTF32ToUTF8(cube_alt_list->Alt(tess_rank), &alt_str);
00181     if (alt_str == tess_str)
00182       break;
00183   }
00184 
00185   // Cube's cost for tesseract's result. Note that this modifies the
00186   // state of cube_obj, including its alternate list by calling RecognizeWord()
00187   int tess_cost = cube_obj->WordCost(tess_str.c_str());
00188   // Cube's bigram cost of Tesseract's string
00189   int tess_bigram_cost = 0;
00190   int tess_bigram_cost_valid = true;
00191   if (cube_cntxt_->Bigrams())
00192     tess_bigram_cost = cube_cntxt_->Bigrams()->
00193         Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet());
00194   else
00195     tess_bigram_cost_valid = false;
00196 
00197   // Tesseract confidence
00198   features->push_back(tess_confidence);
00199   // Cube cost of Tesseract string
00200   features->push_back(tess_cost);
00201   // Cube Rank of Tesseract string
00202   features->push_back(tess_rank);
00203   // length of Tesseract OCR string
00204   features->push_back(tess_str.length());
00205   // Tesseract OCR string in dictionary
00206   features->push_back(ValidWord(tess_str));
00207   if (tess_bigram_cost_valid) {
00208     // bigram cost of Tesseract string
00209     features->push_back(tess_bigram_cost);
00210   }
00211   // Cube tess_cost of Cube best string
00212   features->push_back(cube_best_cost);
00213   // Cube tess_cost of Cube next best string
00214   features->push_back(cube_next_best_cost);
00215   // length of Cube string
00216   features->push_back(cube_best_str.length());
00217   // Cube string in dictionary
00218   features->push_back(ValidWord(cube_best_str));
00219   if (cube_best_bigram_cost_valid) {
00220     // bigram cost of Cube string
00221     features->push_back(cube_best_bigram_cost);
00222   }
00223   // case-insensitive string comparison, including punctuation
00224   int compare_nocase_punc = CompareStrings(cube_best_str.c_str(),
00225                                            tess_str.c_str(), false, true);
00226   features->push_back(compare_nocase_punc == 0);
00227   // case-sensitive string comparison, ignoring punctuation
00228   int compare_case_nopunc = CompareStrings(cube_best_str.c_str(),
00229                                            tess_str.c_str(), true, false);
00230   features->push_back(compare_case_nopunc == 0);
00231   // case-insensitive string comparison, ignoring punctuation
00232   int compare_nocase_nopunc = CompareStrings(cube_best_str.c_str(),
00233                                              tess_str.c_str(), true, true);
00234   features->push_back(compare_nocase_nopunc == 0);
00235   return true;
00236 }
00237 
00238 // The CubeObject parameter is used for 2 purposes: 1) to retrieve
00239 // cube's alt list, and 2) to compute cube's word cost for the
00240 // tesseract result. The call to CubeObject::WordCost() modifies
00241 // the object's alternate list, so previous state will be lost.
00242 float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res,
00243                                             CubeObject *cube_obj) {
00244   // If no combiner is loaded or the cube object is undefined,
00245   // tesseract wins with probability 1.0
00246   if (combiner_net_ == NULL || cube_obj == NULL) {
00247     tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
00248             "Cube objects not initialized; defaulting to Tesseract\n");
00249     return 1.0;
00250   }
00251 
00252   // Retrieve the alternate list from the CubeObject's current state.
00253   // If the alt list empty, tesseract wins with probability 1.0
00254   WordAltList *cube_alt_list = cube_obj->AlternateList();
00255   if (cube_alt_list == NULL)
00256     cube_alt_list = cube_obj->RecognizeWord();
00257   if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
00258     tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
00259             "Cube returned no results; defaulting to Tesseract\n");
00260     return 1.0;
00261   }
00262   return CombineResults(tess_res, cube_obj, cube_alt_list);
00263 }
00264 
00265 // The alt_list parameter is expected to have been extracted from the
00266 // CubeObject that recognized the word to be combined. The cube_obj
00267 // parameter passed may be either same instance or a separate instance to
00268 // be used only by the combiner. In both cases, its alternate
00269 // list will be modified by an internal call to RecognizeWord().
00270 float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res,
00271                                             CubeObject *cube_obj,
00272                                             WordAltList *cube_alt_list) {
00273   // If no combiner is loaded or the cube object is undefined, or the
00274   // alt list is empty, tesseract wins with probability 1.0
00275   if (combiner_net_ == NULL || cube_obj == NULL ||
00276       cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) {
00277     tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): "
00278             "Cube result cannot be retrieved; defaulting to Tesseract\n");
00279     return 1.0;
00280   }
00281 
00282   // Tesseract result string, tesseract confidence, and cost of
00283   // tesseract result according to cube
00284   string tess_str = tess_res->best_choice->unichar_string().string();
00285   // Map certainty [-20.0, 0.0] to confidence [0, 100]
00286   int tess_confidence = MIN(100, MAX(1, static_cast<int>(
00287       100 + (5 * tess_res->best_choice->certainty()))));
00288 
00289   // Compute the combiner features. If feature computation fails or
00290   // answers are identical, tesseract wins with probability 1.0
00291   vector<double> features;
00292   bool agreement;
00293   bool combiner_success = ComputeCombinerFeatures(tess_str, tess_confidence,
00294                                                   cube_obj, cube_alt_list,
00295                                                   &features, &agreement);
00296   if (!combiner_success || agreement)
00297     return 1.0;
00298 
00299   // Classify combiner feature vector and return output (probability
00300   // of tesseract class).
00301   double net_out[2];
00302   if (!combiner_net_->FeedForward(&features[0], net_out))
00303     return 1.0;
00304   return net_out[1];
00305 }
00306 }