Tesseract  3.02
tesseract-ocr/ccmain/adaptions.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        adaptions.cpp  (Formerly adaptions.c)
00003  * Description: Functions used to adapt to blobs already confidently
00004  *                                      identified
00005  * Author:              Chris Newton
00006  * Created:             Thu Oct  7 10:17:28 BST 1993
00007  *
00008  * (C) Copyright 1992, Hewlett-Packard Ltd.
00009  ** Licensed under the Apache License, Version 2.0 (the "License");
00010  ** you may not use this file except in compliance with the License.
00011  ** You may obtain a copy of the License at
00012  ** http://www.apache.org/licenses/LICENSE-2.0
00013  ** Unless required by applicable law or agreed to in writing, software
00014  ** distributed under the License is distributed on an "AS IS" BASIS,
00015  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  ** See the License for the specific language governing permissions and
00017  ** limitations under the License.
00018  *
00019  **********************************************************************/
00020 
00021 #ifdef _MSC_VER
00022 #pragma warning(disable:4244)  // Conversion warnings
00023 #pragma warning(disable:4305)  // int/float warnings
00024 #endif
00025 
00026 #include "mfcpch.h"
00027 
00028 #ifdef __UNIX__
00029 #include          <assert.h>
00030 #endif
00031 #include          <ctype.h>
00032 #include          <string.h>
00033 #include          "tessbox.h"
00034 #include          "tessvars.h"
00035 #include          "memry.h"
00036 #include          "imgs.h"
00037 #include          "scaleimg.h"
00038 #include          "reject.h"
00039 #include          "control.h"
00040 #include          "stopper.h"
00041 #include          "secname.h"
00042 #include          "tesseractclass.h"
00043 
00044 // Include automatically generated configuration file if running autoconf.
00045 #ifdef HAVE_CONFIG_H
00046 #include "config_auto.h"
00047 #endif
00048 
00049 namespace tesseract {
00050 BOOL8 Tesseract::word_adaptable(  //should we adapt?
00051                                 WERD_RES *word,
00052                                 uinT16 mode) {
00053   if (tessedit_adaption_debug) {
00054     tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
00055           word->best_choice == NULL ? "" :
00056           word->best_choice->unichar_string().string(),
00057           word->best_choice->rating(), word->best_choice->certainty());
00058   }
00059 
00060   BOOL8 status = FALSE;
00061   BITS16 flags(mode);
00062 
00063   enum MODES
00064   {
00065     ADAPTABLE_WERD,
00066     ACCEPTABLE_WERD,
00067     CHECK_DAWGS,
00068     CHECK_SPACES,
00069     CHECK_ONE_ELL_CONFLICT,
00070     CHECK_AMBIG_WERD
00071   };
00072 
00073   /*
00074   0: NO adaption
00075   */
00076   if (mode == 0) {
00077     if (tessedit_adaption_debug) tprintf("adaption disabled\n");
00078     return FALSE;
00079   }
00080 
00081   if (flags.bit (ADAPTABLE_WERD)) {
00082     status |= word->tess_would_adapt;  // result of Classify::AdaptableWord()
00083     if (tessedit_adaption_debug && !status) {
00084       tprintf("tess_would_adapt bit is false\n");
00085     }
00086   }
00087 
00088   if (flags.bit (ACCEPTABLE_WERD)) {
00089     status |= word->tess_accepted;
00090     if (tessedit_adaption_debug && !status) {
00091       tprintf("tess_accepted bit is false\n");
00092     }
00093   }
00094 
00095   if (!status) {                  // If not set then
00096     return FALSE;                // ignore other checks
00097   }
00098 
00099   if (flags.bit (CHECK_DAWGS) &&
00100     (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00101     (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00102     (word->best_choice->permuter () != USER_DAWG_PERM) &&
00103     (word->best_choice->permuter () != NUMBER_PERM)) {
00104     if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
00105     return FALSE;
00106   }
00107 
00108   if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) {
00109     if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
00110     return FALSE;
00111   }
00112 
00113   if (flags.bit (CHECK_SPACES) &&
00114     (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
00115     if (tessedit_adaption_debug) tprintf("word contains spaces\n");
00116     return FALSE;
00117   }
00118 
00119 //  if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
00120   if (flags.bit (CHECK_AMBIG_WERD) &&
00121       !getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
00122     if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
00123     return FALSE;
00124   }
00125 
00126   // Do not adapt to words that are composed from fragments if
00127   // tessedit_adapt_to_char_fragments is false.
00128   if (!tessedit_adapt_to_char_fragments) {
00129     const char *fragment_lengths = word->best_choice->fragment_lengths();
00130     if (fragment_lengths != NULL && *fragment_lengths != '\0') {
00131       for (int i = 0; i < word->best_choice->length(); ++i) {
00132         if (fragment_lengths[i] > 1) {
00133           if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
00134           return false;  // found a character composed from fragments
00135         }
00136       }
00137     }
00138   }
00139 
00140   if (tessedit_adaption_debug) {
00141     tprintf("returning status %d\n", status);
00142   }
00143   return status;
00144 }
00145 
00146 }  // namespace tesseract