tesseract-doc/reject_8cpp_source.html

00001 /**********************************************************************
00002  * File:        reject.cpp  (Formerly reject.c)
00003  * Description: Rejection functions used in tessedit
00004  * Author:              Phil Cheatle
00005  * Created:             Wed Sep 23 16:50:21 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #pragma warning(disable:4305)  // int/float warnings
00023 #endif
00024
00025 #include "mfcpch.h"
00026
00027 #include          "tessvars.h"
00028 #ifdef __UNIX__
00029 #include          <assert.h>
00030 #include          <errno.h>
00031 #endif
00032 #include          "scanutils.h"
00033 #include          <ctype.h>
00034 #include          <string.h>
00035 #include          "memry.h"
00036 #include          "reject.h"
00037 #include          "tfacep.h"
00038 #include          "imgs.h"
00039 #include          "control.h"
00040 #include          "docqual.h"
00041 #include          "secname.h"
00042 #include          "globals.h"
00043 #include          "helpers.h"
00044
00045 /* #define SECURE_NAMES done in secnames.h when necessary */
00046
00047 #include "tesseractclass.h"
00048 #include          "notdll.h"
00049
00050 // Include automatically generated configuration file if running autoconf.
00051 #ifdef HAVE_CONFIG_H
00052 #include "config_auto.h"
00053 #endif
00054
00055 CLISTIZEH (STRING) CLISTIZE (STRING)
00056
00057 /*************************************************************************
00058  * set_done()
00059  *
00060  * Set the done flag based on the word acceptability criteria
00061  *************************************************************************/
00062
00063 namespace tesseract {
00064 void Tesseract::set_done(  //set done flag
00065                          WERD_RES *word,
00066                          inT16 pass) {
00067   /*
00068   0: Original heuristic used in Tesseract and Ray's prototype Resaljet
00069   */
00070   if (tessedit_ok_mode == 0) {
00071     /* NOTE - done even if word contains some or all spaces !!! */
00072     word->done = word->tess_accepted;
00073   }
00074   /*
00075   1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
00076   */
00077   else if (tessedit_ok_mode == 1) {
00078     word->done = word->tess_accepted &&
00079       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
00080
00081     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00082       word->done = FALSE;
00083   }
00084   /*
00085   2: as 1 + only accept dict words or numerics in pass 1
00086   */
00087   else if (tessedit_ok_mode == 2) {
00088     word->done = word->tess_accepted &&
00089       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
00090
00091     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00092       word->done = FALSE;
00093
00094     if (word->done &&
00095       (pass == 1) &&
00096       (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00097       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00098       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00099     (word->best_choice->permuter () != NUMBER_PERM)) {
00100       #ifndef SECURE_NAMES
00101       if (tessedit_rejection_debug)
00102         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00103           word->best_choice->unichar_string().string ());
00104       #endif
00105       word->done = FALSE;
00106     }
00107   }
00108   /*
00109   3: as 2 + only accept dict words or numerics in pass 2 as well
00110   */
00111   else if (tessedit_ok_mode == 3) {
00112     word->done = word->tess_accepted &&
00113       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
00114
00115     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00116       word->done = FALSE;
00117
00118     if (word->done &&
00119       (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00120       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00121       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00122     (word->best_choice->permuter () != NUMBER_PERM)) {
00123       #ifndef SECURE_NAMES
00124       if (tessedit_rejection_debug)
00125         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00126           word->best_choice->unichar_string().string ());
00127       #endif
00128       word->done = FALSE;
00129     }
00130   }
00131   /*
00132   4: as 2 + reject dict ambigs in pass 1
00133   */
00134   else if (tessedit_ok_mode == 4) {
00135     word->done = word->tess_accepted &&
00136       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
00137
00138     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00139       word->done = FALSE;
00140
00141     if (word->done &&
00142       (pass == 1) &&
00143       (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00144       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00145       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00146       (word->best_choice->permuter () != NUMBER_PERM)) ||
00147     (test_ambig_word (word)))) {
00148       #ifndef SECURE_NAMES
00149       if (tessedit_rejection_debug)
00150         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00151           word->best_choice->unichar_string().string ());
00152       #endif
00153       word->done = FALSE;
00154     }
00155   }
00156   /*
00157   5: as 3 + reject dict ambigs in both passes
00158   */
00159   else if (tessedit_ok_mode == 5) {
00160     word->done = word->tess_accepted &&
00161       (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
00162
00163     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00164       word->done = FALSE;
00165
00166     if (word->done &&
00167       (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00168       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00169       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00170       (word->best_choice->permuter () != NUMBER_PERM)) ||
00171     (test_ambig_word (word)))) {
00172       #ifndef SECURE_NAMES
00173       if (tessedit_rejection_debug)
00174         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00175           word->best_choice->unichar_string().string ());
00176       #endif
00177       word->done = FALSE;
00178     }
00179   }
00180
00181   else {
00182     tprintf ("BAD tessedit_ok_mode\n");
00183     err_exit();
00184   }
00185 }
00186
00187
00188 /*************************************************************************
00189  * make_reject_map()
00190  *
00191  * Sets the done flag to indicate whether the resylt is acceptable.
00192  *
00193  * Sets a reject map for the word.
00194  *************************************************************************/
00195 void Tesseract::make_reject_map(      //make rej map for wd //detailed results
00196                                 WERD_RES *word,
00197                                 BLOB_CHOICE_LIST_CLIST *blob_choices,
00198                                 ROW *row,
00199                                 inT16 pass  //1st or 2nd?
00200                                ) {
00201   int i;
00202   int offset;
00203
00204   flip_0O(word);
00205   check_debug_pt(word, -1);     // For trap only
00206   set_done(word, pass);  // Set acceptance
00207   word->reject_map.initialise(word->best_choice->unichar_lengths().length());
00208   reject_blanks(word);
00209   /*
00210   0: Rays original heuristic - the baseline
00211   */
00212   if (tessedit_reject_mode == 0) {
00213     if (!word->done)
00214       reject_poor_matches(word, blob_choices);
00215   } else if (tessedit_reject_mode == 5) {
00216     /*
00217     5: Reject I/1/l from words where there is no strong contextual confirmation;
00218       the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
00219       and the whole of any words which are very small
00220     */
00221     if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
00222       word->reject_map.rej_word_small_xht();
00223     } else {
00224       one_ell_conflict(word, TRUE);
00225       /*
00226         Originally the code here just used the done flag. Now I have duplicated
00227         and unpacked the conditions for setting the done flag so that each
00228         mechanism can be turned on or off independently. This works WITHOUT
00229         affecting the done flag setting.
00230       */
00231       if (rej_use_tess_accepted && !word->tess_accepted)
00232         word->reject_map.rej_word_not_tess_accepted ();
00233
00234       if (rej_use_tess_blanks &&
00235         (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
00236         word->reject_map.rej_word_contains_blanks ();
00237
00238       WERD_CHOICE* best_choice = word->best_choice;
00239       if (rej_use_good_perm) {
00240         if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
00241              best_choice->permuter() == FREQ_DAWG_PERM ||
00242              best_choice->permuter() == USER_DAWG_PERM) &&
00243             (!rej_use_sensible_wd ||
00244              acceptable_word_string(*word->uch_set,
00245                                     best_choice->unichar_string().string(),
00246                                     best_choice->unichar_lengths().string()) !=
00247                                         AC_UNACCEPTABLE)) {
00248           // PASSED TEST
00249         } else if (best_choice->permuter() == NUMBER_PERM) {
00250           if (rej_alphas_in_number_perm) {
00251             for (i = 0, offset = 0;
00252                  best_choice->unichar_string()[offset] != '\0';
00253                  offset += best_choice->unichar_lengths()[i++]) {
00254               if (word->reject_map[i].accepted() &&
00255                   word->uch_set->get_isalpha(
00256                       best_choice->unichar_string().string() + offset,
00257                       best_choice->unichar_lengths()[i]))
00258                 word->reject_map[i].setrej_bad_permuter();
00259               // rej alpha
00260             }
00261           }
00262         } else {
00263           word->reject_map.rej_word_bad_permuter();
00264         }
00265       }
00266       /* Ambig word rejection was here once !!*/
00267     }
00268   } else {
00269     tprintf("BAD tessedit_reject_mode\n");
00270     err_exit();
00271   }
00272
00273   if (tessedit_image_border > -1)
00274     reject_edge_blobs(word);
00275
00276   check_debug_pt (word, 10);
00277   if (tessedit_rejection_debug) {
00278     tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
00279     tprintf("Certainty: %f     Rating: %f\n",
00280       word->best_choice->certainty (), word->best_choice->rating ());
00281     tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
00282   }
00283
00284   flip_hyphens(word);
00285   check_debug_pt(word, 20);
00286 }
00287 }  // namespace tesseract
00288
00289
00290 void reject_blanks(WERD_RES *word) {
00291   inT16 i;
00292   inT16 offset;
00293
00294   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
00295        offset += word->best_choice->unichar_lengths()[i], i += 1) {
00296     if (word->best_choice->unichar_string()[offset] == ' ')
00297                                  //rej unrecognised blobs
00298       word->reject_map[i].setrej_tess_failure ();
00299   }
00300 }
00301
00302 namespace tesseract {
00303 void Tesseract::reject_I_1_L(WERD_RES *word) {
00304   inT16 i;
00305   inT16 offset;
00306
00307   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
00308        offset += word->best_choice->unichar_lengths()[i], i += 1) {
00309     if (STRING (conflict_set_I_l_1).
00310     contains (word->best_choice->unichar_string()[offset])) {
00311                                  //rej 1Il conflict
00312       word->reject_map[i].setrej_1Il_conflict ();
00313     }
00314   }
00315 }
00316 }  // namespace tesseract
00317
00318
00319 void reject_poor_matches(  //detailed results
00320                          WERD_RES *word,
00321                          BLOB_CHOICE_LIST_CLIST *blob_choices) {
00322   float threshold;
00323   inT16 i = 0;
00324   inT16 offset = 0;
00325                                  //super iterator
00326   BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
00327   BLOB_CHOICE_IT choice_it;      //real iterator
00328
00329   #ifndef SECURE_NAMES
00330   if (strlen(word->best_choice->unichar_lengths().string()) !=
00331       list_it.length()) {
00332     tprintf
00333       ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
00334       word->best_choice->unichar_string().string(),
00335       strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
00336       word->box_word->length());
00337   }
00338   #endif
00339   ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
00340     list_it.length ());
00341   ASSERT_HOST(word->box_word->length() == list_it.length());
00342   threshold = compute_reject_threshold (blob_choices);
00343
00344   for (list_it.mark_cycle_pt ();
00345   !list_it.cycled_list (); list_it.forward (), i++,
00346            offset += word->best_choice->unichar_lengths()[i]) {
00347     /* NB - only compares the threshold against the TOP choice char in the
00348       choices list for a blob !! - the selected one may be below the threshold
00349     */
00350     choice_it.set_to_list (list_it.data ());
00351     if ((word->best_choice->unichar_string()[offset] == ' ') ||
00352       (choice_it.length () == 0))
00353                                  //rej unrecognised blobs
00354       word->reject_map[i].setrej_tess_failure ();
00355     else if (choice_it.data ()->certainty () < threshold)
00356                                  //rej poor score blob
00357       word->reject_map[i].setrej_poor_match ();
00358   }
00359 }
00360
00361
00362 /**********************************************************************
00363  * compute_reject_threshold
00364  *
00365  * Set a rejection threshold for this word.
00366  * Initially this is a trivial function which looks for the largest
00367  * gap in the certainty value.
00368  **********************************************************************/
00369
00370 float compute_reject_threshold(  //compute threshold //detailed results
00371                                BLOB_CHOICE_LIST_CLIST *blob_choices) {
00372   inT16 index;                   //to ratings
00373   inT16 blob_count;              //no of blobs in word
00374   inT16 ok_blob_count = 0;       //non TESS rej blobs in word
00375   float *ratings;                //array of confidences
00376   float threshold;               //rejection threshold
00377   float bestgap;                 //biggest gap
00378   float gapstart;                //bottom of gap
00379                                  //super iterator
00380   BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
00381   BLOB_CHOICE_IT choice_it;      //real iterator
00382
00383   blob_count = blob_choices->length ();
00384   ratings = (float *) alloc_mem (blob_count * sizeof (float));
00385   for (list_it.mark_cycle_pt (), index = 0;
00386   !list_it.cycled_list (); list_it.forward (), index++) {
00387     choice_it.set_to_list (list_it.data ());
00388     if (choice_it.length () > 0) {
00389       ratings[ok_blob_count] = choice_it.data ()->certainty ();
00390       //get in an array
00391       //                 tprintf("Rating[%d]=%c %g %g\n",
00392       //                         index,choice_it.data()->char_class(),
00393       //                         choice_it.data()->rating(),choice_it.data()->certainty());
00394       ok_blob_count++;
00395     }
00396   }
00397   ASSERT_HOST (index == blob_count);
00398   qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
00399   //sort them
00400   bestgap = 0;
00401   gapstart = ratings[0] - 1;     //all reject if none better
00402   if (ok_blob_count >= 3) {
00403     for (index = 0; index < ok_blob_count - 1; index++) {
00404       if (ratings[index + 1] - ratings[index] > bestgap) {
00405         bestgap = ratings[index + 1] - ratings[index];
00406         //find biggest
00407         gapstart = ratings[index];
00408       }
00409     }
00410   }
00411   threshold = gapstart + bestgap / 2;
00412   //      tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
00413   //              ratings[0],ratings[index],bestgap,threshold);
00414
00415   free_mem(ratings);
00416   return threshold;
00417 }
00418
00419
00420 /*************************************************************************
00421  * reject_edge_blobs()
00422  *
00423  * If the word is perilously close to the edge of the image, reject those blobs
00424  * in the word which are too close to the edge as they could be clipped.
00425  *************************************************************************/
00426 namespace tesseract {
00427 void Tesseract::reject_edge_blobs(WERD_RES *word) {
00428   TBOX word_box = word->word->bounding_box();
00429   // Use the box_word as it is already denormed back to image coordinates.
00430   int blobcount = word->box_word->length();
00431
00432   if (word_box.left() < tessedit_image_border ||
00433       word_box.bottom() < tessedit_image_border ||
00434       word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
00435       word_box.top() + tessedit_image_border > ImageHeight() - 1) {
00436     ASSERT_HOST(word->reject_map.length() == blobcount);
00437     for (int blobindex = 0; blobindex < blobcount; blobindex++) {
00438       TBOX blob_box = word->box_word->BlobBox(blobindex);
00439       if (blob_box.left() < tessedit_image_border ||
00440           blob_box.bottom() < tessedit_image_border ||
00441           blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
00442           blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
00443         word->reject_map[blobindex].setrej_edge_char();
00444         // Close to edge
00445       }
00446     }
00447   }
00448 }
00449
00450 /**********************************************************************
00451  * one_ell_conflict()
00452  *
00453  * Identify words where there is a potential I/l/1 error.
00454  * - A bundle of contextual heuristics!
00455  **********************************************************************/
00456 BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
00457   const char *word;
00458   const char *lengths;
00459   inT16 word_len;                //its length
00460   inT16 first_alphanum_index_;
00461   inT16 first_alphanum_offset_;
00462   inT16 i;
00463   inT16 offset;
00464   BOOL8 non_conflict_set_char;   //non conf set a/n?
00465   BOOL8 conflict = FALSE;
00466   BOOL8 allow_1s;
00467   ACCEPTABLE_WERD_TYPE word_type;
00468   BOOL8 dict_perm_type;
00469   BOOL8 dict_word_ok;
00470   int dict_word_type;
00471
00472   word = word_res->best_choice->unichar_string().string ();
00473   lengths = word_res->best_choice->unichar_lengths().string();
00474   word_len = strlen (lengths);
00475   /*
00476     If there are no occurrences of the conflict set characters then the word
00477     is OK.
00478   */
00479   if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
00480     return FALSE;
00481
00482   /*
00483     There is a conflict if there are NO other (confirmed) alphanumerics apart
00484     from those in the conflict set.
00485   */
00486
00487   for (i = 0, offset = 0, non_conflict_set_char = FALSE;
00488        (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
00489     non_conflict_set_char =
00490         (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
00491             word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
00492         !STRING (conflict_set_I_l_1).contains (word[offset]);
00493   if (!non_conflict_set_char) {
00494     if (update_map)
00495       reject_I_1_L(word_res);
00496     return TRUE;
00497   }
00498
00499   /*
00500     If the word is accepted by a dawg permuter, and the first alpha character
00501     is "I" or "l", check to see if the alternative is also a dawg word. If it
00502     is, then there is a potential error otherwise the word is ok.
00503   */
00504
00505   dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00506     (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
00507     (rej_trust_doc_dawg &&
00508     (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
00509     (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
00510   dict_word_type = dict_word(*(word_res->best_choice));
00511   dict_word_ok = (dict_word_type > 0) &&
00512     (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
00513
00514   if ((rej_1Il_use_dict_word && dict_word_ok) ||
00515     (rej_1Il_trust_permuter_type && dict_perm_type) ||
00516   (dict_perm_type && dict_word_ok)) {
00517     first_alphanum_index_ = first_alphanum_index (word, lengths);
00518     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
00519     if (lengths[first_alphanum_index_] == 1 &&
00520         word[first_alphanum_offset_] == 'I') {
00521       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00522       if (safe_dict_word(word_res) > 0) {
00523         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00524         if (update_map)
00525           word_res->reject_map[first_alphanum_index_].
00526             setrej_1Il_conflict();
00527         return TRUE;
00528       }
00529       else {
00530         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00531         return FALSE;
00532       }
00533     }
00534
00535     if (lengths[first_alphanum_index_] == 1 &&
00536         word[first_alphanum_offset_] == 'l') {
00537       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00538       if (safe_dict_word(word_res) > 0) {
00539         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00540         if (update_map)
00541           word_res->reject_map[first_alphanum_index_].
00542             setrej_1Il_conflict();
00543         return TRUE;
00544       }
00545       else {
00546         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00547         return FALSE;
00548       }
00549     }
00550     return FALSE;
00551   }
00552
00553   /*
00554     NEW 1Il code. The old code relied on permuter types too much. In fact,
00555     tess will use TOP_CHOICE permute for good things like "palette".
00556     In this code the string is examined independently to see if it looks like
00557     a well formed word.
00558   */
00559
00560   /*
00561     REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
00562     dictionary word.
00563   */
00564   first_alphanum_index_ = first_alphanum_index (word, lengths);
00565   first_alphanum_offset_ = first_alphanum_offset (word, lengths);
00566   if (lengths[first_alphanum_index_] == 1 &&
00567       word[first_alphanum_offset_] == 'l') {
00568     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00569     if (safe_dict_word(word_res) > 0)
00570       return FALSE;
00571     else
00572       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00573   }
00574   else if (lengths[first_alphanum_index_] == 1 &&
00575            word[first_alphanum_offset_] == 'I') {
00576     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00577     if (safe_dict_word(word_res) > 0)
00578       return FALSE;
00579     else
00580       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00581   }
00582   /*
00583     For strings containing digits:
00584       If there are no alphas OR the numeric permuter liked the word,
00585         reject any non 1 conflict chs
00586       Else reject all conflict chs
00587   */
00588   if (word_contains_non_1_digit (word, lengths)) {
00589     allow_1s = (alpha_count (word, lengths) == 0) ||
00590       (word_res->best_choice->permuter () == NUMBER_PERM);
00591
00592     inT16 offset;
00593     conflict = FALSE;
00594     for (i = 0, offset = 0; word[offset] != '\0';
00595          offset += word_res->best_choice->unichar_lengths()[i++]) {
00596       if ((!allow_1s || (word[offset] != '1')) &&
00597       STRING (conflict_set_I_l_1).contains (word[offset])) {
00598         if (update_map)
00599           word_res->reject_map[i].setrej_1Il_conflict ();
00600         conflict = TRUE;
00601       }
00602     }
00603     return conflict;
00604   }
00605   /*
00606     For anything else. See if it conforms to an acceptable word type. If so,
00607     treat accordingly.
00608   */
00609   word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
00610   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
00611     first_alphanum_index_ = first_alphanum_index (word, lengths);
00612     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
00613     if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
00614       if (update_map)
00615         word_res->reject_map[first_alphanum_index_].
00616             setrej_1Il_conflict ();
00617       return TRUE;
00618     }
00619     else
00620       return FALSE;
00621   }
00622   else if (word_type == AC_UPPER_CASE) {
00623     return FALSE;
00624   }
00625   else {
00626     if (update_map)
00627       reject_I_1_L(word_res);
00628     return TRUE;
00629   }
00630 }
00631
00632
00633 inT16 Tesseract::first_alphanum_index(const char *word,
00634                                       const char *word_lengths) {
00635   inT16 i;
00636   inT16 offset;
00637
00638   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00639     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
00640         unicharset.get_isdigit(word + offset, word_lengths[i]))
00641       return i;
00642   }
00643   return -1;
00644 }
00645
00646 inT16 Tesseract::first_alphanum_offset(const char *word,
00647                                        const char *word_lengths) {
00648   inT16 i;
00649   inT16 offset;
00650
00651   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00652     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
00653         unicharset.get_isdigit(word + offset, word_lengths[i]))
00654       return offset;
00655   }
00656   return -1;
00657 }
00658
00659 inT16 Tesseract::alpha_count(const char *word,
00660                              const char *word_lengths) {
00661   inT16 i;
00662   inT16 offset;
00663   inT16 count = 0;
00664
00665   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00666     if (unicharset.get_isalpha (word + offset, word_lengths[i]))
00667       count++;
00668   }
00669   return count;
00670 }
00671
00672
00673 BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
00674                                            const char *word_lengths) {
00675   inT16 i;
00676   inT16 offset;
00677
00678   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00679     if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
00680         (word_lengths[i] != 1 || word[offset] != '1'))
00681       return TRUE;
00682   }
00683   return FALSE;
00684 }
00685
00686
00687 BOOL8 Tesseract::test_ambig_word(  //test for ambiguity
00688                                  WERD_RES *word) {
00689     BOOL8 ambig = FALSE;
00690
00691     if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00692       (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00693     (word->best_choice->permuter () == USER_DAWG_PERM)) {
00694       ambig = !getDict().NoDangerousAmbig(
00695           word->best_choice, NULL, false, NULL, NULL);
00696   }
00697   return ambig;
00698 }
00699
00700
00701 /*************************************************************************
00702  * dont_allow_1Il()
00703  * Dont unreject LONE accepted 1Il conflict set chars
00704  *************************************************************************/
00705 void Tesseract::dont_allow_1Il(WERD_RES *word) {
00706   int i = 0;
00707   int offset;
00708   int word_len = word->reject_map.length();
00709   const char *s = word->best_choice->unichar_string().string();
00710   const char *lengths = word->best_choice->unichar_lengths().string();
00711   BOOL8 accepted_1Il = FALSE;
00712
00713   for (i = 0, offset = 0; i < word_len;
00714        offset += word->best_choice->unichar_lengths()[i++]) {
00715     if (word->reject_map[i].accepted()) {
00716       if (STRING(conflict_set_I_l_1).contains(s[offset])) {
00717         accepted_1Il = TRUE;
00718       } else {
00719         if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
00720             word->uch_set->get_isdigit(s + offset, lengths[i]))
00721           return;                // >=1 non 1Il ch accepted
00722       }
00723     }
00724   }
00725   if (!accepted_1Il)
00726     return;                      //Nothing to worry about
00727
00728   for (i = 0, offset = 0; i < word_len;
00729        offset += word->best_choice->unichar_lengths()[i++]) {
00730     if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
00731       word->reject_map[i].accepted())
00732       word->reject_map[i].setrej_postNN_1Il();
00733   }
00734 }
00735
00736
00737 inT16 Tesseract::count_alphanums(WERD_RES *word_res) {
00738   int count = 0;
00739   const WERD_CHOICE *best_choice = word_res->best_choice;
00740   for (int i = 0; i < word_res->reject_map.length(); ++i) {
00741     if ((word_res->reject_map[i].accepted()) &&
00742         (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
00743             word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
00744       count++;
00745     }
00746   }
00747   return count;
00748 }
00749
00750
00751 // reject all if most rejected.
00752 void Tesseract::reject_mostly_rejects(WERD_RES *word) {
00753   /* Reject the whole of the word if the fraction of rejects exceeds a limit */
00754
00755   if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
00756     rej_whole_of_mostly_reject_word_fract)
00757     word->reject_map.rej_word_mostly_rej();
00758 }
00759
00760
00761 BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
00762   inT16 char_quality;
00763   inT16 accepted_char_quality;
00764
00765   if (word->best_choice->unichar_lengths().length() <= 1)
00766     return FALSE;
00767
00768   if (!STRING(ok_repeated_ch_non_alphanum_wds).
00769     contains(word->best_choice->unichar_string()[0]))
00770     return FALSE;
00771
00772   UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
00773   for (int i = 1; i < word->best_choice->length(); ++i) {
00774     if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
00775   }
00776
00777   word_char_quality(word, row, &char_quality, &accepted_char_quality);
00778
00779   if ((word->best_choice->unichar_lengths().length () == char_quality) &&
00780     (char_quality == accepted_char_quality))
00781     return TRUE;
00782   else
00783     return FALSE;
00784 }
00785
00786 inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {
00787   const WERD_CHOICE &word = *werd_res->best_choice;
00788   int dict_word_type = werd_res->tesseract->dict_word(word);
00789   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
00790 }
00791
00792 // Note: After running this function word_res->best_choice->blob_choices()
00793 // might not contain the right BLOB_CHOICE coresponding to each character
00794 // in word_res->best_choice. However, the length of blob_choices and
00795 // word_res->best_choice will remain the same.
00796 void Tesseract::flip_hyphens(WERD_RES *word_res) {
00797   WERD_CHOICE *best_choice = word_res->best_choice;
00798   int i;
00799   int prev_right = -9999;
00800   int next_left;
00801   TBOX out_box;
00802   float aspect_ratio;
00803
00804   if (tessedit_lower_flip_hyphen <= 1)
00805     return;
00806
00807   TBLOB* blob = word_res->rebuild_word->blobs;
00808   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
00809   bool modified = false;
00810   for (i = 0; i < best_choice->length() && blob != NULL; ++i,
00811        blob = blob->next) {
00812     out_box = blob->bounding_box();
00813     if (blob->next == NULL)
00814       next_left = 9999;
00815     else
00816       next_left = blob->next->bounding_box().left();
00817     // Dont touch small or touching blobs - it is too dangerous.
00818     if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
00819         (out_box.left() > prev_right) && (out_box.right() < next_left)) {
00820       aspect_ratio = out_box.width() / (float) out_box.height();
00821       if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
00822         if (aspect_ratio >= tessedit_upper_flip_hyphen &&
00823             word_res->uch_set->contains_unichar_id(unichar_dash) &&
00824             word_res->uch_set->get_enabled(unichar_dash)) {
00825           /* Certain HYPHEN */
00826           best_choice->set_unichar_id(unichar_dash, i);
00827           modified = true;
00828           if (word_res->reject_map[i].rejected())
00829             word_res->reject_map[i].setrej_hyphen_accept();
00830         }
00831         if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
00832           word_res->reject_map[i].accepted())
00833                                  //Suspected HYPHEN
00834           word_res->reject_map[i].setrej_hyphen ();
00835       }
00836       else if (best_choice->unichar_id(i) == unichar_dash) {
00837         if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
00838           (word_res->reject_map[i].rejected()))
00839           word_res->reject_map[i].setrej_hyphen_accept();
00840         //Certain HYPHEN
00841
00842         if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
00843           (word_res->reject_map[i].accepted()))
00844                                  //Suspected HYPHEN
00845           word_res->reject_map[i].setrej_hyphen();
00846       }
00847     }
00848     prev_right = out_box.right();
00849   }
00850 }
00851
00852 // Note: After running this function word_res->best_choice->blob_choices()
00853 // might not contain the right BLOB_CHOICE coresponding to each character
00854 // in word_res->best_choice. However, the length of blob_choices and
00855 // word_res->best_choice will remain the same.
00856 void Tesseract::flip_0O(WERD_RES *word_res) {
00857   WERD_CHOICE *best_choice = word_res->best_choice;
00858   int i;
00859   TBOX out_box;
00860
00861   if (!tessedit_flip_0O)
00862     return;
00863
00864   TBLOB* blob = word_res->rebuild_word->blobs;
00865   for (i = 0; i < best_choice->length() && blob != NULL; ++i,
00866        blob = blob->next) {
00867     if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
00868         word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
00869       out_box = blob->bounding_box();
00870       if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
00871         (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
00872         return;                  //Beware words with sub/superscripts
00873     }
00874   }
00875   UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
00876   UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
00877   if (unichar_0 == INVALID_UNICHAR_ID ||
00878       !word_res->uch_set->get_enabled(unichar_0) ||
00879       unichar_O == INVALID_UNICHAR_ID ||
00880       !word_res->uch_set->get_enabled(unichar_O)) {
00881     return;  // 0 or O are not present/enabled in unicharset
00882   }
00883   bool modified = false;
00884   for (i = 1; i < best_choice->length(); ++i) {
00885     if (best_choice->unichar_id(i) == unichar_0 ||
00886         best_choice->unichar_id(i) == unichar_O) {
00887       /* A0A */
00888       if ((i+1) < best_choice->length() &&
00889           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00890           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
00891         best_choice->set_unichar_id(unichar_O, i);
00892         modified = true;
00893       }
00894       /* A00A */
00895       if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00896           (i+1) < best_choice->length() &&
00897           (best_choice->unichar_id(i+1) == unichar_0 ||
00898            best_choice->unichar_id(i+1) == unichar_O) &&
00899           (i+2) < best_choice->length() &&
00900           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
00901         best_choice->set_unichar_id(unichar_O, i);
00902         modified = true;
00903         i++;
00904       }
00905       /* AA0<non digit or end of word> */
00906       if ((i > 1) &&
00907           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
00908           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00909           (((i+1) < best_choice->length() &&
00910             !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
00911             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
00912             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
00913            (i == best_choice->length() - 1))) {
00914         best_choice->set_unichar_id(unichar_O, i);
00915         modified = true;
00916       }
00917       /* 9O9 */
00918       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00919           (i+1) < best_choice->length() &&
00920           non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
00921         best_choice->set_unichar_id(unichar_0, i);
00922         modified = true;
00923       }
00924       /* 9OOO */
00925       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00926           (i+2) < best_choice->length() &&
00927           (best_choice->unichar_id(i+1) == unichar_0 ||
00928            best_choice->unichar_id(i+1) == unichar_O) &&
00929           (best_choice->unichar_id(i+2) == unichar_0 ||
00930            best_choice->unichar_id(i+2) == unichar_O)) {
00931         best_choice->set_unichar_id(unichar_0, i);
00932         best_choice->set_unichar_id(unichar_0, i+1);
00933         best_choice->set_unichar_id(unichar_0, i+2);
00934         modified = true;
00935         i += 2;
00936       }
00937       /* 9OO<non upper> */
00938       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00939           (i+2) < best_choice->length() &&
00940           (best_choice->unichar_id(i+1) == unichar_0 ||
00941           best_choice->unichar_id(i+1) == unichar_O) &&
00942           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
00943         best_choice->set_unichar_id(unichar_0, i);
00944         best_choice->set_unichar_id(unichar_0, i+1);
00945         modified = true;
00946         i++;
00947       }
00948       /* 9O<non upper> */
00949       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00950           (i+1) < best_choice->length() &&
00951           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
00952         best_choice->set_unichar_id(unichar_0, i);
00953       }
00954       /* 9[.,]OOO.. */
00955       if ((i > 1) &&
00956           (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
00957               word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
00958           (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
00959            best_choice->unichar_id(i-2) == unichar_O)) {
00960         if (best_choice->unichar_id(i-2) == unichar_O) {
00961           best_choice->set_unichar_id(unichar_0, i-2);
00962           modified = true;
00963         }
00964         while (i < best_choice->length() &&
00965                (best_choice->unichar_id(i) == unichar_O ||
00966                 best_choice->unichar_id(i) == unichar_0)) {
00967           best_choice->set_unichar_id(unichar_0, i);
00968           modified = true;
00969           i++;
00970         }
00971         i--;
00972       }
00973     }
00974   }
00975 }
00976
00977 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
00978   return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
00979 }
00980
00981 BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
00982   return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
00983 }
00984 }  // namespace tesseract