Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: reject.cpp (Formerly reject.c) 00003 * Description: Rejection functions used in tessedit 00004 * Author: Phil Cheatle 00005 * Created: Wed Sep 23 16:50:21 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #pragma warning(disable:4305) // int/float warnings 00023 #endif 00024 00025 #include "mfcpch.h" 00026 00027 #include "tessvars.h" 00028 #ifdef __UNIX__ 00029 #include <assert.h> 00030 #include <errno.h> 00031 #endif 00032 #include "scanutils.h" 00033 #include <ctype.h> 00034 #include <string.h> 00035 #include "memry.h" 00036 #include "reject.h" 00037 #include "tfacep.h" 00038 #include "imgs.h" 00039 #include "control.h" 00040 #include "docqual.h" 00041 #include "secname.h" 00042 #include "globals.h" 00043 #include "helpers.h" 00044 00045 /* #define SECURE_NAMES done in secnames.h when necessary */ 00046 00047 #include "tesseractclass.h" 00048 #include "notdll.h" 00049 00050 // Include automatically generated configuration file if running autoconf. 00051 #ifdef HAVE_CONFIG_H 00052 #include "config_auto.h" 00053 #endif 00054 00055 CLISTIZEH (STRING) CLISTIZE (STRING) 00056 00057 /************************************************************************* 00058 * set_done() 00059 * 00060 * Set the done flag based on the word acceptability criteria 00061 *************************************************************************/ 00062 00063 namespace tesseract { 00064 void Tesseract::set_done( //set done flag 00065 WERD_RES *word, 00066 inT16 pass) { 00067 /* 00068 0: Original heuristic used in Tesseract and Ray's prototype Resaljet 00069 */ 00070 if (tessedit_ok_mode == 0) { 00071 /* NOTE - done even if word contains some or all spaces !!! */ 00072 word->done = word->tess_accepted; 00073 } 00074 /* 00075 1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts 00076 */ 00077 else if (tessedit_ok_mode == 1) { 00078 word->done = word->tess_accepted && 00079 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); 00080 00081 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) 00082 word->done = FALSE; 00083 } 00084 /* 00085 2: as 1 + only accept dict words or numerics in pass 1 00086 */ 00087 else if (tessedit_ok_mode == 2) { 00088 word->done = word->tess_accepted && 00089 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); 00090 00091 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) 00092 word->done = FALSE; 00093 00094 if (word->done && 00095 (pass == 1) && 00096 (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && 00097 (word->best_choice->permuter () != FREQ_DAWG_PERM) && 00098 (word->best_choice->permuter () != USER_DAWG_PERM) && 00099 (word->best_choice->permuter () != NUMBER_PERM)) { 00100 #ifndef SECURE_NAMES 00101 if (tessedit_rejection_debug) 00102 tprintf ("\nVETO Tess accepting poor word \"%s\"\n", 00103 word->best_choice->unichar_string().string ()); 00104 #endif 00105 word->done = FALSE; 00106 } 00107 } 00108 /* 00109 3: as 2 + only accept dict words or numerics in pass 2 as well 00110 */ 00111 else if (tessedit_ok_mode == 3) { 00112 word->done = word->tess_accepted && 00113 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); 00114 00115 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) 00116 word->done = FALSE; 00117 00118 if (word->done && 00119 (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && 00120 (word->best_choice->permuter () != FREQ_DAWG_PERM) && 00121 (word->best_choice->permuter () != USER_DAWG_PERM) && 00122 (word->best_choice->permuter () != NUMBER_PERM)) { 00123 #ifndef SECURE_NAMES 00124 if (tessedit_rejection_debug) 00125 tprintf ("\nVETO Tess accepting poor word \"%s\"\n", 00126 word->best_choice->unichar_string().string ()); 00127 #endif 00128 word->done = FALSE; 00129 } 00130 } 00131 /* 00132 4: as 2 + reject dict ambigs in pass 1 00133 */ 00134 else if (tessedit_ok_mode == 4) { 00135 word->done = word->tess_accepted && 00136 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); 00137 00138 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) 00139 word->done = FALSE; 00140 00141 if (word->done && 00142 (pass == 1) && 00143 (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) && 00144 (word->best_choice->permuter () != FREQ_DAWG_PERM) && 00145 (word->best_choice->permuter () != USER_DAWG_PERM) && 00146 (word->best_choice->permuter () != NUMBER_PERM)) || 00147 (test_ambig_word (word)))) { 00148 #ifndef SECURE_NAMES 00149 if (tessedit_rejection_debug) 00150 tprintf ("\nVETO Tess accepting poor word \"%s\"\n", 00151 word->best_choice->unichar_string().string ()); 00152 #endif 00153 word->done = FALSE; 00154 } 00155 } 00156 /* 00157 5: as 3 + reject dict ambigs in both passes 00158 */ 00159 else if (tessedit_ok_mode == 5) { 00160 word->done = word->tess_accepted && 00161 (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); 00162 00163 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) 00164 word->done = FALSE; 00165 00166 if (word->done && 00167 (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) && 00168 (word->best_choice->permuter () != FREQ_DAWG_PERM) && 00169 (word->best_choice->permuter () != USER_DAWG_PERM) && 00170 (word->best_choice->permuter () != NUMBER_PERM)) || 00171 (test_ambig_word (word)))) { 00172 #ifndef SECURE_NAMES 00173 if (tessedit_rejection_debug) 00174 tprintf ("\nVETO Tess accepting poor word \"%s\"\n", 00175 word->best_choice->unichar_string().string ()); 00176 #endif 00177 word->done = FALSE; 00178 } 00179 } 00180 00181 else { 00182 tprintf ("BAD tessedit_ok_mode\n"); 00183 err_exit(); 00184 } 00185 } 00186 00187 00188 /************************************************************************* 00189 * make_reject_map() 00190 * 00191 * Sets the done flag to indicate whether the resylt is acceptable. 00192 * 00193 * Sets a reject map for the word. 00194 *************************************************************************/ 00195 void Tesseract::make_reject_map( //make rej map for wd //detailed results 00196 WERD_RES *word, 00197 BLOB_CHOICE_LIST_CLIST *blob_choices, 00198 ROW *row, 00199 inT16 pass //1st or 2nd? 00200 ) { 00201 int i; 00202 int offset; 00203 00204 flip_0O(word); 00205 check_debug_pt(word, -1); // For trap only 00206 set_done(word, pass); // Set acceptance 00207 word->reject_map.initialise(word->best_choice->unichar_lengths().length()); 00208 reject_blanks(word); 00209 /* 00210 0: Rays original heuristic - the baseline 00211 */ 00212 if (tessedit_reject_mode == 0) { 00213 if (!word->done) 00214 reject_poor_matches(word, blob_choices); 00215 } else if (tessedit_reject_mode == 5) { 00216 /* 00217 5: Reject I/1/l from words where there is no strong contextual confirmation; 00218 the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); 00219 and the whole of any words which are very small 00220 */ 00221 if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { 00222 word->reject_map.rej_word_small_xht(); 00223 } else { 00224 one_ell_conflict(word, TRUE); 00225 /* 00226 Originally the code here just used the done flag. Now I have duplicated 00227 and unpacked the conditions for setting the done flag so that each 00228 mechanism can be turned on or off independently. This works WITHOUT 00229 affecting the done flag setting. 00230 */ 00231 if (rej_use_tess_accepted && !word->tess_accepted) 00232 word->reject_map.rej_word_not_tess_accepted (); 00233 00234 if (rej_use_tess_blanks && 00235 (strchr (word->best_choice->unichar_string().string (), ' ') != NULL)) 00236 word->reject_map.rej_word_contains_blanks (); 00237 00238 WERD_CHOICE* best_choice = word->best_choice; 00239 if (rej_use_good_perm) { 00240 if ((best_choice->permuter() == SYSTEM_DAWG_PERM || 00241 best_choice->permuter() == FREQ_DAWG_PERM || 00242 best_choice->permuter() == USER_DAWG_PERM) && 00243 (!rej_use_sensible_wd || 00244 acceptable_word_string(*word->uch_set, 00245 best_choice->unichar_string().string(), 00246 best_choice->unichar_lengths().string()) != 00247 AC_UNACCEPTABLE)) { 00248 // PASSED TEST 00249 } else if (best_choice->permuter() == NUMBER_PERM) { 00250 if (rej_alphas_in_number_perm) { 00251 for (i = 0, offset = 0; 00252 best_choice->unichar_string()[offset] != '\0'; 00253 offset += best_choice->unichar_lengths()[i++]) { 00254 if (word->reject_map[i].accepted() && 00255 word->uch_set->get_isalpha( 00256 best_choice->unichar_string().string() + offset, 00257 best_choice->unichar_lengths()[i])) 00258 word->reject_map[i].setrej_bad_permuter(); 00259 // rej alpha 00260 } 00261 } 00262 } else { 00263 word->reject_map.rej_word_bad_permuter(); 00264 } 00265 } 00266 /* Ambig word rejection was here once !!*/ 00267 } 00268 } else { 00269 tprintf("BAD tessedit_reject_mode\n"); 00270 err_exit(); 00271 } 00272 00273 if (tessedit_image_border > -1) 00274 reject_edge_blobs(word); 00275 00276 check_debug_pt (word, 10); 00277 if (tessedit_rejection_debug) { 00278 tprintf("Permuter Type = %d\n", word->best_choice->permuter ()); 00279 tprintf("Certainty: %f Rating: %f\n", 00280 word->best_choice->certainty (), word->best_choice->rating ()); 00281 tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); 00282 } 00283 00284 flip_hyphens(word); 00285 check_debug_pt(word, 20); 00286 } 00287 } // namespace tesseract 00288 00289 00290 void reject_blanks(WERD_RES *word) { 00291 inT16 i; 00292 inT16 offset; 00293 00294 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; 00295 offset += word->best_choice->unichar_lengths()[i], i += 1) { 00296 if (word->best_choice->unichar_string()[offset] == ' ') 00297 //rej unrecognised blobs 00298 word->reject_map[i].setrej_tess_failure (); 00299 } 00300 } 00301 00302 namespace tesseract { 00303 void Tesseract::reject_I_1_L(WERD_RES *word) { 00304 inT16 i; 00305 inT16 offset; 00306 00307 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; 00308 offset += word->best_choice->unichar_lengths()[i], i += 1) { 00309 if (STRING (conflict_set_I_l_1). 00310 contains (word->best_choice->unichar_string()[offset])) { 00311 //rej 1Il conflict 00312 word->reject_map[i].setrej_1Il_conflict (); 00313 } 00314 } 00315 } 00316 } // namespace tesseract 00317 00318 00319 void reject_poor_matches( //detailed results 00320 WERD_RES *word, 00321 BLOB_CHOICE_LIST_CLIST *blob_choices) { 00322 float threshold; 00323 inT16 i = 0; 00324 inT16 offset = 0; 00325 //super iterator 00326 BLOB_CHOICE_LIST_C_IT list_it = blob_choices; 00327 BLOB_CHOICE_IT choice_it; //real iterator 00328 00329 #ifndef SECURE_NAMES 00330 if (strlen(word->best_choice->unichar_lengths().string()) != 00331 list_it.length()) { 00332 tprintf 00333 ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n", 00334 word->best_choice->unichar_string().string(), 00335 strlen (word->best_choice->unichar_lengths().string()), list_it.length(), 00336 word->box_word->length()); 00337 } 00338 #endif 00339 ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) == 00340 list_it.length ()); 00341 ASSERT_HOST(word->box_word->length() == list_it.length()); 00342 threshold = compute_reject_threshold (blob_choices); 00343 00344 for (list_it.mark_cycle_pt (); 00345 !list_it.cycled_list (); list_it.forward (), i++, 00346 offset += word->best_choice->unichar_lengths()[i]) { 00347 /* NB - only compares the threshold against the TOP choice char in the 00348 choices list for a blob !! - the selected one may be below the threshold 00349 */ 00350 choice_it.set_to_list (list_it.data ()); 00351 if ((word->best_choice->unichar_string()[offset] == ' ') || 00352 (choice_it.length () == 0)) 00353 //rej unrecognised blobs 00354 word->reject_map[i].setrej_tess_failure (); 00355 else if (choice_it.data ()->certainty () < threshold) 00356 //rej poor score blob 00357 word->reject_map[i].setrej_poor_match (); 00358 } 00359 } 00360 00361 00362 /********************************************************************** 00363 * compute_reject_threshold 00364 * 00365 * Set a rejection threshold for this word. 00366 * Initially this is a trivial function which looks for the largest 00367 * gap in the certainty value. 00368 **********************************************************************/ 00369 00370 float compute_reject_threshold( //compute threshold //detailed results 00371 BLOB_CHOICE_LIST_CLIST *blob_choices) { 00372 inT16 index; //to ratings 00373 inT16 blob_count; //no of blobs in word 00374 inT16 ok_blob_count = 0; //non TESS rej blobs in word 00375 float *ratings; //array of confidences 00376 float threshold; //rejection threshold 00377 float bestgap; //biggest gap 00378 float gapstart; //bottom of gap 00379 //super iterator 00380 BLOB_CHOICE_LIST_C_IT list_it = blob_choices; 00381 BLOB_CHOICE_IT choice_it; //real iterator 00382 00383 blob_count = blob_choices->length (); 00384 ratings = (float *) alloc_mem (blob_count * sizeof (float)); 00385 for (list_it.mark_cycle_pt (), index = 0; 00386 !list_it.cycled_list (); list_it.forward (), index++) { 00387 choice_it.set_to_list (list_it.data ()); 00388 if (choice_it.length () > 0) { 00389 ratings[ok_blob_count] = choice_it.data ()->certainty (); 00390 //get in an array 00391 // tprintf("Rating[%d]=%c %g %g\n", 00392 // index,choice_it.data()->char_class(), 00393 // choice_it.data()->rating(),choice_it.data()->certainty()); 00394 ok_blob_count++; 00395 } 00396 } 00397 ASSERT_HOST (index == blob_count); 00398 qsort (ratings, ok_blob_count, sizeof (float), sort_floats); 00399 //sort them 00400 bestgap = 0; 00401 gapstart = ratings[0] - 1; //all reject if none better 00402 if (ok_blob_count >= 3) { 00403 for (index = 0; index < ok_blob_count - 1; index++) { 00404 if (ratings[index + 1] - ratings[index] > bestgap) { 00405 bestgap = ratings[index + 1] - ratings[index]; 00406 //find biggest 00407 gapstart = ratings[index]; 00408 } 00409 } 00410 } 00411 threshold = gapstart + bestgap / 2; 00412 // tprintf("First=%g, last=%g, gap=%g, threshold=%g\n", 00413 // ratings[0],ratings[index],bestgap,threshold); 00414 00415 free_mem(ratings); 00416 return threshold; 00417 } 00418 00419 00420 /************************************************************************* 00421 * reject_edge_blobs() 00422 * 00423 * If the word is perilously close to the edge of the image, reject those blobs 00424 * in the word which are too close to the edge as they could be clipped. 00425 *************************************************************************/ 00426 namespace tesseract { 00427 void Tesseract::reject_edge_blobs(WERD_RES *word) { 00428 TBOX word_box = word->word->bounding_box(); 00429 // Use the box_word as it is already denormed back to image coordinates. 00430 int blobcount = word->box_word->length(); 00431 00432 if (word_box.left() < tessedit_image_border || 00433 word_box.bottom() < tessedit_image_border || 00434 word_box.right() + tessedit_image_border > ImageWidth() - 1 || 00435 word_box.top() + tessedit_image_border > ImageHeight() - 1) { 00436 ASSERT_HOST(word->reject_map.length() == blobcount); 00437 for (int blobindex = 0; blobindex < blobcount; blobindex++) { 00438 TBOX blob_box = word->box_word->BlobBox(blobindex); 00439 if (blob_box.left() < tessedit_image_border || 00440 blob_box.bottom() < tessedit_image_border || 00441 blob_box.right() + tessedit_image_border > ImageWidth() - 1 || 00442 blob_box.top() + tessedit_image_border > ImageHeight() - 1) { 00443 word->reject_map[blobindex].setrej_edge_char(); 00444 // Close to edge 00445 } 00446 } 00447 } 00448 } 00449 00450 /********************************************************************** 00451 * one_ell_conflict() 00452 * 00453 * Identify words where there is a potential I/l/1 error. 00454 * - A bundle of contextual heuristics! 00455 **********************************************************************/ 00456 BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { 00457 const char *word; 00458 const char *lengths; 00459 inT16 word_len; //its length 00460 inT16 first_alphanum_index_; 00461 inT16 first_alphanum_offset_; 00462 inT16 i; 00463 inT16 offset; 00464 BOOL8 non_conflict_set_char; //non conf set a/n? 00465 BOOL8 conflict = FALSE; 00466 BOOL8 allow_1s; 00467 ACCEPTABLE_WERD_TYPE word_type; 00468 BOOL8 dict_perm_type; 00469 BOOL8 dict_word_ok; 00470 int dict_word_type; 00471 00472 word = word_res->best_choice->unichar_string().string (); 00473 lengths = word_res->best_choice->unichar_lengths().string(); 00474 word_len = strlen (lengths); 00475 /* 00476 If there are no occurrences of the conflict set characters then the word 00477 is OK. 00478 */ 00479 if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL) 00480 return FALSE; 00481 00482 /* 00483 There is a conflict if there are NO other (confirmed) alphanumerics apart 00484 from those in the conflict set. 00485 */ 00486 00487 for (i = 0, offset = 0, non_conflict_set_char = FALSE; 00488 (i < word_len) && !non_conflict_set_char; offset += lengths[i++]) 00489 non_conflict_set_char = 00490 (word_res->uch_set->get_isalpha(word + offset, lengths[i]) || 00491 word_res->uch_set->get_isdigit(word + offset, lengths[i])) && 00492 !STRING (conflict_set_I_l_1).contains (word[offset]); 00493 if (!non_conflict_set_char) { 00494 if (update_map) 00495 reject_I_1_L(word_res); 00496 return TRUE; 00497 } 00498 00499 /* 00500 If the word is accepted by a dawg permuter, and the first alpha character 00501 is "I" or "l", check to see if the alternative is also a dawg word. If it 00502 is, then there is a potential error otherwise the word is ok. 00503 */ 00504 00505 dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) || 00506 (word_res->best_choice->permuter () == USER_DAWG_PERM) || 00507 (rej_trust_doc_dawg && 00508 (word_res->best_choice->permuter () == DOC_DAWG_PERM)) || 00509 (word_res->best_choice->permuter () == FREQ_DAWG_PERM); 00510 dict_word_type = dict_word(*(word_res->best_choice)); 00511 dict_word_ok = (dict_word_type > 0) && 00512 (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); 00513 00514 if ((rej_1Il_use_dict_word && dict_word_ok) || 00515 (rej_1Il_trust_permuter_type && dict_perm_type) || 00516 (dict_perm_type && dict_word_ok)) { 00517 first_alphanum_index_ = first_alphanum_index (word, lengths); 00518 first_alphanum_offset_ = first_alphanum_offset (word, lengths); 00519 if (lengths[first_alphanum_index_] == 1 && 00520 word[first_alphanum_offset_] == 'I') { 00521 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00522 if (safe_dict_word(word_res) > 0) { 00523 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00524 if (update_map) 00525 word_res->reject_map[first_alphanum_index_]. 00526 setrej_1Il_conflict(); 00527 return TRUE; 00528 } 00529 else { 00530 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00531 return FALSE; 00532 } 00533 } 00534 00535 if (lengths[first_alphanum_index_] == 1 && 00536 word[first_alphanum_offset_] == 'l') { 00537 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00538 if (safe_dict_word(word_res) > 0) { 00539 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00540 if (update_map) 00541 word_res->reject_map[first_alphanum_index_]. 00542 setrej_1Il_conflict(); 00543 return TRUE; 00544 } 00545 else { 00546 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00547 return FALSE; 00548 } 00549 } 00550 return FALSE; 00551 } 00552 00553 /* 00554 NEW 1Il code. The old code relied on permuter types too much. In fact, 00555 tess will use TOP_CHOICE permute for good things like "palette". 00556 In this code the string is examined independently to see if it looks like 00557 a well formed word. 00558 */ 00559 00560 /* 00561 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a 00562 dictionary word. 00563 */ 00564 first_alphanum_index_ = first_alphanum_index (word, lengths); 00565 first_alphanum_offset_ = first_alphanum_offset (word, lengths); 00566 if (lengths[first_alphanum_index_] == 1 && 00567 word[first_alphanum_offset_] == 'l') { 00568 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00569 if (safe_dict_word(word_res) > 0) 00570 return FALSE; 00571 else 00572 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00573 } 00574 else if (lengths[first_alphanum_index_] == 1 && 00575 word[first_alphanum_offset_] == 'I') { 00576 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00577 if (safe_dict_word(word_res) > 0) 00578 return FALSE; 00579 else 00580 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00581 } 00582 /* 00583 For strings containing digits: 00584 If there are no alphas OR the numeric permuter liked the word, 00585 reject any non 1 conflict chs 00586 Else reject all conflict chs 00587 */ 00588 if (word_contains_non_1_digit (word, lengths)) { 00589 allow_1s = (alpha_count (word, lengths) == 0) || 00590 (word_res->best_choice->permuter () == NUMBER_PERM); 00591 00592 inT16 offset; 00593 conflict = FALSE; 00594 for (i = 0, offset = 0; word[offset] != '\0'; 00595 offset += word_res->best_choice->unichar_lengths()[i++]) { 00596 if ((!allow_1s || (word[offset] != '1')) && 00597 STRING (conflict_set_I_l_1).contains (word[offset])) { 00598 if (update_map) 00599 word_res->reject_map[i].setrej_1Il_conflict (); 00600 conflict = TRUE; 00601 } 00602 } 00603 return conflict; 00604 } 00605 /* 00606 For anything else. See if it conforms to an acceptable word type. If so, 00607 treat accordingly. 00608 */ 00609 word_type = acceptable_word_string(*word_res->uch_set, word, lengths); 00610 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { 00611 first_alphanum_index_ = first_alphanum_index (word, lengths); 00612 first_alphanum_offset_ = first_alphanum_offset (word, lengths); 00613 if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) { 00614 if (update_map) 00615 word_res->reject_map[first_alphanum_index_]. 00616 setrej_1Il_conflict (); 00617 return TRUE; 00618 } 00619 else 00620 return FALSE; 00621 } 00622 else if (word_type == AC_UPPER_CASE) { 00623 return FALSE; 00624 } 00625 else { 00626 if (update_map) 00627 reject_I_1_L(word_res); 00628 return TRUE; 00629 } 00630 } 00631 00632 00633 inT16 Tesseract::first_alphanum_index(const char *word, 00634 const char *word_lengths) { 00635 inT16 i; 00636 inT16 offset; 00637 00638 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00639 if (unicharset.get_isalpha(word + offset, word_lengths[i]) || 00640 unicharset.get_isdigit(word + offset, word_lengths[i])) 00641 return i; 00642 } 00643 return -1; 00644 } 00645 00646 inT16 Tesseract::first_alphanum_offset(const char *word, 00647 const char *word_lengths) { 00648 inT16 i; 00649 inT16 offset; 00650 00651 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00652 if (unicharset.get_isalpha(word + offset, word_lengths[i]) || 00653 unicharset.get_isdigit(word + offset, word_lengths[i])) 00654 return offset; 00655 } 00656 return -1; 00657 } 00658 00659 inT16 Tesseract::alpha_count(const char *word, 00660 const char *word_lengths) { 00661 inT16 i; 00662 inT16 offset; 00663 inT16 count = 0; 00664 00665 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00666 if (unicharset.get_isalpha (word + offset, word_lengths[i])) 00667 count++; 00668 } 00669 return count; 00670 } 00671 00672 00673 BOOL8 Tesseract::word_contains_non_1_digit(const char *word, 00674 const char *word_lengths) { 00675 inT16 i; 00676 inT16 offset; 00677 00678 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00679 if (unicharset.get_isdigit (word + offset, word_lengths[i]) && 00680 (word_lengths[i] != 1 || word[offset] != '1')) 00681 return TRUE; 00682 } 00683 return FALSE; 00684 } 00685 00686 00687 BOOL8 Tesseract::test_ambig_word( //test for ambiguity 00688 WERD_RES *word) { 00689 BOOL8 ambig = FALSE; 00690 00691 if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || 00692 (word->best_choice->permuter () == FREQ_DAWG_PERM) || 00693 (word->best_choice->permuter () == USER_DAWG_PERM)) { 00694 ambig = !getDict().NoDangerousAmbig( 00695 word->best_choice, NULL, false, NULL, NULL); 00696 } 00697 return ambig; 00698 } 00699 00700 00701 /************************************************************************* 00702 * dont_allow_1Il() 00703 * Dont unreject LONE accepted 1Il conflict set chars 00704 *************************************************************************/ 00705 void Tesseract::dont_allow_1Il(WERD_RES *word) { 00706 int i = 0; 00707 int offset; 00708 int word_len = word->reject_map.length(); 00709 const char *s = word->best_choice->unichar_string().string(); 00710 const char *lengths = word->best_choice->unichar_lengths().string(); 00711 BOOL8 accepted_1Il = FALSE; 00712 00713 for (i = 0, offset = 0; i < word_len; 00714 offset += word->best_choice->unichar_lengths()[i++]) { 00715 if (word->reject_map[i].accepted()) { 00716 if (STRING(conflict_set_I_l_1).contains(s[offset])) { 00717 accepted_1Il = TRUE; 00718 } else { 00719 if (word->uch_set->get_isalpha(s + offset, lengths[i]) || 00720 word->uch_set->get_isdigit(s + offset, lengths[i])) 00721 return; // >=1 non 1Il ch accepted 00722 } 00723 } 00724 } 00725 if (!accepted_1Il) 00726 return; //Nothing to worry about 00727 00728 for (i = 0, offset = 0; i < word_len; 00729 offset += word->best_choice->unichar_lengths()[i++]) { 00730 if (STRING(conflict_set_I_l_1).contains(s[offset]) && 00731 word->reject_map[i].accepted()) 00732 word->reject_map[i].setrej_postNN_1Il(); 00733 } 00734 } 00735 00736 00737 inT16 Tesseract::count_alphanums(WERD_RES *word_res) { 00738 int count = 0; 00739 const WERD_CHOICE *best_choice = word_res->best_choice; 00740 for (int i = 0; i < word_res->reject_map.length(); ++i) { 00741 if ((word_res->reject_map[i].accepted()) && 00742 (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) || 00743 word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) { 00744 count++; 00745 } 00746 } 00747 return count; 00748 } 00749 00750 00751 // reject all if most rejected. 00752 void Tesseract::reject_mostly_rejects(WERD_RES *word) { 00753 /* Reject the whole of the word if the fraction of rejects exceeds a limit */ 00754 00755 if ((float) word->reject_map.reject_count() / word->reject_map.length() >= 00756 rej_whole_of_mostly_reject_word_fract) 00757 word->reject_map.rej_word_mostly_rej(); 00758 } 00759 00760 00761 BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) { 00762 inT16 char_quality; 00763 inT16 accepted_char_quality; 00764 00765 if (word->best_choice->unichar_lengths().length() <= 1) 00766 return FALSE; 00767 00768 if (!STRING(ok_repeated_ch_non_alphanum_wds). 00769 contains(word->best_choice->unichar_string()[0])) 00770 return FALSE; 00771 00772 UNICHAR_ID uch_id = word->best_choice->unichar_id(0); 00773 for (int i = 1; i < word->best_choice->length(); ++i) { 00774 if (word->best_choice->unichar_id(i) != uch_id) return FALSE; 00775 } 00776 00777 word_char_quality(word, row, &char_quality, &accepted_char_quality); 00778 00779 if ((word->best_choice->unichar_lengths().length () == char_quality) && 00780 (char_quality == accepted_char_quality)) 00781 return TRUE; 00782 else 00783 return FALSE; 00784 } 00785 00786 inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) { 00787 const WERD_CHOICE &word = *werd_res->best_choice; 00788 int dict_word_type = werd_res->tesseract->dict_word(word); 00789 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; 00790 } 00791 00792 // Note: After running this function word_res->best_choice->blob_choices() 00793 // might not contain the right BLOB_CHOICE coresponding to each character 00794 // in word_res->best_choice. However, the length of blob_choices and 00795 // word_res->best_choice will remain the same. 00796 void Tesseract::flip_hyphens(WERD_RES *word_res) { 00797 WERD_CHOICE *best_choice = word_res->best_choice; 00798 int i; 00799 int prev_right = -9999; 00800 int next_left; 00801 TBOX out_box; 00802 float aspect_ratio; 00803 00804 if (tessedit_lower_flip_hyphen <= 1) 00805 return; 00806 00807 TBLOB* blob = word_res->rebuild_word->blobs; 00808 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); 00809 bool modified = false; 00810 for (i = 0; i < best_choice->length() && blob != NULL; ++i, 00811 blob = blob->next) { 00812 out_box = blob->bounding_box(); 00813 if (blob->next == NULL) 00814 next_left = 9999; 00815 else 00816 next_left = blob->next->bounding_box().left(); 00817 // Dont touch small or touching blobs - it is too dangerous. 00818 if ((out_box.width() > 8 * word_res->denorm.x_scale()) && 00819 (out_box.left() > prev_right) && (out_box.right() < next_left)) { 00820 aspect_ratio = out_box.width() / (float) out_box.height(); 00821 if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) { 00822 if (aspect_ratio >= tessedit_upper_flip_hyphen && 00823 word_res->uch_set->contains_unichar_id(unichar_dash) && 00824 word_res->uch_set->get_enabled(unichar_dash)) { 00825 /* Certain HYPHEN */ 00826 best_choice->set_unichar_id(unichar_dash, i); 00827 modified = true; 00828 if (word_res->reject_map[i].rejected()) 00829 word_res->reject_map[i].setrej_hyphen_accept(); 00830 } 00831 if ((aspect_ratio > tessedit_lower_flip_hyphen) && 00832 word_res->reject_map[i].accepted()) 00833 //Suspected HYPHEN 00834 word_res->reject_map[i].setrej_hyphen (); 00835 } 00836 else if (best_choice->unichar_id(i) == unichar_dash) { 00837 if ((aspect_ratio >= tessedit_upper_flip_hyphen) && 00838 (word_res->reject_map[i].rejected())) 00839 word_res->reject_map[i].setrej_hyphen_accept(); 00840 //Certain HYPHEN 00841 00842 if ((aspect_ratio <= tessedit_lower_flip_hyphen) && 00843 (word_res->reject_map[i].accepted())) 00844 //Suspected HYPHEN 00845 word_res->reject_map[i].setrej_hyphen(); 00846 } 00847 } 00848 prev_right = out_box.right(); 00849 } 00850 } 00851 00852 // Note: After running this function word_res->best_choice->blob_choices() 00853 // might not contain the right BLOB_CHOICE coresponding to each character 00854 // in word_res->best_choice. However, the length of blob_choices and 00855 // word_res->best_choice will remain the same. 00856 void Tesseract::flip_0O(WERD_RES *word_res) { 00857 WERD_CHOICE *best_choice = word_res->best_choice; 00858 int i; 00859 TBOX out_box; 00860 00861 if (!tessedit_flip_0O) 00862 return; 00863 00864 TBLOB* blob = word_res->rebuild_word->blobs; 00865 for (i = 0; i < best_choice->length() && blob != NULL; ++i, 00866 blob = blob->next) { 00867 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) || 00868 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) { 00869 out_box = blob->bounding_box(); 00870 if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) || 00871 (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) 00872 return; //Beware words with sub/superscripts 00873 } 00874 } 00875 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0"); 00876 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O"); 00877 if (unichar_0 == INVALID_UNICHAR_ID || 00878 !word_res->uch_set->get_enabled(unichar_0) || 00879 unichar_O == INVALID_UNICHAR_ID || 00880 !word_res->uch_set->get_enabled(unichar_O)) { 00881 return; // 0 or O are not present/enabled in unicharset 00882 } 00883 bool modified = false; 00884 for (i = 1; i < best_choice->length(); ++i) { 00885 if (best_choice->unichar_id(i) == unichar_0 || 00886 best_choice->unichar_id(i) == unichar_O) { 00887 /* A0A */ 00888 if ((i+1) < best_choice->length() && 00889 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00890 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) { 00891 best_choice->set_unichar_id(unichar_O, i); 00892 modified = true; 00893 } 00894 /* A00A */ 00895 if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00896 (i+1) < best_choice->length() && 00897 (best_choice->unichar_id(i+1) == unichar_0 || 00898 best_choice->unichar_id(i+1) == unichar_O) && 00899 (i+2) < best_choice->length() && 00900 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) { 00901 best_choice->set_unichar_id(unichar_O, i); 00902 modified = true; 00903 i++; 00904 } 00905 /* AA0<non digit or end of word> */ 00906 if ((i > 1) && 00907 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) && 00908 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00909 (((i+1) < best_choice->length() && 00910 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) && 00911 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") && 00912 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) || 00913 (i == best_choice->length() - 1))) { 00914 best_choice->set_unichar_id(unichar_O, i); 00915 modified = true; 00916 } 00917 /* 9O9 */ 00918 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00919 (i+1) < best_choice->length() && 00920 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) { 00921 best_choice->set_unichar_id(unichar_0, i); 00922 modified = true; 00923 } 00924 /* 9OOO */ 00925 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00926 (i+2) < best_choice->length() && 00927 (best_choice->unichar_id(i+1) == unichar_0 || 00928 best_choice->unichar_id(i+1) == unichar_O) && 00929 (best_choice->unichar_id(i+2) == unichar_0 || 00930 best_choice->unichar_id(i+2) == unichar_O)) { 00931 best_choice->set_unichar_id(unichar_0, i); 00932 best_choice->set_unichar_id(unichar_0, i+1); 00933 best_choice->set_unichar_id(unichar_0, i+2); 00934 modified = true; 00935 i += 2; 00936 } 00937 /* 9OO<non upper> */ 00938 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00939 (i+2) < best_choice->length() && 00940 (best_choice->unichar_id(i+1) == unichar_0 || 00941 best_choice->unichar_id(i+1) == unichar_O) && 00942 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) { 00943 best_choice->set_unichar_id(unichar_0, i); 00944 best_choice->set_unichar_id(unichar_0, i+1); 00945 modified = true; 00946 i++; 00947 } 00948 /* 9O<non upper> */ 00949 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00950 (i+1) < best_choice->length() && 00951 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) { 00952 best_choice->set_unichar_id(unichar_0, i); 00953 } 00954 /* 9[.,]OOO.. */ 00955 if ((i > 1) && 00956 (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") || 00957 word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) && 00958 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) || 00959 best_choice->unichar_id(i-2) == unichar_O)) { 00960 if (best_choice->unichar_id(i-2) == unichar_O) { 00961 best_choice->set_unichar_id(unichar_0, i-2); 00962 modified = true; 00963 } 00964 while (i < best_choice->length() && 00965 (best_choice->unichar_id(i) == unichar_O || 00966 best_choice->unichar_id(i) == unichar_0)) { 00967 best_choice->set_unichar_id(unichar_0, i); 00968 modified = true; 00969 i++; 00970 } 00971 i--; 00972 } 00973 } 00974 } 00975 } 00976 00977 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { 00978 return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O"); 00979 } 00980 00981 BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { 00982 return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0"); 00983 } 00984 } // namespace tesseract