Tesseract
3.02
|
00001 /****************************************************************** 00002 * File: docqual.cpp (Formerly docqual.c) 00003 * Description: Document Quality Metrics 00004 * Author: Phil Cheatle 00005 * Created: Mon May 9 11:27:28 BST 1994 00006 * 00007 * (C) Copyright 1994, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include "mfcpch.h" 00025 #include <ctype.h> 00026 #include "docqual.h" 00027 #include "tfacep.h" 00028 #include "reject.h" 00029 #include "tesscallback.h" 00030 #include "tessvars.h" 00031 #include "secname.h" 00032 #include "globals.h" 00033 #include "tesseractclass.h" 00034 00035 namespace tesseract{ 00036 00037 // A little class to provide the callbacks as we have no pre-bound args. 00038 struct DocQualCallbacks { 00039 explicit DocQualCallbacks(WERD_RES* word0) 00040 : word(word0), match_count(0), accepted_match_count(0) {} 00041 00042 void CountMatchingBlobs(int index) { 00043 ++match_count; 00044 } 00045 00046 void CountAcceptedBlobs(int index) { 00047 if (word->reject_map[index].accepted()) 00048 ++accepted_match_count; 00049 ++match_count; 00050 } 00051 00052 void AcceptIfGoodQuality(int index) { 00053 if (word->reject_map[index].accept_if_good_quality()) 00054 word->reject_map[index].setrej_quality_accept(); 00055 } 00056 00057 WERD_RES* word; 00058 inT16 match_count; 00059 inT16 accepted_match_count; 00060 }; 00061 00062 /************************************************************************* 00063 * word_blob_quality() 00064 * How many blobs in the box_word are identical to those of the inword? 00065 * ASSUME blobs in both initial word and box_word are in ascending order of 00066 * left hand blob edge. 00067 *************************************************************************/ 00068 inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) { 00069 if (word->bln_boxes == NULL || 00070 word->rebuild_word == NULL || word->rebuild_word->blobs == NULL) 00071 return 0; 00072 00073 DocQualCallbacks cb(word); 00074 word->bln_boxes->ProcessMatchedBlobs( 00075 *word->rebuild_word, 00076 NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs)); 00077 return cb.match_count; 00078 } 00079 00080 inT16 Tesseract::word_outline_errs(WERD_RES *word) { 00081 inT16 i = 0; 00082 inT16 err_count = 0; 00083 00084 if (word->rebuild_word != NULL) { 00085 TBLOB* blob = word->rebuild_word->blobs; 00086 for (; blob != NULL; blob = blob->next) { 00087 err_count += count_outline_errs(word->best_choice->unichar_string()[i], 00088 blob->NumOutlines()); 00089 i++; 00090 } 00091 } 00092 return err_count; 00093 } 00094 00095 /************************************************************************* 00096 * word_char_quality() 00097 * Combination of blob quality and outline quality - how many good chars are 00098 * there? - I.e chars which pass the blob AND outline tests. 00099 *************************************************************************/ 00100 void Tesseract::word_char_quality(WERD_RES *word, 00101 ROW *row, 00102 inT16 *match_count, 00103 inT16 *accepted_match_count) { 00104 if (word->bln_boxes == NULL || 00105 word->rebuild_word == NULL || word->rebuild_word->blobs == NULL) 00106 return; 00107 00108 DocQualCallbacks cb(word); 00109 word->bln_boxes->ProcessMatchedBlobs( 00110 *word->rebuild_word, 00111 NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs)); 00112 *match_count = cb.match_count; 00113 *accepted_match_count = cb.accepted_match_count; 00114 } 00115 00116 /************************************************************************* 00117 * unrej_good_chs() 00118 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks 00119 *************************************************************************/ 00120 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) { 00121 if (word->bln_boxes == NULL || 00122 word->rebuild_word == NULL || word->rebuild_word->blobs == NULL) 00123 return; 00124 00125 DocQualCallbacks cb(word); 00126 word->bln_boxes->ProcessMatchedBlobs( 00127 *word->rebuild_word, 00128 NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality)); 00129 } 00130 00131 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) { 00132 int expected_outline_count; 00133 00134 if (STRING (outlines_odd).contains (c)) 00135 return 0; //Dont use this char 00136 else if (STRING (outlines_2).contains (c)) 00137 expected_outline_count = 2; 00138 else 00139 expected_outline_count = 1; 00140 return abs (outline_count - expected_outline_count); 00141 } 00142 00143 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, 00144 BOOL8 good_quality_doc) { 00145 if ((tessedit_good_quality_unrej && good_quality_doc)) 00146 unrej_good_quality_words(page_res_it); 00147 doc_and_block_rejection(page_res_it, good_quality_doc); 00148 if (unlv_tilde_crunching) { 00149 tilde_crunch(page_res_it); 00150 tilde_delete(page_res_it); 00151 } 00152 } 00153 00154 00155 /************************************************************************* 00156 * unrej_good_quality_words() 00157 * Accept potential rejects in words which pass the following checks: 00158 * - Contains a potential reject 00159 * - Word looks like a sensible alpha word. 00160 * - Word segmentation is the same as the original image 00161 * - All characters have the expected number of outlines 00162 * NOTE - the rejection counts are recalculated after unrejection 00163 * - CANT do it in a single pass without a bit of fiddling 00164 * - keep it simple but inefficient 00165 *************************************************************************/ 00166 void Tesseract::unrej_good_quality_words( //unreject potential 00167 PAGE_RES_IT &page_res_it) { 00168 WERD_RES *word; 00169 ROW_RES *current_row; 00170 BLOCK_RES *current_block; 00171 int i; 00172 00173 page_res_it.restart_page (); 00174 while (page_res_it.word () != NULL) { 00175 check_debug_pt (page_res_it.word (), 100); 00176 if (bland_unrej) { 00177 word = page_res_it.word (); 00178 for (i = 0; i < word->reject_map.length (); i++) { 00179 if (word->reject_map[i].accept_if_good_quality ()) 00180 word->reject_map[i].setrej_quality_accept (); 00181 } 00182 page_res_it.forward (); 00183 } 00184 else if ((page_res_it.row ()->char_count > 0) && 00185 ((page_res_it.row ()->rej_count / 00186 (float) page_res_it.row ()->char_count) <= 00187 quality_rowrej_pc)) { 00188 word = page_res_it.word (); 00189 if (word->reject_map.quality_recoverable_rejects() && 00190 (tessedit_unrej_any_wd || 00191 acceptable_word_string(*word->uch_set, 00192 word->best_choice->unichar_string().string(), 00193 word->best_choice->unichar_lengths().string()) 00194 != AC_UNACCEPTABLE)) { 00195 unrej_good_chs(word, page_res_it.row ()->row); 00196 } 00197 page_res_it.forward (); 00198 } 00199 else { 00200 /* Skip to end of dodgy row */ 00201 current_row = page_res_it.row (); 00202 while ((page_res_it.word () != NULL) && 00203 (page_res_it.row () == current_row)) 00204 page_res_it.forward (); 00205 } 00206 check_debug_pt (page_res_it.word (), 110); 00207 } 00208 page_res_it.restart_page (); 00209 page_res_it.page_res->char_count = 0; 00210 page_res_it.page_res->rej_count = 0; 00211 current_block = NULL; 00212 current_row = NULL; 00213 while (page_res_it.word () != NULL) { 00214 if (current_block != page_res_it.block ()) { 00215 current_block = page_res_it.block (); 00216 current_block->char_count = 0; 00217 current_block->rej_count = 0; 00218 } 00219 if (current_row != page_res_it.row ()) { 00220 current_row = page_res_it.row (); 00221 current_row->char_count = 0; 00222 current_row->rej_count = 0; 00223 current_row->whole_word_rej_count = 0; 00224 } 00225 page_res_it.rej_stat_word (); 00226 page_res_it.forward (); 00227 } 00228 } 00229 00230 00231 /************************************************************************* 00232 * doc_and_block_rejection() 00233 * 00234 * If the page has too many rejects - reject all of it. 00235 * If any block has too many rejects - reject all words in the block 00236 *************************************************************************/ 00237 00238 void Tesseract::doc_and_block_rejection( //reject big chunks 00239 PAGE_RES_IT &page_res_it, 00240 BOOL8 good_quality_doc) { 00241 inT16 block_no = 0; 00242 inT16 row_no = 0; 00243 BLOCK_RES *current_block; 00244 ROW_RES *current_row; 00245 00246 BOOL8 rej_word; 00247 BOOL8 prev_word_rejected; 00248 inT16 char_quality = 0; 00249 inT16 accepted_char_quality; 00250 00251 if (page_res_it.page_res->rej_count * 100.0 / 00252 page_res_it.page_res->char_count > tessedit_reject_doc_percent) { 00253 reject_whole_page(page_res_it); 00254 if (tessedit_debug_doc_rejection) { 00255 tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", 00256 page_res_it.page_res->char_count, 00257 page_res_it.page_res->rej_count); 00258 } 00259 } else { 00260 if (tessedit_debug_doc_rejection) { 00261 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", 00262 page_res_it.page_res->char_count, 00263 page_res_it.page_res->rej_count); 00264 } 00265 00266 /* Walk blocks testing for block rejection */ 00267 00268 page_res_it.restart_page(); 00269 WERD_RES* word; 00270 while ((word = page_res_it.word()) != NULL) { 00271 current_block = page_res_it.block(); 00272 block_no = current_block->block->index(); 00273 if (current_block->char_count > 0 && 00274 (current_block->rej_count * 100.0 / current_block->char_count) > 00275 tessedit_reject_block_percent) { 00276 if (tessedit_debug_block_rejection) { 00277 tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", 00278 block_no, current_block->char_count, 00279 current_block->rej_count); 00280 } 00281 prev_word_rejected = FALSE; 00282 while ((word = page_res_it.word()) != NULL && 00283 (page_res_it.block() == current_block)) { 00284 if (tessedit_preserve_blk_rej_perfect_wds) { 00285 rej_word = word->reject_map.reject_count() > 0 || 00286 word->reject_map.length () < tessedit_preserve_min_wd_len; 00287 if (rej_word && tessedit_dont_blkrej_good_wds && 00288 word->reject_map.length() >= tessedit_preserve_min_wd_len && 00289 acceptable_word_string( 00290 *word->uch_set, 00291 word->best_choice->unichar_string().string(), 00292 word->best_choice->unichar_lengths().string()) != 00293 AC_UNACCEPTABLE) { 00294 word_char_quality(word, page_res_it.row()->row, 00295 &char_quality, 00296 &accepted_char_quality); 00297 rej_word = char_quality != word->reject_map.length(); 00298 } 00299 } else { 00300 rej_word = TRUE; 00301 } 00302 if (rej_word) { 00303 /* 00304 Reject spacing if both current and prev words are rejected. 00305 NOTE - this is NOT restricted to FUZZY spaces. - When tried this 00306 generated more space errors. 00307 */ 00308 if (tessedit_use_reject_spaces && 00309 prev_word_rejected && 00310 page_res_it.prev_row() == page_res_it.row() && 00311 word->word->space() == 1) 00312 word->reject_spaces = TRUE; 00313 word->reject_map.rej_word_block_rej(); 00314 } 00315 prev_word_rejected = rej_word; 00316 page_res_it.forward(); 00317 } 00318 } else { 00319 if (tessedit_debug_block_rejection) { 00320 tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", 00321 block_no, page_res_it.block()->char_count, 00322 page_res_it.block()->rej_count); 00323 } 00324 00325 /* Walk rows in block testing for row rejection */ 00326 row_no = 0; 00327 while ((word = page_res_it.word()) != NULL && 00328 page_res_it.block() == current_block) { 00329 current_row = page_res_it.row(); 00330 row_no++; 00331 /* Reject whole row if: 00332 fraction of chars on row which are rejected exceed a limit AND 00333 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a 00334 limit 00335 */ 00336 if (current_row->char_count > 0 && 00337 (current_row->rej_count * 100.0 / current_row->char_count) > 00338 tessedit_reject_row_percent && 00339 (current_row->whole_word_rej_count * 100.0 / 00340 current_row->rej_count) < 00341 tessedit_whole_wd_rej_row_percent) { 00342 if (tessedit_debug_block_rejection) { 00343 tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", 00344 row_no, current_row->char_count, 00345 current_row->rej_count); 00346 } 00347 prev_word_rejected = FALSE; 00348 while ((word = page_res_it.word()) != NULL && 00349 page_res_it.row () == current_row) { 00350 /* Preserve words on good docs unless they are mostly rejected*/ 00351 if (!tessedit_row_rej_good_docs && good_quality_doc) { 00352 rej_word = word->reject_map.reject_count() / 00353 static_cast<float>(word->reject_map.length()) > 00354 tessedit_good_doc_still_rowrej_wd; 00355 } else if (tessedit_preserve_row_rej_perfect_wds) { 00356 /* Preserve perfect words anyway */ 00357 rej_word = word->reject_map.reject_count() > 0 || 00358 word->reject_map.length () < tessedit_preserve_min_wd_len; 00359 if (rej_word && tessedit_dont_rowrej_good_wds && 00360 word->reject_map.length() >= tessedit_preserve_min_wd_len && 00361 acceptable_word_string(*word->uch_set, 00362 word->best_choice->unichar_string().string(), 00363 word->best_choice->unichar_lengths().string()) != 00364 AC_UNACCEPTABLE) { 00365 word_char_quality(word, page_res_it.row()->row, 00366 &char_quality, 00367 &accepted_char_quality); 00368 rej_word = char_quality != word->reject_map.length(); 00369 } 00370 } else { 00371 rej_word = TRUE; 00372 } 00373 if (rej_word) { 00374 /* 00375 Reject spacing if both current and prev words are rejected. 00376 NOTE - this is NOT restricted to FUZZY spaces. - When tried 00377 this generated more space errors. 00378 */ 00379 if (tessedit_use_reject_spaces && 00380 prev_word_rejected && 00381 page_res_it.prev_row() == page_res_it.row() && 00382 word->word->space () == 1) 00383 word->reject_spaces = TRUE; 00384 word->reject_map.rej_word_row_rej(); 00385 } 00386 prev_word_rejected = rej_word; 00387 page_res_it.forward(); 00388 } 00389 } else { 00390 if (tessedit_debug_block_rejection) { 00391 tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", 00392 row_no, current_row->char_count, current_row->rej_count); 00393 } 00394 while (page_res_it.word() != NULL && 00395 page_res_it.row() == current_row) 00396 page_res_it.forward(); 00397 } 00398 } 00399 } 00400 } 00401 } 00402 } 00403 00404 } // namespace tesseract 00405 00406 00407 /************************************************************************* 00408 * reject_whole_page() 00409 * Dont believe any of it - set the reject map to 00..00 in all words 00410 * 00411 *************************************************************************/ 00412 00413 void reject_whole_page(PAGE_RES_IT &page_res_it) { 00414 page_res_it.restart_page (); 00415 while (page_res_it.word () != NULL) { 00416 page_res_it.word ()->reject_map.rej_word_doc_rej (); 00417 page_res_it.forward (); 00418 } 00419 //whole page is rejected 00420 page_res_it.page_res->rejected = TRUE; 00421 } 00422 00423 namespace tesseract { 00424 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { 00425 WERD_RES *word; 00426 GARBAGE_LEVEL garbage_level; 00427 PAGE_RES_IT copy_it; 00428 BOOL8 prev_potential_marked = FALSE; 00429 BOOL8 found_terrible_word = FALSE; 00430 BOOL8 ok_dict_word; 00431 00432 page_res_it.restart_page(); 00433 while (page_res_it.word() != NULL) { 00434 POLY_BLOCK* pb = page_res_it.block()->block->poly_block(); 00435 if (pb != NULL && !pb->IsText()) { 00436 page_res_it.forward(); 00437 continue; 00438 } 00439 word = page_res_it.word(); 00440 00441 if (crunch_early_convert_bad_unlv_chs) 00442 convert_bad_unlv_chs(word); 00443 00444 if (crunch_early_merge_tess_fails) 00445 word->merge_tess_fails(); 00446 00447 if (word->reject_map.accept_count () != 0) { 00448 found_terrible_word = FALSE; 00449 //Forget earlier potential crunches 00450 prev_potential_marked = FALSE; 00451 } 00452 else { 00453 ok_dict_word = safe_dict_word(word); 00454 garbage_level = garbage_word (word, ok_dict_word); 00455 00456 if ((garbage_level != G_NEVER_CRUNCH) && 00457 (terrible_word_crunch (word, garbage_level))) { 00458 if (crunch_debug > 0) { 00459 tprintf ("T CRUNCHING: \"%s\"\n", 00460 word->best_choice->unichar_string().string()); 00461 } 00462 word->unlv_crunch_mode = CR_KEEP_SPACE; 00463 if (prev_potential_marked) { 00464 while (copy_it.word () != word) { 00465 if (crunch_debug > 0) { 00466 tprintf ("P1 CRUNCHING: \"%s\"\n", 00467 copy_it.word()->best_choice->unichar_string().string()); 00468 } 00469 copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE; 00470 copy_it.forward (); 00471 } 00472 prev_potential_marked = FALSE; 00473 } 00474 found_terrible_word = TRUE; 00475 } 00476 else if ((garbage_level != G_NEVER_CRUNCH) && 00477 (potential_word_crunch (word, 00478 garbage_level, ok_dict_word))) { 00479 if (found_terrible_word) { 00480 if (crunch_debug > 0) { 00481 tprintf ("P2 CRUNCHING: \"%s\"\n", 00482 word->best_choice->unichar_string().string()); 00483 } 00484 word->unlv_crunch_mode = CR_KEEP_SPACE; 00485 } 00486 else if (!prev_potential_marked) { 00487 copy_it = page_res_it; 00488 prev_potential_marked = TRUE; 00489 if (crunch_debug > 1) { 00490 tprintf ("P3 CRUNCHING: \"%s\"\n", 00491 word->best_choice->unichar_string().string()); 00492 } 00493 } 00494 } 00495 else { 00496 found_terrible_word = FALSE; 00497 //Forget earlier potential crunches 00498 prev_potential_marked = FALSE; 00499 if (crunch_debug > 2) { 00500 tprintf ("NO CRUNCH: \"%s\"\n", 00501 word->best_choice->unichar_string().string()); 00502 } 00503 } 00504 } 00505 page_res_it.forward (); 00506 } 00507 } 00508 00509 00510 BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word, 00511 GARBAGE_LEVEL garbage_level) { 00512 float rating_per_ch; 00513 int adjusted_len; 00514 int crunch_mode = 0; 00515 00516 if ((word->best_choice->unichar_string().length () == 0) || 00517 (strspn (word->best_choice->unichar_string().string(), " ") == 00518 word->best_choice->unichar_string().length ())) 00519 crunch_mode = 1; 00520 else { 00521 adjusted_len = word->reject_map.length (); 00522 if (adjusted_len > crunch_rating_max) 00523 adjusted_len = crunch_rating_max; 00524 rating_per_ch = word->best_choice->rating () / adjusted_len; 00525 00526 if (rating_per_ch > crunch_terrible_rating) 00527 crunch_mode = 2; 00528 else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) 00529 crunch_mode = 3; 00530 else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) && 00531 (garbage_level != G_OK)) 00532 crunch_mode = 4; 00533 else if ((rating_per_ch > crunch_poor_garbage_rate) && 00534 (garbage_level != G_OK)) 00535 crunch_mode = 5; 00536 } 00537 if (crunch_mode > 0) { 00538 if (crunch_debug > 2) { 00539 tprintf ("Terrible_word_crunch (%d) on \"%s\"\n", 00540 crunch_mode, word->best_choice->unichar_string().string()); 00541 } 00542 return TRUE; 00543 } 00544 else 00545 return FALSE; 00546 } 00547 00548 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word, 00549 GARBAGE_LEVEL garbage_level, 00550 BOOL8 ok_dict_word) { 00551 float rating_per_ch; 00552 int adjusted_len; 00553 const char *str = word->best_choice->unichar_string().string(); 00554 const char *lengths = word->best_choice->unichar_lengths().string(); 00555 BOOL8 word_crunchable; 00556 int poor_indicator_count = 0; 00557 00558 word_crunchable = !crunch_leave_accept_strings || 00559 word->reject_map.length() < 3 || 00560 (acceptable_word_string(*word->uch_set, 00561 str, lengths) == AC_UNACCEPTABLE && 00562 !ok_dict_word); 00563 00564 adjusted_len = word->reject_map.length(); 00565 if (adjusted_len > 10) 00566 adjusted_len = 10; 00567 rating_per_ch = word->best_choice->rating() / adjusted_len; 00568 00569 if (rating_per_ch > crunch_pot_poor_rate) { 00570 if (crunch_debug > 2) { 00571 tprintf("Potential poor rating on \"%s\"\n", 00572 word->best_choice->unichar_string().string()); 00573 } 00574 poor_indicator_count++; 00575 } 00576 00577 if (word_crunchable && 00578 word->best_choice->certainty() < crunch_pot_poor_cert) { 00579 if (crunch_debug > 2) { 00580 tprintf("Potential poor cert on \"%s\"\n", 00581 word->best_choice->unichar_string().string()); 00582 } 00583 poor_indicator_count++; 00584 } 00585 00586 if (garbage_level != G_OK) { 00587 if (crunch_debug > 2) { 00588 tprintf("Potential garbage on \"%s\"\n", 00589 word->best_choice->unichar_string().string()); 00590 } 00591 poor_indicator_count++; 00592 } 00593 return poor_indicator_count >= crunch_pot_indicators; 00594 } 00595 00596 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { 00597 WERD_RES *word; 00598 PAGE_RES_IT copy_it; 00599 BOOL8 deleting_from_bol = FALSE; 00600 BOOL8 marked_delete_point = FALSE; 00601 inT16 debug_delete_mode; 00602 CRUNCH_MODE delete_mode; 00603 inT16 x_debug_delete_mode; 00604 CRUNCH_MODE x_delete_mode; 00605 00606 page_res_it.restart_page(); 00607 while (page_res_it.word() != NULL) { 00608 word = page_res_it.word(); 00609 00610 delete_mode = word_deletable (word, debug_delete_mode); 00611 if (delete_mode != CR_NONE) { 00612 if (word->word->flag (W_BOL) || deleting_from_bol) { 00613 if (crunch_debug > 0) { 00614 tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n", 00615 debug_delete_mode, 00616 word->best_choice->unichar_string().string()); 00617 } 00618 word->unlv_crunch_mode = delete_mode; 00619 deleting_from_bol = TRUE; 00620 } else if (word->word->flag(W_EOL)) { 00621 if (marked_delete_point) { 00622 while (copy_it.word() != word) { 00623 x_delete_mode = word_deletable (copy_it.word (), 00624 x_debug_delete_mode); 00625 if (crunch_debug > 0) { 00626 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", 00627 x_debug_delete_mode, 00628 copy_it.word()->best_choice->unichar_string().string()); 00629 } 00630 copy_it.word ()->unlv_crunch_mode = x_delete_mode; 00631 copy_it.forward (); 00632 } 00633 } 00634 if (crunch_debug > 0) { 00635 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", 00636 debug_delete_mode, 00637 word->best_choice->unichar_string().string()); 00638 } 00639 word->unlv_crunch_mode = delete_mode; 00640 deleting_from_bol = FALSE; 00641 marked_delete_point = FALSE; 00642 } 00643 else { 00644 if (!marked_delete_point) { 00645 copy_it = page_res_it; 00646 marked_delete_point = TRUE; 00647 } 00648 } 00649 } 00650 else { 00651 deleting_from_bol = FALSE; 00652 //Forget earlier potential crunches 00653 marked_delete_point = FALSE; 00654 } 00655 /* 00656 The following step has been left till now as the tess fails are used to 00657 determine if the word is deletable. 00658 */ 00659 if (!crunch_early_merge_tess_fails) 00660 word->merge_tess_fails(); 00661 page_res_it.forward (); 00662 } 00663 } 00664 00665 00666 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { 00667 int i; 00668 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); 00669 UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" "); 00670 UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~"); 00671 UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^"); 00672 bool modified = false; 00673 for (i = 0; i < word_res->reject_map.length(); ++i) { 00674 if (word_res->best_choice->unichar_id(i) == unichar_tilde) { 00675 word_res->best_choice->set_unichar_id(unichar_dash, i); 00676 modified = true; 00677 if (word_res->reject_map[i].accepted ()) 00678 word_res->reject_map[i].setrej_unlv_rej (); 00679 } 00680 if (word_res->best_choice->unichar_id(i) == unichar_pow) { 00681 word_res->best_choice->set_unichar_id(unichar_space, i); 00682 modified = true; 00683 if (word_res->reject_map[i].accepted ()) 00684 word_res->reject_map[i].setrej_unlv_rej (); 00685 } 00686 } 00687 } 00688 00689 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { 00690 enum STATES 00691 { 00692 JUNK, 00693 FIRST_UPPER, 00694 FIRST_LOWER, 00695 FIRST_NUM, 00696 SUBSEQUENT_UPPER, 00697 SUBSEQUENT_LOWER, 00698 SUBSEQUENT_NUM 00699 }; 00700 const char *str = word->best_choice->unichar_string().string(); 00701 const char *lengths = word->best_choice->unichar_lengths().string(); 00702 STATES state = JUNK; 00703 int len = 0; 00704 int isolated_digits = 0; 00705 int isolated_alphas = 0; 00706 int bad_char_count = 0; 00707 int tess_rejs = 0; 00708 int dodgy_chars = 0; 00709 int ok_chars; 00710 UNICHAR_ID last_char = -1; 00711 int alpha_repetition_count = 0; 00712 int longest_alpha_repetition_count = 0; 00713 int longest_lower_run_len = 0; 00714 int lower_string_count = 0; 00715 int longest_upper_run_len = 0; 00716 int upper_string_count = 0; 00717 int total_alpha_count = 0; 00718 int total_digit_count = 0; 00719 00720 for (; *str != '\0'; str += *(lengths++)) { 00721 len++; 00722 if (word->uch_set->get_isupper (str, *lengths)) { 00723 total_alpha_count++; 00724 switch (state) { 00725 case SUBSEQUENT_UPPER: 00726 case FIRST_UPPER: 00727 state = SUBSEQUENT_UPPER; 00728 upper_string_count++; 00729 if (longest_upper_run_len < upper_string_count) 00730 longest_upper_run_len = upper_string_count; 00731 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { 00732 alpha_repetition_count++; 00733 if (longest_alpha_repetition_count < alpha_repetition_count) { 00734 longest_alpha_repetition_count = alpha_repetition_count; 00735 } 00736 } 00737 else { 00738 last_char = word->uch_set->unichar_to_id(str, *lengths); 00739 alpha_repetition_count = 1; 00740 } 00741 break; 00742 case FIRST_NUM: 00743 isolated_digits++; 00744 default: 00745 state = FIRST_UPPER; 00746 last_char = word->uch_set->unichar_to_id(str, *lengths); 00747 alpha_repetition_count = 1; 00748 upper_string_count = 1; 00749 break; 00750 } 00751 } 00752 else if (word->uch_set->get_islower (str, *lengths)) { 00753 total_alpha_count++; 00754 switch (state) { 00755 case SUBSEQUENT_LOWER: 00756 case FIRST_LOWER: 00757 state = SUBSEQUENT_LOWER; 00758 lower_string_count++; 00759 if (longest_lower_run_len < lower_string_count) 00760 longest_lower_run_len = lower_string_count; 00761 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { 00762 alpha_repetition_count++; 00763 if (longest_alpha_repetition_count < alpha_repetition_count) { 00764 longest_alpha_repetition_count = alpha_repetition_count; 00765 } 00766 } 00767 else { 00768 last_char = word->uch_set->unichar_to_id(str, *lengths); 00769 alpha_repetition_count = 1; 00770 } 00771 break; 00772 case FIRST_NUM: 00773 isolated_digits++; 00774 default: 00775 state = FIRST_LOWER; 00776 last_char = word->uch_set->unichar_to_id(str, *lengths); 00777 alpha_repetition_count = 1; 00778 lower_string_count = 1; 00779 break; 00780 } 00781 } 00782 else if (word->uch_set->get_isdigit (str, *lengths)) { 00783 total_digit_count++; 00784 switch (state) { 00785 case FIRST_NUM: 00786 state = SUBSEQUENT_NUM; 00787 case SUBSEQUENT_NUM: 00788 break; 00789 case FIRST_UPPER: 00790 case FIRST_LOWER: 00791 isolated_alphas++; 00792 default: 00793 state = FIRST_NUM; 00794 break; 00795 } 00796 } 00797 else { 00798 if (*lengths == 1 && *str == ' ') 00799 tess_rejs++; 00800 else 00801 bad_char_count++; 00802 switch (state) { 00803 case FIRST_NUM: 00804 isolated_digits++; 00805 break; 00806 case FIRST_UPPER: 00807 case FIRST_LOWER: 00808 isolated_alphas++; 00809 default: 00810 break; 00811 } 00812 state = JUNK; 00813 } 00814 } 00815 00816 switch (state) { 00817 case FIRST_NUM: 00818 isolated_digits++; 00819 break; 00820 case FIRST_UPPER: 00821 case FIRST_LOWER: 00822 isolated_alphas++; 00823 default: 00824 break; 00825 } 00826 00827 if (crunch_include_numerals) { 00828 total_alpha_count += total_digit_count - isolated_digits; 00829 } 00830 00831 if (crunch_leave_ok_strings && len >= 4 && 00832 2 * (total_alpha_count - isolated_alphas) > len && 00833 longest_alpha_repetition_count < crunch_long_repetitions) { 00834 if ((crunch_accept_ok && 00835 acceptable_word_string(*word->uch_set, str, lengths) != 00836 AC_UNACCEPTABLE) || 00837 longest_lower_run_len > crunch_leave_lc_strings || 00838 longest_upper_run_len > crunch_leave_uc_strings) 00839 return G_NEVER_CRUNCH; 00840 } 00841 if (word->reject_map.length() > 1 && 00842 strpbrk(str, " ") == NULL && 00843 (word->best_choice->permuter() == SYSTEM_DAWG_PERM || 00844 word->best_choice->permuter() == FREQ_DAWG_PERM || 00845 word->best_choice->permuter() == USER_DAWG_PERM || 00846 word->best_choice->permuter() == NUMBER_PERM || 00847 acceptable_word_string(*word->uch_set, str, lengths) != 00848 AC_UNACCEPTABLE || ok_dict_word)) 00849 return G_OK; 00850 00851 ok_chars = len - bad_char_count - isolated_digits - 00852 isolated_alphas - tess_rejs; 00853 00854 if (crunch_debug > 3) { 00855 tprintf("garbage_word: \"%s\"\n", 00856 word->best_choice->unichar_string().string()); 00857 tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", 00858 len, 00859 bad_char_count, isolated_digits, isolated_alphas, tess_rejs); 00860 } 00861 if (bad_char_count == 0 && 00862 tess_rejs == 0 && 00863 (len > isolated_digits + isolated_alphas || len <= 2)) 00864 return G_OK; 00865 00866 if (tess_rejs > ok_chars || 00867 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) 00868 return G_TERRIBLE; 00869 00870 if (len > 4) { 00871 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + 00872 isolated_alphas; 00873 if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5) 00874 return G_DODGY; 00875 else 00876 return G_OK; 00877 } else { 00878 dodgy_chars = 2 * tess_rejs + bad_char_count; 00879 if ((len == 4 && dodgy_chars > 2) || 00880 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) 00881 return G_DODGY; 00882 else 00883 return G_OK; 00884 } 00885 } 00886 00887 00888 /************************************************************************* 00889 * word_deletable() 00890 * DELETE WERDS AT ENDS OF ROWS IF 00891 * Word is crunched && 00892 * ( string length = 0 OR 00893 * > 50% of chars are "|" (before merging) OR 00894 * certainty < -10 OR 00895 * rating /char > 60 OR 00896 * TOP of word is more than 0.5 xht BELOW baseline OR 00897 * BOTTOM of word is more than 0.5 xht ABOVE xht OR 00898 * length of word < 3xht OR 00899 * height of word < 0.7 xht OR 00900 * height of word > 3.0 xht OR 00901 * >75% of the outline BBs have longest dimension < 0.5xht 00902 *************************************************************************/ 00903 00904 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) { 00905 int word_len = word->reject_map.length (); 00906 float rating_per_ch; 00907 TBOX box; //BB of word 00908 00909 if (word->unlv_crunch_mode == CR_NONE) { 00910 delete_mode = 0; 00911 return CR_NONE; 00912 } 00913 00914 if (word_len == 0) { 00915 delete_mode = 1; 00916 return CR_DELETE; 00917 } 00918 00919 if (word->rebuild_word != NULL) { 00920 // Cube leaves rebuild_word NULL. 00921 box = word->rebuild_word->bounding_box(); 00922 if (box.height () < crunch_del_min_ht * kBlnXHeight) { 00923 delete_mode = 4; 00924 return CR_DELETE; 00925 } 00926 00927 if (noise_outlines(word->rebuild_word)) { 00928 delete_mode = 5; 00929 return CR_DELETE; 00930 } 00931 } 00932 00933 if ((failure_count (word) * 1.5) > word_len) { 00934 delete_mode = 2; 00935 return CR_LOOSE_SPACE; 00936 } 00937 00938 if (word->best_choice->certainty () < crunch_del_cert) { 00939 delete_mode = 7; 00940 return CR_LOOSE_SPACE; 00941 } 00942 00943 rating_per_ch = word->best_choice->rating () / word_len; 00944 00945 if (rating_per_ch > crunch_del_rating) { 00946 delete_mode = 8; 00947 return CR_LOOSE_SPACE; 00948 } 00949 00950 if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) { 00951 delete_mode = 9; 00952 return CR_LOOSE_SPACE; 00953 } 00954 00955 if (box.bottom () > 00956 kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) { 00957 delete_mode = 10; 00958 return CR_LOOSE_SPACE; 00959 } 00960 00961 if (box.height () > crunch_del_max_ht * kBlnXHeight) { 00962 delete_mode = 11; 00963 return CR_LOOSE_SPACE; 00964 } 00965 00966 if (box.width () < crunch_del_min_width * kBlnXHeight) { 00967 delete_mode = 3; 00968 return CR_LOOSE_SPACE; 00969 } 00970 00971 delete_mode = 0; 00972 return CR_NONE; 00973 } 00974 00975 inT16 Tesseract::failure_count(WERD_RES *word) { 00976 const char *str = word->best_choice->unichar_string().string(); 00977 int tess_rejs = 0; 00978 00979 for (; *str != '\0'; str++) { 00980 if (*str == ' ') 00981 tess_rejs++; 00982 } 00983 return tess_rejs; 00984 } 00985 00986 00987 BOOL8 Tesseract::noise_outlines(TWERD *word) { 00988 TBOX box; // BB of outline 00989 inT16 outline_count = 0; 00990 inT16 small_outline_count = 0; 00991 inT16 max_dimension; 00992 float small_limit = kBlnXHeight * crunch_small_outlines_size; 00993 00994 for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) { 00995 for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) { 00996 outline_count++; 00997 box = ol->bounding_box(); 00998 if (box.height() > box.width()) 00999 max_dimension = box.height(); 01000 else 01001 max_dimension = box.width(); 01002 if (max_dimension < small_limit) 01003 small_outline_count++; 01004 } 01005 } 01006 return (small_outline_count >= outline_count); 01007 } 01008 } // namespace tesseract