Tesseract
3.02
|
00001 /****************************************************************** 00002 * File: fixspace.cpp (Formerly fixspace.c) 00003 * Description: Implements a pass over the page res, exploring the alternative 00004 * spacing possibilities, trying to use context to improve the 00005 * word spacing 00006 * Author: Phil Cheatle 00007 * Created: Thu Oct 21 11:38:43 BST 1993 00008 * 00009 * (C) Copyright 1993, Hewlett-Packard Ltd. 00010 ** Licensed under the Apache License, Version 2.0 (the "License"); 00011 ** you may not use this file except in compliance with the License. 00012 ** You may obtain a copy of the License at 00013 ** http://www.apache.org/licenses/LICENSE-2.0 00014 ** Unless required by applicable law or agreed to in writing, software 00015 ** distributed under the License is distributed on an "AS IS" BASIS, 00016 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 ** See the License for the specific language governing permissions and 00018 ** limitations under the License. 00019 * 00020 **********************************************************************/ 00021 00022 #include "mfcpch.h" 00023 #include <ctype.h> 00024 #include "reject.h" 00025 #include "statistc.h" 00026 #include "control.h" 00027 #include "fixspace.h" 00028 #include "genblob.h" 00029 #include "tessvars.h" 00030 #include "tessbox.h" 00031 #include "secname.h" 00032 #include "globals.h" 00033 #include "tesseractclass.h" 00034 00035 #define PERFECT_WERDS 999 00036 #define MAXSPACING 128 /*max expected spacing in pix */ 00037 00038 namespace tesseract { 00049 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, 00050 inT32 word_count, 00051 PAGE_RES *page_res) { 00052 BLOCK_RES_IT block_res_it; 00053 ROW_RES_IT row_res_it; 00054 WERD_RES_IT word_res_it_from; 00055 WERD_RES_IT word_res_it_to; 00056 WERD_RES *word_res; 00057 WERD_RES_LIST fuzzy_space_words; 00058 inT16 new_length; 00059 BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds 00060 inT32 word_index; // current word 00061 00062 block_res_it.set_to_list(&page_res->block_res_list); 00063 word_index = 0; 00064 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); 00065 block_res_it.forward()) { 00066 row_res_it.set_to_list(&block_res_it.data()->row_res_list); 00067 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); 00068 row_res_it.forward()) { 00069 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list); 00070 while (!word_res_it_from.at_last()) { 00071 word_res = word_res_it_from.data(); 00072 while (!word_res_it_from.at_last() && 00073 !(word_res->combination || 00074 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) || 00075 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) { 00076 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, 00077 block_res_it.data()->block); 00078 word_res = word_res_it_from.forward(); 00079 word_index++; 00080 if (monitor != NULL) { 00081 monitor->ocr_alive = TRUE; 00082 monitor->progress = 90 + 5 * word_index / word_count; 00083 if (monitor->deadline_exceeded() || 00084 (monitor->cancel != NULL && 00085 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) 00086 return; 00087 } 00088 } 00089 00090 if (!word_res_it_from.at_last()) { 00091 word_res_it_to = word_res_it_from; 00092 prevent_null_wd_fixsp = 00093 word_res->word->cblob_list()->empty(); 00094 if (check_debug_pt(word_res, 60)) 00095 debug_fix_space_level.set_value(10); 00096 word_res_it_to.forward(); 00097 word_index++; 00098 if (monitor != NULL) { 00099 monitor->ocr_alive = TRUE; 00100 monitor->progress = 90 + 5 * word_index / word_count; 00101 if (monitor->deadline_exceeded() || 00102 (monitor->cancel != NULL && 00103 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) 00104 return; 00105 } 00106 while (!word_res_it_to.at_last () && 00107 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) || 00108 word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) { 00109 if (check_debug_pt(word_res, 60)) 00110 debug_fix_space_level.set_value(10); 00111 if (word_res->word->cblob_list()->empty()) 00112 prevent_null_wd_fixsp = TRUE; 00113 word_res = word_res_it_to.forward(); 00114 } 00115 if (check_debug_pt(word_res, 60)) 00116 debug_fix_space_level.set_value(10); 00117 if (word_res->word->cblob_list()->empty()) 00118 prevent_null_wd_fixsp = TRUE; 00119 if (prevent_null_wd_fixsp) { 00120 word_res_it_from = word_res_it_to; 00121 } else { 00122 fuzzy_space_words.assign_to_sublist(&word_res_it_from, 00123 &word_res_it_to); 00124 fix_fuzzy_space_list(fuzzy_space_words, 00125 row_res_it.data()->row, 00126 block_res_it.data()->block); 00127 new_length = fuzzy_space_words.length(); 00128 word_res_it_from.add_list_before(&fuzzy_space_words); 00129 for (; 00130 !word_res_it_from.at_last() && new_length > 0; 00131 new_length--) { 00132 word_res_it_from.forward(); 00133 } 00134 } 00135 if (test_pt) 00136 debug_fix_space_level.set_value(0); 00137 } 00138 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, 00139 block_res_it.data()->block); 00140 // Last word in row 00141 } 00142 } 00143 } 00144 } 00145 00146 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, 00147 ROW *row, 00148 BLOCK* block) { 00149 inT16 best_score; 00150 WERD_RES_LIST current_perm; 00151 inT16 current_score; 00152 BOOL8 improved = FALSE; 00153 00154 best_score = eval_word_spacing(best_perm); // default score 00155 dump_words(best_perm, best_score, 1, improved); 00156 00157 if (best_score != PERFECT_WERDS) 00158 initialise_search(best_perm, current_perm); 00159 00160 while ((best_score != PERFECT_WERDS) && !current_perm.empty()) { 00161 match_current_words(current_perm, row, block); 00162 current_score = eval_word_spacing(current_perm); 00163 dump_words(current_perm, current_score, 2, improved); 00164 if (current_score > best_score) { 00165 best_perm.clear(); 00166 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); 00167 best_score = current_score; 00168 improved = TRUE; 00169 } 00170 if (current_score < PERFECT_WERDS) 00171 transform_to_next_perm(current_perm); 00172 } 00173 dump_words(best_perm, best_score, 3, improved); 00174 } 00175 00176 } // namespace tesseract 00177 00178 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) { 00179 WERD_RES_IT src_it(&src_list); 00180 WERD_RES_IT new_it(&new_list); 00181 WERD_RES *src_wd; 00182 WERD_RES *new_wd; 00183 00184 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { 00185 src_wd = src_it.data(); 00186 if (!src_wd->combination) { 00187 new_wd = new WERD_RES(*src_wd); 00188 new_wd->combination = FALSE; 00189 new_wd->part_of_combo = FALSE; 00190 new_it.add_after_then_move(new_wd); 00191 } 00192 } 00193 } 00194 00195 00196 namespace tesseract { 00197 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, 00198 BLOCK* block) { 00199 WERD_RES_IT word_it(&words); 00200 WERD_RES *word; 00201 // Since we are not using PAGE_RES to iterate over words, we need to update 00202 // prev_word_best_choice_ before calling classify_word_pass2(). 00203 prev_word_best_choice_ = NULL; 00204 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00205 word = word_it.data(); 00206 if ((!word->part_of_combo) && (word->box_word == NULL)) { 00207 classify_word_and_language(&Tesseract::classify_word_pass2, 00208 block, row, word); 00209 } 00210 prev_word_best_choice_ = word->best_choice; 00211 } 00212 } 00213 00214 00240 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) { 00241 WERD_RES_IT word_res_it(&word_res_list); 00242 inT16 total_score = 0; 00243 inT16 word_count = 0; 00244 inT16 done_word_count = 0; 00245 inT16 word_len; 00246 inT16 i; 00247 inT16 offset; 00248 WERD_RES *word; // current word 00249 inT16 prev_word_score = 0; 00250 BOOL8 prev_word_done = FALSE; 00251 BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"? 00252 BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0 00253 BOOL8 current_char_1 = FALSE; 00254 BOOL8 current_word_ok_so_far; 00255 STRING punct_chars = "!\"`',.:;"; 00256 BOOL8 prev_char_punct = FALSE; 00257 BOOL8 current_char_punct = FALSE; 00258 BOOL8 word_done = FALSE; 00259 00260 do { 00261 word = word_res_it.data(); 00262 word_done = fixspace_thinks_word_done(word); 00263 word_count++; 00264 if (word->tess_failed) { 00265 total_score += prev_word_score; 00266 if (prev_word_done) 00267 done_word_count++; 00268 prev_word_score = 0; 00269 prev_char_1 = FALSE; 00270 prev_char_digit = FALSE; 00271 prev_word_done = FALSE; 00272 } else { 00273 /* 00274 Can we add the prev word score and potentially count this word? 00275 Yes IF it didnt end in a 1 when the first char of this word is a digit 00276 AND it didnt end in a digit when the first char of this word is a 1 00277 */ 00278 word_len = word->reject_map.length(); 00279 current_word_ok_so_far = FALSE; 00280 if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) || 00281 (prev_char_digit && ( 00282 (word_done && 00283 word->best_choice->unichar_lengths().string()[0] == 1 && 00284 word->best_choice->unichar_string()[0] == '1') || 00285 (!word_done && STRING(conflict_set_I_l_1).contains( 00286 word->best_choice->unichar_string()[0])))))) { 00287 total_score += prev_word_score; 00288 if (prev_word_done) 00289 done_word_count++; 00290 current_word_ok_so_far = word_done; 00291 } 00292 00293 if (current_word_ok_so_far) { 00294 prev_word_done = TRUE; 00295 prev_word_score = word_len; 00296 } else { 00297 prev_word_done = FALSE; 00298 prev_word_score = 0; 00299 } 00300 00301 /* Add 1 to total score for every joined 1 regardless of context and 00302 rejtn */ 00303 for (i = 0, prev_char_1 = FALSE; i < word_len; i++) { 00304 current_char_1 = word->best_choice->unichar_string()[i] == '1'; 00305 if (prev_char_1 || (current_char_1 && (i > 0))) 00306 total_score++; 00307 prev_char_1 = current_char_1; 00308 } 00309 00310 /* Add 1 to total score for every joined punctuation regardless of context 00311 and rejtn */ 00312 if (tessedit_prefer_joined_punct) { 00313 for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len; 00314 offset += word->best_choice->unichar_lengths()[i++]) { 00315 current_char_punct = 00316 punct_chars.contains(word->best_choice->unichar_string()[offset]); 00317 if (prev_char_punct || (current_char_punct && i > 0)) 00318 total_score++; 00319 prev_char_punct = current_char_punct; 00320 } 00321 } 00322 prev_char_digit = digit_or_numeric_punct(word, word_len - 1); 00323 for (i = 0, offset = 0; i < word_len - 1; 00324 offset += word->best_choice->unichar_lengths()[i++]); 00325 prev_char_1 = 00326 ((word_done && (word->best_choice->unichar_string()[offset] == '1')) 00327 || (!word_done && STRING(conflict_set_I_l_1).contains( 00328 word->best_choice->unichar_string()[offset]))); 00329 } 00330 /* Find next word */ 00331 do { 00332 word_res_it.forward(); 00333 } while (word_res_it.data()->part_of_combo); 00334 } while (!word_res_it.at_first()); 00335 total_score += prev_word_score; 00336 if (prev_word_done) 00337 done_word_count++; 00338 if (done_word_count == word_count) 00339 return PERFECT_WERDS; 00340 else 00341 return total_score; 00342 } 00343 00344 BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) { 00345 int i; 00346 int offset; 00347 00348 for (i = 0, offset = 0; i < char_position; 00349 offset += word->best_choice->unichar_lengths()[i++]); 00350 return ( 00351 word->uch_set->get_isdigit( 00352 word->best_choice->unichar_string().string() + offset, 00353 word->best_choice->unichar_lengths()[i]) || 00354 (word->best_choice->permuter() == NUMBER_PERM && 00355 STRING(numeric_punctuation).contains( 00356 word->best_choice->unichar_string().string()[offset]))); 00357 } 00358 00359 } // namespace tesseract 00360 00361 00373 void transform_to_next_perm(WERD_RES_LIST &words) { 00374 WERD_RES_IT word_it(&words); 00375 WERD_RES_IT prev_word_it(&words); 00376 WERD_RES *word; 00377 WERD_RES *prev_word; 00378 WERD_RES *combo; 00379 WERD *copy_word; 00380 inT16 prev_right = -MAX_INT16; 00381 TBOX box; 00382 inT16 gap; 00383 inT16 min_gap = MAX_INT16; 00384 00385 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00386 word = word_it.data(); 00387 if (!word->part_of_combo) { 00388 box = word->word->bounding_box(); 00389 if (prev_right > -MAX_INT16) { 00390 gap = box.left() - prev_right; 00391 if (gap < min_gap) 00392 min_gap = gap; 00393 } 00394 prev_right = box.right(); 00395 } 00396 } 00397 if (min_gap < MAX_INT16) { 00398 prev_right = -MAX_INT16; // back to start 00399 word_it.set_to_list(&words); 00400 // Note: we can't use cycle_pt due to inserted combos at start of list. 00401 for (; (prev_right == -MAX_INT16) || !word_it.at_first(); 00402 word_it.forward()) { 00403 word = word_it.data(); 00404 if (!word->part_of_combo) { 00405 box = word->word->bounding_box(); 00406 if (prev_right > -MAX_INT16) { 00407 gap = box.left() - prev_right; 00408 if (gap <= min_gap) { 00409 prev_word = prev_word_it.data(); 00410 if (prev_word->combination) { 00411 combo = prev_word; 00412 } else { 00413 /* Make a new combination and insert before 00414 * the first word being joined. */ 00415 copy_word = new WERD; 00416 *copy_word = *(prev_word->word); 00417 // deep copy 00418 combo = new WERD_RES(copy_word); 00419 combo->combination = TRUE; 00420 combo->x_height = prev_word->x_height; 00421 prev_word->part_of_combo = TRUE; 00422 prev_word_it.add_before_then_move(combo); 00423 } 00424 combo->word->set_flag(W_EOL, word->word->flag(W_EOL)); 00425 if (word->combination) { 00426 combo->word->join_on(word->word); 00427 // Move blobs to combo 00428 // old combo no longer needed 00429 delete word_it.extract(); 00430 } else { 00431 // Copy current wd to combo 00432 combo->copy_on(word); 00433 word->part_of_combo = TRUE; 00434 } 00435 combo->done = FALSE; 00436 combo->ClearResults(); 00437 } else { 00438 prev_word_it = word_it; // catch up 00439 } 00440 } 00441 prev_right = box.right(); 00442 } 00443 } 00444 } else { 00445 words.clear(); // signal termination 00446 } 00447 } 00448 00449 namespace tesseract { 00450 void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score, 00451 inT16 mode, BOOL8 improved) { 00452 WERD_RES_IT word_res_it(&perm); 00453 00454 if (debug_fix_space_level > 0) { 00455 if (mode == 1) { 00456 stats_.dump_words_str = ""; 00457 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00458 word_res_it.forward()) { 00459 if (!word_res_it.data()->part_of_combo) { 00460 stats_.dump_words_str += 00461 word_res_it.data()->best_choice->unichar_string(); 00462 stats_.dump_words_str += ' '; 00463 } 00464 } 00465 } 00466 00467 #ifndef SECURE_NAMES 00468 if (debug_fix_space_level > 1) { 00469 switch (mode) { 00470 case 1: 00471 tprintf("EXTRACTED (%d): \"", score); 00472 break; 00473 case 2: 00474 tprintf("TESTED (%d): \"", score); 00475 break; 00476 case 3: 00477 tprintf("RETURNED (%d): \"", score); 00478 break; 00479 } 00480 00481 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00482 word_res_it.forward()) { 00483 if (!word_res_it.data()->part_of_combo) { 00484 tprintf("%s/%1d ", 00485 word_res_it.data()->best_choice->unichar_string().string(), 00486 (int)word_res_it.data()->best_choice->permuter()); 00487 } 00488 } 00489 tprintf("\"\n"); 00490 } else if (improved) { 00491 tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string()); 00492 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00493 word_res_it.forward()) { 00494 if (!word_res_it.data()->part_of_combo) { 00495 tprintf("%s/%1d ", 00496 word_res_it.data()->best_choice->unichar_string().string(), 00497 (int)word_res_it.data()->best_choice->permuter()); 00498 } 00499 } 00500 tprintf("\"\n"); 00501 } 00502 #endif 00503 } 00504 } 00505 00506 00515 BOOL8 Tesseract::uniformly_spaced(WERD_RES *word) { 00516 TBOX box; 00517 inT16 prev_right = -MAX_INT16; 00518 inT16 gap; 00519 inT16 max_gap = -MAX_INT16; 00520 inT16 max_gap_count = 0; 00521 STATS gap_stats(0, MAXSPACING); 00522 BOOL8 result; 00523 const ROW *row = word->denorm.row(); 00524 float max_non_space; 00525 float normalised_max_nonspace; 00526 inT16 i = 0; 00527 inT16 offset = 0; 00528 STRING punct_chars = "\"`',.:;"; 00529 00530 for (TBLOB* blob = word->rebuild_word->blobs; blob != NULL; 00531 blob = blob->next) { 00532 box = blob->bounding_box(); 00533 if ((prev_right > -MAX_INT16) && 00534 (!punct_chars.contains( 00535 word->best_choice->unichar_string() 00536 [offset - word->best_choice->unichar_lengths()[i - 1]]) && 00537 !punct_chars.contains( 00538 word->best_choice->unichar_string()[offset]))) { 00539 gap = box.left() - prev_right; 00540 if (gap < max_gap) { 00541 gap_stats.add(gap, 1); 00542 } else if (gap == max_gap) { 00543 max_gap_count++; 00544 } else { 00545 if (max_gap_count > 0) 00546 gap_stats.add(max_gap, max_gap_count); 00547 max_gap = gap; 00548 max_gap_count = 1; 00549 } 00550 } 00551 prev_right = box.right(); 00552 offset += word->best_choice->unichar_lengths()[i++]; 00553 } 00554 00555 max_non_space = (row->space() + 3 * row->kern()) / 4; 00556 normalised_max_nonspace = max_non_space * kBlnXHeight / row->x_height(); 00557 00558 result = ( 00559 gap_stats.get_total() == 0 || 00560 max_gap <= normalised_max_nonspace || 00561 (gap_stats.get_total() > 2 && max_gap <= 2 * gap_stats.median()) || 00562 (gap_stats.get_total() <= 2 && max_gap <= 2 * gap_stats.mean())); 00563 #ifndef SECURE_NAMES 00564 if ((debug_fix_space_level > 1)) { 00565 if (result) { 00566 tprintf( 00567 "ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d " 00568 "total=%d mean=%f median=%f\n", 00569 word->best_choice->unichar_string().string(), normalised_max_nonspace, 00570 max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(), 00571 gap_stats.median()); 00572 } else { 00573 tprintf( 00574 "REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d " 00575 "total=%d mean=%f median=%f\n", 00576 word->best_choice->unichar_string().string(), normalised_max_nonspace, 00577 max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(), 00578 gap_stats.median()); 00579 } 00580 } 00581 #endif 00582 00583 return result; 00584 } 00585 00586 BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) { 00587 if (word->done) 00588 return TRUE; 00589 00590 /* 00591 Use all the standard pass 2 conditions for mode 5 in set_done() in 00592 reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT 00593 CARE WHETHER WE HAVE of/at on/an etc. 00594 */ 00595 if (fixsp_done_mode > 0 && 00596 (word->tess_accepted || 00597 (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) || 00598 fixsp_done_mode == 3) && 00599 (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) && 00600 ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) || 00601 (word->best_choice->permuter() == FREQ_DAWG_PERM) || 00602 (word->best_choice->permuter() == USER_DAWG_PERM) || 00603 (word->best_choice->permuter() == NUMBER_PERM))) { 00604 return TRUE; 00605 } else { 00606 return FALSE; 00607 } 00608 } 00609 00610 00618 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, 00619 BLOCK* block) { 00620 WERD_RES *word_res; 00621 WERD_RES_LIST sub_word_list; 00622 WERD_RES_IT sub_word_list_it(&sub_word_list); 00623 inT16 blob_index; 00624 inT16 new_length; 00625 float junk; 00626 00627 word_res = word_res_it.data(); 00628 if (word_res->word->flag(W_REP_CHAR) || 00629 word_res->combination || 00630 word_res->part_of_combo || 00631 !word_res->word->flag(W_DONT_CHOP)) 00632 return; 00633 00634 blob_index = worst_noise_blob(word_res, &junk); 00635 if (blob_index < 0) 00636 return; 00637 00638 if (debug_fix_space_level > 1) { 00639 tprintf("FP fixspace working on \"%s\"\n", 00640 word_res->best_choice->unichar_string().string()); 00641 } 00642 word_res->word->rej_cblob_list()->sort(c_blob_comparator); 00643 sub_word_list_it.add_after_stay_put(word_res_it.extract()); 00644 fix_noisy_space_list(sub_word_list, row, block); 00645 new_length = sub_word_list.length(); 00646 word_res_it.add_list_before(&sub_word_list); 00647 for (; !word_res_it.at_last() && new_length > 1; new_length--) { 00648 word_res_it.forward(); 00649 } 00650 } 00651 00652 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, 00653 BLOCK* block) { 00654 inT16 best_score; 00655 WERD_RES_IT best_perm_it(&best_perm); 00656 WERD_RES_LIST current_perm; 00657 WERD_RES_IT current_perm_it(¤t_perm); 00658 WERD_RES *old_word_res; 00659 WERD_RES *new_word_res; 00660 inT16 current_score; 00661 BOOL8 improved = FALSE; 00662 00663 best_score = fp_eval_word_spacing(best_perm); // default score 00664 00665 dump_words(best_perm, best_score, 1, improved); 00666 00667 new_word_res = new WERD_RES; 00668 old_word_res = best_perm_it.data(); 00669 old_word_res->combination = TRUE; // Kludge to force deep copy 00670 *new_word_res = *old_word_res; // deep copy 00671 old_word_res->combination = FALSE; // Undo kludge 00672 current_perm_it.add_to_end(new_word_res); 00673 00674 break_noisiest_blob_word(current_perm); 00675 00676 while (best_score != PERFECT_WERDS && !current_perm.empty()) { 00677 match_current_words(current_perm, row, block); 00678 current_score = fp_eval_word_spacing(current_perm); 00679 dump_words(current_perm, current_score, 2, improved); 00680 if (current_score > best_score) { 00681 best_perm.clear(); 00682 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); 00683 best_score = current_score; 00684 improved = TRUE; 00685 } 00686 if (current_score < PERFECT_WERDS) { 00687 break_noisiest_blob_word(current_perm); 00688 } 00689 } 00690 dump_words(best_perm, best_score, 3, improved); 00691 } 00692 00693 00699 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { 00700 WERD_RES_IT word_it(&words); 00701 WERD_RES_IT worst_word_it; 00702 float worst_noise_score = 9999; 00703 int worst_blob_index = -1; // Noisiest blob of noisiest wd 00704 int blob_index; // of wds noisiest blob 00705 float noise_score; // of wds noisiest blob 00706 WERD_RES *word_res; 00707 C_BLOB_IT blob_it; 00708 C_BLOB_IT rej_cblob_it; 00709 C_BLOB_LIST new_blob_list; 00710 C_BLOB_IT new_blob_it; 00711 C_BLOB_IT new_rej_cblob_it; 00712 WERD *new_word; 00713 inT16 start_of_noise_blob; 00714 inT16 i; 00715 00716 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00717 blob_index = worst_noise_blob(word_it.data(), &noise_score); 00718 if (blob_index > -1 && worst_noise_score > noise_score) { 00719 worst_noise_score = noise_score; 00720 worst_blob_index = blob_index; 00721 worst_word_it = word_it; 00722 } 00723 } 00724 if (worst_blob_index < 0) { 00725 words.clear(); // signal termination 00726 return; 00727 } 00728 00729 /* Now split the worst_word_it */ 00730 00731 word_res = worst_word_it.data(); 00732 00733 /* Move blobs before noise blob to a new bloblist */ 00734 00735 new_blob_it.set_to_list(&new_blob_list); 00736 blob_it.set_to_list(word_res->word->cblob_list()); 00737 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) { 00738 new_blob_it.add_after_then_move(blob_it.extract()); 00739 } 00740 start_of_noise_blob = blob_it.data()->bounding_box().left(); 00741 delete blob_it.extract(); // throw out noise blob 00742 00743 new_word = new WERD(&new_blob_list, word_res->word); 00744 new_word->set_flag(W_EOL, FALSE); 00745 word_res->word->set_flag(W_BOL, FALSE); 00746 word_res->word->set_blanks(1); // After break 00747 00748 new_rej_cblob_it.set_to_list(new_word->rej_cblob_list()); 00749 rej_cblob_it.set_to_list(word_res->word->rej_cblob_list()); 00750 for (; 00751 (!rej_cblob_it.empty() && 00752 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob)); 00753 rej_cblob_it.forward()) { 00754 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract()); 00755 } 00756 00757 WERD_RES* new_word_res = new WERD_RES(new_word); 00758 new_word_res->combination = TRUE; 00759 worst_word_it.add_before_then_move(new_word_res); 00760 00761 word_res->ClearResults(); 00762 } 00763 00764 inT16 Tesseract::worst_noise_blob(WERD_RES *word_res, 00765 float *worst_noise_score) { 00766 float noise_score[512]; 00767 int i; 00768 int min_noise_blob; // 1st contender 00769 int max_noise_blob; // last contender 00770 int non_noise_count; 00771 int worst_noise_blob; // Worst blob 00772 float small_limit = kBlnXHeight * fixsp_small_outlines_size; 00773 float non_noise_limit = kBlnXHeight * 0.8; 00774 00775 if (word_res->rebuild_word == NULL) 00776 return -1; // Can't handle cube words. 00777 00778 TBLOB* blob = word_res->rebuild_word->blobs; 00779 // Normalised. 00780 int blob_count = word_res->box_word->length(); 00781 ASSERT_HOST(blob_count <= 512); 00782 if (blob_count < 5) 00783 return -1; // too short to split 00784 00785 /* Get the noise scores for all blobs */ 00786 00787 #ifndef SECURE_NAMES 00788 if (debug_fix_space_level > 5) 00789 tprintf("FP fixspace Noise metrics for \"%s\": ", 00790 word_res->best_choice->unichar_string().string()); 00791 #endif 00792 00793 for (i = 0; i < blob_count && blob != NULL; i++, blob = blob->next) { 00794 if (word_res->reject_map[i].accepted()) 00795 noise_score[i] = non_noise_limit; 00796 else 00797 noise_score[i] = blob_noise_score(blob); 00798 00799 if (debug_fix_space_level > 5) 00800 tprintf("%1.1f ", noise_score[i]); 00801 } 00802 if (debug_fix_space_level > 5) 00803 tprintf("\n"); 00804 00805 /* Now find the worst one which is far enough away from the end of the word */ 00806 00807 non_noise_count = 0; 00808 for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) { 00809 if (noise_score[i] >= non_noise_limit) { 00810 non_noise_count++; 00811 } 00812 } 00813 if (non_noise_count < fixsp_non_noise_limit) 00814 return -1; 00815 00816 min_noise_blob = i; 00817 00818 non_noise_count = 0; 00819 for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; 00820 i--) { 00821 if (noise_score[i] >= non_noise_limit) { 00822 non_noise_count++; 00823 } 00824 } 00825 if (non_noise_count < fixsp_non_noise_limit) 00826 return -1; 00827 00828 max_noise_blob = i; 00829 00830 if (min_noise_blob > max_noise_blob) 00831 return -1; 00832 00833 *worst_noise_score = small_limit; 00834 worst_noise_blob = -1; 00835 for (i = min_noise_blob; i <= max_noise_blob; i++) { 00836 if (noise_score[i] < *worst_noise_score) { 00837 worst_noise_blob = i; 00838 *worst_noise_score = noise_score[i]; 00839 } 00840 } 00841 return worst_noise_blob; 00842 } 00843 00844 float Tesseract::blob_noise_score(TBLOB *blob) { 00845 TBOX box; // BB of outline 00846 inT16 outline_count = 0; 00847 inT16 max_dimension; 00848 inT16 largest_outline_dimension = 0; 00849 00850 for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) { 00851 outline_count++; 00852 box = ol->bounding_box(); 00853 if (box.height() > box.width()) { 00854 max_dimension = box.height(); 00855 } else { 00856 max_dimension = box.width(); 00857 } 00858 00859 if (largest_outline_dimension < max_dimension) 00860 largest_outline_dimension = max_dimension; 00861 } 00862 00863 if (outline_count > 5) { 00864 // penalise LOTS of blobs 00865 largest_outline_dimension *= 2; 00866 } 00867 00868 box = blob->bounding_box(); 00869 if (box.bottom() > kBlnBaselineOffset * 4 || 00870 box.top() < kBlnBaselineOffset / 2) { 00871 // Lax blob is if high or low 00872 largest_outline_dimension /= 2; 00873 } 00874 00875 return largest_outline_dimension; 00876 } 00877 } // namespace tesseract 00878 00879 void fixspace_dbg(WERD_RES *word) { 00880 TBOX box = word->word->bounding_box(); 00881 BOOL8 show_map_detail = FALSE; 00882 inT16 i; 00883 00884 box.print(); 00885 tprintf(" \"%s\" ", word->best_choice->unichar_string().string()); 00886 tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", 00887 word->word->cblob_list()->length(), 00888 word->rebuild_word->NumBlobs(), 00889 word->box_word->length()); 00890 word->reject_map.print(debug_fp); 00891 tprintf("\n"); 00892 if (show_map_detail) { 00893 tprintf("\"%s\"\n", word->best_choice->unichar_string().string()); 00894 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { 00895 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); 00896 word->reject_map[i].full_print(debug_fp); 00897 } 00898 } 00899 00900 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); 00901 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); 00902 } 00903 00904 00913 namespace tesseract { 00914 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { 00915 WERD_RES_IT word_it(&word_res_list); 00916 WERD_RES *word; 00917 inT16 word_length; 00918 inT16 score = 0; 00919 inT16 i; 00920 float small_limit = kBlnXHeight * fixsp_small_outlines_size; 00921 00922 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00923 word = word_it.data(); 00924 if (word->rebuild_word == NULL) 00925 continue; // Can't handle cube words. 00926 word_length = word->reject_map.length(); 00927 if (word->done || 00928 word->tess_accepted || 00929 word->best_choice->permuter() == SYSTEM_DAWG_PERM || 00930 word->best_choice->permuter() == FREQ_DAWG_PERM || 00931 word->best_choice->permuter() == USER_DAWG_PERM || 00932 safe_dict_word(word) > 0) { 00933 TBLOB* blob = word->rebuild_word->blobs; 00934 UNICHAR_ID space = word->uch_set->unichar_to_id(" "); 00935 for (i = 0; i < word->best_choice->length() && blob != NULL; 00936 ++i, blob = blob->next) { 00937 if (word->best_choice->unichar_id(i) == space || 00938 blob_noise_score(blob) < small_limit) { 00939 score -= 1; // penalise possibly erroneous non-space 00940 } else if (word->reject_map[i].accepted()) { 00941 score++; 00942 } 00943 } 00944 } 00945 } 00946 if (score < 0) 00947 score = 0; 00948 return score; 00949 } 00950 00951 } // namespace tesseract