Tesseract
3.02
|
00001 /****************************************************************** 00002 * File: control.cpp (Formerly control.c) 00003 * Description: Module-independent matcher controller. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 11:09:58 BST 1992 00006 * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle 00007 * 00008 * (C) Copyright 1992, Hewlett-Packard Ltd. 00009 ** Licensed under the Apache License, Version 2.0 (the "License"); 00010 ** you may not use this file except in compliance with the License. 00011 ** You may obtain a copy of the License at 00012 ** http://www.apache.org/licenses/LICENSE-2.0 00013 ** Unless required by applicable law or agreed to in writing, software 00014 ** distributed under the License is distributed on an "AS IS" BASIS, 00015 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 ** See the License for the specific language governing permissions and 00017 ** limitations under the License. 00018 * 00019 **********************************************************************/ 00020 00021 #include "mfcpch.h" 00022 00023 #include <string.h> 00024 #include <math.h> 00025 #ifdef __UNIX__ 00026 #include <assert.h> 00027 #include <unistd.h> 00028 #include <errno.h> 00029 #endif 00030 #include <ctype.h> 00031 #include "ocrclass.h" 00032 #include "werdit.h" 00033 #include "drawfx.h" 00034 #include "tfacep.h" 00035 #include "tessbox.h" 00036 #include "tessvars.h" 00037 #include "pgedit.h" 00038 #include "reject.h" 00039 #include "fixspace.h" 00040 #include "docqual.h" 00041 #include "control.h" 00042 #include "secname.h" 00043 #include "output.h" 00044 #include "callcpp.h" 00045 #include "notdll.h" 00046 #include "globals.h" 00047 #include "sorthelper.h" 00048 #include "tesseractclass.h" 00049 00050 // Include automatically generated configuration file if running autoconf. 00051 #ifdef HAVE_CONFIG_H 00052 #include "config_auto.h" 00053 #endif 00054 00055 #define MIN_FONT_ROW_COUNT 8 00056 #define MAX_XHEIGHT_DIFF 3 00057 00058 const char* const kBackUpConfigFile = "tempconfigdata.config"; 00059 // Multiple of x-height to make a repeated word have spaces in it. 00060 const double kRepcharGapThreshold = 0.5; 00061 00062 00071 namespace tesseract { 00072 void Tesseract::recog_pseudo_word(PAGE_RES* page_res, 00073 TBOX &selection_box) { 00074 WERD *word; 00075 ROW *pseudo_row; // row of word 00076 BLOCK *pseudo_block; // block of word 00077 00078 word = make_pseudo_word(page_res, selection_box, 00079 pseudo_block, pseudo_row); 00080 if (word != NULL) { 00081 WERD_RES word_res(word); 00082 recog_interactive(pseudo_block, pseudo_row, &word_res); 00083 delete word; 00084 } 00085 } 00086 00087 00097 BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) { 00098 inT16 char_qual; 00099 inT16 good_char_qual; 00100 00101 classify_word_and_language(&Tesseract::classify_word_pass2, 00102 block, row, word_res); 00103 if (tessedit_debug_quality_metrics) { 00104 word_char_quality(word_res, row, &char_qual, &good_char_qual); 00105 tprintf 00106 ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n", 00107 word_res->reject_map.length(), word_blob_quality(word_res, row), 00108 word_outline_errs(word_res), char_qual, good_char_qual); 00109 } 00110 return TRUE; 00111 } 00112 00113 // Helper function to check for a target word and handle it appropriately. 00114 // Inspired by Jetsoft's requirement to process only single words on pass2 00115 // and beyond. 00116 // If word_config is not null: 00117 // If the word_box and target_word_box overlap, read the word_config file 00118 // else reset to previous config data. 00119 // return true. 00120 // else 00121 // If the word_box and target_word_box overlap or pass <= 1, return true. 00122 // Note that this function uses a fixed temporary file for storing the previous 00123 // configs, so it is neither thread-safe, nor process-safe, but the assumption 00124 // is that it will only be used for one debug window at a time. 00125 // 00126 // Since this function is used for debugging (and not to change OCR results) 00127 // set only debug params from the word config file. 00128 bool Tesseract::ProcessTargetWord(const TBOX& word_box, 00129 const TBOX& target_word_box, 00130 const char* word_config, 00131 int pass) { 00132 if (word_config != NULL) { 00133 if (word_box.major_overlap(target_word_box)) { 00134 if (backup_config_file_ == NULL) { 00135 backup_config_file_ = kBackUpConfigFile; 00136 FILE* config_fp = fopen(backup_config_file_, "wb"); 00137 ParamUtils::PrintParams(config_fp, params()); 00138 fclose(config_fp); 00139 ParamUtils::ReadParamsFile(word_config, 00140 SET_PARAM_CONSTRAINT_DEBUG_ONLY, 00141 params()); 00142 } 00143 } else { 00144 if (backup_config_file_ != NULL) { 00145 ParamUtils::ReadParamsFile(backup_config_file_, 00146 SET_PARAM_CONSTRAINT_DEBUG_ONLY, 00147 params()); 00148 backup_config_file_ = NULL; 00149 } 00150 } 00151 } else if (pass > 1 && !word_box.major_overlap(target_word_box)) { 00152 return false; 00153 } 00154 return true; 00155 } 00156 00178 bool Tesseract::recog_all_words(PAGE_RES* page_res, 00179 ETEXT_DESC* monitor, 00180 const TBOX* target_word_box, 00181 const char* word_config, 00182 int dopasses) { 00183 PAGE_RES_IT page_res_it; 00184 inT32 word_index; // current word 00185 00186 if (tessedit_minimal_rej_pass1) { 00187 tessedit_test_adaption.set_value (TRUE); 00188 tessedit_minimal_rejection.set_value (TRUE); 00189 } 00190 00191 // Before the main recognition loop below, walk through the whole page and set 00192 // up fake words. That way, if we run out of time a user will still get the 00193 // expected best_choice and box_words out the end; they'll just be empty. 00194 page_res_it.page_res = page_res; 00195 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00196 page_res_it.forward()) { 00197 page_res_it.word()->SetupFake(unicharset); 00198 } 00199 00200 if (dopasses==0 || dopasses==1) { 00201 page_res_it.page_res=page_res; 00202 page_res_it.restart_page(); 00203 00204 // ****************** Pass 1 ******************* 00205 00206 // Clear adaptive classifier at the beginning of the page if it is full. 00207 // This is done only at the beginning of the page to ensure that the 00208 // classifier is not reset at an arbitrary point while processing the page, 00209 // which would cripple Passes 2+ if the reset happens towards the end of 00210 // Pass 1 on a page with very difficult text. 00211 // TODO(daria): preemptively clear the classifier if it is almost full. 00212 if (AdaptiveClassifierIsFull()) ResetAdaptiveClassifierInternal(); 00213 // Now check the sub-langs as well. 00214 for (int i = 0; i < sub_langs_.size(); ++i) { 00215 if (sub_langs_[i]->AdaptiveClassifierIsFull()) 00216 sub_langs_[i]->ResetAdaptiveClassifierInternal(); 00217 } 00218 00219 stats_.word_count = 0; 00220 if (monitor != NULL) { 00221 monitor->ocr_alive = TRUE; 00222 while (page_res_it.word() != NULL) { 00223 stats_.word_count++; 00224 page_res_it.forward(); 00225 } 00226 page_res_it.restart_page(); 00227 } else { 00228 stats_.word_count = 1; 00229 } 00230 00231 word_index = 0; 00232 00233 stats_.dict_words = 0; 00234 stats_.doc_blob_quality = 0; 00235 stats_.doc_outline_errs = 0; 00236 stats_.doc_char_quality = 0; 00237 stats_.good_char_count = 0; 00238 stats_.doc_good_char_quality = 0; 00239 00240 most_recently_used_ = this; 00241 while (page_res_it.word() != NULL) { 00242 set_global_loc_code(LOC_PASS1); 00243 word_index++; 00244 if (monitor != NULL) { 00245 monitor->ocr_alive = TRUE; 00246 monitor->progress = 30 + 50 * word_index / stats_.word_count; 00247 if (monitor->deadline_exceeded() || 00248 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, 00249 stats_.dict_words))) 00250 return false; 00251 } 00252 if (target_word_box && 00253 !ProcessTargetWord(page_res_it.word()->word->bounding_box(), 00254 *target_word_box, word_config, 1)) { 00255 page_res_it.forward(); 00256 continue; 00257 } 00258 classify_word_and_language(&Tesseract::classify_word_pass1, 00259 page_res_it.block()->block, 00260 page_res_it.row()->row, 00261 page_res_it.word()); 00262 if (page_res_it.word()->word->flag(W_REP_CHAR)) { 00263 fix_rep_char(&page_res_it); 00264 page_res_it.forward(); 00265 continue; 00266 } 00267 if (tessedit_dump_choices) { 00268 word_dumper(NULL, page_res_it.row()->row, page_res_it.word()); 00269 tprintf("Pass1: %s [%s]\n", 00270 page_res_it.word()->best_choice->unichar_string().string(), 00271 page_res_it.word()->best_choice->debug_string().string()); 00272 } 00273 00274 // tessedit_test_adaption enables testing of the accuracy of the 00275 // input to the adaptive classifier. 00276 if (tessedit_test_adaption && !tessedit_minimal_rejection) { 00277 if (!word_adaptable (page_res_it.word(), 00278 tessedit_test_adaption_mode)) { 00279 page_res_it.word()->reject_map.rej_word_tess_failure(); 00280 // FAKE PERM REJ 00281 } else { 00282 // Override rejection mechanisms for this word. 00283 UNICHAR_ID space = unicharset.unichar_to_id(" "); 00284 for (int i = 0; i < page_res_it.word()->best_choice->length(); i++) { 00285 if ((page_res_it.word()->best_choice->unichar_id(i) != space) && 00286 page_res_it.word()->reject_map[i].rejected()) 00287 page_res_it.word()->reject_map[i].setrej_minimal_rej_accept(); 00288 } 00289 } 00290 } 00291 00292 // Count dict words. 00293 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) 00294 ++(stats_.dict_words); 00295 00296 // Update misadaption log (we only need to do it on pass 1, since 00297 // adaption only happens on this pass). 00298 if (page_res_it.word()->blamer_bundle != NULL && 00299 page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) { 00300 page_res->misadaption_log.push_back( 00301 page_res_it.word()->blamer_bundle->misadaption_debug); 00302 } 00303 00304 page_res_it.forward(); 00305 } 00306 } 00307 00308 if (dopasses == 1) return true; 00309 00310 // ****************** Pass 2 ******************* 00311 page_res_it.restart_page(); 00312 word_index = 0; 00313 most_recently_used_ = this; 00314 while (!tessedit_test_adaption && page_res_it.word() != NULL) { 00315 set_global_loc_code(LOC_PASS2); 00316 word_index++; 00317 if (monitor != NULL) { 00318 monitor->ocr_alive = TRUE; 00319 monitor->progress = 80 + 10 * word_index / stats_.word_count; 00320 if (monitor->deadline_exceeded() || 00321 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, 00322 stats_.dict_words))) 00323 return false; 00324 } 00325 00326 // changed by jetsoft 00327 // specific to its needs to extract one word when need 00328 if (target_word_box && 00329 !ProcessTargetWord(page_res_it.word()->word->bounding_box(), 00330 *target_word_box, word_config, 2)) { 00331 page_res_it.forward(); 00332 continue; 00333 } 00334 // end jetsoft 00335 00336 classify_word_and_language(&Tesseract::classify_word_pass2, 00337 page_res_it.block()->block, 00338 page_res_it.row()->row, 00339 page_res_it.word()); 00340 if (page_res_it.word()->word->flag(W_REP_CHAR) && 00341 !page_res_it.word()->done) { 00342 fix_rep_char(&page_res_it); 00343 page_res_it.forward(); 00344 continue; 00345 } 00346 if (tessedit_dump_choices) { 00347 word_dumper(NULL, page_res_it.row()->row, page_res_it.word()); 00348 tprintf("Pass2: %s [%s]\n", 00349 page_res_it.word()->best_choice->unichar_string().string(), 00350 page_res_it.word()->best_choice->debug_string().string()); 00351 } 00352 page_res_it.forward(); 00353 } 00354 00355 // The next passes can only be run if tesseract has been used, as cube 00356 // doesn't set all the necessary outputs in WERD_RES. 00357 if (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || 00358 tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { 00359 // ****************** Pass 3 ******************* 00360 // Fix fuzzy spaces. 00361 set_global_loc_code(LOC_FUZZY_SPACE); 00362 00363 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces 00364 && !tessedit_word_for_word && !right_to_left()) 00365 fix_fuzzy_spaces(monitor, stats_.word_count, page_res); 00366 00367 // ****************** Pass 4 ******************* 00368 if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res); 00369 00370 // ****************** Pass 5,6 ******************* 00371 rejection_passes(page_res, monitor, target_word_box, word_config); 00372 00373 // ****************** Pass 7 ******************* 00374 // Cube combiner. 00375 // If cube is loaded and its combiner is present, run it. 00376 if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { 00377 run_cube_combiner(page_res); 00378 } 00379 00380 // ****************** Pass 8 ******************* 00381 font_recognition_pass(page_res); 00382 00383 // ****************** Pass 9 ******************* 00384 // Check the correctness of the final results. 00385 blamer_pass(page_res); 00386 } 00387 00388 if (!save_blob_choices) { 00389 // We aren't saving the blob choices so get rid of them now. 00390 // set_blob_choices() does a deep clear. 00391 page_res_it.restart_page(); 00392 while (page_res_it.word() != NULL) { 00393 WERD_RES* word = page_res_it.word(); 00394 word->best_choice->set_blob_choices(NULL); 00395 page_res_it.forward(); 00396 } 00397 } 00398 00399 // Write results pass. 00400 set_global_loc_code(LOC_WRITE_RESULTS); 00401 // This is now redundant, but retained commented so show how to obtain 00402 // bounding boxes and style information. 00403 00404 // changed by jetsoft 00405 // needed for dll to output memory structure 00406 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) 00407 output_pass(page_res_it, target_word_box); 00408 // end jetsoft 00409 PageSegMode pageseg_mode = static_cast<PageSegMode>( 00410 static_cast<int>(tessedit_pageseg_mode)); 00411 textord_.CleanupSingleRowResult(pageseg_mode, page_res); 00412 00413 if (monitor != NULL) { 00414 monitor->progress = 100; 00415 } 00416 return true; 00417 } 00418 00419 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { 00420 PAGE_RES_IT word_it(page_res); 00421 00422 WERD_RES *w_prev = NULL; 00423 WERD_RES *w = word_it.word(); 00424 while (1) { 00425 w_prev = w; 00426 while (word_it.forward() != NULL && 00427 (!word_it.word() || word_it.word()->part_of_combo)) { 00428 // advance word_it, skipping over parts of combos 00429 } 00430 if (!word_it.word()) break; 00431 w = word_it.word(); 00432 if (!w || !w_prev || w->uch_set != w_prev->uch_set) { 00433 continue; 00434 } 00435 if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) { 00436 if (tessedit_bigram_debug) { 00437 tprintf("Skipping because one of the words is W_REP_CHAR\n"); 00438 } 00439 continue; 00440 } 00441 // Two words sharing the same language model, excellent! 00442 if (w->alt_choices.empty()) { 00443 if (tessedit_bigram_debug) { 00444 tprintf("Alt choices not set up for word choice: %s\n", 00445 w->best_choice->unichar_string().string()); 00446 } 00447 continue; 00448 } 00449 if (w_prev->alt_choices.empty()) { 00450 if (tessedit_bigram_debug) { 00451 tprintf("Alt choices not set up for word choice: %s\n", 00452 w_prev->best_choice->unichar_string().string()); 00453 } 00454 continue; 00455 } 00456 00457 // We saved alternate choices, excellent! 00458 GenericVector<WERD_CHOICE *> overrides_word1; 00459 GenericVector<GenericVector<int> *> overrides_word1_state; 00460 GenericVector<WERD_CHOICE *> overrides_word2; 00461 GenericVector<GenericVector<int> *> overrides_word2_state; 00462 00463 STRING orig_w1_str = w_prev->best_choice->unichar_string(); 00464 STRING orig_w2_str = w->best_choice->unichar_string(); 00465 WERD_CHOICE prev_best(w->uch_set); 00466 { 00467 int w1start, w1end; 00468 w_prev->WithoutFootnoteSpan(&w1start, &w1end); 00469 prev_best = w_prev->best_choice->shallow_copy(w1start, w1end); 00470 } 00471 WERD_CHOICE this_best(w->uch_set); 00472 { 00473 int w2start, w2end; 00474 w->WithoutFootnoteSpan(&w2start, &w2end); 00475 this_best = w->best_choice->shallow_copy(w2start, w2end); 00476 } 00477 00478 if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) { 00479 if (tessedit_bigram_debug) { 00480 tprintf("Top choice \"%s %s\" verified by bigram model.\n", 00481 orig_w1_str.string(), orig_w2_str.string()); 00482 } 00483 continue; 00484 } 00485 if (tessedit_bigram_debug > 2) { 00486 tprintf("Examining alt choices for \"%s %s\".\n", 00487 orig_w1_str.string(), orig_w2_str.string()); 00488 } 00489 if (tessedit_bigram_debug > 1) { 00490 if (w_prev->alt_choices.size() > 1) { 00491 print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices); 00492 } 00493 if (w->alt_choices.size() > 1) { 00494 print_word_alternates_list(w->best_choice, &w->alt_choices); 00495 } 00496 } 00497 float best_rating = 0.0; 00498 int best_idx = 0; 00499 for (int i = 0; i < w_prev->alt_choices.size(); i++) { 00500 WERD_CHOICE *p1 = w_prev->alt_choices.get(i); 00501 WERD_CHOICE strip1(w->uch_set); 00502 { 00503 int p1start, p1end; 00504 w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i), 00505 &p1start, &p1end); 00506 strip1 = p1->shallow_copy(p1start, p1end); 00507 } 00508 for (int j = 0; j < w->alt_choices.size(); j++) { 00509 WERD_CHOICE *p2 = w->alt_choices.get(j); 00510 WERD_CHOICE strip2(w->uch_set); 00511 { 00512 int p2start, p2end; 00513 w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end); 00514 strip2 = p2->shallow_copy(p2start, p2end); 00515 } 00516 if (w->tesseract->getDict().valid_bigram(strip1, strip2)) { 00517 overrides_word1.push_back(p1); 00518 overrides_word1_state.push_back(&w_prev->alt_states.get(i)); 00519 overrides_word2.push_back(p2); 00520 overrides_word2_state.push_back(&w->alt_states.get(j)); 00521 if (overrides_word1.size() == 1 || 00522 p1->rating() + p2->rating() < best_rating) { 00523 best_rating = p1->rating() + p2->rating(); 00524 best_idx = overrides_word1.size() - 1; 00525 } 00526 } 00527 } 00528 } 00529 if (overrides_word1.size() >= 1) { 00530 // Excellent, we have some bigram matches. 00531 if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, 00532 *overrides_word1[best_idx]) && 00533 EqualIgnoringCaseAndTerminalPunct(*w->best_choice, 00534 *overrides_word2[best_idx])) { 00535 if (tessedit_bigram_debug > 1) { 00536 tprintf("Top choice \"%s %s\" verified (sans case) by bigram " 00537 "model.\n", orig_w1_str.string(), orig_w2_str.string()); 00538 } 00539 continue; 00540 } 00541 STRING new_w1_str = overrides_word1[best_idx]->unichar_string(); 00542 STRING new_w2_str = overrides_word2[best_idx]->unichar_string(); 00543 if (new_w1_str != orig_w1_str) { 00544 w_prev->ReplaceBestChoice(*overrides_word1[best_idx], 00545 *overrides_word1_state[best_idx]); 00546 } 00547 if (new_w2_str != orig_w2_str) { 00548 w->ReplaceBestChoice(*overrides_word2[best_idx], 00549 *overrides_word2_state[best_idx]); 00550 } 00551 if (tessedit_bigram_debug > 0) { 00552 STRING choices_description; 00553 int num_bigram_choices 00554 = overrides_word1.size() * overrides_word2.size(); 00555 if (num_bigram_choices == 1) { 00556 choices_description = "This was the unique bigram choice."; 00557 } else { 00558 if (tessedit_bigram_debug > 1) { 00559 STRING bigrams_list; 00560 const int kMaxChoicesToPrint = 20; 00561 for (int i = 0; i < overrides_word1.size() && 00562 i < kMaxChoicesToPrint; i++) { 00563 if (i > 0) { bigrams_list += ", "; } 00564 WERD_CHOICE *p1 = overrides_word1[i]; 00565 WERD_CHOICE *p2 = overrides_word2[i]; 00566 bigrams_list += p1->unichar_string() + " " + p2->unichar_string(); 00567 if (i == kMaxChoicesToPrint) { 00568 bigrams_list += " ..."; 00569 } 00570 } 00571 choices_description = "There were many choices: {"; 00572 choices_description += bigrams_list; 00573 choices_description += "}"; 00574 } else { 00575 choices_description.add_str_int("There were ", num_bigram_choices); 00576 choices_description += " compatible bigrams."; 00577 } 00578 } 00579 tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", 00580 orig_w1_str.string(), orig_w2_str.string(), 00581 new_w1_str.string(), new_w2_str.string(), 00582 choices_description.string()); 00583 } 00584 } 00585 } 00586 } 00587 00588 void Tesseract::rejection_passes(PAGE_RES* page_res, 00589 ETEXT_DESC* monitor, 00590 const TBOX* target_word_box, 00591 const char* word_config) { 00592 PAGE_RES_IT page_res_it(page_res); 00593 // ****************** Pass 5 ******************* 00594 // Gather statistics on rejects. 00595 int word_index = 0; 00596 while (!tessedit_test_adaption && page_res_it.word() != NULL) { 00597 set_global_loc_code(LOC_MM_ADAPT); 00598 WERD_RES* word = page_res_it.word(); 00599 word_index++; 00600 if (monitor != NULL) { 00601 monitor->ocr_alive = TRUE; 00602 monitor->progress = 95 + 5 * word_index / stats_.word_count; 00603 } 00604 if (word->rebuild_word == NULL) { 00605 // Word was not processed by tesseract. 00606 page_res_it.forward(); 00607 continue; 00608 } 00609 check_debug_pt(word, 70); 00610 00611 // changed by jetsoft 00612 // specific to its needs to extract one word when need 00613 if (target_word_box && 00614 !ProcessTargetWord(word->word->bounding_box(), 00615 *target_word_box, word_config, 4)) { 00616 page_res_it.forward(); 00617 continue; 00618 } 00619 // end jetsoft 00620 00621 page_res_it.rej_stat_word(); 00622 int chars_in_word = word->reject_map.length(); 00623 int rejects_in_word = word->reject_map.reject_count(); 00624 00625 int blob_quality = word_blob_quality(word, page_res_it.row()->row); 00626 stats_.doc_blob_quality += blob_quality; 00627 int outline_errs = word_outline_errs(word); 00628 stats_.doc_outline_errs += outline_errs; 00629 inT16 all_char_quality; 00630 inT16 accepted_all_char_quality; 00631 word_char_quality(word, page_res_it.row()->row, 00632 &all_char_quality, &accepted_all_char_quality); 00633 stats_.doc_char_quality += all_char_quality; 00634 uinT8 permuter_type = word->best_choice->permuter(); 00635 if ((permuter_type == SYSTEM_DAWG_PERM) || 00636 (permuter_type == FREQ_DAWG_PERM) || 00637 (permuter_type == USER_DAWG_PERM)) { 00638 stats_.good_char_count += chars_in_word - rejects_in_word; 00639 stats_.doc_good_char_quality += accepted_all_char_quality; 00640 } 00641 check_debug_pt(word, 80); 00642 if (tessedit_reject_bad_qual_wds && 00643 (blob_quality == 0) && (outline_errs >= chars_in_word)) 00644 word->reject_map.rej_word_bad_quality(); 00645 check_debug_pt(word, 90); 00646 page_res_it.forward(); 00647 } 00648 00649 if (tessedit_debug_quality_metrics) { 00650 tprintf 00651 ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f" 00652 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", 00653 page_res->char_count, page_res->rej_count, 00654 page_res->rej_count / static_cast<float>(page_res->char_count), 00655 stats_.doc_blob_quality, 00656 stats_.doc_blob_quality / static_cast<float>(page_res->char_count), 00657 stats_.doc_outline_errs, 00658 stats_.doc_outline_errs / static_cast<float>(page_res->char_count), 00659 stats_.doc_char_quality, 00660 stats_.doc_char_quality / static_cast<float>(page_res->char_count), 00661 stats_.doc_good_char_quality, 00662 (stats_.good_char_count > 0) ? 00663 (stats_.doc_good_char_quality / 00664 static_cast<float>(stats_.good_char_count)) : 0.0); 00665 } 00666 BOOL8 good_quality_doc = 00667 ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= 00668 quality_rej_pc) && 00669 (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= 00670 quality_blob_pc) && 00671 (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= 00672 quality_outline_pc) && 00673 (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= 00674 quality_char_pc); 00675 00676 // ****************** Pass 6 ******************* 00677 // Do whole document or whole block rejection pass 00678 if (!tessedit_test_adaption) { 00679 set_global_loc_code(LOC_DOC_BLK_REJ); 00680 quality_based_rejection(page_res_it, good_quality_doc); 00681 } 00682 } 00683 00684 void Tesseract::blamer_pass(PAGE_RES* page_res) { 00685 if (!wordrec_run_blamer) return; 00686 PAGE_RES_IT page_res_it(page_res); 00687 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00688 page_res_it.forward()) { 00689 WERD_RES *word = page_res_it.word(); 00690 if (word->blamer_bundle == NULL) { 00691 word->blamer_bundle = new BlamerBundle(); 00692 word->blamer_bundle->incorrect_result_reason = IRR_PAGE_LAYOUT; 00693 word->blamer_bundle->debug = word->blamer_bundle->IncorrectReason(); 00694 word->blamer_bundle->debug += " to blame"; 00695 } else if (word->blamer_bundle->incorrect_result_reason == 00696 IRR_NO_TRUTH) { 00697 word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth", 00698 word->best_choice, wordrec_debug_blamer); 00699 } else { 00700 bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice, 00701 word->blamer_bundle->truth_text); 00702 IncorrectResultReason irr = 00703 word->blamer_bundle->incorrect_result_reason; 00704 if (irr == IRR_CORRECT && !correct) { 00705 STRING debug = "Choice is incorrect after recognition"; 00706 word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug, 00707 word->best_choice, 00708 wordrec_debug_blamer); 00709 } else if (irr != IRR_CORRECT && correct) { 00710 if (wordrec_debug_blamer) { 00711 tprintf("Corrected %s\n", word->blamer_bundle->debug.string()); 00712 } 00713 word->blamer_bundle->incorrect_result_reason = IRR_CORRECT; 00714 word->blamer_bundle->debug = ""; 00715 } 00716 } 00717 page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason]++; 00718 } 00719 tprintf("Blame reasons:\n"); 00720 for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) { 00721 tprintf("%s %d\n", BlamerBundle::IncorrectReasonName( 00722 static_cast<IncorrectResultReason>(bl)), 00723 page_res->blame_reasons[bl]); 00724 } 00725 if (page_res->misadaption_log.length() > 0) { 00726 tprintf("Misadaption log:\n"); 00727 for (int i = 0; i < page_res->misadaption_log.length(); ++i) { 00728 tprintf("%s\n", page_res->misadaption_log[i].string()); 00729 } 00730 } 00731 } 00732 00733 // Helper returns true if the new_word is better than the word, using a 00734 // simple test of better certainty AND rating (to reduce false positives 00735 // from cube) or a dictionary vs non-dictionary word. 00736 static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) { 00737 if (new_word.best_choice == NULL) { 00738 return false; // New one no good. 00739 } 00740 if (word.best_choice == NULL) { 00741 return true; // Old one no good. 00742 } 00743 if (new_word.best_choice->certainty() > word.best_choice->certainty() && 00744 new_word.best_choice->rating() < word.best_choice->rating()) { 00745 return true; // New word has better confidence. 00746 } 00747 if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) && 00748 Dict::valid_word_permuter(new_word.best_choice->permuter(), false)) { 00749 return true; // New word is from a dictionary. 00750 } 00751 return false; // New word is no better. 00752 } 00753 00754 // Helper to recognize the word using the given (language-specific) tesseract. 00755 // Returns true if the result was better than previously. 00756 bool Tesseract::RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row, 00757 WordRecognizer recognizer) { 00758 if (classify_debug_level || cube_debug_level) { 00759 tprintf("Retrying word using lang %s, oem %d\n", 00760 lang.string(), static_cast<int>(tessedit_ocr_engine_mode)); 00761 } 00762 // Setup a trial WERD_RES in which to classify. 00763 WERD_RES lang_word; 00764 lang_word.InitForRetryRecognition(*word); 00765 // Run the recognizer on the word. 00766 // Initial version is a bit of a hack based on better certainty and rating 00767 // (to reduce false positives from cube) or a dictionary vs non-dictionary 00768 // word. 00769 (this->*recognizer)(block, row, &lang_word); 00770 bool new_is_better = NewWordBetter(*word, lang_word); 00771 if (classify_debug_level || cube_debug_level) { 00772 if (lang_word.best_choice == NULL) { 00773 tprintf("New result %s better:%s\n", 00774 new_is_better ? "IS" : "NOT"); 00775 } else { 00776 tprintf("New result %s better:%s, r=%g, c=%g\n", 00777 new_is_better ? "IS" : "NOT", 00778 lang_word.best_choice->unichar_string().string(), 00779 lang_word.best_choice->rating(), 00780 lang_word.best_choice->certainty()); 00781 } 00782 } 00783 if (new_is_better) { 00784 word->ConsumeWordResults(&lang_word); 00785 } 00786 return new_is_better; 00787 } 00788 00789 // Generic function for classifying a word. Can be used either for pass1 or 00790 // pass2 according to the function passed to recognizer. 00791 // word block and row are the current location in the document's PAGE_RES. 00792 // Recognizes in the current language, and if successful that is all. 00793 // If recognition was not successful, tries all available languages until 00794 // it gets a successful result or runs out of languages. Keeps the best result. 00795 void Tesseract::classify_word_and_language(WordRecognizer recognizer, 00796 BLOCK* block, 00797 ROW *row, 00798 WERD_RES *word) { 00799 if (classify_debug_level || cube_debug_level) { 00800 tprintf("Processing word with lang %s at:", 00801 most_recently_used_->lang.string()); 00802 word->word->bounding_box().print(); 00803 } 00804 const char* result_type = "Initial"; 00805 bool initially_done = !word->tess_failed && word->done; 00806 if (initially_done) { 00807 // If done on pass1, we reuse the tesseract that did it, and don't try 00808 // any more. The only need to call the classifier at all is for the 00809 // cube combiner and xheight fixing (which may be bogus on a done word.) 00810 most_recently_used_ = word->tesseract; 00811 result_type = "Already done"; 00812 } 00813 (most_recently_used_->*recognizer)(block, row, word); 00814 if (!word->tess_failed && word->tess_accepted) 00815 result_type = "Accepted"; 00816 if (classify_debug_level || cube_debug_level) { 00817 tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n", 00818 result_type, 00819 word->best_choice->unichar_string().string(), 00820 word->best_choice->rating(), 00821 word->best_choice->certainty(), 00822 word->tess_accepted, word->tess_would_adapt); 00823 } 00824 if (word->tess_failed || !word->tess_accepted) { 00825 // Try all the other languages to see if they are any better. 00826 Tesseract* previous_used = most_recently_used_; 00827 if (most_recently_used_ != this) { 00828 if (classify_debug_level) { 00829 tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string()); 00830 } 00831 if (RetryWithLanguage(word, block, row, recognizer)) { 00832 most_recently_used_ = this; 00833 if (!word->tess_failed && word->tess_accepted) 00834 return; // No need to look at the others. 00835 } 00836 } 00837 00838 for (int i = 0; i < sub_langs_.size(); ++i) { 00839 if (sub_langs_[i] != previous_used) { 00840 if (classify_debug_level) { 00841 tprintf("Retrying with sub-Tesseract[%d] lang: %s\n", 00842 i, sub_langs_[i]->lang.string()); 00843 } 00844 if (sub_langs_[i]->RetryWithLanguage(word, block, row, recognizer)) { 00845 most_recently_used_ = sub_langs_[i]; 00846 if (!word->tess_failed && word->tess_accepted) 00847 return; // No need to look at the others. 00848 } 00849 } 00850 } 00851 } 00852 } 00853 00860 void Tesseract::classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) { 00861 // If we only intend to run cube - run it and return. 00862 if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { 00863 cube_word_pass1(block, row, word); 00864 return; 00865 } 00866 00867 BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST(); 00868 BOOL8 adapt_ok; 00869 const char *rejmap; 00870 inT16 index; 00871 STRING mapstr = ""; 00872 00873 check_debug_pt(word, 0); 00874 if (word->SetupForTessRecognition(unicharset, this, BestPix(), 00875 classify_bln_numeric_mode, 00876 this->textord_use_cjk_fp_model, 00877 row, block)) 00878 tess_segment_pass1(word, blob_choices); 00879 if (!word->tess_failed) { 00880 /* 00881 The adaption step used to be here. It has been moved to after 00882 make_reject_map so that we know whether the word will be accepted in the 00883 first pass or not. This move will PREVENT adaption to words containing 00884 double quotes because the word will not be identical to what tess thinks 00885 its best choice is. (See CurrentBestChoiceIs in 00886 stopper.cpp which is used by AdaptableWord in 00887 adaptmatch.cpp) 00888 */ 00889 00890 if (!word->word->flag(W_REP_CHAR)) { 00891 // TODO(daria) delete these hacks when replaced by more generic code. 00892 // Convert '' (double single) to " (single double). 00893 word->fix_quotes(blob_choices); 00894 if (tessedit_fix_hyphens) // turn -- to - 00895 word->fix_hyphens(blob_choices); 00896 00897 word->tess_accepted = tess_acceptable_word(word->best_choice, 00898 word->raw_choice); 00899 00900 word->tess_would_adapt = word->best_choice && word->raw_choice && 00901 AdaptableWord(word->rebuild_word, 00902 *word->best_choice, 00903 *word->raw_choice); 00904 // Also sets word->done flag 00905 make_reject_map(word, blob_choices, row, 1); 00906 00907 adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode); 00908 00909 if (adapt_ok || tessedit_tess_adapt_to_rejmap) { 00910 if (!tessedit_tess_adapt_to_rejmap) { 00911 rejmap = NULL; 00912 } else { 00913 ASSERT_HOST(word->reject_map.length() == 00914 word->best_choice->length()); 00915 00916 for (index = 0; index < word->reject_map.length(); index++) { 00917 if (adapt_ok || word->reject_map[index].accepted()) 00918 mapstr += '1'; 00919 else 00920 mapstr += '0'; 00921 } 00922 rejmap = mapstr.string(); 00923 } 00924 // Send word to adaptive classifier for training. 00925 word->BestChoiceToCorrectText(); 00926 set_word_fonts(word, blob_choices); 00927 LearnWord(NULL, rejmap, word); 00928 // Mark misadaptions if running blamer. 00929 if (word->blamer_bundle != NULL && 00930 word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH && 00931 !ChoiceIsCorrect(*word->uch_set, word->best_choice, 00932 word->blamer_bundle->truth_text)) { 00933 word->blamer_bundle->misadaption_debug ="misadapt to word ("; 00934 word->blamer_bundle->misadaption_debug += 00935 word->best_choice->permuter_name(); 00936 word->blamer_bundle->misadaption_debug += "): "; 00937 word->blamer_bundle->FillDebugString( 00938 "", word->best_choice, &(word->blamer_bundle->misadaption_debug)); 00939 if (wordrec_debug_blamer) { 00940 tprintf("%s\n", word->blamer_bundle->misadaption_debug.string()); 00941 } 00942 } 00943 } 00944 00945 if (tessedit_enable_doc_dict) 00946 tess_add_doc_word(word->best_choice); 00947 } 00948 } 00949 00950 // Save best choices in the WERD_CHOICE if needed 00951 word->best_choice->set_blob_choices(blob_choices); 00952 } 00953 00954 // Helper to report the result of the xheight fix. 00955 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, 00956 WERD_RES* word, WERD_RES* new_word) { 00957 tprintf("New XHT Match:%s = %s ", 00958 word->best_choice->unichar_string().string(), 00959 word->best_choice->debug_string().string()); 00960 word->reject_map.print(debug_fp); 00961 tprintf(" -> %s = %s ", 00962 new_word->best_choice->unichar_string().string(), 00963 new_word->best_choice->debug_string().string()); 00964 new_word->reject_map.print(debug_fp); 00965 tprintf(" %s->%s %s %s\n", 00966 word->guessed_x_ht ? "GUESS" : "CERT", 00967 new_word->guessed_x_ht ? "GUESS" : "CERT", 00968 new_x_ht > 0.1 ? "STILL DOUBT" : "OK", 00969 accept_new_word ? "ACCEPTED" : ""); 00970 } 00971 00972 // Run the x-height fix-up, based on min/max top/bottom information in 00973 // unicharset. 00974 // Returns true if the word was changed. 00975 // See the comment in fixxht.cpp for a description of the overall process. 00976 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) { 00977 bool accept_new_x_ht = false; 00978 int original_misfits = CountMisfitTops(word); 00979 if (original_misfits == 0) 00980 return false; 00981 float new_x_ht = ComputeCompatibleXheight(word); 00982 if (new_x_ht > 0.0f) { 00983 WERD_RES new_x_ht_word(word->word); 00984 if (word->blamer_bundle != NULL) { 00985 new_x_ht_word.blamer_bundle = new BlamerBundle(); 00986 new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); 00987 } 00988 new_x_ht_word.x_height = new_x_ht; 00989 new_x_ht_word.caps_height = 0.0; 00990 match_word_pass2(&new_x_ht_word, row, block); 00991 if (!new_x_ht_word.tess_failed) { 00992 int new_misfits = CountMisfitTops(&new_x_ht_word); 00993 if (debug_x_ht_level >= 1) { 00994 tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", 00995 original_misfits, word->x_height, 00996 new_misfits, new_x_ht); 00997 tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", 00998 word->best_choice->rating(), word->best_choice->certainty(), 00999 new_x_ht_word.best_choice->rating(), 01000 new_x_ht_word.best_choice->certainty()); 01001 } 01002 // The misfits must improve and either the rating or certainty. 01003 accept_new_x_ht = new_misfits < original_misfits && 01004 (new_x_ht_word.best_choice->certainty() > 01005 word->best_choice->certainty() || 01006 new_x_ht_word.best_choice->rating() < 01007 word->best_choice->rating()); 01008 if (debug_x_ht_level >= 1) { 01009 ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); 01010 } 01011 } 01012 if (accept_new_x_ht) { 01013 word->ConsumeWordResults(&new_x_ht_word); 01014 return true; 01015 } 01016 } 01017 return false; 01018 } 01019 01026 void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) { 01027 // Return if we do not want to run Tesseract. 01028 if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY && 01029 tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED) 01030 return; 01031 01032 bool done_this_pass = false; 01033 set_global_subloc_code(SUBLOC_NORM); 01034 check_debug_pt(word, 30); 01035 if (!word->done || tessedit_training_tess) { 01036 word->caps_height = 0.0; 01037 if (word->x_height == 0.0f) 01038 word->x_height = row->x_height(); 01039 match_word_pass2(word, row, block); 01040 done_this_pass = TRUE; 01041 check_debug_pt(word, 40); 01042 } 01043 01044 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { 01045 bool accept_new_xht = false; 01046 if (unicharset.top_bottom_useful() && unicharset.script_has_xheight()) { 01047 // Use the tops and bottoms since they are available. 01048 accept_new_xht = TrainedXheightFix(word, block, row); 01049 } 01050 if (accept_new_xht) 01051 done_this_pass = true; 01052 // Test for small caps. Word capheight must be close to block xheight, 01053 // and word must contain no lower case letters, and at least one upper case. 01054 double small_cap_xheight = block->x_height() * kXHeightCapRatio; 01055 double small_cap_delta = (block->x_height() - small_cap_xheight) / 2.0; 01056 if (unicharset.script_has_xheight() && 01057 small_cap_xheight - small_cap_delta <= word->x_height && 01058 word->x_height <= small_cap_xheight + small_cap_delta) { 01059 // Scan for upper/lower. 01060 int num_upper = 0; 01061 int num_lower = 0; 01062 for (int i = 0; i < word->best_choice->length(); ++i) { 01063 if (unicharset.get_isupper(word->best_choice->unichar_id(i))) 01064 ++num_upper; 01065 else if (unicharset.get_islower(word->best_choice->unichar_id(i))) 01066 ++num_lower; 01067 } 01068 if (num_upper > 0 && num_lower == 0) 01069 word->small_caps = true; 01070 } 01071 word->SetScriptPositions(); 01072 01073 set_global_subloc_code(SUBLOC_NORM); 01074 } 01075 #ifndef GRAPHICS_DISABLED 01076 if (tessedit_display_outwords) { 01077 if (fx_win == NULL) 01078 create_fx_win(); 01079 clear_fx_win(); 01080 word->rebuild_word->plot(fx_win); 01081 TBOX wbox = word->rebuild_word->bounding_box(); 01082 fx_win->ZoomToRectangle(wbox.left(), wbox.top(), 01083 wbox.right(), wbox.bottom()); 01084 ScrollView::Update(); 01085 } 01086 #endif 01087 set_global_subloc_code(SUBLOC_NORM); 01088 check_debug_pt(word, 50); 01089 } 01090 01091 01098 void Tesseract::match_word_pass2(WERD_RES *word, //word to do 01099 ROW *row, 01100 BLOCK* block) { 01101 BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST(); 01102 01103 if (word->SetupForTessRecognition(unicharset, this, BestPix(), 01104 classify_bln_numeric_mode, 01105 this->textord_use_cjk_fp_model, 01106 row, block)) 01107 tess_segment_pass2(word, blob_choices); 01108 01109 if (!word->tess_failed) { 01110 if (!word->word->flag (W_REP_CHAR)) { 01111 word->fix_quotes(blob_choices); 01112 if (tessedit_fix_hyphens) 01113 word->fix_hyphens(blob_choices); 01114 /* Dont trust fix_quotes! - though I think I've fixed the bug */ 01115 if (word->best_choice->length() != word->box_word->length() || 01116 word->best_choice->length() != blob_choices->length()) { 01117 tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" 01118 " #Blobs=%d; #Choices=%d\n", 01119 word->best_choice->debug_string().string(), 01120 word->best_choice->length(), 01121 word->box_word->length(), blob_choices->length()); 01122 01123 } 01124 word->tess_accepted = tess_acceptable_word(word->best_choice, 01125 word->raw_choice); 01126 01127 make_reject_map (word, blob_choices, row, 2); 01128 } 01129 } 01130 01131 // Save best choices in the WERD_CHOICE if needed 01132 word->best_choice->set_blob_choices(blob_choices); 01133 set_word_fonts(word, blob_choices); 01134 01135 assert (word->raw_choice != NULL); 01136 } 01137 01138 // Helper to find the BLOB_CHOICE in the bc_list that matches the given 01139 // unichar_id, or NULL if there is no match. 01140 static BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id, 01141 BLOB_CHOICE_LIST* bc_list) { 01142 // Find the corresponding best BLOB_CHOICE. 01143 BLOB_CHOICE_IT choice_it(bc_list); 01144 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 01145 choice_it.forward()) { 01146 BLOB_CHOICE* choice = choice_it.data(); 01147 if (choice->unichar_id() == char_id) { 01148 return choice; 01149 } 01150 } 01151 return NULL; 01152 } 01153 01154 // Helper to return the best rated BLOB_CHOICE in the whole word that matches 01155 // the given char_id, or NULL if none can be found. 01156 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id, 01157 WERD_RES* word_res) { 01158 // Find the corresponding best BLOB_CHOICE from any position in the word_res. 01159 BLOB_CHOICE* best_choice = NULL; 01160 BLOB_CHOICE_LIST_C_IT bc_it(word_res->best_choice->blob_choices()); 01161 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { 01162 BLOB_CHOICE* choice = FindMatchingChoice(char_id, bc_it.data()); 01163 if (choice != NULL) { 01164 if (best_choice == NULL || choice->rating() < best_choice->rating()) 01165 best_choice = choice; 01166 } 01167 } 01168 return best_choice; 01169 } 01170 01171 // Helper to insert blob_choice in each location in the leader word if there is 01172 // no matching BLOB_CHOICE there already, and correct any incorrect results 01173 // in the best_choice. 01174 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice, 01175 WERD_RES* word_res) { 01176 WERD_CHOICE* word = word_res->best_choice; 01177 BLOB_CHOICE_LIST_C_IT bc_it(word->blob_choices()); 01178 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { 01179 BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(), 01180 bc_it.data()); 01181 if (choice == NULL) { 01182 BLOB_CHOICE_IT choice_it(bc_it.data()); 01183 choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice)); 01184 } 01185 } 01186 // Correct any incorrect results in word. 01187 for (int i = 0; i < word->length(); ++i) { 01188 if (word->unichar_id(i) != blob_choice->unichar_id()) 01189 word->set_unichar_id(blob_choice->unichar_id(), i); 01190 } 01191 } 01192 01200 void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) { 01201 WERD_RES *word_res = page_res_it->word(); 01202 const WERD_CHOICE &word = *(word_res->best_choice); 01203 01204 // Find the frequency of each unique character in the word. 01205 UNICHAR_ID space = word_res->uch_set->unichar_to_id(" "); 01206 SortHelper<UNICHAR_ID> rep_ch(word.length()); 01207 for (int i = 0; i < word.length(); ++i) { 01208 if (word.unichar_id(i) != space) 01209 rep_ch.Add(word.unichar_id(i), 1); 01210 } 01211 01212 // Find the most frequent result. 01213 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char 01214 int max_count = rep_ch.MaxCount(&maxch_id); 01215 // Find the best exemplar of a classifier result for maxch_id. 01216 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res); 01217 if (best_choice == NULL) { 01218 tprintf("Failed to find a choice for %s, occurring %d times\n", 01219 word_res->uch_set->debug_str(maxch_id).string(), max_count); 01220 return; 01221 } 01222 word_res->done = TRUE; 01223 01224 // Measure the mean space. 01225 int total_gap = 0; 01226 int gap_count = 0; 01227 WERD* werd = word_res->word; 01228 C_BLOB_IT blob_it(werd->cblob_list()); 01229 C_BLOB* prev_blob = blob_it.data(); 01230 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) { 01231 C_BLOB* blob = blob_it.data(); 01232 int gap = blob->bounding_box().left(); 01233 gap -= prev_blob->bounding_box().right(); 01234 total_gap += gap; 01235 ++gap_count; 01236 prev_blob = blob; 01237 } 01238 if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) { 01239 // Needs spaces between. 01240 ExplodeRepeatedWord(best_choice, page_res_it); 01241 } else { 01242 // Just correct existing classification. 01243 CorrectRepcharChoices(best_choice, word_res); 01244 word_res->reject_map.initialise(word.length()); 01245 } 01246 } 01247 01248 // Explode the word at the given iterator location into individual words 01249 // of a single given unichar_id defined by best_choice. 01250 // The original word is deleted, and the replacements copy most of their 01251 // fields from the original. 01252 void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice, 01253 PAGE_RES_IT* page_res_it) { 01254 WERD_RES *word_res = page_res_it->word(); 01255 ASSERT_HOST(best_choice != NULL); 01256 01257 // Make a new word for each blob in the original. 01258 WERD* werd = word_res->word; 01259 C_BLOB_IT blob_it(werd->cblob_list()); 01260 for (; !blob_it.empty(); blob_it.forward()) { 01261 bool first_blob = blob_it.at_first(); 01262 bool last_blob = blob_it.at_last(); 01263 WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob, 01264 blob_it.extract()); 01265 // Note that blamer_bundle (truth information) is not copied, which is 01266 // desirable, since the newly inserted words would not have the original 01267 // bounding box corresponding to the one recorded in truth fields. 01268 WERD_RES* rep_word = 01269 page_res_it->InsertSimpleCloneWord(*word_res, blob_word); 01270 // Setup the single char WERD_RES 01271 if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(), 01272 false, 01273 this->textord_use_cjk_fp_model, 01274 page_res_it->row()->row, 01275 page_res_it->block()->block)) { 01276 rep_word->CloneChoppedToRebuild(); 01277 BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice); 01278 rep_word->FakeClassifyWord(1, &blob_choice); 01279 } 01280 } 01281 page_res_it->DeleteCurrentWord(); 01282 } 01283 01284 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string( 01285 const UNICHARSET& char_set, const char *s, const char *lengths) { 01286 int i = 0; 01287 int offset = 0; 01288 int leading_punct_count; 01289 int upper_count = 0; 01290 int hyphen_pos = -1; 01291 ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; 01292 01293 if (strlen (lengths) > 20) 01294 return word_type; 01295 01296 /* Single Leading punctuation char*/ 01297 01298 if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset])) 01299 offset += lengths[i++]; 01300 leading_punct_count = i; 01301 01302 /* Initial cap */ 01303 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) { 01304 offset += lengths[i++]; 01305 upper_count++; 01306 } 01307 if (upper_count > 1) { 01308 word_type = AC_UPPER_CASE; 01309 } else { 01310 /* Lower case word, possibly with an initial cap */ 01311 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) { 01312 offset += lengths[i++]; 01313 } 01314 if (i - leading_punct_count < quality_min_initial_alphas_reqd) 01315 goto not_a_word; 01316 /* 01317 Allow a single hyphen in a lower case word 01318 - dont trust upper case - I've seen several cases of "H" -> "I-I" 01319 */ 01320 if (lengths[i] == 1 && s[offset] == '-') { 01321 hyphen_pos = i; 01322 offset += lengths[i++]; 01323 if (s[offset] != '\0') { 01324 while ((s[offset] != '\0') && 01325 char_set.get_islower(s + offset, lengths[i])) { 01326 offset += lengths[i++]; 01327 } 01328 if (i < hyphen_pos + 3) 01329 goto not_a_word; 01330 } 01331 } else { 01332 /* Allow "'s" in NON hyphenated lower case words */ 01333 if (lengths[i] == 1 && (s[offset] == '\'') && 01334 lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) { 01335 offset += lengths[i++]; 01336 offset += lengths[i++]; 01337 } 01338 } 01339 if (upper_count > 0) 01340 word_type = AC_INITIAL_CAP; 01341 else 01342 word_type = AC_LOWER_CASE; 01343 } 01344 01345 /* Up to two different, constrained trailing punctuation chars */ 01346 if (lengths[i] == 1 && s[offset] != '\0' && 01347 STRING(chs_trailing_punct1).contains(s[offset])) 01348 offset += lengths[i++]; 01349 if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && 01350 s[offset - lengths[i - 1]] != s[offset] && 01351 STRING(chs_trailing_punct2).contains (s[offset])) 01352 offset += lengths[i++]; 01353 01354 if (s[offset] != '\0') 01355 word_type = AC_UNACCEPTABLE; 01356 01357 not_a_word: 01358 01359 if (word_type == AC_UNACCEPTABLE) { 01360 /* Look for abbreviation string */ 01361 i = 0; 01362 offset = 0; 01363 if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) { 01364 word_type = AC_UC_ABBREV; 01365 while (s[offset] != '\0' && 01366 char_set.get_isupper(s + offset, lengths[i]) && 01367 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { 01368 offset += lengths[i++]; 01369 offset += lengths[i++]; 01370 } 01371 } 01372 else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) { 01373 word_type = AC_LC_ABBREV; 01374 while (s[offset] != '\0' && 01375 char_set.get_islower(s + offset, lengths[i]) && 01376 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { 01377 offset += lengths[i++]; 01378 offset += lengths[i++]; 01379 } 01380 } 01381 if (s[offset] != '\0') 01382 word_type = AC_UNACCEPTABLE; 01383 } 01384 01385 return word_type; 01386 } 01387 01388 BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) { 01389 BOOL8 show_map_detail = FALSE; 01390 inT16 i; 01391 01392 #ifndef SECURE_NAMES 01393 if (!test_pt) 01394 return FALSE; 01395 01396 tessedit_rejection_debug.set_value (FALSE); 01397 debug_x_ht_level.set_value (0); 01398 01399 if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) { 01400 if (location < 0) 01401 return TRUE; // For breakpoint use 01402 tessedit_rejection_debug.set_value (TRUE); 01403 debug_x_ht_level.set_value (20); 01404 tprintf ("\n\nTESTWD::"); 01405 switch (location) { 01406 case 0: 01407 tprintf ("classify_word_pass1 start\n"); 01408 word->word->print(); 01409 break; 01410 case 10: 01411 tprintf ("make_reject_map: initial map"); 01412 break; 01413 case 20: 01414 tprintf ("make_reject_map: after NN"); 01415 break; 01416 case 30: 01417 tprintf ("classify_word_pass2 - START"); 01418 break; 01419 case 40: 01420 tprintf ("classify_word_pass2 - Pre Xht"); 01421 break; 01422 case 50: 01423 tprintf ("classify_word_pass2 - END"); 01424 show_map_detail = TRUE; 01425 break; 01426 case 60: 01427 tprintf ("fixspace"); 01428 break; 01429 case 70: 01430 tprintf ("MM pass START"); 01431 break; 01432 case 80: 01433 tprintf ("MM pass END"); 01434 break; 01435 case 90: 01436 tprintf ("After Poor quality rejection"); 01437 break; 01438 case 100: 01439 tprintf ("unrej_good_quality_words - START"); 01440 break; 01441 case 110: 01442 tprintf ("unrej_good_quality_words - END"); 01443 break; 01444 case 120: 01445 tprintf ("Write results pass"); 01446 show_map_detail = TRUE; 01447 break; 01448 } 01449 tprintf(" \"%s\" ", 01450 word->best_choice->unichar_string().string()); 01451 word->reject_map.print (debug_fp); 01452 tprintf ("\n"); 01453 if (show_map_detail) { 01454 tprintf ("\"%s\"\n", word->best_choice->unichar_string().string()); 01455 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { 01456 tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); 01457 word->reject_map[i].full_print(debug_fp); 01458 } 01459 } 01460 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); 01461 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); 01462 return TRUE; 01463 } 01464 else 01465 #endif 01466 return FALSE; 01467 } 01468 01474 static void find_modal_font( //good chars in word 01475 STATS *fonts, //font stats 01476 inT16 *font_out, //output font 01477 inT8 *font_count //output count 01478 ) { 01479 inT16 font; //font index 01480 inT32 count; //pile couat 01481 01482 if (fonts->get_total () > 0) { 01483 font = (inT16) fonts->mode (); 01484 *font_out = font; 01485 count = fonts->pile_count (font); 01486 *font_count = count < MAX_INT8 ? count : MAX_INT8; 01487 fonts->add (font, -*font_count); 01488 } 01489 else { 01490 *font_out = -1; 01491 *font_count = 0; 01492 } 01493 } 01494 01500 void Tesseract::set_word_fonts(WERD_RES *word, 01501 BLOB_CHOICE_LIST_CLIST *blob_choices) { 01502 if (blob_choices == NULL) return; 01503 // Don't try to set the word fonts for a cube word, as the configs 01504 // will be meaningless. 01505 if (word->chopped_word == NULL) return; 01506 01507 inT32 index; // char id index 01508 // character iterator 01509 BLOB_CHOICE_LIST_C_IT char_it = blob_choices; 01510 BLOB_CHOICE_IT choice_it; // choice iterator 01511 int fontinfo_size = get_fontinfo_table().size(); 01512 int fontset_size = get_fontset_table().size(); 01513 if (fontinfo_size == 0 || fontset_size == 0) return; 01514 STATS fonts(0, fontinfo_size); // font counters 01515 01516 word->italic = 0; 01517 word->bold = 0; 01518 if (!word->best_choice_fontinfo_ids.empty()) { 01519 word->best_choice_fontinfo_ids.clear(); 01520 } 01521 // Compute the modal font for the word 01522 for (char_it.mark_cycle_pt(), index = 0; 01523 !char_it.cycled_list(); ++index, char_it.forward()) { 01524 UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index); 01525 choice_it.set_to_list(char_it.data()); 01526 if (tessedit_debug_fonts) { 01527 tprintf("Examining fonts in %s\n", 01528 word->best_choice->debug_string().string()); 01529 } 01530 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 01531 choice_it.forward()) { 01532 UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id(); 01533 if (blob_ch_id == word_ch_id) { 01534 if (tessedit_debug_fonts) { 01535 tprintf("%s font %s (%d) font2 %s (%d)\n", 01536 word->uch_set->id_to_unichar(blob_ch_id), 01537 choice_it.data()->fontinfo_id() < 0 ? "unknown" : 01538 fontinfo_table_.get(choice_it.data()->fontinfo_id()).name, 01539 choice_it.data()->fontinfo_id(), 01540 choice_it.data()->fontinfo_id2() < 0 ? "unknown" : 01541 fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name, 01542 choice_it.data()->fontinfo_id2()); 01543 } 01544 // 1st choice font gets 2 pts, 2nd choice 1 pt. 01545 if (choice_it.data()->fontinfo_id() >= 0) { 01546 fonts.add(choice_it.data()->fontinfo_id(), 2); 01547 } 01548 if (choice_it.data()->fontinfo_id2() >= 0) { 01549 fonts.add(choice_it.data()->fontinfo_id2(), 1); 01550 } 01551 break; 01552 } 01553 } 01554 } 01555 inT16 font_id1, font_id2; 01556 find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count); 01557 find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count); 01558 word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL; 01559 word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL; 01560 // All the blobs get the word's best choice font. 01561 for (int i = 0; i < word->best_choice->length(); ++i) { 01562 word->best_choice_fontinfo_ids.push_back(font_id1); 01563 } 01564 if (word->fontinfo_id_count > 0) { 01565 FontInfo fi = fontinfo_table_.get(font_id1); 01566 if (tessedit_debug_fonts) { 01567 if (word->fontinfo_id2_count > 0) { 01568 tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", 01569 fi.name, word->fontinfo_id_count, 01570 fontinfo_table_.get(font_id2).name, 01571 word->fontinfo_id2_count); 01572 } else { 01573 tprintf("Word modal font=%s, score=%d. No 2nd choice\n", 01574 fi.name, word->fontinfo_id_count); 01575 } 01576 } 01577 // 1st choices got 2 pts, so we need to halve the score for the mode. 01578 word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2; 01579 word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2; 01580 } 01581 } 01582 01583 01590 void Tesseract::font_recognition_pass(PAGE_RES* page_res) { 01591 PAGE_RES_IT page_res_it(page_res); 01592 WERD_RES *word; // current word 01593 STATS doc_fonts(0, font_table_size_); // font counters 01594 01595 // Gather font id statistics. 01596 for (page_res_it.restart_page(); page_res_it.word() != NULL; 01597 page_res_it.forward()) { 01598 word = page_res_it.word(); 01599 if (word->fontinfo != NULL) { 01600 doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count); 01601 } 01602 if (word->fontinfo2 != NULL) { 01603 doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count); 01604 } 01605 } 01606 inT16 doc_font; // modal font 01607 inT8 doc_font_count; // modal font 01608 find_modal_font(&doc_fonts, &doc_font, &doc_font_count); 01609 if (doc_font_count == 0) 01610 return; 01611 // Get the modal font pointer. 01612 const FontInfo* modal_font = NULL; 01613 for (page_res_it.restart_page(); page_res_it.word() != NULL; 01614 page_res_it.forward()) { 01615 word = page_res_it.word(); 01616 if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) { 01617 modal_font = word->fontinfo; 01618 break; 01619 } 01620 if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) { 01621 modal_font = word->fontinfo2; 01622 break; 01623 } 01624 } 01625 ASSERT_HOST(modal_font != NULL); 01626 01627 // Assign modal font to weak words. 01628 for (page_res_it.restart_page(); page_res_it.word() != NULL; 01629 page_res_it.forward()) { 01630 word = page_res_it.word(); 01631 int length = word->best_choice->length(); 01632 01633 // 1st choices got 2 pts, so we need to halve the score for the mode. 01634 int count = (word->fontinfo_id_count + 1) / 2; 01635 if (!(count == length || (length > 3 && count >= length * 3 / 4))) { 01636 word->fontinfo = modal_font; 01637 // Counts only get 1 as it came from the doc. 01638 word->fontinfo_id_count = 1; 01639 word->italic = modal_font->is_italic() ? 1 : -1; 01640 word->bold = modal_font->is_bold() ? 1 : -1; 01641 } 01642 } 01643 } 01644 01645 } // namespace tesseract