tesseract-doc/control_8cpp_source.html

00001 /******************************************************************
00002  * File:        control.cpp  (Formerly control.c)
00003  * Description: Module-independent matcher controller.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Thu Apr 23 11:09:58 BST 1992
00006  * ReHacked:    Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
00007  *
00008  * (C) Copyright 1992, Hewlett-Packard Ltd.
00009  ** Licensed under the Apache License, Version 2.0 (the "License");
00010  ** you may not use this file except in compliance with the License.
00011  ** You may obtain a copy of the License at
00012  ** http://www.apache.org/licenses/LICENSE-2.0
00013  ** Unless required by applicable law or agreed to in writing, software
00014  ** distributed under the License is distributed on an "AS IS" BASIS,
00015  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  ** See the License for the specific language governing permissions and
00017  ** limitations under the License.
00018  *
00019  **********************************************************************/
00020
00021 #include "mfcpch.h"
00022
00023 #include <string.h>
00024 #include <math.h>
00025 #ifdef __UNIX__
00026 #include <assert.h>
00027 #include <unistd.h>
00028 #include <errno.h>
00029 #endif
00030 #include <ctype.h>
00031 #include "ocrclass.h"
00032 #include "werdit.h"
00033 #include "drawfx.h"
00034 #include "tfacep.h"
00035 #include "tessbox.h"
00036 #include "tessvars.h"
00037 #include "pgedit.h"
00038 #include "reject.h"
00039 #include "fixspace.h"
00040 #include "docqual.h"
00041 #include "control.h"
00042 #include "secname.h"
00043 #include "output.h"
00044 #include "callcpp.h"
00045 #include "notdll.h"
00046 #include "globals.h"
00047 #include "sorthelper.h"
00048 #include "tesseractclass.h"
00049
00050 // Include automatically generated configuration file if running autoconf.
00051 #ifdef HAVE_CONFIG_H
00052 #include "config_auto.h"
00053 #endif
00054
00055 #define MIN_FONT_ROW_COUNT  8
00056 #define MAX_XHEIGHT_DIFF  3
00057
00058 const char* const kBackUpConfigFile = "tempconfigdata.config";
00059 // Multiple of x-height to make a repeated word have spaces in it.
00060 const double kRepcharGapThreshold = 0.5;
00061
00062
00071 namespace tesseract {
00072 void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
00073                                   TBOX &selection_box) {
00074   WERD *word;
00075   ROW *pseudo_row;               // row of word
00076   BLOCK *pseudo_block;           // block of word
00077
00078   word = make_pseudo_word(page_res, selection_box,
00079                           pseudo_block, pseudo_row);
00080   if (word != NULL) {
00081     WERD_RES word_res(word);
00082     recog_interactive(pseudo_block, pseudo_row, &word_res);
00083     delete word;
00084   }
00085 }
00086
00087
00097 BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) {
00098   inT16 char_qual;
00099   inT16 good_char_qual;
00100
00101   classify_word_and_language(&Tesseract::classify_word_pass2,
00102                              block, row, word_res);
00103   if (tessedit_debug_quality_metrics) {
00104     word_char_quality(word_res, row, &char_qual, &good_char_qual);
00105     tprintf
00106       ("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
00107       word_res->reject_map.length(), word_blob_quality(word_res, row),
00108       word_outline_errs(word_res), char_qual, good_char_qual);
00109   }
00110   return TRUE;
00111 }
00112
00113 // Helper function to check for a target word and handle it appropriately.
00114 // Inspired by Jetsoft's requirement to process only single words on pass2
00115 // and beyond.
00116 // If word_config is not null:
00117 //   If the word_box and target_word_box overlap, read the word_config file
00118 //   else reset to previous config data.
00119 //   return true.
00120 // else
00121 //   If the word_box and target_word_box overlap or pass <= 1, return true.
00122 // Note that this function uses a fixed temporary file for storing the previous
00123 // configs, so it is neither thread-safe, nor process-safe, but the assumption
00124 // is that it will only be used for one debug window at a time.
00125 //
00126 // Since this function is used for debugging (and not to change OCR results)
00127 // set only debug params from the word config file.
00128 bool Tesseract::ProcessTargetWord(const TBOX& word_box,
00129                                   const TBOX& target_word_box,
00130                                   const char* word_config,
00131                                   int pass) {
00132   if (word_config != NULL) {
00133     if (word_box.major_overlap(target_word_box)) {
00134       if (backup_config_file_ == NULL) {
00135         backup_config_file_ = kBackUpConfigFile;
00136         FILE* config_fp = fopen(backup_config_file_, "wb");
00137         ParamUtils::PrintParams(config_fp, params());
00138         fclose(config_fp);
00139         ParamUtils::ReadParamsFile(word_config,
00140                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,
00141                                    params());
00142       }
00143     } else {
00144       if (backup_config_file_ != NULL) {
00145         ParamUtils::ReadParamsFile(backup_config_file_,
00146                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,
00147                                    params());
00148         backup_config_file_ = NULL;
00149       }
00150     }
00151   } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
00152     return false;
00153   }
00154   return true;
00155 }
00156
00178 bool Tesseract::recog_all_words(PAGE_RES* page_res,
00179                                 ETEXT_DESC* monitor,
00180                                 const TBOX* target_word_box,
00181                                 const char* word_config,
00182                                 int dopasses) {
00183   PAGE_RES_IT page_res_it;
00184   inT32 word_index;              // current word
00185
00186   if (tessedit_minimal_rej_pass1) {
00187     tessedit_test_adaption.set_value (TRUE);
00188     tessedit_minimal_rejection.set_value (TRUE);
00189   }
00190
00191   // Before the main recognition loop below, walk through the whole page and set
00192   // up fake words.  That way, if we run out of time a user will still get the
00193   // expected best_choice and box_words out the end; they'll just be empty.
00194   page_res_it.page_res = page_res;
00195   for (page_res_it.restart_page(); page_res_it.word() != NULL;
00196        page_res_it.forward()) {
00197     page_res_it.word()->SetupFake(unicharset);
00198   }
00199
00200   if (dopasses==0 || dopasses==1) {
00201     page_res_it.page_res=page_res;
00202     page_res_it.restart_page();
00203
00204     // ****************** Pass 1 *******************
00205
00206     // Clear adaptive classifier at the beginning of the page if it is full.
00207     // This is done only at the beginning of the page to ensure that the
00208     // classifier is not reset at an arbitrary point while processing the page,
00209     // which would cripple Passes 2+ if the reset happens towards the end of
00210     // Pass 1 on a page with very difficult text.
00211     // TODO(daria): preemptively clear the classifier if it is almost full.
00212     if (AdaptiveClassifierIsFull()) ResetAdaptiveClassifierInternal();
00213     // Now check the sub-langs as well.
00214     for (int i = 0; i < sub_langs_.size(); ++i) {
00215       if (sub_langs_[i]->AdaptiveClassifierIsFull())
00216         sub_langs_[i]->ResetAdaptiveClassifierInternal();
00217     }
00218
00219     stats_.word_count = 0;
00220     if (monitor != NULL) {
00221       monitor->ocr_alive = TRUE;
00222       while (page_res_it.word() != NULL) {
00223         stats_.word_count++;
00224         page_res_it.forward();
00225       }
00226       page_res_it.restart_page();
00227     } else {
00228       stats_.word_count = 1;
00229     }
00230
00231     word_index = 0;
00232
00233     stats_.dict_words = 0;
00234     stats_.doc_blob_quality = 0;
00235     stats_.doc_outline_errs = 0;
00236     stats_.doc_char_quality = 0;
00237     stats_.good_char_count = 0;
00238     stats_.doc_good_char_quality = 0;
00239
00240     most_recently_used_ = this;
00241     while (page_res_it.word() != NULL) {
00242       set_global_loc_code(LOC_PASS1);
00243       word_index++;
00244       if (monitor != NULL) {
00245         monitor->ocr_alive = TRUE;
00246         monitor->progress = 30 + 50 * word_index / stats_.word_count;
00247         if (monitor->deadline_exceeded() ||
00248             (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00249                                                            stats_.dict_words)))
00250           return false;
00251       }
00252       if (target_word_box &&
00253           !ProcessTargetWord(page_res_it.word()->word->bounding_box(),
00254                              *target_word_box, word_config, 1)) {
00255         page_res_it.forward();
00256         continue;
00257       }
00258       classify_word_and_language(&Tesseract::classify_word_pass1,
00259                                  page_res_it.block()->block,
00260                                  page_res_it.row()->row,
00261                                  page_res_it.word());
00262       if (page_res_it.word()->word->flag(W_REP_CHAR)) {
00263         fix_rep_char(&page_res_it);
00264         page_res_it.forward();
00265         continue;
00266       }
00267       if (tessedit_dump_choices) {
00268         word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
00269         tprintf("Pass1: %s [%s]\n",
00270                 page_res_it.word()->best_choice->unichar_string().string(),
00271                 page_res_it.word()->best_choice->debug_string().string());
00272       }
00273
00274       // tessedit_test_adaption enables testing of the accuracy of the
00275       // input to the adaptive classifier.
00276       if (tessedit_test_adaption && !tessedit_minimal_rejection) {
00277         if (!word_adaptable (page_res_it.word(),
00278           tessedit_test_adaption_mode)) {
00279           page_res_it.word()->reject_map.rej_word_tess_failure();
00280           // FAKE PERM REJ
00281         } else {
00282           // Override rejection mechanisms for this word.
00283           UNICHAR_ID space = unicharset.unichar_to_id(" ");
00284           for (int i = 0; i < page_res_it.word()->best_choice->length(); i++) {
00285             if ((page_res_it.word()->best_choice->unichar_id(i) != space) &&
00286                 page_res_it.word()->reject_map[i].rejected())
00287               page_res_it.word()->reject_map[i].setrej_minimal_rej_accept();
00288           }
00289         }
00290       }
00291
00292       // Count dict words.
00293       if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
00294         ++(stats_.dict_words);
00295
00296       // Update misadaption log (we only need to do it on pass 1, since
00297       // adaption only happens on this pass).
00298       if (page_res_it.word()->blamer_bundle != NULL &&
00299           page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) {
00300         page_res->misadaption_log.push_back(
00301             page_res_it.word()->blamer_bundle->misadaption_debug);
00302       }
00303
00304       page_res_it.forward();
00305     }
00306   }
00307
00308   if (dopasses == 1) return true;
00309
00310   // ****************** Pass 2 *******************
00311   page_res_it.restart_page();
00312   word_index = 0;
00313   most_recently_used_ = this;
00314   while (!tessedit_test_adaption && page_res_it.word() != NULL) {
00315     set_global_loc_code(LOC_PASS2);
00316     word_index++;
00317     if (monitor != NULL) {
00318       monitor->ocr_alive = TRUE;
00319       monitor->progress = 80 + 10 * word_index / stats_.word_count;
00320       if (monitor->deadline_exceeded() ||
00321           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00322                                                          stats_.dict_words)))
00323         return false;
00324     }
00325
00326     // changed by jetsoft
00327     // specific to its needs to extract one word when need
00328     if (target_word_box &&
00329         !ProcessTargetWord(page_res_it.word()->word->bounding_box(),
00330                            *target_word_box, word_config, 2)) {
00331       page_res_it.forward();
00332       continue;
00333     }
00334     // end jetsoft
00335
00336     classify_word_and_language(&Tesseract::classify_word_pass2,
00337                                page_res_it.block()->block,
00338                                page_res_it.row()->row,
00339                                page_res_it.word());
00340     if (page_res_it.word()->word->flag(W_REP_CHAR) &&
00341         !page_res_it.word()->done) {
00342       fix_rep_char(&page_res_it);
00343       page_res_it.forward();
00344       continue;
00345     }
00346     if (tessedit_dump_choices) {
00347       word_dumper(NULL, page_res_it.row()->row, page_res_it.word());
00348       tprintf("Pass2: %s [%s]\n",
00349               page_res_it.word()->best_choice->unichar_string().string(),
00350               page_res_it.word()->best_choice->debug_string().string());
00351     }
00352     page_res_it.forward();
00353   }
00354
00355   // The next passes can only be run if tesseract has been used, as cube
00356   // doesn't set all the necessary outputs in WERD_RES.
00357   if (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY ||
00358       tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
00359     // ****************** Pass 3 *******************
00360     // Fix fuzzy spaces.
00361     set_global_loc_code(LOC_FUZZY_SPACE);
00362
00363     if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
00364         && !tessedit_word_for_word && !right_to_left())
00365       fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
00366
00367     // ****************** Pass 4 *******************
00368     if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res);
00369
00370     // ****************** Pass 5,6 *******************
00371     rejection_passes(page_res, monitor, target_word_box, word_config);
00372
00373     // ****************** Pass 7 *******************
00374     // Cube combiner.
00375     // If cube is loaded and its combiner is present, run it.
00376     if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
00377       run_cube_combiner(page_res);
00378     }
00379
00380     // ****************** Pass 8 *******************
00381     font_recognition_pass(page_res);
00382
00383     // ****************** Pass 9 *******************
00384     // Check the correctness of the final results.
00385     blamer_pass(page_res);
00386   }
00387
00388   if (!save_blob_choices) {
00389     // We aren't saving the blob choices so get rid of them now.
00390     // set_blob_choices() does a deep clear.
00391     page_res_it.restart_page();
00392     while (page_res_it.word() != NULL) {
00393       WERD_RES* word = page_res_it.word();
00394       word->best_choice->set_blob_choices(NULL);
00395       page_res_it.forward();
00396     }
00397   }
00398
00399   // Write results pass.
00400   set_global_loc_code(LOC_WRITE_RESULTS);
00401   // This is now redundant, but retained commented so show how to obtain
00402   // bounding boxes and style information.
00403
00404   // changed by jetsoft
00405   // needed for dll to output memory structure
00406   if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
00407     output_pass(page_res_it, target_word_box);
00408   // end jetsoft
00409   PageSegMode pageseg_mode = static_cast<PageSegMode>(
00410       static_cast<int>(tessedit_pageseg_mode));
00411   textord_.CleanupSingleRowResult(pageseg_mode, page_res);
00412
00413   if (monitor != NULL) {
00414     monitor->progress = 100;
00415   }
00416   return true;
00417 }
00418
00419 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
00420   PAGE_RES_IT word_it(page_res);
00421
00422   WERD_RES *w_prev = NULL;
00423   WERD_RES *w = word_it.word();
00424   while (1) {
00425     w_prev = w;
00426     while (word_it.forward() != NULL &&
00427            (!word_it.word() || word_it.word()->part_of_combo)) {
00428       // advance word_it, skipping over parts of combos
00429     }
00430     if (!word_it.word()) break;
00431     w = word_it.word();
00432     if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
00433       continue;
00434     }
00435     if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
00436       if (tessedit_bigram_debug) {
00437         tprintf("Skipping because one of the words is W_REP_CHAR\n");
00438       }
00439       continue;
00440     }
00441     // Two words sharing the same language model, excellent!
00442     if (w->alt_choices.empty()) {
00443       if (tessedit_bigram_debug) {
00444         tprintf("Alt choices not set up for word choice: %s\n",
00445                 w->best_choice->unichar_string().string());
00446       }
00447       continue;
00448     }
00449     if (w_prev->alt_choices.empty()) {
00450       if (tessedit_bigram_debug) {
00451         tprintf("Alt choices not set up for word choice: %s\n",
00452                 w_prev->best_choice->unichar_string().string());
00453       }
00454       continue;
00455     }
00456
00457     // We saved alternate choices, excellent!
00458     GenericVector<WERD_CHOICE *> overrides_word1;
00459     GenericVector<GenericVector<int> *> overrides_word1_state;
00460     GenericVector<WERD_CHOICE *> overrides_word2;
00461     GenericVector<GenericVector<int> *> overrides_word2_state;
00462
00463     STRING orig_w1_str = w_prev->best_choice->unichar_string();
00464     STRING orig_w2_str = w->best_choice->unichar_string();
00465     WERD_CHOICE prev_best(w->uch_set);
00466     {
00467       int w1start, w1end;
00468       w_prev->WithoutFootnoteSpan(&w1start, &w1end);
00469       prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
00470     }
00471     WERD_CHOICE this_best(w->uch_set);
00472     {
00473       int w2start, w2end;
00474       w->WithoutFootnoteSpan(&w2start, &w2end);
00475       this_best = w->best_choice->shallow_copy(w2start, w2end);
00476     }
00477
00478     if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
00479       if (tessedit_bigram_debug) {
00480         tprintf("Top choice \"%s %s\" verified by bigram model.\n",
00481                 orig_w1_str.string(), orig_w2_str.string());
00482       }
00483       continue;
00484     }
00485     if (tessedit_bigram_debug > 2) {
00486       tprintf("Examining alt choices for \"%s %s\".\n",
00487               orig_w1_str.string(), orig_w2_str.string());
00488     }
00489     if (tessedit_bigram_debug > 1) {
00490       if (w_prev->alt_choices.size() > 1) {
00491         print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices);
00492       }
00493       if (w->alt_choices.size() > 1) {
00494         print_word_alternates_list(w->best_choice, &w->alt_choices);
00495       }
00496     }
00497     float best_rating = 0.0;
00498     int best_idx = 0;
00499     for (int i = 0; i < w_prev->alt_choices.size(); i++) {
00500       WERD_CHOICE *p1 = w_prev->alt_choices.get(i);
00501       WERD_CHOICE strip1(w->uch_set);
00502       {
00503         int p1start, p1end;
00504         w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i),
00505                                     &p1start, &p1end);
00506         strip1 = p1->shallow_copy(p1start, p1end);
00507       }
00508       for (int j = 0; j < w->alt_choices.size(); j++) {
00509         WERD_CHOICE *p2 = w->alt_choices.get(j);
00510         WERD_CHOICE strip2(w->uch_set);
00511         {
00512           int p2start, p2end;
00513           w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end);
00514           strip2 = p2->shallow_copy(p2start, p2end);
00515         }
00516         if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
00517           overrides_word1.push_back(p1);
00518           overrides_word1_state.push_back(&w_prev->alt_states.get(i));
00519           overrides_word2.push_back(p2);
00520           overrides_word2_state.push_back(&w->alt_states.get(j));
00521           if (overrides_word1.size() == 1 ||
00522               p1->rating() + p2->rating() < best_rating) {
00523             best_rating = p1->rating() + p2->rating();
00524             best_idx = overrides_word1.size() - 1;
00525           }
00526         }
00527       }
00528     }
00529     if (overrides_word1.size() >= 1) {
00530       // Excellent, we have some bigram matches.
00531       if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice,
00532                                             *overrides_word1[best_idx]) &&
00533           EqualIgnoringCaseAndTerminalPunct(*w->best_choice,
00534                                             *overrides_word2[best_idx])) {
00535         if (tessedit_bigram_debug > 1) {
00536           tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
00537                   "model.\n", orig_w1_str.string(), orig_w2_str.string());
00538         }
00539         continue;
00540       }
00541       STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
00542       STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
00543       if (new_w1_str != orig_w1_str) {
00544         w_prev->ReplaceBestChoice(*overrides_word1[best_idx],
00545                                   *overrides_word1_state[best_idx]);
00546       }
00547       if (new_w2_str != orig_w2_str) {
00548         w->ReplaceBestChoice(*overrides_word2[best_idx],
00549                              *overrides_word2_state[best_idx]);
00550       }
00551       if (tessedit_bigram_debug > 0) {
00552         STRING choices_description;
00553         int num_bigram_choices
00554             = overrides_word1.size() * overrides_word2.size();
00555         if (num_bigram_choices == 1) {
00556           choices_description = "This was the unique bigram choice.";
00557         } else {
00558           if (tessedit_bigram_debug > 1) {
00559             STRING bigrams_list;
00560             const int kMaxChoicesToPrint = 20;
00561             for (int i = 0; i < overrides_word1.size() &&
00562                  i < kMaxChoicesToPrint; i++) {
00563               if (i > 0) { bigrams_list += ", "; }
00564               WERD_CHOICE *p1 = overrides_word1[i];
00565               WERD_CHOICE *p2 = overrides_word2[i];
00566               bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
00567               if (i == kMaxChoicesToPrint) {
00568                 bigrams_list += " ...";
00569               }
00570             }
00571             choices_description = "There were many choices: {";
00572             choices_description += bigrams_list;
00573             choices_description += "}";
00574           } else {
00575             choices_description.add_str_int("There were ", num_bigram_choices);
00576             choices_description += " compatible bigrams.";
00577           }
00578         }
00579         tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
00580                 orig_w1_str.string(), orig_w2_str.string(),
00581                 new_w1_str.string(), new_w2_str.string(),
00582                 choices_description.string());
00583       }
00584     }
00585   }
00586 }
00587
00588 void Tesseract::rejection_passes(PAGE_RES* page_res,
00589                                  ETEXT_DESC* monitor,
00590                                  const TBOX* target_word_box,
00591                                  const char* word_config) {
00592   PAGE_RES_IT page_res_it(page_res);
00593   // ****************** Pass 5 *******************
00594   // Gather statistics on rejects.
00595   int word_index = 0;
00596   while (!tessedit_test_adaption && page_res_it.word() != NULL) {
00597     set_global_loc_code(LOC_MM_ADAPT);
00598     WERD_RES* word = page_res_it.word();
00599     word_index++;
00600     if (monitor != NULL) {
00601       monitor->ocr_alive = TRUE;
00602       monitor->progress = 95 + 5 * word_index / stats_.word_count;
00603     }
00604     if (word->rebuild_word == NULL) {
00605       // Word was not processed by tesseract.
00606       page_res_it.forward();
00607       continue;
00608     }
00609     check_debug_pt(word, 70);
00610
00611     // changed by jetsoft
00612     // specific to its needs to extract one word when need
00613     if (target_word_box &&
00614         !ProcessTargetWord(word->word->bounding_box(),
00615                            *target_word_box, word_config, 4)) {
00616       page_res_it.forward();
00617       continue;
00618     }
00619     // end jetsoft
00620
00621     page_res_it.rej_stat_word();
00622     int chars_in_word = word->reject_map.length();
00623     int rejects_in_word = word->reject_map.reject_count();
00624
00625     int blob_quality = word_blob_quality(word, page_res_it.row()->row);
00626     stats_.doc_blob_quality += blob_quality;
00627     int outline_errs = word_outline_errs(word);
00628     stats_.doc_outline_errs += outline_errs;
00629     inT16 all_char_quality;
00630     inT16 accepted_all_char_quality;
00631     word_char_quality(word, page_res_it.row()->row,
00632                       &all_char_quality, &accepted_all_char_quality);
00633     stats_.doc_char_quality += all_char_quality;
00634     uinT8 permuter_type = word->best_choice->permuter();
00635     if ((permuter_type == SYSTEM_DAWG_PERM) ||
00636         (permuter_type == FREQ_DAWG_PERM) ||
00637         (permuter_type == USER_DAWG_PERM)) {
00638       stats_.good_char_count += chars_in_word - rejects_in_word;
00639       stats_.doc_good_char_quality += accepted_all_char_quality;
00640     }
00641     check_debug_pt(word, 80);
00642     if (tessedit_reject_bad_qual_wds &&
00643         (blob_quality == 0) && (outline_errs >= chars_in_word))
00644       word->reject_map.rej_word_bad_quality();
00645     check_debug_pt(word, 90);
00646     page_res_it.forward();
00647   }
00648
00649   if (tessedit_debug_quality_metrics) {
00650     tprintf
00651       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"
00652        " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
00653       page_res->char_count, page_res->rej_count,
00654       page_res->rej_count / static_cast<float>(page_res->char_count),
00655       stats_.doc_blob_quality,
00656       stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
00657       stats_.doc_outline_errs,
00658       stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
00659       stats_.doc_char_quality,
00660       stats_.doc_char_quality / static_cast<float>(page_res->char_count),
00661       stats_.doc_good_char_quality,
00662       (stats_.good_char_count > 0) ?
00663       (stats_.doc_good_char_quality /
00664        static_cast<float>(stats_.good_char_count)) : 0.0);
00665   }
00666   BOOL8 good_quality_doc =
00667     ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
00668      quality_rej_pc) &&
00669     (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
00670      quality_blob_pc) &&
00671     (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
00672      quality_outline_pc) &&
00673     (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
00674      quality_char_pc);
00675
00676   // ****************** Pass 6 *******************
00677   // Do whole document or whole block rejection pass
00678   if (!tessedit_test_adaption) {
00679     set_global_loc_code(LOC_DOC_BLK_REJ);
00680     quality_based_rejection(page_res_it, good_quality_doc);
00681   }
00682 }
00683
00684 void Tesseract::blamer_pass(PAGE_RES* page_res) {
00685   if (!wordrec_run_blamer) return;
00686   PAGE_RES_IT page_res_it(page_res);
00687   for (page_res_it.restart_page(); page_res_it.word() != NULL;
00688       page_res_it.forward()) {
00689     WERD_RES *word = page_res_it.word();
00690     if (word->blamer_bundle == NULL) {
00691       word->blamer_bundle = new BlamerBundle();
00692       word->blamer_bundle->incorrect_result_reason = IRR_PAGE_LAYOUT;
00693       word->blamer_bundle->debug = word->blamer_bundle->IncorrectReason();
00694       word->blamer_bundle->debug += " to blame";
00695     } else if (word->blamer_bundle->incorrect_result_reason ==
00696         IRR_NO_TRUTH) {
00697       word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
00698                                     word->best_choice, wordrec_debug_blamer);
00699     } else {
00700       bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice,
00701                                      word->blamer_bundle->truth_text);
00702       IncorrectResultReason irr =
00703           word->blamer_bundle->incorrect_result_reason;
00704       if (irr == IRR_CORRECT && !correct) {
00705         STRING debug = "Choice is incorrect after recognition";
00706         word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug,
00707                                       word->best_choice,
00708                                       wordrec_debug_blamer);
00709       } else if (irr != IRR_CORRECT && correct) {
00710         if (wordrec_debug_blamer) {
00711           tprintf("Corrected %s\n", word->blamer_bundle->debug.string());
00712         }
00713         word->blamer_bundle->incorrect_result_reason = IRR_CORRECT;
00714         word->blamer_bundle->debug = "";
00715       }
00716     }
00717     page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason]++;
00718   }
00719   tprintf("Blame reasons:\n");
00720   for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
00721     tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(
00722         static_cast<IncorrectResultReason>(bl)),
00723         page_res->blame_reasons[bl]);
00724   }
00725   if (page_res->misadaption_log.length() > 0) {
00726     tprintf("Misadaption log:\n");
00727     for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
00728       tprintf("%s\n", page_res->misadaption_log[i].string());
00729     }
00730   }
00731 }
00732
00733 // Helper returns true if the new_word is better than the word, using a
00734 // simple test of better certainty AND rating (to reduce false positives
00735 // from cube) or a dictionary vs non-dictionary word.
00736 static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) {
00737   if (new_word.best_choice == NULL) {
00738     return false;  // New one no good.
00739   }
00740   if (word.best_choice == NULL) {
00741     return true;  // Old one no good.
00742   }
00743   if (new_word.best_choice->certainty() > word.best_choice->certainty() &&
00744       new_word.best_choice->rating() < word.best_choice->rating()) {
00745     return true;  // New word has better confidence.
00746   }
00747   if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) &&
00748       Dict::valid_word_permuter(new_word.best_choice->permuter(), false)) {
00749     return true;  // New word is from a dictionary.
00750   }
00751   return false;  // New word is no better.
00752 }
00753
00754 // Helper to recognize the word using the given (language-specific) tesseract.
00755 // Returns true if the result was better than previously.
00756 bool Tesseract::RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row,
00757                                   WordRecognizer recognizer) {
00758   if (classify_debug_level || cube_debug_level) {
00759     tprintf("Retrying word using lang %s, oem %d\n",
00760             lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
00761   }
00762   // Setup a trial WERD_RES in which to classify.
00763   WERD_RES lang_word;
00764   lang_word.InitForRetryRecognition(*word);
00765   // Run the recognizer on the word.
00766   // Initial version is a bit of a hack based on better certainty and rating
00767   // (to reduce false positives from cube) or a dictionary vs non-dictionary
00768   // word.
00769   (this->*recognizer)(block, row, &lang_word);
00770   bool new_is_better = NewWordBetter(*word, lang_word);
00771   if (classify_debug_level || cube_debug_level) {
00772     if (lang_word.best_choice == NULL) {
00773       tprintf("New result %s better:%s\n",
00774               new_is_better ? "IS" : "NOT");
00775     } else {
00776       tprintf("New result %s better:%s, r=%g, c=%g\n",
00777               new_is_better ? "IS" : "NOT",
00778               lang_word.best_choice->unichar_string().string(),
00779               lang_word.best_choice->rating(),
00780               lang_word.best_choice->certainty());
00781     }
00782   }
00783   if (new_is_better) {
00784     word->ConsumeWordResults(&lang_word);
00785   }
00786   return new_is_better;
00787 }
00788
00789 // Generic function for classifying a word. Can be used either for pass1 or
00790 // pass2 according to the function passed to recognizer.
00791 // word block and row are the current location in the document's PAGE_RES.
00792 // Recognizes in the current language, and if successful that is all.
00793 // If recognition was not successful, tries all available languages until
00794 // it gets a successful result or runs out of languages. Keeps the best result.
00795 void Tesseract::classify_word_and_language(WordRecognizer recognizer,
00796                                            BLOCK* block,
00797                                            ROW *row,
00798                                            WERD_RES *word) {
00799   if (classify_debug_level || cube_debug_level) {
00800     tprintf("Processing word with lang %s at:",
00801             most_recently_used_->lang.string());
00802     word->word->bounding_box().print();
00803   }
00804   const char* result_type = "Initial";
00805   bool initially_done = !word->tess_failed && word->done;
00806   if (initially_done) {
00807     // If done on pass1, we reuse the tesseract that did it, and don't try
00808     // any more. The only need to call the classifier at all is for the
00809     // cube combiner and xheight fixing (which may be bogus on a done word.)
00810     most_recently_used_ = word->tesseract;
00811     result_type = "Already done";
00812   }
00813   (most_recently_used_->*recognizer)(block, row, word);
00814   if (!word->tess_failed && word->tess_accepted)
00815     result_type = "Accepted";
00816   if (classify_debug_level || cube_debug_level) {
00817     tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
00818             result_type,
00819             word->best_choice->unichar_string().string(),
00820             word->best_choice->rating(),
00821             word->best_choice->certainty(),
00822             word->tess_accepted, word->tess_would_adapt);
00823   }
00824   if (word->tess_failed || !word->tess_accepted) {
00825     // Try all the other languages to see if they are any better.
00826     Tesseract* previous_used = most_recently_used_;
00827     if (most_recently_used_ != this) {
00828       if (classify_debug_level) {
00829         tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string());
00830       }
00831       if (RetryWithLanguage(word, block, row, recognizer)) {
00832         most_recently_used_ = this;
00833         if (!word->tess_failed && word->tess_accepted)
00834           return;  // No need to look at the others.
00835       }
00836     }
00837
00838     for (int i = 0; i < sub_langs_.size(); ++i) {
00839       if (sub_langs_[i] != previous_used) {
00840         if (classify_debug_level) {
00841           tprintf("Retrying with sub-Tesseract[%d] lang: %s\n",
00842                   i, sub_langs_[i]->lang.string());
00843         }
00844         if (sub_langs_[i]->RetryWithLanguage(word, block, row, recognizer)) {
00845           most_recently_used_ = sub_langs_[i];
00846           if (!word->tess_failed && word->tess_accepted)
00847             return;  // No need to look at the others.
00848         }
00849       }
00850     }
00851   }
00852 }
00853
00860 void Tesseract::classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) {
00861   // If we only intend to run cube - run it and return.
00862   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
00863     cube_word_pass1(block, row, word);
00864     return;
00865   }
00866
00867   BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
00868   BOOL8 adapt_ok;
00869   const char *rejmap;
00870   inT16 index;
00871   STRING mapstr = "";
00872
00873   check_debug_pt(word, 0);
00874   if (word->SetupForTessRecognition(unicharset, this, BestPix(),
00875                                     classify_bln_numeric_mode,
00876                                     this->textord_use_cjk_fp_model,
00877                                     row, block))
00878     tess_segment_pass1(word, blob_choices);
00879   if (!word->tess_failed) {
00880     /*
00881        The adaption step used to be here. It has been moved to after
00882        make_reject_map so that we know whether the word will be accepted in the
00883        first pass or not.   This move will PREVENT adaption to words containing
00884        double quotes because the word will not be identical to what tess thinks
00885        its best choice is. (See CurrentBestChoiceIs in
00886        stopper.cpp which is used by AdaptableWord in
00887        adaptmatch.cpp)
00888      */
00889
00890     if (!word->word->flag(W_REP_CHAR)) {
00891       // TODO(daria) delete these hacks when replaced by more generic code.
00892       // Convert '' (double single) to " (single double).
00893       word->fix_quotes(blob_choices);
00894       if (tessedit_fix_hyphens)  // turn -- to -
00895         word->fix_hyphens(blob_choices);
00896
00897       word->tess_accepted = tess_acceptable_word(word->best_choice,
00898                                                  word->raw_choice);
00899
00900       word->tess_would_adapt = word->best_choice && word->raw_choice &&
00901           AdaptableWord(word->rebuild_word,
00902                         *word->best_choice,
00903                         *word->raw_choice);
00904                                  // Also sets word->done flag
00905       make_reject_map(word, blob_choices, row, 1);
00906
00907       adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
00908
00909       if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
00910         if (!tessedit_tess_adapt_to_rejmap) {
00911           rejmap = NULL;
00912         } else {
00913           ASSERT_HOST(word->reject_map.length() ==
00914                       word->best_choice->length());
00915
00916           for (index = 0; index < word->reject_map.length(); index++) {
00917             if (adapt_ok || word->reject_map[index].accepted())
00918               mapstr += '1';
00919             else
00920               mapstr += '0';
00921           }
00922           rejmap = mapstr.string();
00923         }
00924         // Send word to adaptive classifier for training.
00925         word->BestChoiceToCorrectText();
00926         set_word_fonts(word, blob_choices);
00927         LearnWord(NULL, rejmap, word);
00928         // Mark misadaptions if running blamer.
00929         if (word->blamer_bundle != NULL &&
00930             word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
00931             !ChoiceIsCorrect(*word->uch_set, word->best_choice,
00932                              word->blamer_bundle->truth_text)) {
00933           word->blamer_bundle->misadaption_debug ="misadapt to word (";
00934           word->blamer_bundle->misadaption_debug +=
00935               word->best_choice->permuter_name();
00936           word->blamer_bundle->misadaption_debug += "): ";
00937           word->blamer_bundle->FillDebugString(
00938               "", word->best_choice, &(word->blamer_bundle->misadaption_debug));
00939           if (wordrec_debug_blamer) {
00940             tprintf("%s\n", word->blamer_bundle->misadaption_debug.string());
00941           }
00942         }
00943       }
00944
00945       if (tessedit_enable_doc_dict)
00946         tess_add_doc_word(word->best_choice);
00947     }
00948   }
00949
00950   // Save best choices in the WERD_CHOICE if needed
00951   word->best_choice->set_blob_choices(blob_choices);
00952 }
00953
00954 // Helper to report the result of the xheight fix.
00955 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
00956                                    WERD_RES* word, WERD_RES* new_word) {
00957   tprintf("New XHT Match:%s = %s ",
00958           word->best_choice->unichar_string().string(),
00959           word->best_choice->debug_string().string());
00960   word->reject_map.print(debug_fp);
00961   tprintf(" -> %s = %s ",
00962           new_word->best_choice->unichar_string().string(),
00963           new_word->best_choice->debug_string().string());
00964   new_word->reject_map.print(debug_fp);
00965   tprintf(" %s->%s %s %s\n",
00966           word->guessed_x_ht ? "GUESS" : "CERT",
00967           new_word->guessed_x_ht ? "GUESS" : "CERT",
00968           new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
00969           accept_new_word ? "ACCEPTED" : "");
00970 }
00971
00972 // Run the x-height fix-up, based on min/max top/bottom information in
00973 // unicharset.
00974 // Returns true if the word was changed.
00975 // See the comment in fixxht.cpp for a description of the overall process.
00976 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
00977   bool accept_new_x_ht = false;
00978   int original_misfits = CountMisfitTops(word);
00979   if (original_misfits == 0)
00980     return false;
00981   float new_x_ht = ComputeCompatibleXheight(word);
00982   if (new_x_ht > 0.0f) {
00983     WERD_RES new_x_ht_word(word->word);
00984     if (word->blamer_bundle != NULL) {
00985       new_x_ht_word.blamer_bundle = new BlamerBundle();
00986       new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
00987     }
00988     new_x_ht_word.x_height = new_x_ht;
00989     new_x_ht_word.caps_height = 0.0;
00990     match_word_pass2(&new_x_ht_word, row, block);
00991     if (!new_x_ht_word.tess_failed) {
00992       int new_misfits = CountMisfitTops(&new_x_ht_word);
00993       if (debug_x_ht_level >= 1) {
00994         tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
00995                 original_misfits, word->x_height,
00996                 new_misfits, new_x_ht);
00997         tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
00998                 word->best_choice->rating(), word->best_choice->certainty(),
00999                 new_x_ht_word.best_choice->rating(),
01000                 new_x_ht_word.best_choice->certainty());
01001       }
01002       // The misfits must improve and either the rating or certainty.
01003       accept_new_x_ht = new_misfits < original_misfits &&
01004                         (new_x_ht_word.best_choice->certainty() >
01005                             word->best_choice->certainty() ||
01006                          new_x_ht_word.best_choice->rating() <
01007                             word->best_choice->rating());
01008       if (debug_x_ht_level >= 1) {
01009         ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
01010       }
01011     }
01012     if (accept_new_x_ht) {
01013       word->ConsumeWordResults(&new_x_ht_word);
01014       return true;
01015     }
01016   }
01017   return false;
01018 }
01019
01026 void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) {
01027   // Return if we do not want to run Tesseract.
01028   if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY &&
01029       tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED)
01030     return;
01031
01032   bool done_this_pass = false;
01033   set_global_subloc_code(SUBLOC_NORM);
01034   check_debug_pt(word, 30);
01035   if (!word->done || tessedit_training_tess) {
01036     word->caps_height = 0.0;
01037     if (word->x_height == 0.0f)
01038       word->x_height = row->x_height();
01039     match_word_pass2(word, row, block);
01040     done_this_pass = TRUE;
01041     check_debug_pt(word, 40);
01042   }
01043
01044   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
01045     bool accept_new_xht = false;
01046     if (unicharset.top_bottom_useful() && unicharset.script_has_xheight()) {
01047       // Use the tops and bottoms since they are available.
01048       accept_new_xht = TrainedXheightFix(word, block, row);
01049     }
01050     if (accept_new_xht)
01051       done_this_pass = true;
01052     // Test for small caps. Word capheight must be close to block xheight,
01053     // and word must contain no lower case letters, and at least one upper case.
01054     double small_cap_xheight = block->x_height() * kXHeightCapRatio;
01055     double small_cap_delta = (block->x_height() - small_cap_xheight) / 2.0;
01056     if (unicharset.script_has_xheight() &&
01057         small_cap_xheight - small_cap_delta <= word->x_height &&
01058         word->x_height <= small_cap_xheight + small_cap_delta) {
01059       // Scan for upper/lower.
01060       int num_upper = 0;
01061       int num_lower = 0;
01062       for (int i = 0; i < word->best_choice->length(); ++i) {
01063         if (unicharset.get_isupper(word->best_choice->unichar_id(i)))
01064           ++num_upper;
01065         else if (unicharset.get_islower(word->best_choice->unichar_id(i)))
01066           ++num_lower;
01067       }
01068       if (num_upper > 0 && num_lower == 0)
01069         word->small_caps = true;
01070     }
01071     word->SetScriptPositions();
01072
01073     set_global_subloc_code(SUBLOC_NORM);
01074   }
01075 #ifndef GRAPHICS_DISABLED
01076   if (tessedit_display_outwords) {
01077     if (fx_win == NULL)
01078       create_fx_win();
01079     clear_fx_win();
01080     word->rebuild_word->plot(fx_win);
01081     TBOX wbox = word->rebuild_word->bounding_box();
01082     fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
01083                             wbox.right(), wbox.bottom());
01084     ScrollView::Update();
01085   }
01086 #endif
01087   set_global_subloc_code(SUBLOC_NORM);
01088   check_debug_pt(word, 50);
01089 }
01090
01091
01098 void Tesseract::match_word_pass2(WERD_RES *word,  //word to do
01099                                  ROW *row,
01100                                  BLOCK* block) {
01101   BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
01102
01103   if (word->SetupForTessRecognition(unicharset, this, BestPix(),
01104                                     classify_bln_numeric_mode,
01105                                     this->textord_use_cjk_fp_model,
01106                                     row, block))
01107     tess_segment_pass2(word, blob_choices);
01108
01109   if (!word->tess_failed) {
01110     if (!word->word->flag (W_REP_CHAR)) {
01111       word->fix_quotes(blob_choices);
01112       if (tessedit_fix_hyphens)
01113         word->fix_hyphens(blob_choices);
01114       /* Dont trust fix_quotes! - though I think I've fixed the bug */
01115       if (word->best_choice->length() != word->box_word->length() ||
01116           word->best_choice->length() != blob_choices->length()) {
01117         tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
01118                 " #Blobs=%d; #Choices=%d\n",
01119                 word->best_choice->debug_string().string(),
01120                 word->best_choice->length(),
01121                 word->box_word->length(), blob_choices->length());
01122
01123       }
01124       word->tess_accepted = tess_acceptable_word(word->best_choice,
01125                                                  word->raw_choice);
01126
01127       make_reject_map (word, blob_choices, row, 2);
01128     }
01129   }
01130
01131   // Save best choices in the WERD_CHOICE if needed
01132   word->best_choice->set_blob_choices(blob_choices);
01133   set_word_fonts(word, blob_choices);
01134
01135   assert (word->raw_choice != NULL);
01136 }
01137
01138 // Helper to find the BLOB_CHOICE in the bc_list that matches the given
01139 // unichar_id, or NULL if there is no match.
01140 static BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
01141                                        BLOB_CHOICE_LIST* bc_list) {
01142   // Find the corresponding best BLOB_CHOICE.
01143   BLOB_CHOICE_IT choice_it(bc_list);
01144   for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
01145        choice_it.forward()) {
01146     BLOB_CHOICE* choice = choice_it.data();
01147     if (choice->unichar_id() == char_id) {
01148       return choice;
01149     }
01150   }
01151   return NULL;
01152 }
01153
01154 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
01155 // the given char_id, or NULL if none can be found.
01156 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
01157                                            WERD_RES* word_res) {
01158   // Find the corresponding best BLOB_CHOICE from any position in the word_res.
01159   BLOB_CHOICE* best_choice = NULL;
01160   BLOB_CHOICE_LIST_C_IT bc_it(word_res->best_choice->blob_choices());
01161   for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
01162     BLOB_CHOICE* choice = FindMatchingChoice(char_id, bc_it.data());
01163     if (choice != NULL) {
01164       if (best_choice == NULL || choice->rating() < best_choice->rating())
01165         best_choice = choice;
01166     }
01167   }
01168   return best_choice;
01169 }
01170
01171 // Helper to insert blob_choice in each location in the leader word if there is
01172 // no matching BLOB_CHOICE there already, and correct any incorrect results
01173 // in the best_choice.
01174 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
01175                                   WERD_RES* word_res) {
01176   WERD_CHOICE* word = word_res->best_choice;
01177   BLOB_CHOICE_LIST_C_IT bc_it(word->blob_choices());
01178   for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
01179     BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
01180                                              bc_it.data());
01181     if (choice == NULL) {
01182       BLOB_CHOICE_IT choice_it(bc_it.data());
01183       choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
01184     }
01185   }
01186   // Correct any incorrect results in word.
01187   for (int i = 0; i < word->length(); ++i) {
01188     if (word->unichar_id(i) != blob_choice->unichar_id())
01189       word->set_unichar_id(blob_choice->unichar_id(), i);
01190   }
01191 }
01192
01200 void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
01201   WERD_RES *word_res = page_res_it->word();
01202   const WERD_CHOICE &word = *(word_res->best_choice);
01203
01204   // Find the frequency of each unique character in the word.
01205   UNICHAR_ID space = word_res->uch_set->unichar_to_id(" ");
01206   SortHelper<UNICHAR_ID> rep_ch(word.length());
01207   for (int i = 0; i < word.length(); ++i) {
01208     if (word.unichar_id(i) != space)
01209       rep_ch.Add(word.unichar_id(i), 1);
01210   }
01211
01212   // Find the most frequent result.
01213   UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
01214   int max_count = rep_ch.MaxCount(&maxch_id);
01215   // Find the best exemplar of a classifier result for maxch_id.
01216   BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
01217   if (best_choice == NULL) {
01218     tprintf("Failed to find a choice for %s, occurring %d times\n",
01219             word_res->uch_set->debug_str(maxch_id).string(), max_count);
01220     return;
01221   }
01222   word_res->done = TRUE;
01223
01224   // Measure the mean space.
01225   int total_gap = 0;
01226   int gap_count = 0;
01227   WERD* werd = word_res->word;
01228   C_BLOB_IT blob_it(werd->cblob_list());
01229   C_BLOB* prev_blob = blob_it.data();
01230   for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
01231     C_BLOB* blob = blob_it.data();
01232     int gap = blob->bounding_box().left();
01233     gap -= prev_blob->bounding_box().right();
01234     total_gap += gap;
01235     ++gap_count;
01236     prev_blob = blob;
01237   }
01238   if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) {
01239     // Needs spaces between.
01240     ExplodeRepeatedWord(best_choice, page_res_it);
01241   } else {
01242     // Just correct existing classification.
01243     CorrectRepcharChoices(best_choice, word_res);
01244     word_res->reject_map.initialise(word.length());
01245   }
01246 }
01247
01248 // Explode the word at the given iterator location into individual words
01249 // of a single given unichar_id defined by best_choice.
01250 // The original word is deleted, and the replacements copy most of their
01251 // fields from the original.
01252 void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice,
01253                                     PAGE_RES_IT* page_res_it) {
01254   WERD_RES *word_res = page_res_it->word();
01255   ASSERT_HOST(best_choice != NULL);
01256
01257   // Make a new word for each blob in the original.
01258   WERD* werd = word_res->word;
01259   C_BLOB_IT blob_it(werd->cblob_list());
01260   for (; !blob_it.empty(); blob_it.forward()) {
01261     bool first_blob = blob_it.at_first();
01262     bool last_blob = blob_it.at_last();
01263     WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob,
01264                                                     blob_it.extract());
01265     // Note that blamer_bundle (truth information) is not copied, which is
01266     // desirable, since the newly inserted words would not have the original
01267     // bounding box corresponding to the one recorded in truth fields.
01268     WERD_RES* rep_word =
01269         page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
01270     // Setup the single char WERD_RES
01271     if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(),
01272                                           false,
01273                                           this->textord_use_cjk_fp_model,
01274                                           page_res_it->row()->row,
01275                                           page_res_it->block()->block)) {
01276       rep_word->CloneChoppedToRebuild();
01277       BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
01278       rep_word->FakeClassifyWord(1, &blob_choice);
01279     }
01280   }
01281   page_res_it->DeleteCurrentWord();
01282 }
01283
01284 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
01285     const UNICHARSET& char_set, const char *s, const char *lengths) {
01286   int i = 0;
01287   int offset = 0;
01288   int leading_punct_count;
01289   int upper_count = 0;
01290   int hyphen_pos = -1;
01291   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
01292
01293   if (strlen (lengths) > 20)
01294     return word_type;
01295
01296   /* Single Leading punctuation char*/
01297
01298   if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
01299     offset += lengths[i++];
01300   leading_punct_count = i;
01301
01302   /* Initial cap */
01303   while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
01304     offset += lengths[i++];
01305     upper_count++;
01306   }
01307   if (upper_count > 1) {
01308     word_type = AC_UPPER_CASE;
01309   } else {
01310     /* Lower case word, possibly with an initial cap */
01311     while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
01312       offset += lengths[i++];
01313     }
01314     if (i - leading_punct_count < quality_min_initial_alphas_reqd)
01315       goto not_a_word;
01316     /*
01317     Allow a single hyphen in a lower case word
01318     - dont trust upper case - I've seen several cases of "H" -> "I-I"
01319     */
01320     if (lengths[i] == 1 && s[offset] == '-') {
01321       hyphen_pos = i;
01322       offset += lengths[i++];
01323       if (s[offset] != '\0') {
01324         while ((s[offset] != '\0') &&
01325                char_set.get_islower(s + offset, lengths[i])) {
01326           offset += lengths[i++];
01327         }
01328         if (i < hyphen_pos + 3)
01329           goto not_a_word;
01330       }
01331     } else {
01332       /* Allow "'s" in NON hyphenated lower case words */
01333       if (lengths[i] == 1 && (s[offset] == '\'') &&
01334           lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
01335         offset += lengths[i++];
01336         offset += lengths[i++];
01337       }
01338     }
01339     if (upper_count > 0)
01340       word_type = AC_INITIAL_CAP;
01341     else
01342       word_type = AC_LOWER_CASE;
01343   }
01344
01345   /* Up to two different, constrained trailing punctuation chars */
01346   if (lengths[i] == 1 && s[offset] != '\0' &&
01347       STRING(chs_trailing_punct1).contains(s[offset]))
01348     offset += lengths[i++];
01349   if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
01350       s[offset - lengths[i - 1]] != s[offset] &&
01351       STRING(chs_trailing_punct2).contains (s[offset]))
01352     offset += lengths[i++];
01353
01354   if (s[offset] != '\0')
01355     word_type = AC_UNACCEPTABLE;
01356
01357   not_a_word:
01358
01359   if (word_type == AC_UNACCEPTABLE) {
01360     /* Look for abbreviation string */
01361     i = 0;
01362     offset = 0;
01363     if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
01364       word_type = AC_UC_ABBREV;
01365       while (s[offset] != '\0' &&
01366              char_set.get_isupper(s + offset, lengths[i]) &&
01367              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
01368         offset += lengths[i++];
01369         offset += lengths[i++];
01370       }
01371     }
01372     else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
01373       word_type = AC_LC_ABBREV;
01374       while (s[offset] != '\0' &&
01375              char_set.get_islower(s + offset, lengths[i]) &&
01376              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
01377         offset += lengths[i++];
01378         offset += lengths[i++];
01379       }
01380     }
01381     if (s[offset] != '\0')
01382       word_type = AC_UNACCEPTABLE;
01383   }
01384
01385   return word_type;
01386 }
01387
01388 BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
01389   BOOL8 show_map_detail = FALSE;
01390   inT16 i;
01391
01392   #ifndef SECURE_NAMES
01393   if (!test_pt)
01394     return FALSE;
01395
01396   tessedit_rejection_debug.set_value (FALSE);
01397   debug_x_ht_level.set_value (0);
01398
01399   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
01400     if (location < 0)
01401       return TRUE;               // For breakpoint use
01402     tessedit_rejection_debug.set_value (TRUE);
01403     debug_x_ht_level.set_value (20);
01404     tprintf ("\n\nTESTWD::");
01405     switch (location) {
01406       case 0:
01407         tprintf ("classify_word_pass1 start\n");
01408         word->word->print();
01409         break;
01410       case 10:
01411         tprintf ("make_reject_map: initial map");
01412         break;
01413       case 20:
01414         tprintf ("make_reject_map: after NN");
01415         break;
01416       case 30:
01417         tprintf ("classify_word_pass2 - START");
01418         break;
01419       case 40:
01420         tprintf ("classify_word_pass2 - Pre Xht");
01421         break;
01422       case 50:
01423         tprintf ("classify_word_pass2 - END");
01424         show_map_detail = TRUE;
01425         break;
01426       case 60:
01427         tprintf ("fixspace");
01428         break;
01429       case 70:
01430         tprintf ("MM pass START");
01431         break;
01432       case 80:
01433         tprintf ("MM pass END");
01434         break;
01435       case 90:
01436         tprintf ("After Poor quality rejection");
01437         break;
01438       case 100:
01439         tprintf ("unrej_good_quality_words - START");
01440         break;
01441       case 110:
01442         tprintf ("unrej_good_quality_words - END");
01443         break;
01444       case 120:
01445         tprintf ("Write results pass");
01446         show_map_detail = TRUE;
01447         break;
01448     }
01449     tprintf(" \"%s\" ",
01450             word->best_choice->unichar_string().string());
01451     word->reject_map.print (debug_fp);
01452     tprintf ("\n");
01453     if (show_map_detail) {
01454       tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
01455       for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
01456         tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
01457         word->reject_map[i].full_print(debug_fp);
01458       }
01459     }
01460     tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
01461     tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
01462     return TRUE;
01463   }
01464   else
01465   #endif
01466     return FALSE;
01467 }
01468
01474 static void find_modal_font(           //good chars in word
01475                      STATS *fonts,     //font stats
01476                      inT16 *font_out,   //output font
01477                      inT8 *font_count  //output count
01478                     ) {
01479   inT16 font;                     //font index
01480   inT32 count;                   //pile couat
01481
01482   if (fonts->get_total () > 0) {
01483     font = (inT16) fonts->mode ();
01484     *font_out = font;
01485     count = fonts->pile_count (font);
01486     *font_count = count < MAX_INT8 ? count : MAX_INT8;
01487     fonts->add (font, -*font_count);
01488   }
01489   else {
01490     *font_out = -1;
01491     *font_count = 0;
01492   }
01493 }
01494
01500 void Tesseract::set_word_fonts(WERD_RES *word,
01501                                BLOB_CHOICE_LIST_CLIST *blob_choices) {
01502   if (blob_choices == NULL) return;
01503   // Don't try to set the word fonts for a cube word, as the configs
01504   // will be meaningless.
01505   if (word->chopped_word == NULL) return;
01506
01507   inT32 index;                   // char id index
01508                                  // character iterator
01509   BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
01510   BLOB_CHOICE_IT choice_it;      // choice iterator
01511   int fontinfo_size = get_fontinfo_table().size();
01512   int fontset_size = get_fontset_table().size();
01513   if (fontinfo_size == 0 || fontset_size == 0) return;
01514   STATS fonts(0, fontinfo_size);  // font counters
01515
01516   word->italic = 0;
01517   word->bold = 0;
01518   if (!word->best_choice_fontinfo_ids.empty()) {
01519     word->best_choice_fontinfo_ids.clear();
01520   }
01521   // Compute the modal font for the word
01522   for (char_it.mark_cycle_pt(), index = 0;
01523        !char_it.cycled_list(); ++index, char_it.forward()) {
01524     UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
01525     choice_it.set_to_list(char_it.data());
01526     if (tessedit_debug_fonts) {
01527       tprintf("Examining fonts in %s\n",
01528               word->best_choice->debug_string().string());
01529     }
01530     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
01531          choice_it.forward()) {
01532       UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id();
01533       if (blob_ch_id == word_ch_id) {
01534         if (tessedit_debug_fonts) {
01535           tprintf("%s font %s (%d) font2 %s (%d)\n",
01536                   word->uch_set->id_to_unichar(blob_ch_id),
01537                   choice_it.data()->fontinfo_id() < 0 ? "unknown" :
01538                   fontinfo_table_.get(choice_it.data()->fontinfo_id()).name,
01539                   choice_it.data()->fontinfo_id(),
01540                   choice_it.data()->fontinfo_id2() < 0 ? "unknown" :
01541                   fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name,
01542                   choice_it.data()->fontinfo_id2());
01543         }
01544         // 1st choice font gets 2 pts, 2nd choice 1 pt.
01545         if (choice_it.data()->fontinfo_id() >= 0) {
01546           fonts.add(choice_it.data()->fontinfo_id(), 2);
01547         }
01548         if (choice_it.data()->fontinfo_id2() >= 0) {
01549           fonts.add(choice_it.data()->fontinfo_id2(), 1);
01550         }
01551         break;
01552       }
01553     }
01554   }
01555   inT16 font_id1, font_id2;
01556   find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count);
01557   find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count);
01558   word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
01559   word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
01560   // All the blobs get the word's best choice font.
01561   for (int i = 0; i < word->best_choice->length(); ++i) {
01562     word->best_choice_fontinfo_ids.push_back(font_id1);
01563   }
01564   if (word->fontinfo_id_count > 0) {
01565     FontInfo fi = fontinfo_table_.get(font_id1);
01566     if (tessedit_debug_fonts) {
01567       if (word->fontinfo_id2_count > 0) {
01568         tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
01569                 fi.name, word->fontinfo_id_count,
01570                 fontinfo_table_.get(font_id2).name,
01571                 word->fontinfo_id2_count);
01572       } else {
01573         tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
01574                 fi.name, word->fontinfo_id_count);
01575       }
01576     }
01577     // 1st choices got 2 pts, so we need to halve the score for the mode.
01578     word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
01579     word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
01580   }
01581 }
01582
01583
01590 void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
01591   PAGE_RES_IT page_res_it(page_res);
01592   WERD_RES *word;                // current word
01593   STATS doc_fonts(0, font_table_size_);           // font counters
01594
01595   // Gather font id statistics.
01596   for (page_res_it.restart_page(); page_res_it.word() != NULL;
01597        page_res_it.forward()) {
01598     word = page_res_it.word();
01599     if (word->fontinfo != NULL) {
01600       doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
01601     }
01602     if (word->fontinfo2 != NULL) {
01603       doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
01604     }
01605   }
01606   inT16 doc_font;                 // modal font
01607   inT8 doc_font_count;           // modal font
01608   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
01609   if (doc_font_count == 0)
01610     return;
01611   // Get the modal font pointer.
01612   const FontInfo* modal_font = NULL;
01613   for (page_res_it.restart_page(); page_res_it.word() != NULL;
01614        page_res_it.forward()) {
01615     word = page_res_it.word();
01616     if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
01617       modal_font = word->fontinfo;
01618       break;
01619     }
01620     if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
01621       modal_font = word->fontinfo2;
01622       break;
01623     }
01624   }
01625   ASSERT_HOST(modal_font != NULL);
01626
01627   // Assign modal font to weak words.
01628   for (page_res_it.restart_page(); page_res_it.word() != NULL;
01629        page_res_it.forward()) {
01630     word = page_res_it.word();
01631     int length = word->best_choice->length();
01632
01633     // 1st choices got 2 pts, so we need to halve the score for the mode.
01634     int count = (word->fontinfo_id_count + 1) / 2;
01635     if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
01636       word->fontinfo = modal_font;
01637       // Counts only get 1 as it came from the doc.
01638       word->fontinfo_id_count = 1;
01639       word->italic = modal_font->is_italic() ? 1 : -1;
01640       word->bold = modal_font->is_bold() ? 1 : -1;
01641     }
01642   }
01643 }
01644
01645 }  // namespace tesseract