tesseract-doc/recogtraining_8cpp_source.html

00001
00002 // File:        recogtraining.cpp
00003 // Description: Functions for ambiguity and parameter training.
00004 // Author:      Daria Antonova
00005 // Created:     Mon Aug 13 11:26:43 PDT 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019
00020 #include "tesseractclass.h"
00021
00022 #include "boxread.h"
00023 #include "control.h"
00024 #include "cutil.h"
00025 #include "host.h"
00026 #include "permute.h"
00027 #include "ratngs.h"
00028 #include "reject.h"
00029 #include "stopper.h"
00030
00031 namespace tesseract {
00032
00033 const inT16 kMaxBoxEdgeDiff = 2;
00034
00035 // Sets flags necessary for recognition in the training mode.
00036 // Opens and returns the pointer to the output file.
00037 FILE *Tesseract::init_recog_training(const STRING &fname) {
00038   if (tessedit_ambigs_training) {
00039     tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
00040     tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
00041     save_blob_choices.set_value(1);              // save individual char choices
00042     getDict().save_raw_choices.set_value(1);     // save raw choices
00043     getDict().permute_only_top.set_value(true);  // use only top choice permuter
00044     tessedit_ok_mode.set_value(0);               // turn off context checking
00045     // Explore all segmentations.
00046     getDict().stopper_no_acceptable_choices.set_value(1);
00047   }
00048
00049   STRING output_fname = fname;
00050   const char *lastdot = strrchr(output_fname.string(), '.');
00051   if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
00052   output_fname += ".txt";
00053   FILE *output_file = open_file(output_fname.string(), "a+");
00054   return output_file;
00055 }
00056
00057 // Copies the bounding box from page_res_it->word() to the given TBOX.
00058 bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
00059   while (page_res_it->block() != NULL) {
00060     if (page_res_it->word() != NULL)
00061       break;
00062     page_res_it->forward();
00063   }
00064
00065   if (page_res_it->word() != NULL) {
00066     *tbox = page_res_it->word()->word->bounding_box();
00067     page_res_it->forward();
00068
00069     // If tbox->left() is negative, the training image has vertical text and
00070     // all the coordinates of bounding boxes of page_res are rotated by 90
00071     // degrees in a counterclockwise direction. We need to rotate the TBOX back
00072     // in order to compare with the TBOXes of box files.
00073     if (tbox->left() < 0) {
00074       tbox->rotate(FCOORD(0.0, -1.0));
00075     }
00076
00077     return true;
00078   } else {
00079     return false;
00080   }
00081 }
00082
00083 // This function takes tif/box pair of files and runs recognition on the image,
00084 // while making sure that the word bounds that tesseract identified roughly
00085 // match to those specified by the input box file. For each word (ngram in a
00086 // single bounding box from the input box file) it outputs the ocred result,
00087 // the correct label, rating and certainty.
00088 void Tesseract::recog_training_segmented(const STRING &fname,
00089                                          PAGE_RES *page_res,
00090                                          volatile ETEXT_DESC *monitor,
00091                                          FILE *output_file) {
00092   STRING box_fname = fname;
00093   const char *lastdot = strrchr(box_fname.string(), '.');
00094   if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
00095   box_fname += ".box";
00096   // read_next_box() will close box_file
00097   FILE *box_file = open_file(box_fname.string(), "r");
00098
00099   PAGE_RES_IT page_res_it;
00100   page_res_it.page_res = page_res;
00101   page_res_it.restart_page();
00102   STRING label;
00103
00104   // Process all the words on this page.
00105   TBOX tbox;  // tesseract-identified box
00106   TBOX bbox;  // box from the box file
00107   bool keep_going;
00108   int line_number = 0;
00109   int examined_words = 0;
00110   do {
00111     keep_going = read_t(&page_res_it, &tbox);
00112     keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
00113                               &bbox);
00114     // Align bottom left points of the TBOXes.
00115     while (keep_going &&
00116            !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
00117       keep_going = (bbox.bottom() < tbox.bottom()) ?
00118           read_t(&page_res_it, &tbox) :
00119             ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
00120     }
00121     while (keep_going &&
00122            !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
00123       keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) :
00124           ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
00125     }
00126     // OCR the word if top right points of the TBOXes are similar.
00127     if (keep_going &&
00128         NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
00129         NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
00130         ambigs_classify_and_output(page_res_it.prev_word(),
00131                                    page_res_it.prev_row(),
00132                                    page_res_it.prev_block(),
00133                                    label.string(), output_file);
00134         examined_words++;
00135     }
00136   } while (keep_going);
00137
00138   // Set up scripts on all of the words that did not get sent to
00139   // ambigs_classify_and_output.  They all should have, but if all the
00140   // werd_res's don't get uch_sets, tesseract will crash when you try
00141   // to iterate over them. :-(
00142   int total_words = 0;
00143   for (page_res_it.restart_page(); page_res_it.block() != NULL;
00144        page_res_it.forward()) {
00145     if (page_res_it.word()) {
00146       if (page_res_it.word()->uch_set == NULL)
00147         page_res_it.word()->SetupFake(unicharset);
00148       total_words++;
00149     }
00150   }
00151   if (examined_words < 0.85 * total_words) {
00152     tprintf("TODO(antonova): clean up recog_training_segmented; "
00153             " It examined only a small fraction of the ambigs image.\n");
00154   }
00155   tprintf("recog_training_segmented: examined %d / %d words.\n",
00156           examined_words, total_words);
00157 }
00158
00159 // Runs classify_word_pass1() on the current word. Outputs Tesseract's
00160 // raw choice as a result of the classification. For words labeled with a
00161 // single unichar also outputs all alternatives from blob_choices of the
00162 // best choice.
00163 void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
00164                                            ROW_RES *row_res,
00165                                            BLOCK_RES *block_res,
00166                                            const char *label,
00167                                            FILE *output_file) {
00168   int offset;
00169   // Classify word.
00170   fflush(stdout);
00171   classify_word_pass1(block_res->block, row_res->row, werd_res);
00172   WERD_CHOICE *best_choice = werd_res->best_choice;
00173   ASSERT_HOST(best_choice != NULL);
00174   ASSERT_HOST(best_choice->blob_choices() != NULL);
00175
00176   // Compute the number of unichars in the label.
00177   int label_num_unichars = 0;
00178   int step = 1;  // should be non-zero on the first iteration
00179   for (offset = 0; label[offset] != '\0' && step > 0;
00180        step = werd_res->uch_set->step(label + offset),
00181        offset += step, ++label_num_unichars);
00182   if (step == 0) {
00183     tprintf("Not outputting illegal unichar %s\n", label);
00184     return;
00185   }
00186
00187   // Output all classifier choices for the unigrams (1->1 classifications).
00188   if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
00189     BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
00190     outer_blob_choice_it.set_to_list(best_choice->blob_choices());
00191     BLOB_CHOICE_IT blob_choice_it;
00192     blob_choice_it.set_to_list(outer_blob_choice_it.data());
00193     for (blob_choice_it.mark_cycle_pt();
00194          !blob_choice_it.cycled_list();
00195          blob_choice_it.forward()) {
00196       BLOB_CHOICE *blob_choice = blob_choice_it.data();
00197       if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
00198         fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
00199                unicharset.id_to_unichar(blob_choice->unichar_id()),
00200                label, blob_choice->rating(), blob_choice->certainty());
00201       }
00202     }
00203   }
00204   // Output raw choices for many->many and 1->many classifications.
00205   getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
00206 }
00207
00208 }  // namespace tesseract