Tesseract
3.02
|
00001 00002 // File: recogtraining.cpp 00003 // Description: Functions for ambiguity and parameter training. 00004 // Author: Daria Antonova 00005 // Created: Mon Aug 13 11:26:43 PDT 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include "tesseractclass.h" 00021 00022 #include "boxread.h" 00023 #include "control.h" 00024 #include "cutil.h" 00025 #include "host.h" 00026 #include "permute.h" 00027 #include "ratngs.h" 00028 #include "reject.h" 00029 #include "stopper.h" 00030 00031 namespace tesseract { 00032 00033 const inT16 kMaxBoxEdgeDiff = 2; 00034 00035 // Sets flags necessary for recognition in the training mode. 00036 // Opens and returns the pointer to the output file. 00037 FILE *Tesseract::init_recog_training(const STRING &fname) { 00038 if (tessedit_ambigs_training) { 00039 tessedit_tess_adaption_mode.set_value(0); // turn off adaption 00040 tessedit_enable_doc_dict.set_value(0); // turn off document dictionary 00041 save_blob_choices.set_value(1); // save individual char choices 00042 getDict().save_raw_choices.set_value(1); // save raw choices 00043 getDict().permute_only_top.set_value(true); // use only top choice permuter 00044 tessedit_ok_mode.set_value(0); // turn off context checking 00045 // Explore all segmentations. 00046 getDict().stopper_no_acceptable_choices.set_value(1); 00047 } 00048 00049 STRING output_fname = fname; 00050 const char *lastdot = strrchr(output_fname.string(), '.'); 00051 if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0'; 00052 output_fname += ".txt"; 00053 FILE *output_file = open_file(output_fname.string(), "a+"); 00054 return output_file; 00055 } 00056 00057 // Copies the bounding box from page_res_it->word() to the given TBOX. 00058 bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) { 00059 while (page_res_it->block() != NULL) { 00060 if (page_res_it->word() != NULL) 00061 break; 00062 page_res_it->forward(); 00063 } 00064 00065 if (page_res_it->word() != NULL) { 00066 *tbox = page_res_it->word()->word->bounding_box(); 00067 page_res_it->forward(); 00068 00069 // If tbox->left() is negative, the training image has vertical text and 00070 // all the coordinates of bounding boxes of page_res are rotated by 90 00071 // degrees in a counterclockwise direction. We need to rotate the TBOX back 00072 // in order to compare with the TBOXes of box files. 00073 if (tbox->left() < 0) { 00074 tbox->rotate(FCOORD(0.0, -1.0)); 00075 } 00076 00077 return true; 00078 } else { 00079 return false; 00080 } 00081 } 00082 00083 // This function takes tif/box pair of files and runs recognition on the image, 00084 // while making sure that the word bounds that tesseract identified roughly 00085 // match to those specified by the input box file. For each word (ngram in a 00086 // single bounding box from the input box file) it outputs the ocred result, 00087 // the correct label, rating and certainty. 00088 void Tesseract::recog_training_segmented(const STRING &fname, 00089 PAGE_RES *page_res, 00090 volatile ETEXT_DESC *monitor, 00091 FILE *output_file) { 00092 STRING box_fname = fname; 00093 const char *lastdot = strrchr(box_fname.string(), '.'); 00094 if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0'; 00095 box_fname += ".box"; 00096 // read_next_box() will close box_file 00097 FILE *box_file = open_file(box_fname.string(), "r"); 00098 00099 PAGE_RES_IT page_res_it; 00100 page_res_it.page_res = page_res; 00101 page_res_it.restart_page(); 00102 STRING label; 00103 00104 // Process all the words on this page. 00105 TBOX tbox; // tesseract-identified box 00106 TBOX bbox; // box from the box file 00107 bool keep_going; 00108 int line_number = 0; 00109 int examined_words = 0; 00110 do { 00111 keep_going = read_t(&page_res_it, &tbox); 00112 keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label, 00113 &bbox); 00114 // Align bottom left points of the TBOXes. 00115 while (keep_going && 00116 !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) { 00117 keep_going = (bbox.bottom() < tbox.bottom()) ? 00118 read_t(&page_res_it, &tbox) : 00119 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); 00120 } 00121 while (keep_going && 00122 !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) { 00123 keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) : 00124 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); 00125 } 00126 // OCR the word if top right points of the TBOXes are similar. 00127 if (keep_going && 00128 NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) && 00129 NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) { 00130 ambigs_classify_and_output(page_res_it.prev_word(), 00131 page_res_it.prev_row(), 00132 page_res_it.prev_block(), 00133 label.string(), output_file); 00134 examined_words++; 00135 } 00136 } while (keep_going); 00137 00138 // Set up scripts on all of the words that did not get sent to 00139 // ambigs_classify_and_output. They all should have, but if all the 00140 // werd_res's don't get uch_sets, tesseract will crash when you try 00141 // to iterate over them. :-( 00142 int total_words = 0; 00143 for (page_res_it.restart_page(); page_res_it.block() != NULL; 00144 page_res_it.forward()) { 00145 if (page_res_it.word()) { 00146 if (page_res_it.word()->uch_set == NULL) 00147 page_res_it.word()->SetupFake(unicharset); 00148 total_words++; 00149 } 00150 } 00151 if (examined_words < 0.85 * total_words) { 00152 tprintf("TODO(antonova): clean up recog_training_segmented; " 00153 " It examined only a small fraction of the ambigs image.\n"); 00154 } 00155 tprintf("recog_training_segmented: examined %d / %d words.\n", 00156 examined_words, total_words); 00157 } 00158 00159 // Runs classify_word_pass1() on the current word. Outputs Tesseract's 00160 // raw choice as a result of the classification. For words labeled with a 00161 // single unichar also outputs all alternatives from blob_choices of the 00162 // best choice. 00163 void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res, 00164 ROW_RES *row_res, 00165 BLOCK_RES *block_res, 00166 const char *label, 00167 FILE *output_file) { 00168 int offset; 00169 // Classify word. 00170 fflush(stdout); 00171 classify_word_pass1(block_res->block, row_res->row, werd_res); 00172 WERD_CHOICE *best_choice = werd_res->best_choice; 00173 ASSERT_HOST(best_choice != NULL); 00174 ASSERT_HOST(best_choice->blob_choices() != NULL); 00175 00176 // Compute the number of unichars in the label. 00177 int label_num_unichars = 0; 00178 int step = 1; // should be non-zero on the first iteration 00179 for (offset = 0; label[offset] != '\0' && step > 0; 00180 step = werd_res->uch_set->step(label + offset), 00181 offset += step, ++label_num_unichars); 00182 if (step == 0) { 00183 tprintf("Not outputting illegal unichar %s\n", label); 00184 return; 00185 } 00186 00187 // Output all classifier choices for the unigrams (1->1 classifications). 00188 if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) { 00189 BLOB_CHOICE_LIST_C_IT outer_blob_choice_it; 00190 outer_blob_choice_it.set_to_list(best_choice->blob_choices()); 00191 BLOB_CHOICE_IT blob_choice_it; 00192 blob_choice_it.set_to_list(outer_blob_choice_it.data()); 00193 for (blob_choice_it.mark_cycle_pt(); 00194 !blob_choice_it.cycled_list(); 00195 blob_choice_it.forward()) { 00196 BLOB_CHOICE *blob_choice = blob_choice_it.data(); 00197 if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) { 00198 fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n", 00199 unicharset.id_to_unichar(blob_choice->unichar_id()), 00200 label, blob_choice->rating(), blob_choice->certainty()); 00201 } 00202 } 00203 } 00204 // Output raw choices for many->many and 1->many classifications. 00205 getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars); 00206 } 00207 00208 } // namespace tesseract