Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: applybox.cpp (Formerly applybox.c) 00003 * Description: Re segment rows according to box file data 00004 * Author: Phil Cheatle 00005 * Created: Wed Nov 24 09:11:23 GMT 1993 00006 * 00007 * (C) Copyright 1993, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 #include "mfcpch.h" 00020 00021 #ifdef _MSC_VER 00022 #pragma warning(disable:4244) // Conversion warnings 00023 #endif 00024 00025 #include <ctype.h> 00026 #include <string.h> 00027 #ifdef __UNIX__ 00028 #include <assert.h> 00029 #include <errno.h> 00030 #endif 00031 #include "allheaders.h" 00032 #include "boxread.h" 00033 #include "chopper.h" 00034 #include "pageres.h" 00035 #include "unichar.h" 00036 #include "unicharset.h" 00037 #include "tesseractclass.h" 00038 #include "genericvector.h" 00039 00040 // Max number of blobs to classify together in FindSegmentation. 00041 const int kMaxGroupSize = 4; 00042 // Max fraction of median allowed as deviation in xheight before switching 00043 // to median. 00044 const double kMaxXHeightDeviationFraction = 0.125; 00045 00046 /************************************************************************* 00047 * The box file is assumed to contain box definitions, one per line, of the 00048 * following format for blob-level boxes: 00049 * <UTF8 str> <left> <bottom> <right> <top> <page id> 00050 * and for word/line-level boxes: 00051 * WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> 00052 * NOTES: 00053 * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT. 00054 * 00055 * <page id> is 0-based, and the page number is used for multipage input (tiff). 00056 * 00057 * In the blob-level form, each line represents a recognizable unit, which may 00058 * be several UTF-8 bytes, but there is a bounding box around each recognizable 00059 * unit, and no classifier is needed to train in this mode (bootstrapping.) 00060 * 00061 * In the word/line-level form, the line begins with the literal "WordStr", and 00062 * the bounding box bounds either a whole line or a whole word. The recognizable 00063 * units in the word/line are listed after the # at the end of the line and 00064 * are space delimited, ignoring any original spaces on the line. 00065 * Eg. 00066 * word -> #w o r d 00067 * multi word line -> #m u l t i w o r d l i n e 00068 * The recognizable units must be space-delimited in order to allow multiple 00069 * unicodes to be used for a single recognizable unit, eg Hindi. 00070 * In this mode, the classifier must have been pre-trained with the desired 00071 * character set, or it will not be able to find the character segmentations. 00072 *************************************************************************/ 00073 00074 namespace tesseract { 00075 00076 static void clear_any_old_text(BLOCK_LIST *block_list) { 00077 BLOCK_IT block_it(block_list); 00078 for (block_it.mark_cycle_pt(); 00079 !block_it.cycled_list(); block_it.forward()) { 00080 ROW_IT row_it(block_it.data()->row_list()); 00081 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00082 WERD_IT word_it(row_it.data()->word_list()); 00083 for (word_it.mark_cycle_pt(); 00084 !word_it.cycled_list(); word_it.forward()) { 00085 word_it.data()->set_text(""); 00086 } 00087 } 00088 } 00089 } 00090 00091 // Applies the box file based on the image name fname, and resegments 00092 // the words in the block_list (page), with: 00093 // blob-mode: one blob per line in the box file, words as input. 00094 // word/line-mode: one blob per space-delimited unit after the #, and one word 00095 // per line in the box file. (See comment above for box file format.) 00096 // If find_segmentation is true, (word/line mode) then the classifier is used 00097 // to re-segment words/lines to match the space-delimited truth string for 00098 // each box. In this case, the input box may be for a word or even a whole 00099 // text line, and the output words will contain multiple blobs corresponding 00100 // to the space-delimited input string. 00101 // With find_segmentation false, no classifier is needed, but the chopper 00102 // can still be used to correctly segment touching characters with the help 00103 // of the input boxes. 00104 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned 00105 // from normal classification, ie. with a word, chopped_word, rebuild_word, 00106 // seam_array, denorm, box_word, and best_state, but NO best_choice or 00107 // raw_choice, as they would require a UNICHARSET, which we aim to avoid. 00108 // Instead, the correct_text member of WERD_RES is set, and this may be later 00109 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords 00110 // is not required before calling ApplyBoxTraining. 00111 PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname, 00112 bool find_segmentation, 00113 BLOCK_LIST *block_list) { 00114 int box_count = 0; 00115 int box_failures = 0; 00116 00117 FILE* box_file = OpenBoxFile(fname); 00118 TBOX box; 00119 GenericVector<TBOX> boxes; 00120 GenericVector<STRING> texts, full_texts; 00121 00122 bool found_box = true; 00123 while (found_box) { 00124 int line_number = 0; // Line number of the box file. 00125 STRING text, full_text; 00126 found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box); 00127 if (found_box) { 00128 ++box_count; 00129 MakeBoxFileStr(text.string(), box, applybox_page, &full_text); 00130 } else { 00131 full_text = ""; 00132 } 00133 boxes.push_back(box); 00134 texts.push_back(text); 00135 full_texts.push_back(full_text); 00136 } 00137 00138 // In word mode, we use the boxes to make a word for each box, but 00139 // in blob mode we use the existing words and maximally chop them first. 00140 PAGE_RES* page_res = find_segmentation ? 00141 NULL : SetupApplyBoxes(boxes, block_list); 00142 clear_any_old_text(block_list); 00143 00144 for (int i = 0; i < boxes.size() - 1; i++) { 00145 bool foundit = false; 00146 if (page_res != NULL) { 00147 if (i == 0) { 00148 foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1], 00149 full_texts[i].string()); 00150 } else { 00151 foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i], 00152 boxes[i + 1], full_texts[i].string()); 00153 } 00154 } else { 00155 foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1], 00156 texts[i].string()); 00157 } 00158 if (!foundit) { 00159 box_failures++; 00160 ReportFailedBox(box_count, boxes[i], texts[i].string(), 00161 "FAILURE! Couldn't find a matching blob"); 00162 } 00163 } 00164 00165 if (page_res == NULL) { 00166 // In word/line mode, we now maximally chop all the words and resegment 00167 // them with the classifier. 00168 page_res = SetupApplyBoxes(boxes, block_list); 00169 ReSegmentByClassification(page_res); 00170 } 00171 if (applybox_debug > 0) { 00172 tprintf("APPLY_BOXES:\n"); 00173 tprintf(" Boxes read from boxfile: %6d\n", box_count); 00174 if (box_failures > 0) 00175 tprintf(" Boxes failed resegmentation: %6d\n", box_failures); 00176 } 00177 TidyUp(page_res); 00178 return page_res; 00179 } 00180 00181 // Helper computes median xheight in the image. 00182 static double MedianXHeight(BLOCK_LIST *block_list) { 00183 BLOCK_IT block_it(block_list); 00184 STATS xheights(0, block_it.data()->bounding_box().height()); 00185 for (block_it.mark_cycle_pt(); 00186 !block_it.cycled_list(); block_it.forward()) { 00187 ROW_IT row_it(block_it.data()->row_list()); 00188 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00189 xheights.add(IntCastRounded(row_it.data()->x_height()), 1); 00190 } 00191 } 00192 return xheights.median(); 00193 } 00194 00195 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: 00196 // All fuzzy spaces are removed, and all the words are maximally chopped. 00197 PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes, 00198 BLOCK_LIST *block_list) { 00199 double median_xheight = MedianXHeight(block_list); 00200 double max_deviation = kMaxXHeightDeviationFraction * median_xheight; 00201 // Strip all fuzzy space markers to simplify the PAGE_RES. 00202 BLOCK_IT b_it(block_list); 00203 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00204 BLOCK* block = b_it.data(); 00205 ROW_IT r_it(block->row_list()); 00206 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { 00207 ROW* row = r_it.data(); 00208 float diff = fabs(row->x_height() - median_xheight); 00209 if (diff > max_deviation) { 00210 if (applybox_debug) { 00211 tprintf("row xheight=%g, but median xheight = %g\n", 00212 row->x_height(), median_xheight); 00213 } 00214 row->set_x_height(static_cast<float>(median_xheight)); 00215 } 00216 WERD_IT w_it(row->word_list()); 00217 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00218 WERD* word = w_it.data(); 00219 if (word->cblob_list()->empty()) { 00220 delete w_it.extract(); 00221 } else { 00222 word->set_flag(W_FUZZY_SP, false); 00223 word->set_flag(W_FUZZY_NON, false); 00224 } 00225 } 00226 } 00227 } 00228 PAGE_RES* page_res = new PAGE_RES(block_list, NULL); 00229 PAGE_RES_IT pr_it(page_res); 00230 WERD_RES* word_res; 00231 while ((word_res = pr_it.word()) != NULL) { 00232 MaximallyChopWord(boxes, pr_it.block()->block, 00233 pr_it.row()->row, word_res); 00234 pr_it.forward(); 00235 } 00236 return page_res; 00237 } 00238 00239 // Helper to make a WERD_CHOICE from the BLOB_CHOICE_LIST_VECTOR using only 00240 // the top choices. Avoids problems with very long words. 00241 static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices, 00242 const UNICHARSET& unicharset, 00243 WERD_CHOICE* word_choice) { 00244 *word_choice = WERD_CHOICE(&unicharset); // clear the word choice. 00245 word_choice->make_bad(); 00246 for (int i = 0; i < char_choices.size(); ++i) { 00247 BLOB_CHOICE_IT it(char_choices[i]); 00248 BLOB_CHOICE* bc = it.data(); 00249 word_choice->append_unichar_id(bc->unichar_id(), 1, 00250 bc->rating(), bc->certainty()); 00251 } 00252 } 00253 00254 // Tests the chopper by exhaustively running chop_one_blob. 00255 // The word_res will contain filled chopped_word, seam_array, denorm, 00256 // box_word and best_state for the maximally chopped word. 00257 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes, 00258 BLOCK* block, ROW* row, 00259 WERD_RES* word_res) { 00260 if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false, 00261 this->textord_use_cjk_fp_model, 00262 row, block)) { 00263 word_res->CloneChoppedToRebuild(); 00264 return; 00265 } 00266 if (chop_debug) { 00267 tprintf("Maximally chopping word at:"); 00268 word_res->word->bounding_box().print(); 00269 } 00270 blob_match_table.init_match_table(); 00271 BLOB_CHOICE_LIST *match_result; 00272 BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); 00273 ASSERT_HOST(word_res->chopped_word->blobs != NULL); 00274 float rating = static_cast<float>(MAX_INT8); 00275 for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL; 00276 blob = blob->next) { 00277 // The rating and certainty are not quite arbitrary. Since 00278 // select_blob_to_chop uses the worst certainty to choose, they all have 00279 // to be different, so starting with MAX_INT8, subtract 1/8 for each blob 00280 // in here, and then divide by e each time they are chopped, which 00281 // should guarantee a set of unequal values for the whole tree of blobs 00282 // produced, however much chopping is required. The chops are thus only 00283 // limited by the ability of the chopper to find suitable chop points, 00284 // and not by the value of the certainties. 00285 match_result = fake_classify_blob(0, rating, -rating); 00286 modify_blob_choice(match_result, 0); 00287 ASSERT_HOST(!match_result->empty()); 00288 *char_choices += match_result; 00289 rating -= 0.125f; 00290 } 00291 inT32 blob_number; 00292 int right_chop_index = 0; 00293 if (!assume_fixed_pitch_char_segment) { 00294 // We only chop if the language is not fixed pitch like CJK. 00295 if (prioritize_division) { 00296 while (chop_one_blob2(boxes, word_res, &word_res->seam_array)); 00297 } else { 00298 while (chop_one_blob(word_res->chopped_word, char_choices, 00299 &blob_number, &word_res->seam_array, 00300 &right_chop_index)); 00301 } 00302 } 00303 MakeWordChoice(*char_choices, unicharset, word_res->best_choice); 00304 MakeWordChoice(*char_choices, unicharset, word_res->raw_choice); 00305 word_res->CloneChoppedToRebuild(); 00306 blob_match_table.end_match_table(); 00307 if (char_choices != NULL) { 00308 char_choices->delete_data_pointers(); 00309 delete char_choices; 00310 } 00311 } 00312 00313 // Helper to compute the dispute resolution metric. 00314 // Disputed blob resolution. The aim is to give the blob to the most 00315 // appropriate boxfile box. Most of the time it is obvious, but if 00316 // two boxfile boxes overlap significantly it is not. If a small boxfile 00317 // box takes most of the blob, and a large boxfile box does too, then 00318 // we want the small boxfile box to get it, but if the small box 00319 // is much smaller than the blob, we don't want it to get it. 00320 // Details of the disputed blob resolution: 00321 // Given a box with area A, and a blob with area B, with overlap area C, 00322 // then the miss metric is (A-C)(B-C)/(AB) and the box with minimum 00323 // miss metric gets the blob. 00324 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) { 00325 int overlap_area = box1.intersection(box2).area(); 00326 double miss_metric = box1.area()- overlap_area; 00327 miss_metric /= box1.area(); 00328 miss_metric *= box2.area() - overlap_area; 00329 miss_metric /= box2.area(); 00330 return miss_metric; 00331 } 00332 00333 // Gather consecutive blobs that match the given box into the best_state 00334 // and corresponding correct_text. 00335 // Fights over which box owns which blobs are settled by pre-chopping and 00336 // applying the blobs to box or next_box with the least non-overlap. 00337 // Returns false if the box was in error, which can only be caused by 00338 // failing to find an appropriate blob for a box. 00339 // This means that occasionally, blobs may be incorrectly segmented if the 00340 // chopper fails to find a suitable chop point. 00341 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box, 00342 const TBOX& box, const TBOX& next_box, 00343 const char* correct_text) { 00344 if (applybox_debug > 1) { 00345 tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text); 00346 } 00347 PAGE_RES_IT page_res_it(page_res); 00348 WERD_RES* word_res; 00349 for (word_res = page_res_it.word(); word_res != NULL; 00350 word_res = page_res_it.forward()) { 00351 if (!word_res->box_word->bounding_box().major_overlap(box)) 00352 continue; 00353 if (applybox_debug > 1) { 00354 tprintf("Checking word box:"); 00355 word_res->box_word->bounding_box().print(); 00356 } 00357 int word_len = word_res->box_word->length(); 00358 for (int i = 0; i < word_len; ++i) { 00359 TBOX char_box = TBOX(); 00360 int blob_count = 0; 00361 for (blob_count = 0; i + blob_count < word_len; ++blob_count) { 00362 TBOX blob_box = word_res->box_word->BlobBox(i + blob_count); 00363 if (!blob_box.major_overlap(box)) 00364 break; 00365 if (word_res->correct_text[i + blob_count].length() > 0) 00366 break; // Blob is claimed already. 00367 double current_box_miss_metric = BoxMissMetric(blob_box, box); 00368 double next_box_miss_metric = BoxMissMetric(blob_box, next_box); 00369 if (applybox_debug > 2) { 00370 tprintf("Checking blob:"); 00371 blob_box.print(); 00372 tprintf("Current miss metric = %g, next = %g\n", 00373 current_box_miss_metric, next_box_miss_metric); 00374 } 00375 if (current_box_miss_metric > next_box_miss_metric) 00376 break; // Blob is a better match for next box. 00377 char_box += blob_box; 00378 } 00379 if (blob_count > 0) { 00380 if (applybox_debug > 1) { 00381 tprintf("Index [%d, %d) seem good.\n", i, i + blob_count); 00382 } 00383 if (!char_box.almost_equal(box, 3) && 00384 (box.x_gap(next_box) < -3 || 00385 (prev_box != NULL && prev_box->x_gap(box) < -3))) { 00386 return false; 00387 } 00388 // We refine just the box_word, best_state and correct_text here. 00389 // The rebuild_word is made in TidyUp. 00390 // blob_count blobs are put together to match the box. Merge the 00391 // box_word boxes, save the blob_count in the state and the text. 00392 word_res->box_word->MergeBoxes(i, i + blob_count); 00393 word_res->best_state[i] = blob_count; 00394 word_res->correct_text[i] = correct_text; 00395 if (applybox_debug > 2) { 00396 tprintf("%d Blobs match: blob box:", blob_count); 00397 word_res->box_word->BlobBox(i).print(); 00398 tprintf("Matches box:"); 00399 box.print(); 00400 tprintf("With next box:"); 00401 next_box.print(); 00402 } 00403 // Eliminated best_state and correct_text entries for the consumed 00404 // blobs. 00405 for (int j = 1; j < blob_count; ++j) { 00406 word_res->best_state.remove(i + 1); 00407 word_res->correct_text.remove(i + 1); 00408 } 00409 // Assume that no box spans multiple source words, so we are done with 00410 // this box. 00411 if (applybox_debug > 1) { 00412 tprintf("Best state = "); 00413 for (int j = 0; j < word_res->best_state.size(); ++j) { 00414 tprintf("%d ", word_res->best_state[j]); 00415 } 00416 tprintf("\n"); 00417 tprintf("Correct text = [[ "); 00418 for (int j = 0; j < word_res->correct_text.size(); ++j) { 00419 tprintf("%s ", word_res->correct_text[j].string()); 00420 } 00421 tprintf("]]\n"); 00422 } 00423 return true; 00424 } 00425 } 00426 } 00427 if (applybox_debug > 0) { 00428 tprintf("FAIL!\n"); 00429 } 00430 return false; // Failure. 00431 } 00432 00433 // Consume all source blobs that strongly overlap the given box, 00434 // putting them into a new word, with the correct_text label. 00435 // Fights over which box owns which blobs are settled by 00436 // applying the blobs to box or next_box with the least non-overlap. 00437 // Returns false if the box was in error, which can only be caused by 00438 // failing to find an overlapping blob for a box. 00439 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, 00440 const TBOX& box, const TBOX& next_box, 00441 const char* correct_text) { 00442 if (applybox_debug > 1) { 00443 tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); 00444 } 00445 WERD* new_word = NULL; 00446 BLOCK_IT b_it(block_list); 00447 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00448 BLOCK* block = b_it.data(); 00449 if (!box.major_overlap(block->bounding_box())) 00450 continue; 00451 ROW_IT r_it(block->row_list()); 00452 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { 00453 ROW* row = r_it.data(); 00454 if (!box.major_overlap(row->bounding_box())) 00455 continue; 00456 WERD_IT w_it(row->word_list()); 00457 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00458 WERD* word = w_it.data(); 00459 if (applybox_debug > 2) { 00460 tprintf("Checking word:"); 00461 word->bounding_box().print(); 00462 } 00463 if (word->text() != NULL && word->text()[0] != '\0') 00464 continue; // Ignore words that are already done. 00465 if (!box.major_overlap(word->bounding_box())) 00466 continue; 00467 C_BLOB_IT blob_it(word->cblob_list()); 00468 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); 00469 blob_it.forward()) { 00470 C_BLOB* blob = blob_it.data(); 00471 TBOX blob_box = blob->bounding_box(); 00472 if (!blob_box.major_overlap(box)) 00473 continue; 00474 double current_box_miss_metric = BoxMissMetric(blob_box, box); 00475 double next_box_miss_metric = BoxMissMetric(blob_box, next_box); 00476 if (applybox_debug > 2) { 00477 tprintf("Checking blob:"); 00478 blob_box.print(); 00479 tprintf("Current miss metric = %g, next = %g\n", 00480 current_box_miss_metric, next_box_miss_metric); 00481 } 00482 if (current_box_miss_metric > next_box_miss_metric) 00483 continue; // Blob is a better match for next box. 00484 if (applybox_debug > 2) { 00485 tprintf("Blob match: blob:"); 00486 blob_box.print(); 00487 tprintf("Matches box:"); 00488 box.print(); 00489 tprintf("With next box:"); 00490 next_box.print(); 00491 } 00492 if (new_word == NULL) { 00493 // Make a new word with a single blob. 00494 new_word = word->shallow_copy(); 00495 new_word->set_text(correct_text); 00496 w_it.add_to_end(new_word); 00497 } 00498 C_BLOB_IT new_blob_it(new_word->cblob_list()); 00499 new_blob_it.add_to_end(blob_it.extract()); 00500 } 00501 } 00502 } 00503 } 00504 if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); 00505 return new_word != NULL; 00506 } 00507 00508 // Resegments the words by running the classifier in an attempt to find the 00509 // correct segmentation that produces the required string. 00510 void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) { 00511 PAGE_RES_IT pr_it(page_res); 00512 WERD_RES* word_res; 00513 for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { 00514 WERD* word = word_res->word; 00515 if (word->text() == NULL || word->text()[0] == '\0') 00516 continue; // Ignore words that have no text. 00517 // Convert the correct text to a vector of UNICHAR_ID 00518 GenericVector<UNICHAR_ID> target_text; 00519 if (!ConvertStringToUnichars(word->text(), &target_text)) { 00520 tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", 00521 word->text()); 00522 pr_it.DeleteCurrentWord(); 00523 continue; 00524 } 00525 if (!FindSegmentation(target_text, word_res)) { 00526 tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", 00527 word->text()); 00528 pr_it.DeleteCurrentWord(); 00529 continue; 00530 } 00531 } 00532 } 00533 00534 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. 00535 // Returns false if an invalid UNICHAR_ID is encountered. 00536 bool Tesseract::ConvertStringToUnichars(const char* utf8, 00537 GenericVector<UNICHAR_ID>* class_ids) { 00538 for (int step = 0; *utf8 != '\0'; utf8 += step) { 00539 const char* next_space = strchr(utf8, ' '); 00540 if (next_space == NULL) 00541 next_space = utf8 + strlen(utf8); 00542 step = next_space - utf8; 00543 UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step); 00544 if (class_id == INVALID_UNICHAR_ID) { 00545 return false; 00546 } 00547 while (utf8[step] == ' ') 00548 ++step; 00549 class_ids->push_back(class_id); 00550 } 00551 return true; 00552 } 00553 00554 // Resegments the word to achieve the target_text from the classifier. 00555 // Returns false if the re-segmentation fails. 00556 // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and 00557 // applies a full search on the classifier results to find the best classified 00558 // segmentation. As a compromise to obtain better recall, 1-1 ambiguity 00559 // substitutions ARE used. 00560 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text, 00561 WERD_RES* word_res) { 00562 blob_match_table.init_match_table(); 00563 // Classify all required combinations of blobs and save results in choices. 00564 int word_length = word_res->box_word->length(); 00565 GenericVector<BLOB_CHOICE_LIST*>* choices = 00566 new GenericVector<BLOB_CHOICE_LIST*>[word_length]; 00567 for (int i = 0; i < word_length; ++i) { 00568 for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { 00569 BLOB_CHOICE_LIST* match_result = classify_piece( 00570 word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array, 00571 i, i + j - 1, word_res->blamer_bundle); 00572 if (applybox_debug > 2) { 00573 tprintf("%d+%d:", i, j); 00574 print_ratings_list("Segment:", match_result, unicharset); 00575 } 00576 choices[i].push_back(match_result); 00577 } 00578 } 00579 // Search the segmentation graph for the target text. Must be an exact 00580 // match. Using wildcards makes it difficult to find the correct 00581 // segmentation even when it is there. 00582 word_res->best_state.clear(); 00583 GenericVector<int> search_segmentation; 00584 float best_rating = 0.0f; 00585 SearchForText(choices, 0, word_length, target_text, 0, 0.0f, 00586 &search_segmentation, &best_rating, &word_res->best_state); 00587 blob_match_table.end_match_table(); 00588 for (int i = 0; i < word_length; ++i) 00589 choices[i].delete_data_pointers(); 00590 delete [] choices; 00591 if (word_res->best_state.empty()) { 00592 // Build the original segmentation and if it is the same length as the 00593 // truth, assume it will do. 00594 int blob_count = 1; 00595 for (int s = 0; s < array_count(word_res->seam_array); ++s) { 00596 SEAM* seam = 00597 reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s)); 00598 if (seam->split1 == NULL) { 00599 word_res->best_state.push_back(blob_count); 00600 blob_count = 1; 00601 } else { 00602 ++blob_count; 00603 } 00604 } 00605 word_res->best_state.push_back(blob_count); 00606 if (word_res->best_state.size() != target_text.size()) { 00607 word_res->best_state.clear(); // No good. Original segmentation bad size. 00608 return false; 00609 } 00610 } 00611 word_res->correct_text.clear(); 00612 for (int i = 0; i < target_text.size(); ++i) { 00613 word_res->correct_text.push_back( 00614 STRING(unicharset.id_to_unichar(target_text[i]))); 00615 } 00616 return true; 00617 } 00618 00619 // Recursive helper to find a match to the target_text (from text_index 00620 // position) in the choices (from choices_pos position). 00621 // Choices is an array of GenericVectors, of length choices_length, with each 00622 // element representing a starting position in the word, and the 00623 // GenericVector holding classification results for a sequence of consecutive 00624 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc. 00625 void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices, 00626 int choices_pos, int choices_length, 00627 const GenericVector<UNICHAR_ID>& target_text, 00628 int text_index, 00629 float rating, GenericVector<int>* segmentation, 00630 float* best_rating, 00631 GenericVector<int>* best_segmentation) { 00632 const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs(); 00633 for (int length = 1; length <= choices[choices_pos].size(); ++length) { 00634 // Rating of matching choice or worst choice if no match. 00635 float choice_rating = 0.0f; 00636 // Find the corresponding best BLOB_CHOICE. 00637 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]); 00638 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 00639 choice_it.forward()) { 00640 BLOB_CHOICE* choice = choice_it.data(); 00641 choice_rating = choice->rating(); 00642 UNICHAR_ID class_id = choice->unichar_id(); 00643 if (class_id == target_text[text_index]) { 00644 break; 00645 } 00646 // Search ambigs table. 00647 if (class_id < table.size() && table[class_id] != NULL) { 00648 AmbigSpec_IT spec_it(table[class_id]); 00649 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); 00650 spec_it.forward()) { 00651 const AmbigSpec *ambig_spec = spec_it.data(); 00652 // We'll only do 1-1. 00653 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID && 00654 ambig_spec->correct_ngram_id == target_text[text_index]) 00655 break; 00656 } 00657 if (!spec_it.cycled_list()) 00658 break; // Found an ambig. 00659 } 00660 } 00661 if (choice_it.cycled_list()) 00662 continue; // No match. 00663 segmentation->push_back(length); 00664 if (choices_pos + length == choices_length && 00665 text_index + 1 == target_text.size()) { 00666 // This is a complete match. If the rating is good record a new best. 00667 if (applybox_debug > 2) { 00668 tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n", 00669 rating + choice_rating, *best_rating, segmentation->size(), 00670 best_segmentation->size()); 00671 } 00672 if (best_segmentation->empty() || rating + choice_rating < *best_rating) { 00673 *best_segmentation = *segmentation; 00674 *best_rating = rating + choice_rating; 00675 } 00676 } else if (choices_pos + length < choices_length && 00677 text_index + 1 < target_text.size()) { 00678 if (applybox_debug > 3) { 00679 tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", 00680 target_text[text_index], 00681 unicharset.id_to_unichar(target_text[text_index]), 00682 choice_it.data()->unichar_id() == target_text[text_index] 00683 ? "Match" : "Ambig", 00684 choices_pos, length); 00685 } 00686 SearchForText(choices, choices_pos + length, choices_length, target_text, 00687 text_index + 1, rating + choice_rating, segmentation, 00688 best_rating, best_segmentation); 00689 if (applybox_debug > 3) { 00690 tprintf("End recursion for %d=%s\n", target_text[text_index], 00691 unicharset.id_to_unichar(target_text[text_index])); 00692 } 00693 } 00694 segmentation->truncate(segmentation->size() - 1); 00695 } 00696 } 00697 00698 // Counts up the labelled words and the blobs within. 00699 // Deletes all unused or emptied words, counting the unused ones. 00700 // Resets W_BOL and W_EOL flags correctly. 00701 // Builds the rebuild_word and rebuilds the box_word and the best_choice. 00702 void Tesseract::TidyUp(PAGE_RES* page_res) { 00703 int ok_blob_count = 0; 00704 int bad_blob_count = 0; 00705 int ok_word_count = 0; 00706 int unlabelled_words = 0; 00707 PAGE_RES_IT pr_it(page_res); 00708 WERD_RES* word_res; 00709 for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { 00710 int ok_in_word = 0; 00711 BLOB_CHOICE_LIST_VECTOR char_choices; 00712 for (int i = word_res->correct_text.size() - 1; i >= 0; i--) { 00713 if (word_res->correct_text[i].length() > 0) { 00714 ++ok_in_word; 00715 } 00716 // Since we only need a fake word_res->best_choice, the actual 00717 // unichar_ids do not matter. Which is fortunate, since TidyUp() 00718 // can be called while training Tesseract, at the stage where 00719 // unicharset is not meaningful yet. 00720 char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0); 00721 } 00722 if (ok_in_word > 0) { 00723 ok_blob_count += ok_in_word; 00724 bad_blob_count += word_res->correct_text.size() - ok_in_word; 00725 MakeWordChoice(char_choices, unicharset, word_res->best_choice); 00726 } else { 00727 ++unlabelled_words; 00728 if (applybox_debug > 0) { 00729 tprintf("APPLY_BOXES: Unlabelled word at :"); 00730 word_res->word->bounding_box().print(); 00731 } 00732 pr_it.DeleteCurrentWord(); 00733 } 00734 char_choices.delete_data_pointers(); 00735 } 00736 pr_it.restart_page(); 00737 for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { 00738 // Denormalize back to a BoxWord. 00739 word_res->RebuildBestState(); 00740 word_res->SetupBoxWord(); 00741 word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row()); 00742 word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row()); 00743 } 00744 if (applybox_debug > 0) { 00745 tprintf(" Found %d good blobs.\n", ok_blob_count); 00746 if (bad_blob_count > 0) { 00747 tprintf(" Leaving %d unlabelled blobs in %d words.\n", 00748 bad_blob_count, ok_word_count); 00749 } 00750 if (unlabelled_words > 0) 00751 tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words); 00752 } 00753 } 00754 00755 // Logs a bad box by line in the box file and box coords. 00756 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, 00757 const char *box_ch, const char *err_msg) { 00758 tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", 00759 boxfile_lineno, box_ch, 00760 box.left(), box.bottom(), box.right(), box.top(), err_msg); 00761 } 00762 00763 // Creates a fake best_choice entry in each WERD_RES with the correct text. 00764 void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) { 00765 PAGE_RES_IT pr_it(page_res); 00766 for (WERD_RES *word_res = pr_it.word(); word_res != NULL; 00767 word_res = pr_it.forward()) { 00768 WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set, 00769 word_res->correct_text.size()); 00770 for (int i = 0; i < word_res->correct_text.size(); ++i) { 00771 // The part before the first space is the real ground truth, and the 00772 // rest is the bounding box location and page number. 00773 GenericVector<STRING> tokens; 00774 word_res->correct_text[i].split(' ', &tokens); 00775 UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string()); 00776 choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f); 00777 } 00778 if (word_res->best_choice != NULL) 00779 delete word_res->best_choice; 00780 word_res->best_choice = choice; 00781 } 00782 } 00783 00784 // Calls LearnWord to extract features for labelled blobs within each word. 00785 // Features are written to the given filename. 00786 void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) { 00787 PAGE_RES_IT pr_it(page_res); 00788 int word_count = 0; 00789 for (WERD_RES *word_res = pr_it.word(); word_res != NULL; 00790 word_res = pr_it.forward()) { 00791 LearnWord(filename.string(), NULL, word_res); 00792 ++word_count; 00793 } 00794 tprintf("Generated training data for %d words\n", word_count); 00795 } 00796 00797 00798 } // namespace tesseract