Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: pageres.cpp (Formerly page_res.c) 00003 * Description: Results classes used by control.c 00004 * Author: Phil Cheatle 00005 * Created: Tue Sep 22 08:42:49 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 #include "mfcpch.h" 00020 #include <stdlib.h> 00021 #ifdef __UNIX__ 00022 #include <assert.h> 00023 #endif 00024 #include "pageres.h" 00025 #include "blobs.h" 00026 00027 const char kBlameCorrect[] = "corr"; 00028 const char kBlameClassifier[] = "cl"; 00029 const char kBlameChopper[] = "chop"; 00030 const char kBlameClassLMTradeoff[] = "cl/LM"; 00031 const char kBlamePageLayout[] = "pglt"; 00032 const char kBlameSegsearchHeur[] = "ss_heur"; 00033 const char kBlameSegsearchPP[] = "ss_pp"; 00034 const char kBlameClassOldLMTradeoff[] = "cl/old_LM"; 00035 const char kBlameAdaption[] = "adapt"; 00036 const char kBlameNoTruthSplit[] = "no_tr_spl"; 00037 const char kBlameNoTruth[] = "no_tr"; 00038 const char kBlameUnknown[] = "unkn"; 00039 00040 const char * const kIncorrectResultReasonNames[] = { 00041 kBlameCorrect, 00042 kBlameClassifier, 00043 kBlameChopper, 00044 kBlameClassLMTradeoff, 00045 kBlamePageLayout, 00046 kBlameSegsearchHeur, 00047 kBlameSegsearchPP, 00048 kBlameClassOldLMTradeoff, 00049 kBlameAdaption, 00050 kBlameNoTruthSplit, 00051 kBlameNoTruth, 00052 kBlameUnknown 00053 }; 00054 00055 const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) { 00056 return kIncorrectResultReasonNames[irr]; 00057 } 00058 00059 const char *BlamerBundle::IncorrectReason() const { 00060 return kIncorrectResultReasonNames[incorrect_result_reason]; 00061 } 00062 00063 void BlamerBundle::FillDebugString(const STRING &msg, 00064 const WERD_CHOICE *choice, 00065 STRING *debug) { 00066 (*debug) += "Truth "; 00067 for (int i = 0; i < this->truth_text.length(); ++i) { 00068 (*debug) += this->truth_text[i]; 00069 } 00070 if (!this->truth_has_char_boxes) (*debug) += " (no char boxes)"; 00071 if (choice != NULL) { 00072 (*debug) += " Choice "; 00073 STRING choice_str; 00074 choice->string_and_lengths(&choice_str, NULL); 00075 (*debug) += choice_str; 00076 } 00077 if (msg.length() > 0) { 00078 (*debug) += "\n"; 00079 (*debug) += msg; 00080 } 00081 (*debug) += "\n"; 00082 } 00083 00084 ELISTIZE (BLOCK_RES) 00085 CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES) 00086 /************************************************************************* 00087 * PAGE_RES::PAGE_RES 00088 * 00089 * Constructor for page results 00090 *************************************************************************/ 00091 PAGE_RES::PAGE_RES( 00092 BLOCK_LIST *the_block_list, 00093 WERD_CHOICE **prev_word_best_choice_ptr) { 00094 Init(); 00095 BLOCK_IT block_it(the_block_list); 00096 BLOCK_RES_IT block_res_it(&block_res_list); 00097 for (block_it.mark_cycle_pt(); 00098 !block_it.cycled_list(); block_it.forward()) { 00099 block_res_it.add_to_end(new BLOCK_RES(block_it.data())); 00100 } 00101 prev_word_best_choice = prev_word_best_choice_ptr; 00102 } 00103 00104 /************************************************************************* 00105 * BLOCK_RES::BLOCK_RES 00106 * 00107 * Constructor for BLOCK results 00108 *************************************************************************/ 00109 00110 BLOCK_RES::BLOCK_RES(BLOCK *the_block) { 00111 ROW_IT row_it (the_block->row_list ()); 00112 ROW_RES_IT row_res_it(&row_res_list); 00113 00114 char_count = 0; 00115 rej_count = 0; 00116 font_class = -1; //not assigned 00117 x_height = -1.0; 00118 font_assigned = FALSE; 00119 bold = FALSE; 00120 italic = FALSE; 00121 row_count = 0; 00122 00123 block = the_block; 00124 00125 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00126 row_res_it.add_to_end(new ROW_RES(row_it.data())); 00127 } 00128 } 00129 00130 00131 /************************************************************************* 00132 * ROW_RES::ROW_RES 00133 * 00134 * Constructor for ROW results 00135 *************************************************************************/ 00136 00137 ROW_RES::ROW_RES(ROW *the_row) { 00138 WERD_IT word_it(the_row->word_list()); 00139 WERD_RES_IT word_res_it(&word_res_list); 00140 WERD_RES *combo = NULL; // current combination of fuzzies 00141 WERD_RES *word_res; // current word 00142 WERD *copy_word; 00143 00144 char_count = 0; 00145 rej_count = 0; 00146 whole_word_rej_count = 0; 00147 00148 row = the_row; 00149 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00150 word_res = new WERD_RES(word_it.data()); 00151 word_res->x_height = the_row->x_height(); 00152 00153 if (word_res->word->flag(W_FUZZY_NON)) { 00154 ASSERT_HOST(combo != NULL); 00155 word_res->part_of_combo = TRUE; 00156 combo->copy_on(word_res); 00157 } 00158 if (word_it.data_relative(1)->flag(W_FUZZY_NON)) { 00159 if (combo == NULL) { 00160 copy_word = new WERD; 00161 //deep copy 00162 *copy_word = *(word_it.data()); 00163 combo = new WERD_RES(copy_word); 00164 combo->x_height = the_row->x_height(); 00165 combo->combination = TRUE; 00166 word_res_it.add_to_end(combo); 00167 } 00168 word_res->part_of_combo = TRUE; 00169 } else { 00170 combo = NULL; 00171 } 00172 word_res_it.add_to_end(word_res); 00173 } 00174 } 00175 00176 00177 WERD_RES& WERD_RES::operator=(const WERD_RES & source) { 00178 this->ELIST_LINK::operator=(source); 00179 Clear(); 00180 if (source.combination) { 00181 word = new WERD; 00182 *word = *(source.word); // deep copy 00183 } else { 00184 word = source.word; // pt to same word 00185 } 00186 if (source.bln_boxes != NULL) 00187 bln_boxes = new tesseract::BoxWord(*source.bln_boxes); 00188 if (source.chopped_word != NULL) 00189 chopped_word = new TWERD(*source.chopped_word); 00190 if (source.rebuild_word != NULL) 00191 rebuild_word = new TWERD(*source.rebuild_word); 00192 // TODO(rays) Do we ever need to copy the seam_array? 00193 denorm = source.denorm; 00194 if (source.box_word != NULL) 00195 box_word = new tesseract::BoxWord(*source.box_word); 00196 best_state = source.best_state; 00197 correct_text = source.correct_text; 00198 00199 if (source.best_choice != NULL) { 00200 best_choice = new WERD_CHOICE(*source.best_choice); 00201 raw_choice = new WERD_CHOICE(*source.raw_choice); 00202 best_choice_fontinfo_ids = source.best_choice_fontinfo_ids; 00203 } 00204 else { 00205 best_choice = NULL; 00206 raw_choice = NULL; 00207 if (!best_choice_fontinfo_ids.empty()) { 00208 best_choice_fontinfo_ids.clear(); 00209 } 00210 } 00211 for (int i = 0; i < source.alt_choices.length(); ++i) { 00212 const WERD_CHOICE *choice = source.alt_choices[i]; 00213 ASSERT_HOST(choice != NULL); 00214 alt_choices.push_back(new WERD_CHOICE(*choice)); 00215 } 00216 alt_states = source.alt_states; 00217 if (source.ep_choice != NULL) { 00218 ep_choice = new WERD_CHOICE(*source.ep_choice); 00219 } else { 00220 ep_choice = NULL; 00221 } 00222 reject_map = source.reject_map; 00223 combination = source.combination; 00224 part_of_combo = source.part_of_combo; 00225 CopySimpleFields(source); 00226 if (source.blamer_bundle != NULL) { 00227 blamer_bundle = new BlamerBundle(*(source.blamer_bundle)); 00228 } 00229 return *this; 00230 } 00231 00232 // Copies basic fields that don't involve pointers that might be useful 00233 // to copy when making one WERD_RES from another. 00234 void WERD_RES::CopySimpleFields(const WERD_RES& source) { 00235 tess_failed = source.tess_failed; 00236 tess_accepted = source.tess_accepted; 00237 tess_would_adapt = source.tess_would_adapt; 00238 done = source.done; 00239 unlv_crunch_mode = source.unlv_crunch_mode; 00240 small_caps = source.small_caps; 00241 italic = source.italic; 00242 bold = source.bold; 00243 fontinfo = source.fontinfo; 00244 fontinfo2 = source.fontinfo2; 00245 fontinfo_id_count = source.fontinfo_id_count; 00246 fontinfo_id2_count = source.fontinfo_id2_count; 00247 x_height = source.x_height; 00248 caps_height = source.caps_height; 00249 guessed_x_ht = source.guessed_x_ht; 00250 guessed_caps_ht = source.guessed_caps_ht; 00251 reject_spaces = source.reject_spaces; 00252 uch_set = source.uch_set; 00253 tesseract = source.tesseract; 00254 } 00255 00256 // Initializes a blank (default constructed) WERD_RES from one that has 00257 // already been recognized. 00258 // Use SetupFor*Recognition afterwards to complete the setup and make 00259 // it ready for a retry recognition. 00260 void WERD_RES::InitForRetryRecognition(const WERD_RES& source) { 00261 word = source.word; 00262 CopySimpleFields(source); 00263 if (source.blamer_bundle != NULL) { 00264 blamer_bundle = new BlamerBundle(); 00265 blamer_bundle->CopyTruth(*source.blamer_bundle); 00266 } 00267 } 00268 00269 // Sets up the members used in recognition: 00270 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. 00271 // Returns false if the word is empty and sets up fake results. 00272 bool WERD_RES::SetupForTessRecognition(const UNICHARSET& unicharset_in, 00273 tesseract::Tesseract* tess, Pix* pix, 00274 bool numeric_mode, 00275 bool use_body_size, 00276 ROW *row, BLOCK* block) { 00277 tesseract = tess; 00278 POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; 00279 if (word->cblob_list()->empty() || (pb != NULL && !pb->IsText())) { 00280 // Empty words occur when all the blobs have been moved to the rej_blobs 00281 // list, which seems to occur frequently in junk. 00282 SetupFake(unicharset_in); 00283 word->set_flag(W_REP_CHAR, false); 00284 return false; 00285 } 00286 ClearResults(); 00287 SetupWordScript(unicharset_in); 00288 chopped_word = TWERD::PolygonalCopy(word); 00289 if (use_body_size && row->body_size() > 0.0f) { 00290 chopped_word->SetupBLNormalize(block, row, row->body_size(), 00291 numeric_mode, &denorm); 00292 } else { 00293 chopped_word->SetupBLNormalize(block, row, x_height, numeric_mode, &denorm); 00294 } 00295 // The image will be 8-bit grey if the input was grey or color. Note that in 00296 // a grey image 0 is black and 255 is white. If the input was binary, then 00297 // the pix will be binary and 0 is white, with 1 being black. 00298 // To tell the difference pixGetDepth() will return 8 or 1. 00299 denorm.set_pix(pix); 00300 // The inverse flag will be true iff the word has been determined to be white 00301 // on black, and is independent of whether the pix is 8 bit or 1 bit. 00302 denorm.set_inverse(word->flag(W_INVERSE)); 00303 chopped_word->Normalize(denorm); 00304 bln_boxes = tesseract::BoxWord::CopyFromNormalized(NULL, chopped_word); 00305 seam_array = start_seam_list(chopped_word->blobs); 00306 best_choice = new WERD_CHOICE(&unicharset_in); 00307 best_choice->make_bad(); 00308 raw_choice = new WERD_CHOICE(&unicharset_in); 00309 raw_choice->make_bad(); 00310 SetupBlamerBundle(); 00311 return true; 00312 } 00313 00314 // Sets up the members used in recognition: 00315 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. 00316 // Returns false if the word is empty and sets up fake results. 00317 bool WERD_RES::SetupForCubeRecognition(const UNICHARSET& unicharset_in, 00318 tesseract::Tesseract* tess, 00319 const BLOCK* block) { 00320 tesseract = tess; 00321 POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; 00322 if (pb != NULL && !pb->IsText()) { 00323 // Ignore words in graphic regions. 00324 SetupFake(unicharset_in); 00325 word->set_flag(W_REP_CHAR, false); 00326 return false; 00327 } 00328 ClearResults(); 00329 SetupWordScript(unicharset_in); 00330 TBOX word_box = word->bounding_box(); 00331 denorm.SetupNormalization(block, NULL, NULL, NULL, NULL, 0, 00332 word_box.left(), word_box.bottom(), 00333 1.0f, 1.0f, 0.0f, 0.0f); 00334 SetupBlamerBundle(); 00335 return true; 00336 } 00337 00338 // Sets up the members used in recognition for an empty recognition result: 00339 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. 00340 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) { 00341 ClearResults(); 00342 SetupWordScript(unicharset_in); 00343 chopped_word = new TWERD; 00344 rebuild_word = new TWERD; 00345 bln_boxes = new tesseract::BoxWord; 00346 box_word = new tesseract::BoxWord; 00347 int blob_count = word->cblob_list()->length(); 00348 best_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f, 00349 TOP_CHOICE_PERM, unicharset_in); 00350 raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f, 00351 TOP_CHOICE_PERM, unicharset_in); 00352 if (blob_count > 0) { 00353 BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count]; 00354 // For non-text blocks, just pass any blobs through to the box_word 00355 // and call the word failed with a fake classification. 00356 C_BLOB_IT b_it(word->cblob_list()); 00357 int blob_id = 0; 00358 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00359 TBOX box = b_it.data()->bounding_box(); 00360 box_word->InsertBox(box_word->length(), box); 00361 fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f, 00362 -1, -1, -1, 0, 0, false); 00363 } 00364 FakeClassifyWord(blob_count, fake_choices); 00365 delete [] fake_choices; 00366 } 00367 tess_failed = true; 00368 } 00369 00370 void WERD_RES::SetupWordScript(const UNICHARSET& uch) { 00371 uch_set = &uch; 00372 int script = uch.default_sid(); 00373 word->set_script_id(script); 00374 word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight()); 00375 word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid()); 00376 } 00377 00378 // Sets up the blamer_bundle if it is not null, using the initialized denorm. 00379 void WERD_RES::SetupBlamerBundle() { 00380 if (blamer_bundle != NULL) { 00381 blamer_bundle->norm_box_tolerance = kBlamerBoxTolerance * denorm.x_scale(); 00382 TPOINT topleft; 00383 TPOINT botright; 00384 TPOINT norm_topleft; 00385 TPOINT norm_botright; 00386 for (int b = 0; b < blamer_bundle->truth_word.length(); ++b) { 00387 const TBOX &box = blamer_bundle->truth_word.BlobBox(b); 00388 topleft.x = box.left(); 00389 topleft.y = box.top(); 00390 botright.x = box.right(); 00391 botright.y = box.bottom(); 00392 denorm.NormTransform(topleft, &norm_topleft); 00393 denorm.NormTransform(botright, &norm_botright); 00394 TBOX norm_box(norm_topleft.x, norm_botright.y, 00395 norm_botright.x, norm_topleft.y); 00396 blamer_bundle->norm_truth_word.InsertBox(b, norm_box); 00397 } 00398 } 00399 } 00400 00401 // Simple helper moves the ownership of the pointer data from src to dest, 00402 // first deleting anything in dest, and nulling out src afterwards. 00403 template<class T> static void MovePointerData(T** dest, T**src) { 00404 delete *dest; 00405 *dest = *src; 00406 *src = NULL; 00407 } 00408 00409 // Moves the results fields from word to this. This takes ownership of all 00410 // the data, so src can be destructed. 00411 void WERD_RES::ConsumeWordResults(WERD_RES* word) { 00412 denorm = word->denorm; 00413 MovePointerData(&chopped_word, &word->chopped_word); 00414 MovePointerData(&rebuild_word, &word->rebuild_word); 00415 MovePointerData(&box_word, &word->box_word); 00416 if (seam_array != NULL) 00417 free_seam_list(seam_array); 00418 seam_array = word->seam_array; 00419 word->seam_array = NULL; 00420 best_state.move(&word->best_state); 00421 correct_text.move(&word->correct_text); 00422 MovePointerData(&best_choice, &word->best_choice); 00423 MovePointerData(&raw_choice, &word->raw_choice); 00424 alt_choices.delete_data_pointers(); 00425 alt_choices.move(&word->alt_choices); 00426 alt_states.move(&word->alt_states); 00427 reject_map = word->reject_map; 00428 if (word->blamer_bundle != NULL) { 00429 assert(blamer_bundle != NULL); 00430 blamer_bundle->CopyResults(*(word->blamer_bundle)); 00431 } 00432 CopySimpleFields(*word); 00433 } 00434 00435 // Replace the best choice and rebuild box word. 00436 void WERD_RES::ReplaceBestChoice( 00437 const WERD_CHOICE& choice, 00438 const GenericVector<int>& segmentation_state) { 00439 delete best_choice; 00440 best_choice = new WERD_CHOICE(choice); 00441 best_state = segmentation_state; 00442 RebuildBestState(); 00443 SetupBoxWord(); 00444 // Make up a fake reject map of the right length to keep the 00445 // rejection pass happy. 00446 reject_map.initialise(segmentation_state.length()); 00447 done = tess_accepted = tess_would_adapt = true; 00448 SetScriptPositions(); 00449 } 00450 00451 // Builds the rebuild_word from the chopped_word and the best_state. 00452 void WERD_RES::RebuildBestState() { 00453 if (rebuild_word != NULL) 00454 delete rebuild_word; 00455 rebuild_word = new TWERD; 00456 if (seam_array == NULL) { 00457 seam_array = start_seam_list(chopped_word->blobs); 00458 } 00459 TBLOB* prev_blob = NULL; 00460 int start = 0; 00461 for (int i = 0; i < best_state.size(); ++i) { 00462 int length = best_state[i]; 00463 join_pieces(chopped_word->blobs, seam_array, start, start + length - 1); 00464 TBLOB* blob = chopped_word->blobs; 00465 for (int i = 0; i < start; ++i) 00466 blob = blob->next; 00467 TBLOB* copy_blob = new TBLOB(*blob); 00468 if (prev_blob == NULL) 00469 rebuild_word->blobs = copy_blob; 00470 else 00471 prev_blob->next = copy_blob; 00472 prev_blob = copy_blob; 00473 break_pieces(blob, seam_array, start, start + length - 1); 00474 start += length; 00475 } 00476 } 00477 00478 // Copies the chopped_word to the rebuild_word, faking a best_state as well. 00479 // Also sets up the output box_word. 00480 void WERD_RES::CloneChoppedToRebuild() { 00481 if (rebuild_word != NULL) 00482 delete rebuild_word; 00483 rebuild_word = new TWERD(*chopped_word); 00484 SetupBoxWord(); 00485 int word_len = box_word->length(); 00486 best_state.reserve(word_len); 00487 correct_text.reserve(word_len); 00488 for (int i = 0; i < word_len; ++i) { 00489 best_state.push_back(1); 00490 correct_text.push_back(STRING("")); 00491 } 00492 } 00493 00494 // Sets/replaces the box_word with one made from the rebuild_word. 00495 void WERD_RES::SetupBoxWord() { 00496 if (box_word != NULL) 00497 delete box_word; 00498 rebuild_word->ComputeBoundingBoxes(); 00499 box_word = tesseract::BoxWord::CopyFromNormalized(&denorm, rebuild_word); 00500 box_word->ClipToOriginalWord(denorm.block(), word); 00501 } 00502 00503 // Sets up the script positions in the output boxword using the best_choice 00504 // to get the unichars, and the unicharset to get the target positions. 00505 void WERD_RES::SetScriptPositions() { 00506 box_word->SetScriptPositions(*uch_set, small_caps, rebuild_word, 00507 best_choice); 00508 } 00509 00510 void WERD_RES::WithoutFootnoteSpan(int *pstart, int *pend) const { 00511 int end = best_choice->length(); 00512 while (end > 0 && 00513 uch_set->get_isdigit(best_choice->unichar_ids()[end - 1]) && 00514 box_word->BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) { 00515 end--; 00516 } 00517 int start = 0; 00518 while (start < end && 00519 uch_set->get_isdigit(best_choice->unichar_ids()[start]) && 00520 box_word->BlobPosition(start) == tesseract::SP_SUPERSCRIPT) { 00521 start++; 00522 } 00523 *pstart = start; 00524 *pend = end; 00525 } 00526 00527 void WERD_RES::WithoutFootnoteSpan( 00528 const WERD_CHOICE &word, const GenericVector<int> &state, 00529 int *pstart, int *pend) const { 00530 int len = word.length(); 00531 *pstart = 0; 00532 *pend = len; 00533 if (len < 2) return; 00534 if (!word.unicharset()->get_isdigit(word.unichar_ids()[len - 1]) && 00535 !word.unicharset()->get_isdigit(word.unichar_ids()[0])) return; 00536 00537 // ok, now that we know the word ends in digits, do the expensive bit of 00538 // figuring out if they're superscript. 00539 WERD_RES copy(*this); 00540 copy.ReplaceBestChoice(word, state); 00541 copy.WithoutFootnoteSpan(pstart, pend); 00542 } 00543 00544 // Classifies the word with some already-calculated BLOB_CHOICEs. 00545 // The choices are an array of blob_count pointers to BLOB_CHOICE, 00546 // providing a single classifier result for each blob. 00547 // The BLOB_CHOICEs are consumed and the word takes ownership. 00548 // The number of blobs in the outword must match blob_count. 00549 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) { 00550 // Setup the WERD_RES. 00551 ASSERT_HOST(box_word != NULL); 00552 ASSERT_HOST(blob_count == box_word->length()); 00553 ASSERT_HOST(best_choice != NULL); 00554 BLOB_CHOICE_LIST_CLIST* word_choices = new BLOB_CHOICE_LIST_CLIST; 00555 BLOB_CHOICE_LIST_C_IT bc_it(word_choices); 00556 for (int c = 0; c < blob_count; ++c) { 00557 best_choice->append_unichar_id( 00558 choices[c]->unichar_id(), 1, 00559 choices[c]->rating(), choices[c]->certainty()); 00560 BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST; 00561 BLOB_CHOICE_IT choice_it(choice_list); 00562 choice_it.add_after_then_move(choices[c]); 00563 bc_it.add_after_then_move(choice_list); 00564 } 00565 best_choice->set_blob_choices(word_choices); 00566 delete raw_choice; 00567 raw_choice = new WERD_CHOICE(*best_choice); 00568 reject_map.initialise(blob_count); 00569 } 00570 00571 // Copies the best_choice strings to the correct_text for adaption/training. 00572 void WERD_RES::BestChoiceToCorrectText() { 00573 correct_text.clear(); 00574 ASSERT_HOST(best_choice != NULL); 00575 for (int i = 0; i < best_choice->length(); ++i) { 00576 UNICHAR_ID choice_id = best_choice->unichar_id(i); 00577 const char* blob_choice = uch_set->id_to_unichar(choice_id); 00578 correct_text.push_back(STRING(blob_choice)); 00579 } 00580 } 00581 00582 // Merges 2 adjacent blobs in the result if the permanent callback 00583 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent 00584 // callback box_cb is NULL or returns true, setting the merged blob 00585 // result to the class returned from class_cb. 00586 // Returns true if anything was merged. 00587 bool WERD_RES::ConditionalBlobMerge( 00588 TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb, 00589 TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb, 00590 00591 BLOB_CHOICE_LIST_CLIST *blob_choices) { 00592 bool modified = false; 00593 for (int i = 0; i + 1 < best_choice->length(); ++i) { 00594 UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i), 00595 best_choice->unichar_id(i+1)); 00596 if (new_id != INVALID_UNICHAR_ID && 00597 (box_cb == NULL || box_cb->Run(box_word->BlobBox(i), 00598 box_word->BlobBox(i + 1)))) { 00599 if (reject_map.length() == best_choice->length()) 00600 reject_map.remove_pos(i); 00601 best_choice->set_unichar_id(new_id, i); 00602 best_choice->remove_unichar_id(i + 1); 00603 raw_choice->set_unichar_id(new_id, i); 00604 raw_choice->remove_unichar_id(i + 1); 00605 modified = true; 00606 rebuild_word->MergeBlobs(i, i + 2); 00607 box_word->MergeBoxes(i, i + 2); 00608 if (i + 1 < best_state.length()) { 00609 best_state[i] += best_state[i + 1]; 00610 best_state.remove(i + 1); 00611 } 00612 00613 BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices); 00614 for (int j = 0; j < i; ++j) 00615 blob_choices_it.forward(); 00616 BLOB_CHOICE_IT it1(blob_choices_it.data()); // first choices 00617 BLOB_CHOICE_LIST* target_choices = blob_choices_it.data_relative(1); 00618 BLOB_CHOICE_IT it2(target_choices); // second choices 00619 float certainty = it2.data()->certainty(); 00620 float rating = it2.data()->rating(); 00621 if (it1.data()->certainty() < certainty) { 00622 certainty = it1.data()->certainty(); 00623 rating = it1.data()->rating(); 00624 target_choices = blob_choices_it.data(); 00625 blob_choices_it.forward(); 00626 } 00627 delete blob_choices_it.extract(); // get rid of spare 00628 // TODO(rays) Fix the choices so they contain the desired result. 00629 // Do we really need to ? Only needed for fix_quotes, which should be 00630 // going away. 00631 } 00632 } 00633 delete class_cb; 00634 delete box_cb; 00635 return modified; 00636 } 00637 00638 // TODO(tkielbus) Decide between keeping this behavior here or modifying the 00639 // training data. 00640 00641 // Utility function for fix_quotes 00642 // Return true if the next character in the string (given the UTF8 length in 00643 // bytes) is a quote character. 00644 static int is_simple_quote(const char* signed_str, int length) { 00645 const unsigned char* str = 00646 reinterpret_cast<const unsigned char*>(signed_str); 00647 // Standard 1 byte quotes. 00648 return (length == 1 && (*str == '\'' || *str == '`')) || 00649 // UTF-8 3 bytes curved quotes. 00650 (length == 3 && ((*str == 0xe2 && 00651 *(str + 1) == 0x80 && 00652 *(str + 2) == 0x98) || 00653 (*str == 0xe2 && 00654 *(str + 1) == 0x80 && 00655 *(str + 2) == 0x99))); 00656 } 00657 00658 // Callback helper for fix_quotes returns a double quote if both 00659 // arguments are quote, otherwise INVALID_UNICHAR_ID. 00660 UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) { 00661 const char *ch = uch_set->id_to_unichar(id1); 00662 const char *next_ch = uch_set->id_to_unichar(id2); 00663 if (is_simple_quote(ch, strlen(ch)) && 00664 is_simple_quote(next_ch, strlen(next_ch))) 00665 return uch_set->unichar_to_id("\""); 00666 return INVALID_UNICHAR_ID; 00667 } 00668 00669 // Change pairs of quotes to double quotes. 00670 void WERD_RES::fix_quotes(BLOB_CHOICE_LIST_CLIST* blob_choices) { 00671 if (!uch_set->contains_unichar("\"") || 00672 !uch_set->get_enabled(uch_set->unichar_to_id("\""))) 00673 return; // Don't create it if it is disallowed. 00674 00675 ConditionalBlobMerge( 00676 NewPermanentTessCallback(this, &WERD_RES::BothQuotes), 00677 NULL, 00678 blob_choices); 00679 } 00680 00681 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both 00682 // arguments are hyphen, otherwise INVALID_UNICHAR_ID. 00683 UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) { 00684 const char *ch = uch_set->id_to_unichar(id1); 00685 const char *next_ch = uch_set->id_to_unichar(id2); 00686 if (strlen(ch) == 1 && strlen(next_ch) == 1 && 00687 (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~')) 00688 return uch_set->unichar_to_id("-"); 00689 return INVALID_UNICHAR_ID; 00690 } 00691 00692 // Callback helper for fix_hyphens returns true if box1 and box2 overlap 00693 // (assuming both on the same textline, are in order and a chopped em dash.) 00694 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) { 00695 return box1.right() >= box2.left(); 00696 } 00697 00698 // Change pairs of hyphens to a single hyphen if the bounding boxes touch 00699 // Typically a long dash which has been segmented. 00700 void WERD_RES::fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices) { 00701 if (!uch_set->contains_unichar("-") || 00702 !uch_set->get_enabled(uch_set->unichar_to_id("-"))) 00703 return; // Don't create it if it is disallowed. 00704 00705 ConditionalBlobMerge( 00706 NewPermanentTessCallback(this, &WERD_RES::BothHyphens), 00707 NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap), 00708 blob_choices); 00709 } 00710 00711 // Callback helper for merge_tess_fails returns a space if both 00712 // arguments are space, otherwise INVALID_UNICHAR_ID. 00713 UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) { 00714 if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) 00715 return id1; 00716 else 00717 return INVALID_UNICHAR_ID; 00718 } 00719 00720 // Change pairs of tess failures to a single one 00721 void WERD_RES::merge_tess_fails() { 00722 if (ConditionalBlobMerge( 00723 NewPermanentTessCallback(this, &WERD_RES::BothSpaces), NULL, 00724 best_choice->blob_choices())) { 00725 int len = best_choice->length(); 00726 ASSERT_HOST(reject_map.length() == len); 00727 ASSERT_HOST(box_word->length() == len); 00728 } 00729 } 00730 00731 // Returns true if the collection of count pieces, starting at start, are all 00732 // natural connected components, ie there are no real chops involved. 00733 bool WERD_RES::PiecesAllNatural(int start, int count) const { 00734 // all seams must have no splits. 00735 for (int index = start; index < start + count - 1; ++index) { 00736 if (index >= 0 && index < array_count(seam_array)) { 00737 SEAM* seam = reinterpret_cast<SEAM *>(array_value(seam_array, index)); 00738 if (seam != NULL && seam->split1 != NULL) 00739 return false; 00740 } 00741 } 00742 return true; 00743 } 00744 00745 00746 WERD_RES::~WERD_RES () { 00747 Clear(); 00748 } 00749 00750 void WERD_RES::InitNonPointers() { 00751 tess_failed = FALSE; 00752 tess_accepted = FALSE; 00753 tess_would_adapt = FALSE; 00754 done = FALSE; 00755 unlv_crunch_mode = CR_NONE; 00756 small_caps = false; 00757 italic = FALSE; 00758 bold = FALSE; 00759 // The fontinfos and tesseract count as non-pointers as they point to 00760 // data owned elsewhere. 00761 fontinfo = NULL; 00762 fontinfo2 = NULL; 00763 tesseract = NULL; 00764 fontinfo_id_count = 0; 00765 fontinfo_id2_count = 0; 00766 x_height = 0.0; 00767 caps_height = 0.0; 00768 guessed_x_ht = TRUE; 00769 guessed_caps_ht = TRUE; 00770 combination = FALSE; 00771 part_of_combo = FALSE; 00772 reject_spaces = FALSE; 00773 } 00774 00775 void WERD_RES::InitPointers() { 00776 word = NULL; 00777 bln_boxes = NULL; 00778 uch_set = NULL; 00779 chopped_word = NULL; 00780 rebuild_word = NULL; 00781 box_word = NULL; 00782 seam_array = NULL; 00783 best_choice = NULL; 00784 raw_choice = NULL; 00785 ep_choice = NULL; 00786 blamer_bundle = NULL; 00787 } 00788 00789 void WERD_RES::Clear() { 00790 if (word != NULL && combination) { 00791 delete word; 00792 } 00793 word = NULL; 00794 delete blamer_bundle; 00795 blamer_bundle = NULL; 00796 ClearResults(); 00797 } 00798 00799 void WERD_RES::ClearResults() { 00800 done = false; 00801 fontinfo = NULL; 00802 fontinfo2 = NULL; 00803 fontinfo_id_count = 0; 00804 fontinfo_id2_count = 0; 00805 if (bln_boxes != NULL) { 00806 delete bln_boxes; 00807 bln_boxes = NULL; 00808 } 00809 if (chopped_word != NULL) { 00810 delete chopped_word; 00811 chopped_word = NULL; 00812 } 00813 if (rebuild_word != NULL) { 00814 delete rebuild_word; 00815 rebuild_word = NULL; 00816 } 00817 if (box_word != NULL) { 00818 delete box_word; 00819 box_word = NULL; 00820 } 00821 best_state.clear(); 00822 correct_text.clear(); 00823 if (seam_array != NULL) { 00824 free_seam_list(seam_array); 00825 seam_array = NULL; 00826 } 00827 if (best_choice != NULL) { 00828 delete best_choice; 00829 delete raw_choice; 00830 best_choice = NULL; 00831 raw_choice = NULL; 00832 } 00833 if (!alt_choices.empty()) { 00834 alt_choices.delete_data_pointers(); 00835 alt_choices.clear(); 00836 } 00837 alt_states.clear(); 00838 if (ep_choice != NULL) { 00839 delete ep_choice; 00840 ep_choice = NULL; 00841 } 00842 if (blamer_bundle != NULL) blamer_bundle->ClearResults(); 00843 } 00844 00845 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const { 00846 return word_res == other.word_res && 00847 row_res == other.row_res && 00848 block_res == other.block_res; 00849 } 00850 00851 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const { 00852 ASSERT_HOST(page_res == other.page_res); 00853 if (other.block_res == NULL) { 00854 // other points to the end of the page. 00855 if (block_res == NULL) 00856 return 0; 00857 return -1; 00858 } 00859 if (block_res == NULL) { 00860 return 1; // we point to the end of the page. 00861 } 00862 if (block_res == other.block_res) { 00863 if (other.row_res == NULL || row_res == NULL) { 00864 // this should only happen if we hit an image block. 00865 return 0; 00866 } 00867 if (row_res == other.row_res) { 00868 // we point to the same block and row. 00869 ASSERT_HOST(other.word_res != NULL && word_res != NULL); 00870 if (word_res == other.word_res) { 00871 // we point to the same word! 00872 return 0; 00873 } 00874 00875 WERD_RES_IT word_res_it(&row_res->word_res_list); 00876 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00877 word_res_it.forward()) { 00878 if (word_res_it.data() == word_res) { 00879 return -1; 00880 } else if (word_res_it.data() == other.word_res) { 00881 return 1; 00882 } 00883 } 00884 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); 00885 } 00886 00887 // we both point to the same block, but different rows. 00888 ROW_RES_IT row_res_it(&block_res->row_res_list); 00889 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); 00890 row_res_it.forward()) { 00891 if (row_res_it.data() == row_res) { 00892 return -1; 00893 } else if (row_res_it.data() == other.row_res) { 00894 return 1; 00895 } 00896 } 00897 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); 00898 } 00899 00900 // We point to different blocks. 00901 BLOCK_RES_IT block_res_it(&page_res->block_res_list); 00902 for (block_res_it.mark_cycle_pt(); 00903 !block_res_it.cycled_list(); block_res_it.forward()) { 00904 if (block_res_it.data() == block_res) { 00905 return -1; 00906 } else if (block_res_it.data() == other.block_res) { 00907 return 1; 00908 } 00909 } 00910 // Shouldn't happen... 00911 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); 00912 return 0; 00913 } 00914 00915 // Inserts the new_word and a corresponding WERD_RES before the current 00916 // position. The simple fields of the WERD_RES are copied from clone_res and 00917 // the resulting WERD_RES is returned for further setup with best_choice etc. 00918 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res, 00919 WERD* new_word) { 00920 // Insert new_word into the ROW. 00921 WERD_IT w_it(row()->row->word_list()); 00922 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00923 WERD* word = w_it.data(); 00924 if (word == word_res->word) 00925 break; 00926 } 00927 ASSERT_HOST(!w_it.cycled_list()); 00928 w_it.add_before_then_move(new_word); 00929 // Make a WERD_RES for the new_word. 00930 WERD_RES* new_res = new WERD_RES(new_word); 00931 new_res->CopySimpleFields(clone_res); 00932 // Insert into the appropriate place in the ROW_RES. 00933 WERD_RES_IT wr_it(&row()->word_res_list); 00934 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { 00935 WERD_RES* word = wr_it.data(); 00936 if (word == word_res) 00937 break; 00938 } 00939 ASSERT_HOST(!wr_it.cycled_list()); 00940 wr_it.add_before_then_move(new_res); 00941 if (wr_it.at_first()) { 00942 // This is the new first word, so reset the member iterator so it 00943 // detects the cycled_list state correctly. 00944 ResetWordIterator(); 00945 } 00946 return new_res; 00947 } 00948 00949 // Deletes the current WERD_RES and its underlying WERD. 00950 void PAGE_RES_IT::DeleteCurrentWord() { 00951 // Check that this word is as we expect. part_of_combos are NEVER iterated 00952 // by the normal iterator, so we should never be trying to delete them. 00953 ASSERT_HOST(!word_res->part_of_combo); 00954 if (!word_res->combination) { 00955 // Combinations own their own word, so we won't find the word on the 00956 // row's word_list, but it is legitimate to try to delete them. 00957 // Delete word from the ROW when not a combination. 00958 WERD_IT w_it(row()->row->word_list()); 00959 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00960 if (w_it.data() == word_res->word) { 00961 break; 00962 } 00963 } 00964 ASSERT_HOST(!w_it.cycled_list()); 00965 delete w_it.extract(); 00966 } 00967 // Remove the WERD_RES for the new_word. 00968 // Remove the WORD_RES from the ROW_RES. 00969 WERD_RES_IT wr_it(&row()->word_res_list); 00970 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { 00971 if (wr_it.data() == word_res) { 00972 word_res = NULL; 00973 break; 00974 } 00975 } 00976 ASSERT_HOST(!wr_it.cycled_list()); 00977 delete wr_it.extract(); 00978 ResetWordIterator(); 00979 } 00980 00981 /************************************************************************* 00982 * PAGE_RES_IT::restart_page 00983 * 00984 * Set things up at the start of the page 00985 *************************************************************************/ 00986 00987 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) { 00988 block_res_it.set_to_list(&page_res->block_res_list); 00989 block_res_it.mark_cycle_pt(); 00990 prev_block_res = NULL; 00991 prev_row_res = NULL; 00992 prev_word_res = NULL; 00993 block_res = NULL; 00994 row_res = NULL; 00995 word_res = NULL; 00996 next_block_res = NULL; 00997 next_row_res = NULL; 00998 next_word_res = NULL; 00999 internal_forward(true, empty_ok); 01000 return internal_forward(false, empty_ok); 01001 } 01002 01003 // Recovers from operations on the current word, such as in InsertCloneWord 01004 // and DeleteCurrentWord. 01005 // Resets the word_res_it so that it is one past the next_word_res, as 01006 // it should be after internal_forward. If next_row_res != row_res, 01007 // then the next_word_res is in the next row, so there is no need to do 01008 // anything, since operations on the current word will not have disturbed 01009 // the word_res_it. 01010 void PAGE_RES_IT::ResetWordIterator() { 01011 if (row_res == next_row_res) { 01012 // Reset the member iterator so it can move forward and detect the 01013 // cycled_list state correctly. 01014 word_res_it.move_to_first(); 01015 word_res_it.mark_cycle_pt(); 01016 while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res) 01017 word_res_it.forward(); 01018 ASSERT_HOST(!word_res_it.cycled_list()); 01019 word_res_it.forward(); 01020 } 01021 } 01022 01023 /************************************************************************* 01024 * PAGE_RES_IT::internal_forward 01025 * 01026 * Find the next word on the page. If empty_ok is true, then non-text blocks 01027 * and text blocks with no text are visited as if they contain a single 01028 * imaginary word in a single imaginary row. (word() and row() both return NULL 01029 * in such a block and the return value is NULL.) 01030 * If empty_ok is false, the old behaviour is maintained. Each real word 01031 * is visited and empty and non-text blocks and rows are skipped. 01032 * new_block is used to initialize the iterators for a new block. 01033 * The iterator maintains pointers to block, row and word for the previous, 01034 * current and next words. These are correct, regardless of block/row 01035 * boundaries. NULL values denote start and end of the page. 01036 *************************************************************************/ 01037 01038 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) { 01039 bool new_row = false; 01040 01041 prev_block_res = block_res; 01042 prev_row_res = row_res; 01043 prev_word_res = word_res; 01044 block_res = next_block_res; 01045 row_res = next_row_res; 01046 word_res = next_word_res; 01047 next_block_res = NULL; 01048 next_row_res = NULL; 01049 next_word_res = NULL; 01050 01051 while (!block_res_it.cycled_list()) { 01052 if (new_block) { 01053 new_block = false; 01054 row_res_it.set_to_list(&block_res_it.data()->row_res_list); 01055 row_res_it.mark_cycle_pt(); 01056 if (row_res_it.empty() && empty_ok) { 01057 next_block_res = block_res_it.data(); 01058 break; 01059 } 01060 new_row = true; 01061 } 01062 while (!row_res_it.cycled_list()) { 01063 if (new_row) { 01064 new_row = false; 01065 word_res_it.set_to_list(&row_res_it.data()->word_res_list); 01066 word_res_it.mark_cycle_pt(); 01067 } 01068 // Skip any part_of_combo words. 01069 while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) 01070 word_res_it.forward(); 01071 if (!word_res_it.cycled_list()) { 01072 next_block_res = block_res_it.data(); 01073 next_row_res = row_res_it.data(); 01074 next_word_res = word_res_it.data(); 01075 word_res_it.forward(); 01076 goto foundword; 01077 } 01078 // end of row reached 01079 row_res_it.forward(); 01080 new_row = true; 01081 } 01082 // end of block reached 01083 block_res_it.forward(); 01084 new_block = true; 01085 } 01086 foundword: 01087 // Update prev_word_best_choice pointer. 01088 if (page_res != NULL && page_res->prev_word_best_choice != NULL) { 01089 *page_res->prev_word_best_choice = 01090 (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice; 01091 } 01092 return word_res; 01093 } 01094 01095 /************************************************************************* 01096 * PAGE_RES_IT::restart_row() 01097 * 01098 * Move to the beginning (leftmost word) of the current row. 01099 *************************************************************************/ 01100 WERD_RES *PAGE_RES_IT::restart_row() { 01101 ROW_RES *row = this->row(); 01102 if (!row) return NULL; 01103 for (restart_page(); this->row() != row; forward()) { 01104 // pass 01105 } 01106 return word(); 01107 } 01108 01109 /************************************************************************* 01110 * PAGE_RES_IT::forward_paragraph 01111 * 01112 * Move to the beginning of the next paragraph, allowing empty blocks. 01113 *************************************************************************/ 01114 01115 WERD_RES *PAGE_RES_IT::forward_paragraph() { 01116 while (block_res == next_block_res && 01117 (next_row_res != NULL && next_row_res->row != NULL && 01118 row_res->row->para() == next_row_res->row->para())) { 01119 internal_forward(false, true); 01120 } 01121 return internal_forward(false, true); 01122 } 01123 01124 /************************************************************************* 01125 * PAGE_RES_IT::forward_block 01126 * 01127 * Move to the beginning of the next block, allowing empty blocks. 01128 *************************************************************************/ 01129 01130 WERD_RES *PAGE_RES_IT::forward_block() { 01131 while (block_res == next_block_res) { 01132 internal_forward(false, true); 01133 } 01134 return internal_forward(false, true); 01135 } 01136 01137 void PAGE_RES_IT::rej_stat_word() { 01138 inT16 chars_in_word; 01139 inT16 rejects_in_word = 0; 01140 01141 chars_in_word = word_res->reject_map.length (); 01142 page_res->char_count += chars_in_word; 01143 block_res->char_count += chars_in_word; 01144 row_res->char_count += chars_in_word; 01145 01146 rejects_in_word = word_res->reject_map.reject_count (); 01147 01148 page_res->rej_count += rejects_in_word; 01149 block_res->rej_count += rejects_in_word; 01150 row_res->rej_count += rejects_in_word; 01151 if (chars_in_word == rejects_in_word) 01152 row_res->whole_word_rej_count += rejects_in_word; 01153 }