Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: chopper.c (Formerly chopper.c) 00005 * Description: 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 **************************************************************************/ 00025 00026 /*---------------------------------------------------------------------- 00027 I n c l u d e s 00028 ----------------------------------------------------------------------*/ 00029 00030 #include <math.h> 00031 00032 #include "chopper.h" 00033 00034 #include "assert.h" 00035 #include "associate.h" 00036 #include "callcpp.h" 00037 #include "const.h" 00038 #include "findseam.h" 00039 #include "freelist.h" 00040 #include "globals.h" 00041 #include "makechop.h" 00042 #include "render.h" 00043 #include "pageres.h" 00044 #include "permute.h" 00045 #include "seam.h" 00046 #include "stopper.h" 00047 #include "structures.h" 00048 #include "unicharset.h" 00049 #include "wordclass.h" 00050 #include "wordrec.h" 00051 00052 // Include automatically generated configuration file if running autoconf. 00053 #ifdef HAVE_CONFIG_H 00054 #include "config_auto.h" 00055 #endif 00056 00057 /*---------------------------------------------------------------------- 00058 F u n c t i o n s 00059 ----------------------------------------------------------------------*/ 00065 void preserve_outline(EDGEPT *start) { 00066 EDGEPT *srcpt; 00067 00068 if (start == NULL) 00069 return; 00070 srcpt = start; 00071 do { 00072 srcpt->flags[1] = 1; 00073 srcpt = srcpt->next; 00074 } 00075 while (srcpt != start); 00076 srcpt->flags[1] = 2; 00077 } 00078 00079 00080 /**************************************************************************/ 00081 void preserve_outline_tree(TESSLINE *srcline) { 00082 TESSLINE *outline; 00083 00084 for (outline = srcline; outline != NULL; outline = outline->next) { 00085 preserve_outline (outline->loop); 00086 } 00087 } 00088 00089 00095 EDGEPT *restore_outline(EDGEPT *start) { 00096 EDGEPT *srcpt; 00097 EDGEPT *real_start; 00098 EDGEPT *deadpt; 00099 00100 if (start == NULL) 00101 return NULL; 00102 srcpt = start; 00103 do { 00104 if (srcpt->flags[1] == 2) 00105 break; 00106 srcpt = srcpt->next; 00107 } 00108 while (srcpt != start); 00109 real_start = srcpt; 00110 do { 00111 if (srcpt->flags[1] == 0) { 00112 deadpt = srcpt; 00113 srcpt = srcpt->next; 00114 srcpt->prev = deadpt->prev; 00115 deadpt->prev->next = srcpt; 00116 deadpt->prev->vec.x = srcpt->pos.x - deadpt->prev->pos.x; 00117 deadpt->prev->vec.y = srcpt->pos.y - deadpt->prev->pos.y; 00118 delete deadpt; 00119 } 00120 else 00121 srcpt = srcpt->next; 00122 } 00123 while (srcpt != real_start); 00124 return real_start; 00125 } 00126 00127 00128 /******************************************************************************/ 00129 void restore_outline_tree(TESSLINE *srcline) { 00130 TESSLINE *outline; 00131 00132 for (outline = srcline; outline != NULL; outline = outline->next) { 00133 outline->loop = restore_outline (outline->loop); 00134 outline->start = outline->loop->pos; 00135 } 00136 } 00137 00138 00145 namespace tesseract { 00146 SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, 00147 bool italic_blob, SEAMS seam_list) { 00148 TBLOB *next_blob = blob->next; 00149 TBLOB *other_blob; 00150 SEAM *seam; 00151 00152 if (repair_unchopped_blobs) 00153 preserve_outline_tree (blob->outlines); 00154 other_blob = new TBLOB; /* Make new blob */ 00155 other_blob->next = blob->next; 00156 other_blob->outlines = NULL; 00157 blob->next = other_blob; 00158 00159 seam = NULL; 00160 if (prioritize_division) { 00161 TPOINT location; 00162 if (divisible_blob(blob, italic_blob, &location)) { 00163 seam = new_seam(0.0f, location, NULL, NULL, NULL); 00164 } 00165 } 00166 if (seam == NULL) 00167 seam = pick_good_seam(blob); 00168 if (seam == NULL && word->latin_script) { 00169 // If the blob can simply be divided into outlines, then do that. 00170 TPOINT location; 00171 if (divisible_blob(blob, italic_blob, &location)) { 00172 seam = new_seam(0.0f, location, NULL, NULL, NULL); 00173 } 00174 } 00175 if (chop_debug) { 00176 if (seam != NULL) { 00177 print_seam ("Good seam picked=", seam); 00178 } 00179 else 00180 cprintf ("\n** no seam picked *** \n"); 00181 } 00182 if (seam) { 00183 apply_seam(blob, other_blob, italic_blob, seam); 00184 } 00185 00186 if ((seam == NULL) || 00187 (blob->outlines == NULL) || 00188 (other_blob->outlines == NULL) || 00189 total_containment (blob, other_blob) || 00190 check_blob (other_blob) || 00191 !(check_seam_order (blob, seam) && 00192 check_seam_order (other_blob, seam)) || 00193 any_shared_split_points (seam_list, seam) || 00194 !test_insert_seam(seam_list, blob_number, blob, word->blobs)) { 00195 00196 blob->next = next_blob; 00197 if (seam) { 00198 undo_seam(blob, other_blob, seam); 00199 delete_seam(seam); 00200 #ifndef GRAPHICS_DISABLED 00201 if (chop_debug) { 00202 if (chop_debug >2) 00203 display_blob(blob, Red); 00204 cprintf ("\n** seam being removed ** \n"); 00205 } 00206 #endif 00207 } else { 00208 delete other_blob; 00209 } 00210 00211 if (repair_unchopped_blobs) 00212 restore_outline_tree (blob->outlines); 00213 return (NULL); 00214 } 00215 return (seam); 00216 } 00217 00218 00219 SEAM *Wordrec::chop_numbered_blob(TWERD *word, inT32 blob_number, 00220 bool italic_blob, SEAMS seam_list) { 00221 TBLOB *blob; 00222 inT16 x; 00223 00224 blob = word->blobs; 00225 for (x = 0; x < blob_number; x++) 00226 blob = blob->next; 00227 00228 return attempt_blob_chop(word, blob, blob_number, 00229 italic_blob, seam_list); 00230 } 00231 00232 00233 SEAM *Wordrec::chop_overlapping_blob(const GenericVector<TBOX>& boxes, 00234 WERD_RES *word_res, inT32 *blob_number, 00235 bool italic_blob, SEAMS seam_list) { 00236 TWERD *word = word_res->chopped_word; 00237 TBLOB *blob; 00238 00239 *blob_number = 0; 00240 blob = word->blobs; 00241 while (blob != NULL) { 00242 TPOINT topleft, botright; 00243 topleft.x = blob->bounding_box().left(); 00244 topleft.y = blob->bounding_box().top(); 00245 botright.x = blob->bounding_box().right(); 00246 botright.y = blob->bounding_box().bottom(); 00247 00248 TPOINT original_topleft, original_botright; 00249 word_res->denorm.DenormTransform(topleft, &original_topleft); 00250 word_res->denorm.DenormTransform(botright, &original_botright); 00251 00252 TBOX original_box = TBOX(original_topleft.x, original_botright.y, 00253 original_botright.x, original_topleft.y); 00254 00255 bool almost_equal_box = false; 00256 int num_overlap = 0; 00257 for (int i = 0; i < boxes.size(); i++) { 00258 if (original_box.overlap_fraction(boxes[i]) > 0.125) 00259 num_overlap++; 00260 if (original_box.almost_equal(boxes[i], 3)) 00261 almost_equal_box = true; 00262 } 00263 00264 TPOINT location; 00265 if (divisible_blob(blob, italic_blob, &location) || 00266 (!almost_equal_box && num_overlap > 1)) { 00267 SEAM *seam = attempt_blob_chop(word, blob, *blob_number, 00268 italic_blob, seam_list); 00269 if (seam != NULL) 00270 return seam; 00271 } 00272 00273 *blob_number = *blob_number + 1; 00274 blob = blob->next; 00275 } 00276 00277 *blob_number = -1; 00278 return NULL; 00279 } 00280 00281 } // namespace tesseract 00282 00283 00289 int any_shared_split_points(SEAMS seam_list, SEAM *seam) { 00290 int length; 00291 int index; 00292 00293 length = array_count (seam_list); 00294 for (index = 0; index < length; index++) 00295 if (shared_split_points ((SEAM *) array_value (seam_list, index), seam)) 00296 return TRUE; 00297 return FALSE; 00298 } 00299 00300 00306 int check_blob(TBLOB *blob) { 00307 TESSLINE *outline; 00308 EDGEPT *edgept; 00309 00310 for (outline = blob->outlines; outline != NULL; outline = outline->next) { 00311 edgept = outline->loop; 00312 do { 00313 if (edgept == NULL) 00314 break; 00315 edgept = edgept->next; 00316 } 00317 while (edgept != outline->loop); 00318 if (edgept == NULL) 00319 return 1; 00320 } 00321 return 0; 00322 } 00323 00324 00325 namespace tesseract { 00332 bool Wordrec::improve_one_blob(WERD_RES *word_res, 00333 BLOB_CHOICE_LIST_VECTOR *char_choices, 00334 inT32 *blob_number, 00335 SEAMS *seam_list, 00336 DANGERR *fixpt, 00337 bool split_next_to_fragment, 00338 BlamerBundle *blamer_bundle) { 00339 TWERD* word = word_res->chopped_word; 00340 TBLOB *blob; 00341 inT16 x = 0; 00342 float rating_ceiling = MAX_FLOAT32; 00343 BLOB_CHOICE_LIST *answer; 00344 BLOB_CHOICE_IT answer_it; 00345 SEAM *seam; 00346 00347 do { 00348 *blob_number = select_blob_to_split_from_fixpt(fixpt); 00349 bool split_point_from_dict = (*blob_number != -1); 00350 if (split_point_from_dict) { 00351 fixpt->clear(); 00352 } else { 00353 *blob_number = select_blob_to_split(*char_choices, rating_ceiling, 00354 split_next_to_fragment); 00355 } 00356 if (chop_debug) 00357 cprintf("blob_number = %d\n", *blob_number); 00358 if (*blob_number == -1) 00359 return false; 00360 00361 // TODO(rays) it may eventually help to allow italic_blob to be true, 00362 seam = chop_numbered_blob(word, *blob_number, false, *seam_list); 00363 if (seam != NULL) 00364 break; 00365 /* Must split null blobs */ 00366 answer = char_choices->get(*blob_number); 00367 if (answer == NULL) 00368 return false; 00369 answer_it.set_to_list(answer); 00370 if (!split_point_from_dict) { 00371 // We chopped the worst rated blob, try something else next time. 00372 rating_ceiling = answer_it.data()->rating(); 00373 } 00374 } while (true); 00375 /* Split OK */ 00376 for (blob = word->blobs; x < *blob_number; x++) { 00377 blob = blob->next; 00378 } 00379 00380 *seam_list = 00381 insert_seam (*seam_list, *blob_number, seam, blob, word->blobs); 00382 00383 delete char_choices->get(*blob_number); 00384 00385 answer = classify_blob(blob, word_res->denorm, "improve 1:", Red, 00386 blamer_bundle); 00387 char_choices->insert(answer, *blob_number); 00388 00389 answer = classify_blob(blob->next, word_res->denorm, "improve 2:", Yellow, 00390 blamer_bundle); 00391 char_choices->set(answer, *blob_number + 1); 00392 00393 return true; 00394 } 00395 00403 void Wordrec::modify_blob_choice(BLOB_CHOICE_LIST *answer, 00404 int chop_index) { 00405 char chop_index_string[2]; 00406 if (chop_index <= 9) { 00407 snprintf(chop_index_string, sizeof(chop_index_string), "%d", chop_index); 00408 } else { 00409 chop_index_string[0] = static_cast<char>('A' - 10 + chop_index); 00410 chop_index_string[1] = '\0'; 00411 } 00412 UNICHAR_ID unichar_id = unicharset.unichar_to_id(chop_index_string); 00413 if (unichar_id == INVALID_UNICHAR_ID) { 00414 // If the word is very long, we might exhaust the possibilities. 00415 unichar_id = 1; 00416 } 00417 BLOB_CHOICE_IT answer_it(answer); 00418 BLOB_CHOICE *modified_blob = 00419 new BLOB_CHOICE(unichar_id, 00420 answer_it.data()->rating(), 00421 answer_it.data()->certainty(), 00422 answer_it.data()->fontinfo_id(), 00423 answer_it.data()->fontinfo_id2(), 00424 answer_it.data()->script_id(), 00425 answer_it.data()->min_xheight(), 00426 answer_it.data()->max_xheight(), 00427 answer_it.data()->adapted()); 00428 answer->clear(); 00429 answer_it.set_to_list(answer); 00430 answer_it.add_after_then_move(modified_blob); 00431 } 00432 00433 00441 bool Wordrec::chop_one_blob(TWERD *word, 00442 BLOB_CHOICE_LIST_VECTOR *char_choices, 00443 inT32 *blob_number, 00444 SEAMS *seam_list, 00445 int *right_chop_index) { 00446 TBLOB *blob; 00447 inT16 x = 0; 00448 float rating_ceiling = MAX_FLOAT32; 00449 BLOB_CHOICE_LIST *answer; 00450 BLOB_CHOICE_IT answer_it; 00451 SEAM *seam; 00452 UNICHAR_ID unichar_id = 0; 00453 int left_chop_index = 0; 00454 00455 do { 00456 *blob_number = select_blob_to_split(*char_choices, rating_ceiling, false); 00457 if (chop_debug) 00458 cprintf("blob_number = %d\n", *blob_number); 00459 if (*blob_number == -1) 00460 return false; 00461 seam = chop_numbered_blob(word, *blob_number, true, *seam_list); 00462 if (seam != NULL) 00463 break; 00464 /* Must split null blobs */ 00465 answer = char_choices->get(*blob_number); 00466 if (answer == NULL) 00467 return false; 00468 answer_it.set_to_list(answer); 00469 rating_ceiling = answer_it.data()->rating(); // try a different blob 00470 } while (true); 00471 /* Split OK */ 00472 for (blob = word->blobs; x < *blob_number; x++) { 00473 blob = blob->next; 00474 } 00475 if (chop_debug) { 00476 tprintf("Chop made blob1:"); 00477 blob->bounding_box().print(); 00478 tprintf("and blob2:"); 00479 blob->next->bounding_box().print(); 00480 } 00481 *seam_list = insert_seam(*seam_list, *blob_number, seam, blob, word->blobs); 00482 00483 answer = char_choices->get(*blob_number); 00484 answer_it.set_to_list(answer); 00485 unichar_id = answer_it.data()->unichar_id(); 00486 float rating = answer_it.data()->rating() / exp(1.0); 00487 left_chop_index = atoi(unicharset.id_to_unichar(unichar_id)); 00488 00489 delete char_choices->get(*blob_number); 00490 // combine confidence w/ serial # 00491 answer = fake_classify_blob(0, rating, -rating); 00492 modify_blob_choice(answer, left_chop_index); 00493 char_choices->insert(answer, *blob_number); 00494 00495 answer = fake_classify_blob(0, rating - 0.125f, -rating); 00496 modify_blob_choice(answer, ++*right_chop_index); 00497 char_choices->set(answer, *blob_number + 1); 00498 return true; 00499 } 00500 00501 00502 bool Wordrec::chop_one_blob2(const GenericVector<TBOX>& boxes, 00503 WERD_RES *word_res, 00504 SEAMS *seam_list) { 00505 inT32 blob_number; 00506 inT16 x = 0; 00507 TBLOB *blob; 00508 SEAM *seam; 00509 00510 seam = chop_overlapping_blob(boxes, word_res, &blob_number, 00511 true, *seam_list); 00512 if (seam == NULL) 00513 return false; 00514 00515 /* Split OK */ 00516 for (blob = word_res->chopped_word->blobs; x < blob_number; x++) { 00517 blob = blob->next; 00518 } 00519 if (chop_debug) { 00520 tprintf("Chop made blob1:"); 00521 blob->bounding_box().print(); 00522 tprintf("and blob2:"); 00523 blob->next->bounding_box().print(); 00524 } 00525 *seam_list = insert_seam(*seam_list, blob_number, seam, blob, 00526 word_res->chopped_word->blobs); 00527 return true; 00528 } 00529 } // namespace tesseract 00530 00539 inT16 check_seam_order(TBLOB *blob, SEAM *seam) { 00540 TESSLINE *outline; 00541 TESSLINE *last_outline; 00542 inT8 found_em[3]; 00543 00544 if (seam->split1 == NULL || seam->split1 == NULL || blob == NULL) 00545 return (TRUE); 00546 00547 found_em[0] = found_em[1] = found_em[2] = FALSE; 00548 00549 for (outline = blob->outlines; outline; outline = outline->next) { 00550 if (!found_em[0] && 00551 ((seam->split1 == NULL) || 00552 is_split_outline (outline, seam->split1))) { 00553 found_em[0] = TRUE; 00554 } 00555 if (!found_em[1] && 00556 ((seam->split2 == NULL) || 00557 is_split_outline (outline, seam->split2))) { 00558 found_em[1] = TRUE; 00559 } 00560 if (!found_em[2] && 00561 ((seam->split3 == NULL) || 00562 is_split_outline (outline, seam->split3))) { 00563 found_em[2] = TRUE; 00564 } 00565 last_outline = outline; 00566 } 00567 00568 if (!found_em[0] || !found_em[1] || !found_em[2]) 00569 return (FALSE); 00570 else 00571 return (TRUE); 00572 } 00573 00574 namespace tesseract { 00583 BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(WERD_RES *word) { 00584 TBLOB *blob; 00585 int index; 00586 int did_chopping; 00587 STATE state; 00588 BLOB_CHOICE_LIST *match_result; 00589 MATRIX *ratings = NULL; 00590 DANGERR fixpt; /*dangerous ambig */ 00591 inT32 bit_count; //no of bits 00592 00593 BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); 00594 BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR(); 00595 00596 did_chopping = 0; 00597 for (blob = word->chopped_word->blobs, index = 0; 00598 blob != NULL; blob = blob->next, index++) { 00599 match_result = classify_blob(blob, word->denorm, "chop_word:", Green, 00600 word->blamer_bundle); 00601 if (match_result == NULL) 00602 cprintf("Null classifier output!\n"); 00603 *char_choices += match_result; 00604 } 00605 bit_count = index - 1; 00606 set_n_ones(&state, char_choices->length() - 1); 00607 bool acceptable = false; 00608 bool replaced = false; 00609 bool best_choice_updated = 00610 getDict().permute_characters(*char_choices, word->best_choice, 00611 word->raw_choice); 00612 if (best_choice_updated && 00613 getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt, 00614 CHOPPER_CALLER, &replaced)) { 00615 acceptable = true; 00616 } 00617 if (replaced) 00618 update_blob_classifications(word->chopped_word, *char_choices); 00619 CopyCharChoices(*char_choices, best_char_choices); 00620 if (!acceptable) { // do more work to find a better choice 00621 did_chopping = 1; 00622 00623 bool best_choice_acceptable = false; 00624 if (chop_enable) 00625 improve_by_chopping(word, 00626 char_choices, 00627 &state, 00628 best_char_choices, 00629 &fixpt, 00630 &best_choice_acceptable); 00631 if (chop_debug) 00632 print_seams ("Final seam list:", word->seam_array); 00633 00634 if (word->blamer_bundle != NULL && 00635 !ChoiceIsCorrect(*word->uch_set, word->best_choice, 00636 word->blamer_bundle->truth_text)) { 00637 set_chopper_blame(word); 00638 } 00639 00640 // The force_word_assoc is almost redundant to enable_assoc. However, 00641 // it is not conditioned on the dict behavior. For CJK, we need to force 00642 // the associator to be invoked. When we figure out the exact behavior 00643 // of dict on CJK, we can remove the flag if it turns out to be redundant. 00644 if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) { 00645 ratings = word_associator(false, word, &state, best_char_choices, 00646 &fixpt, &state); 00647 } 00648 } 00649 best_char_choices = rebuild_current_state(word, &state, best_char_choices, 00650 ratings); 00651 00652 // If after running only the chopper best_choice is incorrect and no blame 00653 // has been yet set, blame the classifier if best_choice is classifier's 00654 // top choice and is a dictionary word (i.e. language model could not have 00655 // helped). Otherwise blame the tradeoff between the classifier and 00656 // the old language model (permuters). 00657 if (word->blamer_bundle != NULL && 00658 word->blamer_bundle->incorrect_result_reason == IRR_CORRECT && 00659 ratings == NULL && // only the chopper was run 00660 !ChoiceIsCorrect(*word->uch_set, word->best_choice, 00661 word->blamer_bundle->truth_text)) { 00662 if (word->best_choice != NULL && 00663 Dict::valid_word_permuter(word->best_choice->permuter(), false)) { 00664 // Find out whether best choice is a top choice. 00665 word->blamer_bundle->best_choice_is_dict_and_top_choice = true; 00666 for (int i = 0; i < word->best_choice->length(); ++i) { 00667 BLOB_CHOICE_IT blob_choice_it(best_char_choices->get(i)); 00668 ASSERT_HOST(!blob_choice_it.empty()); 00669 BLOB_CHOICE *first_choice = NULL; 00670 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); 00671 blob_choice_it.forward()) { // find first non-fragment choice 00672 if (!(getDict().getUnicharset().get_fragment( 00673 blob_choice_it.data()->unichar_id()))) { 00674 first_choice = blob_choice_it.data(); 00675 break; 00676 } 00677 } 00678 ASSERT_HOST(first_choice != NULL); 00679 if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) { 00680 word->blamer_bundle->best_choice_is_dict_and_top_choice = false; 00681 break; 00682 } 00683 } 00684 } 00685 STRING debug; 00686 if (word->blamer_bundle->best_choice_is_dict_and_top_choice) { 00687 debug = "Best choice is: incorrect, top choice, dictionary word"; 00688 debug += " with permuter "; 00689 debug += word->best_choice->permuter_name(); 00690 } else { 00691 debug = "Classifier/Old LM tradeoff is to blame"; 00692 } 00693 word->blamer_bundle->SetBlame( 00694 word->blamer_bundle->best_choice_is_dict_and_top_choice ? 00695 IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF, 00696 debug, word->best_choice, wordrec_debug_blamer); 00697 } 00698 00699 if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) { 00700 if (ratings == NULL) { 00701 ratings = word_associator(true, word, NULL, NULL, NULL, NULL); 00702 } 00703 CallFillLattice(*ratings, getDict().getBestChoices(), 00704 *word->uch_set, word->blamer_bundle); 00705 } 00706 if (ratings != NULL) { 00707 if (wordrec_debug_level > 0) { 00708 tprintf("Final Ratings Matrix:\n"); 00709 ratings->print(getDict().getUnicharset()); 00710 } 00711 ratings->delete_matrix_pointers(); 00712 delete ratings; 00713 } 00714 getDict().FilterWordChoices(); 00715 // TODO(antonova, eger): check that FilterWordChoices() does not filter 00716 // out anything useful for word bigram or phrase search. 00717 // TODO(antonova, eger): when implementing word bigram and phrase search 00718 // we will need to think carefully about how to replace a word with its 00719 // alternative choice. 00720 // In particular it might be required to save the segmentation state 00721 // associated with the word, so that best_char_choices could be updated 00722 // by rebuild_current_state() correctly. 00723 if (save_alt_choices) SaveAltChoices(getDict().getBestChoices(), word); 00724 char_choices->delete_data_pointers(); 00725 delete char_choices; 00726 00727 return best_char_choices; 00728 } 00729 00730 00731 00741 void Wordrec::improve_by_chopping(WERD_RES *word, 00742 BLOB_CHOICE_LIST_VECTOR *char_choices, 00743 STATE *best_state, 00744 BLOB_CHOICE_LIST_VECTOR *best_char_choices, 00745 DANGERR *fixpt, 00746 bool *best_choice_acceptable) { 00747 inT32 blob_number; 00748 float old_best; 00749 bool updated_best_choice = false; 00750 00751 while (1) { // improvement loop 00752 old_best = word->best_choice->rating(); 00753 if (improve_one_blob(word, char_choices, 00754 &blob_number, &word->seam_array, 00755 fixpt, (fragments_guide_chopper && 00756 word->best_choice->fragment_mark()), 00757 word->blamer_bundle)) { 00758 getDict().LogNewSplit(blob_number); 00759 updated_best_choice = 00760 getDict().permute_characters(*char_choices, word->best_choice, 00761 word->raw_choice); 00762 00763 if (old_best > word->best_choice->rating()) { 00764 set_n_ones(best_state, char_choices->length() - 1); 00765 } else { 00766 insert_new_chunk(best_state, blob_number, char_choices->length() - 2); 00767 fixpt->clear(); 00768 } 00769 00770 if (chop_debug) 00771 print_state("best state = ", 00772 best_state, count_blobs(word->chopped_word->blobs) - 1); 00773 } else { 00774 break; 00775 } 00776 00777 // Check if we should break from the loop. 00778 bool done = false; 00779 bool replaced = false; 00780 if ((updated_best_choice && 00781 (*best_choice_acceptable = 00782 getDict().AcceptableChoice(char_choices, word->best_choice, 00783 fixpt, CHOPPER_CALLER, &replaced))) || 00784 char_choices->length() >= MAX_NUM_CHUNKS) { 00785 done = true; 00786 } 00787 if (replaced) update_blob_classifications(word->chopped_word, 00788 *char_choices); 00789 if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices); 00790 if (done) break; 00791 } 00792 } 00793 00794 00795 /********************************************************************** 00796 * select_blob_to_split 00797 * 00798 * These are the results of the last classification. Find a likely 00799 * place to apply splits. If none, return -1. 00800 **********************************************************************/ 00801 inT16 Wordrec::select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00802 float rating_ceiling, 00803 bool split_next_to_fragment) { 00804 BLOB_CHOICE_IT blob_choice_it; 00805 BLOB_CHOICE *blob_choice; 00806 BLOB_CHOICE_IT temp_it; 00807 int x; 00808 float worst = -MAX_FLOAT32; 00809 int worst_index = -1; 00810 float worst_near_fragment = -MAX_FLOAT32; 00811 int worst_index_near_fragment = -1; 00812 const CHAR_FRAGMENT **fragments = NULL; 00813 00814 if (chop_debug) { 00815 if (rating_ceiling < MAX_FLOAT32) 00816 cprintf("rating_ceiling = %8.4f\n", rating_ceiling); 00817 else 00818 cprintf("rating_ceiling = No Limit\n"); 00819 } 00820 00821 if (split_next_to_fragment && char_choices.length() > 0) { 00822 fragments = new const CHAR_FRAGMENT *[char_choices.length()]; 00823 if (char_choices.get(0) != NULL) { 00824 temp_it.set_to_list(char_choices.get(0)); 00825 fragments[0] = getDict().getUnicharset().get_fragment( 00826 temp_it.data()->unichar_id()); 00827 } else { 00828 fragments[0] = NULL; 00829 } 00830 } 00831 00832 for (x = 0; x < char_choices.length(); ++x) { 00833 if (char_choices.get(x) == NULL) { 00834 if (fragments != NULL) { 00835 delete[] fragments; 00836 } 00837 return x; 00838 } else { 00839 blob_choice_it.set_to_list(char_choices.get(x)); 00840 blob_choice = blob_choice_it.data(); 00841 // Populate fragments for the following position. 00842 if (split_next_to_fragment && x+1 < char_choices.length()) { 00843 if (char_choices.get(x+1) != NULL) { 00844 temp_it.set_to_list(char_choices.get(x+1)); 00845 fragments[x+1] = getDict().getUnicharset().get_fragment( 00846 temp_it.data()->unichar_id()); 00847 } else { 00848 fragments[x+1] = NULL; 00849 } 00850 } 00851 if (blob_choice->rating() < rating_ceiling && 00852 blob_choice->certainty() < tessedit_certainty_threshold) { 00853 // Update worst and worst_index. 00854 if (blob_choice->rating() > worst) { 00855 worst_index = x; 00856 worst = blob_choice->rating(); 00857 } 00858 if (split_next_to_fragment) { 00859 // Update worst_near_fragment and worst_index_near_fragment. 00860 bool expand_following_fragment = 00861 (x + 1 < char_choices.length() && 00862 fragments[x+1] != NULL && !fragments[x+1]->is_beginning()); 00863 bool expand_preceding_fragment = 00864 (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending()); 00865 if ((expand_following_fragment || expand_preceding_fragment) && 00866 blob_choice->rating() > worst_near_fragment) { 00867 worst_index_near_fragment = x; 00868 worst_near_fragment = blob_choice->rating(); 00869 if (chop_debug) { 00870 cprintf("worst_index_near_fragment=%d" 00871 " expand_following_fragment=%d" 00872 " expand_preceding_fragment=%d\n", 00873 worst_index_near_fragment, 00874 expand_following_fragment, 00875 expand_preceding_fragment); 00876 } 00877 } 00878 } 00879 } 00880 } 00881 } 00882 if (fragments != NULL) { 00883 delete[] fragments; 00884 } 00885 // TODO(daria): maybe a threshold of badness for 00886 // worst_near_fragment would be useful. 00887 return worst_index_near_fragment != -1 ? 00888 worst_index_near_fragment : worst_index; 00889 } 00890 00891 /********************************************************************** 00892 * select_blob_to_split_from_fixpt 00893 * 00894 * Given the fix point from a dictionary search, if there is a single 00895 * dangerous blob that maps to multiple characters, return that blob 00896 * index as a place we need to split. If none, return -1. 00897 **********************************************************************/ 00898 inT16 Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) { 00899 if (!fixpt) 00900 return -1; 00901 for (int i = 0; i < fixpt->size(); i++) { 00902 if ((*fixpt)[i].begin == (*fixpt)[i].end && 00903 (*fixpt)[i].dangerous && 00904 (*fixpt)[i].correct_is_ngram) { 00905 return (*fixpt)[i].begin; 00906 } 00907 } 00908 return -1; 00909 } 00910 00911 /********************************************************************** 00912 * set_chopper_blame 00913 * 00914 * Check whether chops were made at all the character bounding box boundaries 00915 * in word->truth_word. If not - blame the chopper for an incorrect answer. 00916 **********************************************************************/ 00917 void Wordrec::set_chopper_blame(WERD_RES *word) { 00918 BlamerBundle *blamer_bundle = word->blamer_bundle; 00919 assert(blamer_bundle != NULL); 00920 if (blamer_bundle->NoTruth() || !(blamer_bundle->truth_has_char_boxes) || 00921 word->chopped_word->blobs == NULL) { 00922 return; 00923 } 00924 STRING debug; 00925 bool missing_chop = false; 00926 TBLOB * curr_blob = word->chopped_word->blobs; 00927 int b = 0; 00928 inT16 truth_x; 00929 while (b < blamer_bundle->truth_word.length() && curr_blob != NULL) { 00930 truth_x = blamer_bundle->norm_truth_word.BlobBox(b).right(); 00931 if (curr_blob->bounding_box().right() < 00932 (truth_x - blamer_bundle->norm_box_tolerance)) { 00933 curr_blob = curr_blob->next; 00934 continue; // encountered an extra chop, keep looking 00935 } else if (curr_blob->bounding_box().right() > 00936 (truth_x + blamer_bundle->norm_box_tolerance)) { 00937 missing_chop = true; 00938 break; 00939 } else { 00940 curr_blob = curr_blob->next; 00941 ++b; 00942 } 00943 } 00944 if (missing_chop || b < blamer_bundle->norm_truth_word.length()) { 00945 STRING debug; 00946 char debug_buffer[256]; 00947 if (missing_chop) { 00948 sprintf(debug_buffer, "Detected missing chop (tolerance=%d) at ", 00949 blamer_bundle->norm_box_tolerance); 00950 debug += debug_buffer; 00951 curr_blob->bounding_box().append_debug(&debug); 00952 debug.add_str_int("\nNo chop for truth at x=", truth_x); 00953 } else { 00954 debug.add_str_int("Missing chops for last ", 00955 blamer_bundle->norm_truth_word.length()-b); 00956 debug += " truth box(es)"; 00957 } 00958 debug += "\nMaximally chopped word boxes:\n"; 00959 for (curr_blob = word->chopped_word->blobs; curr_blob != NULL; 00960 curr_blob = curr_blob->next) { 00961 const TBOX &tbox = curr_blob->bounding_box(); 00962 sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n", 00963 tbox.left(), tbox.bottom(), tbox.right(), tbox.top()); 00964 debug += debug_buffer; 00965 } 00966 debug += "Truth bounding boxes:\n"; 00967 for (b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) { 00968 const TBOX &tbox = blamer_bundle->norm_truth_word.BlobBox(b); 00969 sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n", 00970 tbox.left(), tbox.bottom(), tbox.right(), tbox.top()); 00971 debug += debug_buffer; 00972 } 00973 blamer_bundle->SetBlame(IRR_CHOPPER, debug, word->best_choice, 00974 wordrec_debug_blamer); 00975 } 00976 } 00977 00978 /********************************************************************** 00979 * word_associator 00980 * 00981 * Reassociate and classify the blobs in a word. Continue this process 00982 * until a good answer is found or all the possibilities have been tried. 00983 **********************************************************************/ 00984 MATRIX *Wordrec::word_associator(bool only_create_ratings_matrix, 00985 WERD_RES *word, 00986 STATE *state, 00987 BLOB_CHOICE_LIST_VECTOR *best_char_choices, 00988 DANGERR *fixpt, 00989 STATE *best_state) { 00990 CHUNKS_RECORD chunks_record; 00991 BLOB_WEIGHTS blob_weights; 00992 int x; 00993 int num_chunks; 00994 BLOB_CHOICE_IT blob_choice_it; 00995 00996 num_chunks = array_count(word->seam_array) + 1; 00997 00998 TBLOB* blobs = word->chopped_word->blobs; 00999 chunks_record.ratings = record_piece_ratings(blobs); 01000 chunks_record.chunks = blobs; 01001 chunks_record.word_res = word; 01002 chunks_record.splits = word->seam_array; 01003 chunks_record.chunk_widths = blobs_widths(blobs); 01004 chunks_record.char_widths = blobs_widths(blobs); 01005 /* Save chunk weights */ 01006 for (x = 0; x < num_chunks; x++) { 01007 BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings, blobs, 01008 chunks_record.word_res->denorm, 01009 word->seam_array, x, x, 01010 word->blamer_bundle); 01011 blob_choice_it.set_to_list(choices); 01012 //This is done by Jetsoft. Divide by zero is possible. 01013 if (blob_choice_it.data()->certainty() == 0) { 01014 blob_weights[x]=0; 01015 } else { 01016 blob_weights[x] = 01017 -(inT16) (10 * blob_choice_it.data()->rating() / 01018 blob_choice_it.data()->certainty()); 01019 } 01020 } 01021 chunks_record.weights = blob_weights; 01022 01023 if (chop_debug) 01024 chunks_record.ratings->print(getDict().getUnicharset()); 01025 01026 if (!only_create_ratings_matrix) { 01027 if (enable_new_segsearch) { 01028 SegSearch(&chunks_record, word->best_choice, 01029 best_char_choices, word->raw_choice, 01030 state, word->blamer_bundle); 01031 } else { 01032 best_first_search(&chunks_record, best_char_choices, word, 01033 state, fixpt, best_state); 01034 } 01035 } 01036 01037 free_widths(chunks_record.chunk_widths); 01038 free_widths(chunks_record.char_widths); 01039 return chunks_record.ratings; 01040 } 01041 } // namespace tesseract 01042 01043 01044 /********************************************************************** 01045 * total_containment 01046 * 01047 * Check to see if one of these outlines is totally contained within 01048 * the bounding box of the other. 01049 **********************************************************************/ 01050 inT16 total_containment(TBLOB *blob1, TBLOB *blob2) { 01051 TBOX box1 = blob1->bounding_box(); 01052 TBOX box2 = blob2->bounding_box(); 01053 return box1.contains(box2) || box2.contains(box1); 01054 }