tesseract-doc/chopper_8cpp_source.html

00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        chopper.c  (Formerly chopper.c)
00005  * Description:
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  **************************************************************************/
00025
00026 /*----------------------------------------------------------------------
00027           I n c l u d e s
00028 ----------------------------------------------------------------------*/
00029
00030 #include <math.h>
00031
00032 #include "chopper.h"
00033
00034 #include "assert.h"
00035 #include "associate.h"
00036 #include "callcpp.h"
00037 #include "const.h"
00038 #include "findseam.h"
00039 #include "freelist.h"
00040 #include "globals.h"
00041 #include "makechop.h"
00042 #include "render.h"
00043 #include "pageres.h"
00044 #include "permute.h"
00045 #include "seam.h"
00046 #include "stopper.h"
00047 #include "structures.h"
00048 #include "unicharset.h"
00049 #include "wordclass.h"
00050 #include "wordrec.h"
00051
00052 // Include automatically generated configuration file if running autoconf.
00053 #ifdef HAVE_CONFIG_H
00054 #include "config_auto.h"
00055 #endif
00056
00057 /*----------------------------------------------------------------------
00058           F u n c t i o n s
00059 ----------------------------------------------------------------------*/
00065 void preserve_outline(EDGEPT *start) {
00066   EDGEPT *srcpt;
00067
00068   if (start == NULL)
00069     return;
00070   srcpt = start;
00071   do {
00072     srcpt->flags[1] = 1;
00073     srcpt = srcpt->next;
00074   }
00075   while (srcpt != start);
00076   srcpt->flags[1] = 2;
00077 }
00078
00079
00080 /**************************************************************************/
00081 void preserve_outline_tree(TESSLINE *srcline) {
00082   TESSLINE *outline;
00083
00084   for (outline = srcline; outline != NULL; outline = outline->next) {
00085     preserve_outline (outline->loop);
00086   }
00087 }
00088
00089
00095 EDGEPT *restore_outline(EDGEPT *start) {
00096   EDGEPT *srcpt;
00097   EDGEPT *real_start;
00098   EDGEPT *deadpt;
00099
00100   if (start == NULL)
00101     return NULL;
00102   srcpt = start;
00103   do {
00104     if (srcpt->flags[1] == 2)
00105       break;
00106     srcpt = srcpt->next;
00107   }
00108   while (srcpt != start);
00109   real_start = srcpt;
00110   do {
00111     if (srcpt->flags[1] == 0) {
00112       deadpt = srcpt;
00113       srcpt = srcpt->next;
00114       srcpt->prev = deadpt->prev;
00115       deadpt->prev->next = srcpt;
00116       deadpt->prev->vec.x = srcpt->pos.x - deadpt->prev->pos.x;
00117       deadpt->prev->vec.y = srcpt->pos.y - deadpt->prev->pos.y;
00118       delete deadpt;
00119     }
00120     else
00121       srcpt = srcpt->next;
00122   }
00123   while (srcpt != real_start);
00124   return real_start;
00125 }
00126
00127
00128 /******************************************************************************/
00129 void restore_outline_tree(TESSLINE *srcline) {
00130   TESSLINE *outline;
00131
00132   for (outline = srcline; outline != NULL; outline = outline->next) {
00133     outline->loop = restore_outline (outline->loop);
00134     outline->start = outline->loop->pos;
00135   }
00136 }
00137
00138
00145 namespace tesseract {
00146 SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
00147                                  bool italic_blob, SEAMS seam_list) {
00148   TBLOB *next_blob = blob->next;
00149   TBLOB *other_blob;
00150   SEAM *seam;
00151
00152   if (repair_unchopped_blobs)
00153     preserve_outline_tree (blob->outlines);
00154   other_blob = new TBLOB;       /* Make new blob */
00155   other_blob->next = blob->next;
00156   other_blob->outlines = NULL;
00157   blob->next = other_blob;
00158
00159   seam = NULL;
00160   if (prioritize_division) {
00161     TPOINT location;
00162     if (divisible_blob(blob, italic_blob, &location)) {
00163       seam = new_seam(0.0f, location, NULL, NULL, NULL);
00164     }
00165   }
00166   if (seam == NULL)
00167     seam = pick_good_seam(blob);
00168   if (seam == NULL && word->latin_script) {
00169     // If the blob can simply be divided into outlines, then do that.
00170     TPOINT location;
00171     if (divisible_blob(blob, italic_blob, &location)) {
00172       seam = new_seam(0.0f, location, NULL, NULL, NULL);
00173     }
00174   }
00175   if (chop_debug) {
00176     if (seam != NULL) {
00177       print_seam ("Good seam picked=", seam);
00178     }
00179     else
00180       cprintf ("\n** no seam picked *** \n");
00181   }
00182   if (seam) {
00183     apply_seam(blob, other_blob, italic_blob, seam);
00184   }
00185
00186   if ((seam == NULL) ||
00187     (blob->outlines == NULL) ||
00188     (other_blob->outlines == NULL) ||
00189     total_containment (blob, other_blob) ||
00190     check_blob (other_blob) ||
00191     !(check_seam_order (blob, seam) &&
00192     check_seam_order (other_blob, seam)) ||
00193     any_shared_split_points (seam_list, seam) ||
00194     !test_insert_seam(seam_list, blob_number, blob, word->blobs)) {
00195
00196     blob->next = next_blob;
00197     if (seam) {
00198       undo_seam(blob, other_blob, seam);
00199       delete_seam(seam);
00200 #ifndef GRAPHICS_DISABLED
00201       if (chop_debug) {
00202         if (chop_debug >2)
00203           display_blob(blob, Red);
00204         cprintf ("\n** seam being removed ** \n");
00205       }
00206 #endif
00207     } else {
00208       delete other_blob;
00209     }
00210
00211     if (repair_unchopped_blobs)
00212       restore_outline_tree (blob->outlines);
00213     return (NULL);
00214   }
00215   return (seam);
00216 }
00217
00218
00219 SEAM *Wordrec::chop_numbered_blob(TWERD *word, inT32 blob_number,
00220                                   bool italic_blob, SEAMS seam_list) {
00221   TBLOB *blob;
00222   inT16 x;
00223
00224   blob = word->blobs;
00225   for (x = 0; x < blob_number; x++)
00226     blob = blob->next;
00227
00228   return attempt_blob_chop(word, blob, blob_number,
00229                            italic_blob, seam_list);
00230 }
00231
00232
00233 SEAM *Wordrec::chop_overlapping_blob(const GenericVector<TBOX>& boxes,
00234                                      WERD_RES *word_res, inT32 *blob_number,
00235                                      bool italic_blob, SEAMS seam_list) {
00236   TWERD *word = word_res->chopped_word;
00237   TBLOB *blob;
00238
00239   *blob_number = 0;
00240   blob = word->blobs;
00241   while (blob != NULL) {
00242     TPOINT topleft, botright;
00243     topleft.x = blob->bounding_box().left();
00244     topleft.y = blob->bounding_box().top();
00245     botright.x = blob->bounding_box().right();
00246     botright.y = blob->bounding_box().bottom();
00247
00248     TPOINT original_topleft, original_botright;
00249     word_res->denorm.DenormTransform(topleft, &original_topleft);
00250     word_res->denorm.DenormTransform(botright, &original_botright);
00251
00252     TBOX original_box = TBOX(original_topleft.x, original_botright.y,
00253                              original_botright.x, original_topleft.y);
00254
00255     bool almost_equal_box = false;
00256     int num_overlap = 0;
00257     for (int i = 0; i < boxes.size(); i++) {
00258       if (original_box.overlap_fraction(boxes[i]) > 0.125)
00259         num_overlap++;
00260       if (original_box.almost_equal(boxes[i], 3))
00261         almost_equal_box = true;
00262     }
00263
00264     TPOINT location;
00265     if (divisible_blob(blob, italic_blob, &location) ||
00266         (!almost_equal_box && num_overlap > 1)) {
00267       SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
00268                                      italic_blob, seam_list);
00269       if (seam != NULL)
00270         return seam;
00271     }
00272
00273     *blob_number = *blob_number + 1;
00274     blob = blob->next;
00275   }
00276
00277   *blob_number = -1;
00278   return NULL;
00279 }
00280
00281 }  // namespace tesseract
00282
00283
00289 int any_shared_split_points(SEAMS seam_list, SEAM *seam) {
00290   int length;
00291   int index;
00292
00293   length = array_count (seam_list);
00294   for (index = 0; index < length; index++)
00295     if (shared_split_points ((SEAM *) array_value (seam_list, index), seam))
00296       return TRUE;
00297   return FALSE;
00298 }
00299
00300
00306 int check_blob(TBLOB *blob) {
00307   TESSLINE *outline;
00308   EDGEPT *edgept;
00309
00310   for (outline = blob->outlines; outline != NULL; outline = outline->next) {
00311     edgept = outline->loop;
00312     do {
00313       if (edgept == NULL)
00314         break;
00315       edgept = edgept->next;
00316     }
00317     while (edgept != outline->loop);
00318     if (edgept == NULL)
00319       return 1;
00320   }
00321   return 0;
00322 }
00323
00324
00325 namespace tesseract {
00332 bool Wordrec::improve_one_blob(WERD_RES *word_res,
00333                                BLOB_CHOICE_LIST_VECTOR *char_choices,
00334                                inT32 *blob_number,
00335                                SEAMS *seam_list,
00336                                DANGERR *fixpt,
00337                                bool split_next_to_fragment,
00338                                BlamerBundle *blamer_bundle) {
00339   TWERD* word = word_res->chopped_word;
00340   TBLOB *blob;
00341   inT16 x = 0;
00342   float rating_ceiling = MAX_FLOAT32;
00343   BLOB_CHOICE_LIST *answer;
00344   BLOB_CHOICE_IT answer_it;
00345   SEAM *seam;
00346
00347   do {
00348     *blob_number = select_blob_to_split_from_fixpt(fixpt);
00349     bool split_point_from_dict = (*blob_number != -1);
00350     if (split_point_from_dict) {
00351       fixpt->clear();
00352     } else {
00353       *blob_number = select_blob_to_split(*char_choices, rating_ceiling,
00354                                           split_next_to_fragment);
00355     }
00356     if (chop_debug)
00357       cprintf("blob_number = %d\n", *blob_number);
00358     if (*blob_number == -1)
00359       return false;
00360
00361     // TODO(rays) it may eventually help to allow italic_blob to be true,
00362     seam = chop_numbered_blob(word, *blob_number, false, *seam_list);
00363     if (seam != NULL)
00364       break;
00365     /* Must split null blobs */
00366     answer = char_choices->get(*blob_number);
00367     if (answer == NULL)
00368       return false;
00369     answer_it.set_to_list(answer);
00370     if (!split_point_from_dict) {
00371       // We chopped the worst rated blob, try something else next time.
00372       rating_ceiling = answer_it.data()->rating();
00373     }
00374   } while (true);
00375   /* Split OK */
00376   for (blob = word->blobs; x < *blob_number; x++) {
00377     blob = blob->next;
00378   }
00379
00380   *seam_list =
00381     insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);
00382
00383   delete char_choices->get(*blob_number);
00384
00385   answer = classify_blob(blob, word_res->denorm, "improve 1:", Red,
00386                          blamer_bundle);
00387   char_choices->insert(answer, *blob_number);
00388
00389   answer = classify_blob(blob->next, word_res->denorm, "improve 2:", Yellow,
00390                          blamer_bundle);
00391   char_choices->set(answer, *blob_number + 1);
00392
00393   return true;
00394 }
00395
00403 void Wordrec::modify_blob_choice(BLOB_CHOICE_LIST *answer,
00404                         int chop_index) {
00405   char chop_index_string[2];
00406   if (chop_index <= 9) {
00407     snprintf(chop_index_string, sizeof(chop_index_string), "%d", chop_index);
00408   } else {
00409     chop_index_string[0] = static_cast<char>('A' - 10 + chop_index);
00410     chop_index_string[1] = '\0';
00411   }
00412   UNICHAR_ID unichar_id = unicharset.unichar_to_id(chop_index_string);
00413   if (unichar_id == INVALID_UNICHAR_ID) {
00414     // If the word is very long, we might exhaust the possibilities.
00415     unichar_id = 1;
00416   }
00417   BLOB_CHOICE_IT answer_it(answer);
00418   BLOB_CHOICE *modified_blob =
00419       new BLOB_CHOICE(unichar_id,
00420                       answer_it.data()->rating(),
00421                       answer_it.data()->certainty(),
00422                       answer_it.data()->fontinfo_id(),
00423                       answer_it.data()->fontinfo_id2(),
00424                       answer_it.data()->script_id(),
00425                       answer_it.data()->min_xheight(),
00426                       answer_it.data()->max_xheight(),
00427                       answer_it.data()->adapted());
00428   answer->clear();
00429   answer_it.set_to_list(answer);
00430   answer_it.add_after_then_move(modified_blob);
00431 }
00432
00433
00441 bool Wordrec::chop_one_blob(TWERD *word,
00442                             BLOB_CHOICE_LIST_VECTOR *char_choices,
00443                             inT32 *blob_number,
00444                             SEAMS *seam_list,
00445                             int *right_chop_index) {
00446   TBLOB *blob;
00447   inT16 x = 0;
00448   float rating_ceiling = MAX_FLOAT32;
00449   BLOB_CHOICE_LIST *answer;
00450   BLOB_CHOICE_IT answer_it;
00451   SEAM *seam;
00452   UNICHAR_ID unichar_id = 0;
00453   int left_chop_index = 0;
00454
00455   do {
00456     *blob_number = select_blob_to_split(*char_choices, rating_ceiling, false);
00457     if (chop_debug)
00458       cprintf("blob_number = %d\n", *blob_number);
00459     if (*blob_number == -1)
00460       return false;
00461     seam = chop_numbered_blob(word, *blob_number, true, *seam_list);
00462     if (seam != NULL)
00463       break;
00464     /* Must split null blobs */
00465     answer = char_choices->get(*blob_number);
00466     if (answer == NULL)
00467       return false;
00468     answer_it.set_to_list(answer);
00469     rating_ceiling = answer_it.data()->rating();  // try a different blob
00470   } while (true);
00471   /* Split OK */
00472   for (blob = word->blobs; x < *blob_number; x++) {
00473     blob = blob->next;
00474   }
00475   if (chop_debug) {
00476     tprintf("Chop made blob1:");
00477     blob->bounding_box().print();
00478     tprintf("and blob2:");
00479     blob->next->bounding_box().print();
00480   }
00481   *seam_list = insert_seam(*seam_list, *blob_number, seam, blob, word->blobs);
00482
00483   answer = char_choices->get(*blob_number);
00484   answer_it.set_to_list(answer);
00485   unichar_id = answer_it.data()->unichar_id();
00486   float rating = answer_it.data()->rating() / exp(1.0);
00487   left_chop_index = atoi(unicharset.id_to_unichar(unichar_id));
00488
00489   delete char_choices->get(*blob_number);
00490   // combine confidence w/ serial #
00491   answer = fake_classify_blob(0, rating, -rating);
00492   modify_blob_choice(answer, left_chop_index);
00493   char_choices->insert(answer, *blob_number);
00494
00495   answer = fake_classify_blob(0, rating - 0.125f, -rating);
00496   modify_blob_choice(answer, ++*right_chop_index);
00497   char_choices->set(answer, *blob_number + 1);
00498   return true;
00499 }
00500
00501
00502 bool Wordrec::chop_one_blob2(const GenericVector<TBOX>& boxes,
00503                              WERD_RES *word_res,
00504                              SEAMS *seam_list) {
00505   inT32 blob_number;
00506   inT16 x = 0;
00507   TBLOB *blob;
00508   SEAM *seam;
00509
00510   seam = chop_overlapping_blob(boxes, word_res, &blob_number,
00511                                true, *seam_list);
00512   if (seam == NULL)
00513     return false;
00514
00515   /* Split OK */
00516   for (blob = word_res->chopped_word->blobs; x < blob_number; x++) {
00517     blob = blob->next;
00518   }
00519   if (chop_debug) {
00520     tprintf("Chop made blob1:");
00521     blob->bounding_box().print();
00522     tprintf("and blob2:");
00523     blob->next->bounding_box().print();
00524   }
00525   *seam_list = insert_seam(*seam_list, blob_number, seam, blob,
00526                            word_res->chopped_word->blobs);
00527   return true;
00528 }
00529 }  // namespace tesseract
00530
00539 inT16 check_seam_order(TBLOB *blob, SEAM *seam) {
00540   TESSLINE *outline;
00541   TESSLINE *last_outline;
00542   inT8 found_em[3];
00543
00544   if (seam->split1 == NULL || seam->split1 == NULL || blob == NULL)
00545     return (TRUE);
00546
00547   found_em[0] = found_em[1] = found_em[2] = FALSE;
00548
00549   for (outline = blob->outlines; outline; outline = outline->next) {
00550     if (!found_em[0] &&
00551       ((seam->split1 == NULL) ||
00552     is_split_outline (outline, seam->split1))) {
00553       found_em[0] = TRUE;
00554     }
00555     if (!found_em[1] &&
00556       ((seam->split2 == NULL) ||
00557     is_split_outline (outline, seam->split2))) {
00558       found_em[1] = TRUE;
00559     }
00560     if (!found_em[2] &&
00561       ((seam->split3 == NULL) ||
00562     is_split_outline (outline, seam->split3))) {
00563       found_em[2] = TRUE;
00564     }
00565     last_outline = outline;
00566   }
00567
00568   if (!found_em[0] || !found_em[1] || !found_em[2])
00569     return (FALSE);
00570   else
00571     return (TRUE);
00572 }
00573
00574 namespace tesseract {
00583 BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(WERD_RES *word) {
00584   TBLOB *blob;
00585   int index;
00586   int did_chopping;
00587   STATE state;
00588   BLOB_CHOICE_LIST *match_result;
00589   MATRIX *ratings = NULL;
00590   DANGERR fixpt;                 /*dangerous ambig */
00591   inT32 bit_count;               //no of bits
00592
00593   BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
00594   BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR();
00595
00596   did_chopping = 0;
00597   for (blob = word->chopped_word->blobs, index = 0;
00598        blob != NULL; blob = blob->next, index++) {
00599     match_result = classify_blob(blob, word->denorm, "chop_word:", Green,
00600                                  word->blamer_bundle);
00601     if (match_result == NULL)
00602       cprintf("Null classifier output!\n");
00603     *char_choices += match_result;
00604   }
00605   bit_count = index - 1;
00606   set_n_ones(&state, char_choices->length() - 1);
00607   bool acceptable = false;
00608   bool replaced = false;
00609   bool best_choice_updated =
00610     getDict().permute_characters(*char_choices, word->best_choice,
00611                                  word->raw_choice);
00612   if (best_choice_updated &&
00613       getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt,
00614                                  CHOPPER_CALLER, &replaced)) {
00615     acceptable = true;
00616   }
00617   if (replaced)
00618     update_blob_classifications(word->chopped_word, *char_choices);
00619   CopyCharChoices(*char_choices, best_char_choices);
00620   if (!acceptable) {  // do more work to find a better choice
00621     did_chopping = 1;
00622
00623     bool best_choice_acceptable = false;
00624     if (chop_enable)
00625       improve_by_chopping(word,
00626                           char_choices,
00627                           &state,
00628                           best_char_choices,
00629                           &fixpt,
00630                           &best_choice_acceptable);
00631     if (chop_debug)
00632       print_seams ("Final seam list:", word->seam_array);
00633
00634     if (word->blamer_bundle != NULL &&
00635         !ChoiceIsCorrect(*word->uch_set, word->best_choice,
00636                          word->blamer_bundle->truth_text)) {
00637       set_chopper_blame(word);
00638     }
00639
00640     // The force_word_assoc is almost redundant to enable_assoc.  However,
00641     // it is not conditioned on the dict behavior.  For CJK, we need to force
00642     // the associator to be invoked.  When we figure out the exact behavior
00643     // of dict on CJK, we can remove the flag if it turns out to be redundant.
00644     if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) {
00645       ratings = word_associator(false, word, &state, best_char_choices,
00646                                 &fixpt, &state);
00647     }
00648   }
00649   best_char_choices = rebuild_current_state(word, &state, best_char_choices,
00650                                             ratings);
00651
00652   // If after running only the chopper best_choice is incorrect and no blame
00653   // has been yet set, blame the classifier if best_choice is classifier's
00654   // top choice and is a dictionary word (i.e. language model could not have
00655   // helped). Otherwise blame the tradeoff between the classifier and
00656   // the old language model (permuters).
00657   if (word->blamer_bundle != NULL &&
00658       word->blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
00659       ratings == NULL &&  // only the chopper was run
00660       !ChoiceIsCorrect(*word->uch_set, word->best_choice,
00661                        word->blamer_bundle->truth_text)) {
00662     if (word->best_choice != NULL &&
00663         Dict::valid_word_permuter(word->best_choice->permuter(), false)) {
00664       // Find out whether best choice is a top choice.
00665       word->blamer_bundle->best_choice_is_dict_and_top_choice = true;
00666       for (int i = 0; i < word->best_choice->length(); ++i) {
00667         BLOB_CHOICE_IT blob_choice_it(best_char_choices->get(i));
00668         ASSERT_HOST(!blob_choice_it.empty());
00669         BLOB_CHOICE *first_choice = NULL;
00670         for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
00671              blob_choice_it.forward()) {  // find first non-fragment choice
00672           if (!(getDict().getUnicharset().get_fragment(
00673                 blob_choice_it.data()->unichar_id()))) {
00674             first_choice = blob_choice_it.data();
00675             break;
00676           }
00677         }
00678         ASSERT_HOST(first_choice != NULL);
00679         if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
00680           word->blamer_bundle->best_choice_is_dict_and_top_choice = false;
00681           break;
00682         }
00683       }
00684     }
00685     STRING debug;
00686     if (word->blamer_bundle->best_choice_is_dict_and_top_choice) {
00687       debug = "Best choice is: incorrect, top choice, dictionary word";
00688       debug += " with permuter ";
00689       debug += word->best_choice->permuter_name();
00690     } else {
00691       debug = "Classifier/Old LM tradeoff is to blame";
00692     }
00693     word->blamer_bundle->SetBlame(
00694         word->blamer_bundle->best_choice_is_dict_and_top_choice ?
00695             IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF,
00696         debug, word->best_choice, wordrec_debug_blamer);
00697   }
00698
00699   if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
00700     if (ratings == NULL) {
00701       ratings = word_associator(true, word, NULL, NULL, NULL, NULL);
00702     }
00703     CallFillLattice(*ratings, getDict().getBestChoices(),
00704                     *word->uch_set, word->blamer_bundle);
00705   }
00706   if (ratings != NULL) {
00707     if (wordrec_debug_level > 0) {
00708       tprintf("Final Ratings Matrix:\n");
00709       ratings->print(getDict().getUnicharset());
00710     }
00711     ratings->delete_matrix_pointers();
00712     delete ratings;
00713   }
00714   getDict().FilterWordChoices();
00715   // TODO(antonova, eger): check that FilterWordChoices() does not filter
00716   // out anything useful for word bigram or phrase search.
00717   // TODO(antonova, eger): when implementing word bigram and phrase search
00718   // we will need to think carefully about how to replace a word with its
00719   // alternative choice.
00720   // In particular it might be required to save the segmentation state
00721   // associated with the word, so that best_char_choices could be updated
00722   // by rebuild_current_state() correctly.
00723   if (save_alt_choices) SaveAltChoices(getDict().getBestChoices(), word);
00724   char_choices->delete_data_pointers();
00725   delete char_choices;
00726
00727   return best_char_choices;
00728 }
00729
00730
00731
00741 void Wordrec::improve_by_chopping(WERD_RES *word,
00742                                   BLOB_CHOICE_LIST_VECTOR *char_choices,
00743                                   STATE *best_state,
00744                                   BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00745                                   DANGERR *fixpt,
00746                                   bool *best_choice_acceptable) {
00747   inT32 blob_number;
00748   float old_best;
00749   bool updated_best_choice = false;
00750
00751   while (1) {  // improvement loop
00752     old_best = word->best_choice->rating();
00753     if (improve_one_blob(word, char_choices,
00754                          &blob_number, &word->seam_array,
00755                          fixpt, (fragments_guide_chopper &&
00756                                  word->best_choice->fragment_mark()),
00757                                  word->blamer_bundle)) {
00758       getDict().LogNewSplit(blob_number);
00759       updated_best_choice =
00760         getDict().permute_characters(*char_choices, word->best_choice,
00761                                      word->raw_choice);
00762
00763       if (old_best > word->best_choice->rating()) {
00764         set_n_ones(best_state, char_choices->length() - 1);
00765       } else {
00766         insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
00767         fixpt->clear();
00768       }
00769
00770       if (chop_debug)
00771         print_state("best state = ",
00772           best_state, count_blobs(word->chopped_word->blobs) - 1);
00773     } else {
00774       break;
00775     }
00776
00777     // Check if we should break from the loop.
00778     bool done = false;
00779     bool replaced = false;
00780     if ((updated_best_choice &&
00781          (*best_choice_acceptable =
00782           getDict().AcceptableChoice(char_choices, word->best_choice,
00783                                      fixpt, CHOPPER_CALLER, &replaced))) ||
00784         char_choices->length() >= MAX_NUM_CHUNKS) {
00785       done = true;
00786     }
00787     if (replaced) update_blob_classifications(word->chopped_word,
00788                                               *char_choices);
00789     if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices);
00790     if (done) break;
00791   }
00792 }
00793
00794
00795 /**********************************************************************
00796  * select_blob_to_split
00797  *
00798  * These are the results of the last classification.  Find a likely
00799  * place to apply splits.  If none, return -1.
00800  **********************************************************************/
00801 inT16 Wordrec::select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00802                                     float rating_ceiling,
00803                                     bool split_next_to_fragment) {
00804   BLOB_CHOICE_IT blob_choice_it;
00805   BLOB_CHOICE *blob_choice;
00806   BLOB_CHOICE_IT temp_it;
00807   int x;
00808   float worst = -MAX_FLOAT32;
00809   int worst_index = -1;
00810   float worst_near_fragment = -MAX_FLOAT32;
00811   int worst_index_near_fragment = -1;
00812   const CHAR_FRAGMENT **fragments = NULL;
00813
00814   if (chop_debug) {
00815     if (rating_ceiling < MAX_FLOAT32)
00816       cprintf("rating_ceiling = %8.4f\n", rating_ceiling);
00817     else
00818       cprintf("rating_ceiling = No Limit\n");
00819   }
00820
00821   if (split_next_to_fragment && char_choices.length() > 0) {
00822     fragments = new const CHAR_FRAGMENT *[char_choices.length()];
00823     if (char_choices.get(0) != NULL) {
00824       temp_it.set_to_list(char_choices.get(0));
00825       fragments[0] = getDict().getUnicharset().get_fragment(
00826           temp_it.data()->unichar_id());
00827     } else {
00828       fragments[0] = NULL;
00829     }
00830   }
00831
00832   for (x = 0; x < char_choices.length(); ++x) {
00833     if (char_choices.get(x) == NULL) {
00834       if (fragments != NULL) {
00835         delete[] fragments;
00836       }
00837       return x;
00838     } else {
00839       blob_choice_it.set_to_list(char_choices.get(x));
00840       blob_choice = blob_choice_it.data();
00841       // Populate fragments for the following position.
00842       if (split_next_to_fragment && x+1 < char_choices.length()) {
00843         if (char_choices.get(x+1) != NULL) {
00844           temp_it.set_to_list(char_choices.get(x+1));
00845           fragments[x+1] = getDict().getUnicharset().get_fragment(
00846               temp_it.data()->unichar_id());
00847         } else {
00848           fragments[x+1] = NULL;
00849         }
00850       }
00851       if (blob_choice->rating() < rating_ceiling &&
00852           blob_choice->certainty() < tessedit_certainty_threshold) {
00853         // Update worst and worst_index.
00854         if (blob_choice->rating() > worst) {
00855           worst_index = x;
00856           worst = blob_choice->rating();
00857         }
00858         if (split_next_to_fragment) {
00859           // Update worst_near_fragment and worst_index_near_fragment.
00860           bool expand_following_fragment =
00861             (x + 1 < char_choices.length() &&
00862              fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
00863           bool expand_preceding_fragment =
00864             (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
00865           if ((expand_following_fragment || expand_preceding_fragment) &&
00866               blob_choice->rating() > worst_near_fragment) {
00867             worst_index_near_fragment = x;
00868             worst_near_fragment = blob_choice->rating();
00869             if (chop_debug) {
00870               cprintf("worst_index_near_fragment=%d"
00871                       " expand_following_fragment=%d"
00872                       " expand_preceding_fragment=%d\n",
00873                       worst_index_near_fragment,
00874                       expand_following_fragment,
00875                       expand_preceding_fragment);
00876             }
00877           }
00878         }
00879       }
00880     }
00881   }
00882   if (fragments != NULL) {
00883     delete[] fragments;
00884   }
00885   // TODO(daria): maybe a threshold of badness for
00886   // worst_near_fragment would be useful.
00887   return worst_index_near_fragment != -1 ?
00888     worst_index_near_fragment : worst_index;
00889 }
00890
00891 /**********************************************************************
00892  * select_blob_to_split_from_fixpt
00893  *
00894  * Given the fix point from a dictionary search, if there is a single
00895  * dangerous blob that maps to multiple characters, return that blob
00896  * index as a place we need to split.  If none, return -1.
00897  **********************************************************************/
00898 inT16 Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) {
00899   if (!fixpt)
00900     return -1;
00901   for (int i = 0; i < fixpt->size(); i++) {
00902     if ((*fixpt)[i].begin == (*fixpt)[i].end &&
00903         (*fixpt)[i].dangerous &&
00904         (*fixpt)[i].correct_is_ngram) {
00905       return (*fixpt)[i].begin;
00906     }
00907   }
00908   return -1;
00909 }
00910
00911 /**********************************************************************
00912  * set_chopper_blame
00913  *
00914  * Check whether chops were made at all the character bounding box boundaries
00915  * in word->truth_word. If not - blame the chopper for an incorrect answer.
00916  **********************************************************************/
00917 void Wordrec::set_chopper_blame(WERD_RES *word) {
00918   BlamerBundle *blamer_bundle = word->blamer_bundle;
00919   assert(blamer_bundle != NULL);
00920   if (blamer_bundle->NoTruth() || !(blamer_bundle->truth_has_char_boxes) ||
00921       word->chopped_word->blobs == NULL) {
00922     return;
00923   }
00924   STRING debug;
00925   bool missing_chop = false;
00926   TBLOB * curr_blob = word->chopped_word->blobs;
00927   int b = 0;
00928   inT16 truth_x;
00929   while (b < blamer_bundle->truth_word.length() && curr_blob != NULL) {
00930     truth_x = blamer_bundle->norm_truth_word.BlobBox(b).right();
00931     if (curr_blob->bounding_box().right() <
00932         (truth_x - blamer_bundle->norm_box_tolerance)) {
00933       curr_blob = curr_blob->next;
00934       continue;  // encountered an extra chop, keep looking
00935     } else if (curr_blob->bounding_box().right() >
00936                 (truth_x + blamer_bundle->norm_box_tolerance)) {
00937       missing_chop = true;
00938       break;
00939     } else {
00940       curr_blob = curr_blob->next;
00941       ++b;
00942     }
00943   }
00944   if (missing_chop || b < blamer_bundle->norm_truth_word.length()) {
00945     STRING debug;
00946     char debug_buffer[256];
00947     if (missing_chop) {
00948       sprintf(debug_buffer, "Detected missing chop (tolerance=%d) at ",
00949               blamer_bundle->norm_box_tolerance);
00950       debug += debug_buffer;
00951       curr_blob->bounding_box().append_debug(&debug);
00952       debug.add_str_int("\nNo chop for truth at x=", truth_x);
00953     } else {
00954       debug.add_str_int("Missing chops for last ",
00955                         blamer_bundle->norm_truth_word.length()-b);
00956       debug += " truth box(es)";
00957     }
00958     debug += "\nMaximally chopped word boxes:\n";
00959     for (curr_blob = word->chopped_word->blobs; curr_blob != NULL;
00960         curr_blob = curr_blob->next) {
00961       const TBOX &tbox = curr_blob->bounding_box();
00962       sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
00963               tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
00964       debug += debug_buffer;
00965     }
00966     debug += "Truth  bounding  boxes:\n";
00967     for (b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) {
00968       const TBOX &tbox = blamer_bundle->norm_truth_word.BlobBox(b);
00969       sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
00970               tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
00971       debug += debug_buffer;
00972     }
00973     blamer_bundle->SetBlame(IRR_CHOPPER, debug, word->best_choice,
00974                             wordrec_debug_blamer);
00975   }
00976 }
00977
00978 /**********************************************************************
00979  * word_associator
00980  *
00981  * Reassociate and classify the blobs in a word.  Continue this process
00982  * until a good answer is found or all the possibilities have been tried.
00983  **********************************************************************/
00984 MATRIX *Wordrec::word_associator(bool only_create_ratings_matrix,
00985                                  WERD_RES *word,
00986                                  STATE *state,
00987                                  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00988                                  DANGERR *fixpt,
00989                                  STATE *best_state) {
00990   CHUNKS_RECORD chunks_record;
00991   BLOB_WEIGHTS blob_weights;
00992   int x;
00993   int num_chunks;
00994   BLOB_CHOICE_IT blob_choice_it;
00995
00996   num_chunks = array_count(word->seam_array) + 1;
00997
00998   TBLOB* blobs = word->chopped_word->blobs;
00999   chunks_record.ratings = record_piece_ratings(blobs);
01000   chunks_record.chunks = blobs;
01001   chunks_record.word_res = word;
01002   chunks_record.splits = word->seam_array;
01003   chunks_record.chunk_widths = blobs_widths(blobs);
01004   chunks_record.char_widths = blobs_widths(blobs);
01005   /* Save chunk weights */
01006   for (x = 0; x < num_chunks; x++) {
01007     BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings, blobs,
01008                                                  chunks_record.word_res->denorm,
01009                                                  word->seam_array, x, x,
01010                                                  word->blamer_bundle);
01011     blob_choice_it.set_to_list(choices);
01012     //This is done by Jetsoft. Divide by zero is possible.
01013     if (blob_choice_it.data()->certainty() == 0) {
01014       blob_weights[x]=0;
01015     } else {
01016       blob_weights[x] =
01017         -(inT16) (10 * blob_choice_it.data()->rating() /
01018                   blob_choice_it.data()->certainty());
01019     }
01020   }
01021   chunks_record.weights = blob_weights;
01022
01023   if (chop_debug)
01024     chunks_record.ratings->print(getDict().getUnicharset());
01025
01026   if (!only_create_ratings_matrix) {
01027     if (enable_new_segsearch) {
01028       SegSearch(&chunks_record, word->best_choice,
01029                 best_char_choices, word->raw_choice,
01030                 state, word->blamer_bundle);
01031     } else {
01032       best_first_search(&chunks_record, best_char_choices, word,
01033                         state, fixpt, best_state);
01034     }
01035   }
01036
01037   free_widths(chunks_record.chunk_widths);
01038   free_widths(chunks_record.char_widths);
01039   return chunks_record.ratings;
01040 }
01041 }  // namespace tesseract
01042
01043
01044 /**********************************************************************
01045  * total_containment
01046  *
01047  * Check to see if one of these outlines is totally contained within
01048  * the bounding box of the other.
01049  **********************************************************************/
01050 inT16 total_containment(TBLOB *blob1, TBLOB *blob2) {
01051   TBOX box1 = blob1->bounding_box();
01052   TBOX box2 = blob2->bounding_box();
01053   return box1.contains(box2) || box2.contains(box1);
01054 }