tesseract-doc/tfacepp_8cpp_source.html

00001 /**********************************************************************
00002  * File:        tfacepp.cpp  (Formerly tface++.c)
00003  * Description: C++ side of the C/C++ Tess/Editor interface.
00004  * Author:                  Ray Smith
00005  * Created:                 Thu Apr 23 15:39:23 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #pragma warning(disable:4305)  // int/float warnings
00023 #pragma warning(disable:4800)  // int/bool warnings
00024 #endif
00025
00026 #include <math.h>
00027
00028 #include "mfcpch.h"
00029 #ifdef __UNIX__
00030 #include          <assert.h>
00031 #endif
00032 #include          "errcode.h"
00033 #include          "ratngs.h"
00034 #include          "reject.h"
00035 #include          "werd.h"
00036 #include          "tfacep.h"
00037 #include          "tfacepp.h"
00038 #include          "tessvars.h"
00039 #include          "globals.h"
00040 #include          "reject.h"
00041 #include          "tesseractclass.h"
00042
00043 #define MAX_UNDIVIDED_LENGTH 24
00044
00045
00046
00047 /**********************************************************************
00048  * recog_word
00049  *
00050  * Convert the word to tess form and pass it to the tess segmenter.
00051  * Convert the output back to editor form.
00052  **********************************************************************/
00053 namespace tesseract {
00054 void Tesseract::recog_word(WERD_RES *word,
00055                            BLOB_CHOICE_LIST_CLIST *blob_choices) {
00056   ASSERT_HOST(word->chopped_word->blobs != NULL);
00057   recog_word_recursive(word, blob_choices);
00058   word->SetupBoxWord();
00059   if ((word->best_choice->length() != word->box_word->length()) ||
00060       (word->best_choice->length() != blob_choices->length())) {
00061     tprintf("recog_word ASSERT FAIL String:\"%s\"; "
00062             "Strlen=%d; #Blobs=%d; #Choices=%d\n",
00063             word->best_choice->debug_string().string(),
00064             word->best_choice->length(), word->box_word->length(),
00065             blob_choices->length());
00066   }
00067   ASSERT_HOST(word->best_choice->length() == word->box_word->length());
00068   ASSERT_HOST(word->best_choice->length() == blob_choices->length());
00069   if (tessedit_override_permuter) {
00070     /* Override the permuter type if a straight dictionary check disagrees. */
00071     uinT8 perm_type = word->best_choice->permuter();
00072     if ((perm_type != SYSTEM_DAWG_PERM) &&
00073         (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
00074       uinT8 real_dict_perm_type = dict_word(*word->best_choice);
00075       if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
00076            (real_dict_perm_type == FREQ_DAWG_PERM) ||
00077            (real_dict_perm_type == USER_DAWG_PERM)) &&
00078           (alpha_count(word->best_choice->unichar_string().string(),
00079                        word->best_choice->unichar_lengths().string()) > 0)) {
00080         word->best_choice->set_permuter(real_dict_perm_type);  // use dict perm
00081       }
00082     }
00083     if (tessedit_rejection_debug &&
00084         perm_type != word->best_choice->permuter()) {
00085       tprintf("Permuter Type Flipped from %d to %d\n",
00086               perm_type, word->best_choice->permuter());
00087     }
00088   }
00089   // Factored out from control.cpp
00090   ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
00091   if (word->best_choice == NULL || word->best_choice->length() == 0 ||
00092       strspn(word->best_choice->unichar_string().string(), " ") ==
00093         word->best_choice->length()) {
00094     word->tess_failed = true;
00095     word->reject_map.initialise(word->box_word->length());
00096     word->reject_map.rej_word_tess_failure();
00097   } else {
00098     word->tess_failed = false;
00099   }
00100 }
00101
00102
00103 /**********************************************************************
00104  * recog_word_recursive
00105  *
00106  * Convert the word to tess form and pass it to the tess segmenter.
00107  * Convert the output back to editor form.
00108  **********************************************************************/
00109 void Tesseract::recog_word_recursive(WERD_RES *word,
00110                                      BLOB_CHOICE_LIST_CLIST *blob_choices) {
00111   int word_length = word->chopped_word->NumBlobs();  // no of blobs
00112   if (word_length > MAX_UNDIVIDED_LENGTH) {
00113     return split_and_recog_word(word, blob_choices);
00114   }
00115   int initial_blob_choice_len = blob_choices->length();
00116   BLOB_CHOICE_LIST_VECTOR* tess_ratings = cc_recog(word);
00117
00118   // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
00119   BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
00120   for (int i = 0; i < tess_ratings->length(); ++i) {
00121     blob_choices_it.add_to_end(tess_ratings->get(i));
00122   }
00123   delete tess_ratings;
00124
00125   word_length = word->rebuild_word->NumBlobs();  // No of blobs in output.
00126   // Pad raw_choice with spaces if needed.
00127   if (word->raw_choice->length() < word_length) {
00128     UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
00129     while (word->raw_choice->length() < word_length) {
00130       word->raw_choice->append_unichar_id(space_id, 1, 0.0,
00131                                           word->raw_choice->certainty());
00132     }
00133   }
00134
00135   // Do sanity checks and minor fixes on best_choice.
00136   if (word->best_choice->length() > word_length) {
00137     word->best_choice->make_bad();  // should never happen
00138     tprintf("recog_word: Discarded long string \"%s\""
00139             " (%d characters vs %d blobs)\n",
00140             word->best_choice->unichar_string().string(),
00141             word->best_choice->length(), word_length);
00142     tprintf("Word is at:");
00143     word->word->bounding_box().print();
00144   }
00145   if (blob_choices->length() - initial_blob_choice_len != word_length) {
00146     word->best_choice->make_bad();  // force rejection
00147     tprintf("recog_word: Choices list len:%d; blob lists len:%d\n",
00148             blob_choices->length(), word_length);
00149     blob_choices_it.set_to_list(blob_choices);  // list of lists
00150     while (blob_choices->length() - initial_blob_choice_len < word_length) {
00151       blob_choices_it.add_to_end(new BLOB_CHOICE_LIST());  // add a fake one
00152       tprintf("recog_word: Added dummy choice list\n");
00153     }
00154     while (blob_choices->length() - initial_blob_choice_len > word_length) {
00155       blob_choices_it.move_to_last(); // should never happen
00156       delete blob_choices_it.extract();
00157       tprintf("recog_word: Deleted choice list\n");
00158     }
00159   }
00160   if (word->best_choice->length() < word_length) {
00161     UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
00162     while (word->best_choice->length() < word_length) {
00163       word->best_choice->append_unichar_id(space_id, 1, 0.0,
00164                                            word->best_choice->certainty());
00165     }
00166   }
00167 }
00168
00169
00170 /**********************************************************************
00171  * split_and_recog_word
00172  *
00173  * Split the word into 2 smaller pieces at the largest gap.
00174  * Recognize the pieces and stick the results back together.
00175  **********************************************************************/
00176
00177 void Tesseract::split_and_recog_word(WERD_RES *word,
00178                                      BLOB_CHOICE_LIST_CLIST *blob_choices) {
00179   // Find the biggest blob gap in the chopped_word.
00180   int bestgap = -MAX_INT32;
00181   TPOINT best_split_pt;
00182   TBLOB* best_end = NULL;
00183   TBLOB* prev_blob = NULL;
00184   for (TBLOB* blob = word->chopped_word->blobs; blob != NULL;
00185        blob = blob->next) {
00186     if (prev_blob != NULL) {
00187       TBOX prev_box = prev_blob->bounding_box();
00188       TBOX blob_box = blob->bounding_box();
00189       int gap = blob_box.left() - prev_box.right();
00190       if (gap > bestgap) {
00191         bestgap = gap;
00192         best_end = prev_blob;
00193         best_split_pt.x = (prev_box.right() + blob_box.left()) / 2;
00194         best_split_pt.y = (prev_box.top() + prev_box.bottom() +
00195                            blob_box.top() + blob_box.bottom()) / 4;
00196       }
00197     }
00198     prev_blob = blob;
00199   }
00200   ASSERT_HOST(best_end != NULL);
00201   ASSERT_HOST(best_end->next != NULL);
00202
00203   // Make a copy of the word to put the 2nd half in.
00204   WERD_RES* word2 = new WERD_RES(*word);
00205   // Blow away the copied chopped_word, as we want to work with the blobs
00206   // from the input chopped_word so the seam_arrays can be merged.
00207   delete word2->chopped_word;
00208   word2->chopped_word = new TWERD;
00209   word2->chopped_word->blobs = best_end->next;
00210   best_end->next = NULL;
00211   // Make a new seamarray on both words.
00212   free_seam_list(word->seam_array);
00213   word->seam_array = start_seam_list(word->chopped_word->blobs);
00214   word2->seam_array = start_seam_list(word2->chopped_word->blobs);
00215   BlamerBundle *orig_bb = word->blamer_bundle;
00216   STRING blamer_debug;
00217   // Try to adjust truth information.
00218   if (orig_bb != NULL) {
00219     // Find truth boxes that correspond to the split in the blobs.
00220     int b;
00221     int begin2_truth_index = -1;
00222     if (orig_bb->incorrect_result_reason != IRR_NO_TRUTH &&
00223         orig_bb->truth_has_char_boxes) {
00224       int end1_x = best_end->bounding_box().right();
00225       int begin2_x = word2->chopped_word->blobs->bounding_box().left();
00226       blamer_debug = "Looking for truth split at";
00227       blamer_debug.add_str_int(" end1_x ", end1_x);
00228       blamer_debug.add_str_int(" begin2_x ", begin2_x);
00229       blamer_debug += "\nnorm_truth_word boxes:\n";
00230       if (orig_bb->norm_truth_word.length() > 1) {
00231         orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug);
00232         for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) {
00233           orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug);
00234           if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) <
00235               orig_bb->norm_box_tolerance) &&
00236               (abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) <
00237               orig_bb->norm_box_tolerance)) {
00238             begin2_truth_index = b;
00239             blamer_debug += "Split found\n";
00240             break;
00241           }
00242         }
00243       }
00244     }
00245     // Populate truth information in word and word2 with the first and second
00246     // part of the original truth.
00247     word->blamer_bundle = new BlamerBundle();
00248     word2->blamer_bundle = new BlamerBundle();
00249     if (begin2_truth_index > 0) {
00250       word->blamer_bundle->truth_has_char_boxes = true;
00251       word->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
00252       word2->blamer_bundle->truth_has_char_boxes = true;
00253       word2->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
00254       BlamerBundle *curr_bb = word->blamer_bundle;
00255       for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) {
00256         if (b == begin2_truth_index) curr_bb = word2->blamer_bundle;
00257         curr_bb->norm_truth_word.InsertBox(
00258             b, orig_bb->norm_truth_word.BlobBox(b));
00259         curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b));
00260         curr_bb->truth_text.push_back(orig_bb->truth_text[b]);
00261       }
00262     } else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) {
00263       word->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
00264       word2->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
00265     } else {
00266       blamer_debug += "Truth split not found";
00267       blamer_debug += orig_bb->truth_has_char_boxes ?
00268           "\n" : " (no truth char boxes)\n";
00269       word->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
00270                                     NULL, wordrec_debug_blamer);
00271       word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
00272                                      NULL, wordrec_debug_blamer);
00273     }
00274   }
00275
00276   // Recognize the first part of the word.
00277   recog_word_recursive(word, blob_choices);
00278   // Recognize the second part of the word.
00279   recog_word_recursive(word2, blob_choices);
00280   // Tack the word2 outputs onto the end of the word outputs.
00281   // New blobs might have appeared on the end of word1.
00282   for (best_end = word->chopped_word->blobs; best_end->next != NULL;
00283        best_end = best_end->next);
00284   best_end->next = word2->chopped_word->blobs;
00285   TBLOB* blob;
00286   for (blob = word->rebuild_word->blobs; blob->next != NULL; blob = blob->next);
00287   blob->next = word2->rebuild_word->blobs;
00288   word2->chopped_word->blobs = NULL;
00289   word2->rebuild_word->blobs = NULL;
00290   // Copy the seams onto the end of the word1 seam_array.
00291   // Since the seam list is one element short, an empty seam marking the
00292   // end of the last blob in the first word is needed first.
00293   word->seam_array = add_seam(word->seam_array,
00294                               new_seam(0.0, best_split_pt, NULL, NULL, NULL));
00295   for (int i = 0; i < array_count(word2->seam_array); ++i) {
00296     SEAM* seam = reinterpret_cast<SEAM*>(array_value(word2->seam_array, i));
00297     array_value(word2->seam_array, i) = NULL;
00298     word->seam_array = add_seam(word->seam_array, seam);
00299   }
00300   word->best_state += word2->best_state;
00301   // Append the word choices.
00302   *word->best_choice += *word2->best_choice;
00303   *word->raw_choice += *word2->raw_choice;
00304
00305   // How many alt choices from each should we try to get?
00306   const int kAltsPerPiece = 2;
00307   // When do we start throwing away extra alt choices?
00308   const int kTooManyAltChoices = 100;
00309
00310   if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) {
00311     // Construct the cartesian product of the alt choices of word(1) and word2.
00312     int num_first_alt_choices = word->alt_choices.size();
00313     // Nota Bene: For the main loop here, we leave in place word1-only
00314     // alt_choices in
00315     //   word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1]
00316     // These will get fused with the best choices for word2 below.
00317     for (int j = 1; j < word2->alt_choices.size() &&
00318          (j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices);
00319          j++) {
00320       for (int i = 0; i < num_first_alt_choices &&
00321            (i <= kAltsPerPiece ||
00322             word->alt_choices.size() < kTooManyAltChoices);
00323            i++) {
00324         WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]);
00325         *wc += *word2->alt_choices[j];
00326         word->alt_choices.push_back(wc);
00327
00328         word->alt_states.push_back(GenericVector<int>());
00329         GenericVector<int> &alt_state = word->alt_states.back();
00330         alt_state += word->alt_states[i];
00331         alt_state += word2->alt_states[j];
00332       }
00333     }
00334     // Now that we've filled in as many alternates as we want, paste the best
00335     // choice for word2 onto the original word alt_choices.
00336     for (int i = 0; i < num_first_alt_choices; i++) {
00337       *word->alt_choices[i] += *word2->alt_choices[0];
00338       word->alt_states[i] += word2->alt_states[0];
00339     }
00340   }
00341
00342   // Restore the pointer to original blamer bundle and combine blamer
00343   // information recorded in the splits.
00344   if (orig_bb != NULL) {
00345     IncorrectResultReason irr = orig_bb->incorrect_result_reason;
00346     if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = "";
00347     if (word->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
00348         word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
00349         word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
00350       blamer_debug += "Blame from part 1: ";
00351       blamer_debug += word->blamer_bundle->debug;
00352       irr = word->blamer_bundle->incorrect_result_reason;
00353     }
00354     if (word2->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
00355         word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
00356         word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
00357       blamer_debug += "Blame from part 2: ";
00358       blamer_debug += word2->blamer_bundle->debug;
00359       if (irr == IRR_CORRECT) {
00360         irr = word2->blamer_bundle->incorrect_result_reason;
00361       } else if (irr != word2->blamer_bundle->incorrect_result_reason) {
00362         irr = IRR_UNKNOWN;
00363       }
00364     }
00365     delete word->blamer_bundle;
00366     word->blamer_bundle = orig_bb;
00367     word->blamer_bundle->incorrect_result_reason = irr;
00368     if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
00369       word->blamer_bundle->SetBlame(irr, blamer_debug, NULL,
00370                                     wordrec_debug_blamer);
00371     }
00372   }
00373   delete word2;
00374 }
00375
00376 }  // namespace tesseract