Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: tfacepp.cpp (Formerly tface++.c) 00003 * Description: C++ side of the C/C++ Tess/Editor interface. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 15:39:23 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #pragma warning(disable:4305) // int/float warnings 00023 #pragma warning(disable:4800) // int/bool warnings 00024 #endif 00025 00026 #include <math.h> 00027 00028 #include "mfcpch.h" 00029 #ifdef __UNIX__ 00030 #include <assert.h> 00031 #endif 00032 #include "errcode.h" 00033 #include "ratngs.h" 00034 #include "reject.h" 00035 #include "werd.h" 00036 #include "tfacep.h" 00037 #include "tfacepp.h" 00038 #include "tessvars.h" 00039 #include "globals.h" 00040 #include "reject.h" 00041 #include "tesseractclass.h" 00042 00043 #define MAX_UNDIVIDED_LENGTH 24 00044 00045 00046 00047 /********************************************************************** 00048 * recog_word 00049 * 00050 * Convert the word to tess form and pass it to the tess segmenter. 00051 * Convert the output back to editor form. 00052 **********************************************************************/ 00053 namespace tesseract { 00054 void Tesseract::recog_word(WERD_RES *word, 00055 BLOB_CHOICE_LIST_CLIST *blob_choices) { 00056 ASSERT_HOST(word->chopped_word->blobs != NULL); 00057 recog_word_recursive(word, blob_choices); 00058 word->SetupBoxWord(); 00059 if ((word->best_choice->length() != word->box_word->length()) || 00060 (word->best_choice->length() != blob_choices->length())) { 00061 tprintf("recog_word ASSERT FAIL String:\"%s\"; " 00062 "Strlen=%d; #Blobs=%d; #Choices=%d\n", 00063 word->best_choice->debug_string().string(), 00064 word->best_choice->length(), word->box_word->length(), 00065 blob_choices->length()); 00066 } 00067 ASSERT_HOST(word->best_choice->length() == word->box_word->length()); 00068 ASSERT_HOST(word->best_choice->length() == blob_choices->length()); 00069 if (tessedit_override_permuter) { 00070 /* Override the permuter type if a straight dictionary check disagrees. */ 00071 uinT8 perm_type = word->best_choice->permuter(); 00072 if ((perm_type != SYSTEM_DAWG_PERM) && 00073 (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) { 00074 uinT8 real_dict_perm_type = dict_word(*word->best_choice); 00075 if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || 00076 (real_dict_perm_type == FREQ_DAWG_PERM) || 00077 (real_dict_perm_type == USER_DAWG_PERM)) && 00078 (alpha_count(word->best_choice->unichar_string().string(), 00079 word->best_choice->unichar_lengths().string()) > 0)) { 00080 word->best_choice->set_permuter(real_dict_perm_type); // use dict perm 00081 } 00082 } 00083 if (tessedit_rejection_debug && 00084 perm_type != word->best_choice->permuter()) { 00085 tprintf("Permuter Type Flipped from %d to %d\n", 00086 perm_type, word->best_choice->permuter()); 00087 } 00088 } 00089 // Factored out from control.cpp 00090 ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL)); 00091 if (word->best_choice == NULL || word->best_choice->length() == 0 || 00092 strspn(word->best_choice->unichar_string().string(), " ") == 00093 word->best_choice->length()) { 00094 word->tess_failed = true; 00095 word->reject_map.initialise(word->box_word->length()); 00096 word->reject_map.rej_word_tess_failure(); 00097 } else { 00098 word->tess_failed = false; 00099 } 00100 } 00101 00102 00103 /********************************************************************** 00104 * recog_word_recursive 00105 * 00106 * Convert the word to tess form and pass it to the tess segmenter. 00107 * Convert the output back to editor form. 00108 **********************************************************************/ 00109 void Tesseract::recog_word_recursive(WERD_RES *word, 00110 BLOB_CHOICE_LIST_CLIST *blob_choices) { 00111 int word_length = word->chopped_word->NumBlobs(); // no of blobs 00112 if (word_length > MAX_UNDIVIDED_LENGTH) { 00113 return split_and_recog_word(word, blob_choices); 00114 } 00115 int initial_blob_choice_len = blob_choices->length(); 00116 BLOB_CHOICE_LIST_VECTOR* tess_ratings = cc_recog(word); 00117 00118 // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices. 00119 BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices); 00120 for (int i = 0; i < tess_ratings->length(); ++i) { 00121 blob_choices_it.add_to_end(tess_ratings->get(i)); 00122 } 00123 delete tess_ratings; 00124 00125 word_length = word->rebuild_word->NumBlobs(); // No of blobs in output. 00126 // Pad raw_choice with spaces if needed. 00127 if (word->raw_choice->length() < word_length) { 00128 UNICHAR_ID space_id = unicharset.unichar_to_id(" "); 00129 while (word->raw_choice->length() < word_length) { 00130 word->raw_choice->append_unichar_id(space_id, 1, 0.0, 00131 word->raw_choice->certainty()); 00132 } 00133 } 00134 00135 // Do sanity checks and minor fixes on best_choice. 00136 if (word->best_choice->length() > word_length) { 00137 word->best_choice->make_bad(); // should never happen 00138 tprintf("recog_word: Discarded long string \"%s\"" 00139 " (%d characters vs %d blobs)\n", 00140 word->best_choice->unichar_string().string(), 00141 word->best_choice->length(), word_length); 00142 tprintf("Word is at:"); 00143 word->word->bounding_box().print(); 00144 } 00145 if (blob_choices->length() - initial_blob_choice_len != word_length) { 00146 word->best_choice->make_bad(); // force rejection 00147 tprintf("recog_word: Choices list len:%d; blob lists len:%d\n", 00148 blob_choices->length(), word_length); 00149 blob_choices_it.set_to_list(blob_choices); // list of lists 00150 while (blob_choices->length() - initial_blob_choice_len < word_length) { 00151 blob_choices_it.add_to_end(new BLOB_CHOICE_LIST()); // add a fake one 00152 tprintf("recog_word: Added dummy choice list\n"); 00153 } 00154 while (blob_choices->length() - initial_blob_choice_len > word_length) { 00155 blob_choices_it.move_to_last(); // should never happen 00156 delete blob_choices_it.extract(); 00157 tprintf("recog_word: Deleted choice list\n"); 00158 } 00159 } 00160 if (word->best_choice->length() < word_length) { 00161 UNICHAR_ID space_id = unicharset.unichar_to_id(" "); 00162 while (word->best_choice->length() < word_length) { 00163 word->best_choice->append_unichar_id(space_id, 1, 0.0, 00164 word->best_choice->certainty()); 00165 } 00166 } 00167 } 00168 00169 00170 /********************************************************************** 00171 * split_and_recog_word 00172 * 00173 * Split the word into 2 smaller pieces at the largest gap. 00174 * Recognize the pieces and stick the results back together. 00175 **********************************************************************/ 00176 00177 void Tesseract::split_and_recog_word(WERD_RES *word, 00178 BLOB_CHOICE_LIST_CLIST *blob_choices) { 00179 // Find the biggest blob gap in the chopped_word. 00180 int bestgap = -MAX_INT32; 00181 TPOINT best_split_pt; 00182 TBLOB* best_end = NULL; 00183 TBLOB* prev_blob = NULL; 00184 for (TBLOB* blob = word->chopped_word->blobs; blob != NULL; 00185 blob = blob->next) { 00186 if (prev_blob != NULL) { 00187 TBOX prev_box = prev_blob->bounding_box(); 00188 TBOX blob_box = blob->bounding_box(); 00189 int gap = blob_box.left() - prev_box.right(); 00190 if (gap > bestgap) { 00191 bestgap = gap; 00192 best_end = prev_blob; 00193 best_split_pt.x = (prev_box.right() + blob_box.left()) / 2; 00194 best_split_pt.y = (prev_box.top() + prev_box.bottom() + 00195 blob_box.top() + blob_box.bottom()) / 4; 00196 } 00197 } 00198 prev_blob = blob; 00199 } 00200 ASSERT_HOST(best_end != NULL); 00201 ASSERT_HOST(best_end->next != NULL); 00202 00203 // Make a copy of the word to put the 2nd half in. 00204 WERD_RES* word2 = new WERD_RES(*word); 00205 // Blow away the copied chopped_word, as we want to work with the blobs 00206 // from the input chopped_word so the seam_arrays can be merged. 00207 delete word2->chopped_word; 00208 word2->chopped_word = new TWERD; 00209 word2->chopped_word->blobs = best_end->next; 00210 best_end->next = NULL; 00211 // Make a new seamarray on both words. 00212 free_seam_list(word->seam_array); 00213 word->seam_array = start_seam_list(word->chopped_word->blobs); 00214 word2->seam_array = start_seam_list(word2->chopped_word->blobs); 00215 BlamerBundle *orig_bb = word->blamer_bundle; 00216 STRING blamer_debug; 00217 // Try to adjust truth information. 00218 if (orig_bb != NULL) { 00219 // Find truth boxes that correspond to the split in the blobs. 00220 int b; 00221 int begin2_truth_index = -1; 00222 if (orig_bb->incorrect_result_reason != IRR_NO_TRUTH && 00223 orig_bb->truth_has_char_boxes) { 00224 int end1_x = best_end->bounding_box().right(); 00225 int begin2_x = word2->chopped_word->blobs->bounding_box().left(); 00226 blamer_debug = "Looking for truth split at"; 00227 blamer_debug.add_str_int(" end1_x ", end1_x); 00228 blamer_debug.add_str_int(" begin2_x ", begin2_x); 00229 blamer_debug += "\nnorm_truth_word boxes:\n"; 00230 if (orig_bb->norm_truth_word.length() > 1) { 00231 orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug); 00232 for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) { 00233 orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug); 00234 if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) < 00235 orig_bb->norm_box_tolerance) && 00236 (abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) < 00237 orig_bb->norm_box_tolerance)) { 00238 begin2_truth_index = b; 00239 blamer_debug += "Split found\n"; 00240 break; 00241 } 00242 } 00243 } 00244 } 00245 // Populate truth information in word and word2 with the first and second 00246 // part of the original truth. 00247 word->blamer_bundle = new BlamerBundle(); 00248 word2->blamer_bundle = new BlamerBundle(); 00249 if (begin2_truth_index > 0) { 00250 word->blamer_bundle->truth_has_char_boxes = true; 00251 word->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance; 00252 word2->blamer_bundle->truth_has_char_boxes = true; 00253 word2->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance; 00254 BlamerBundle *curr_bb = word->blamer_bundle; 00255 for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) { 00256 if (b == begin2_truth_index) curr_bb = word2->blamer_bundle; 00257 curr_bb->norm_truth_word.InsertBox( 00258 b, orig_bb->norm_truth_word.BlobBox(b)); 00259 curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b)); 00260 curr_bb->truth_text.push_back(orig_bb->truth_text[b]); 00261 } 00262 } else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) { 00263 word->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH; 00264 word2->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH; 00265 } else { 00266 blamer_debug += "Truth split not found"; 00267 blamer_debug += orig_bb->truth_has_char_boxes ? 00268 "\n" : " (no truth char boxes)\n"; 00269 word->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug, 00270 NULL, wordrec_debug_blamer); 00271 word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug, 00272 NULL, wordrec_debug_blamer); 00273 } 00274 } 00275 00276 // Recognize the first part of the word. 00277 recog_word_recursive(word, blob_choices); 00278 // Recognize the second part of the word. 00279 recog_word_recursive(word2, blob_choices); 00280 // Tack the word2 outputs onto the end of the word outputs. 00281 // New blobs might have appeared on the end of word1. 00282 for (best_end = word->chopped_word->blobs; best_end->next != NULL; 00283 best_end = best_end->next); 00284 best_end->next = word2->chopped_word->blobs; 00285 TBLOB* blob; 00286 for (blob = word->rebuild_word->blobs; blob->next != NULL; blob = blob->next); 00287 blob->next = word2->rebuild_word->blobs; 00288 word2->chopped_word->blobs = NULL; 00289 word2->rebuild_word->blobs = NULL; 00290 // Copy the seams onto the end of the word1 seam_array. 00291 // Since the seam list is one element short, an empty seam marking the 00292 // end of the last blob in the first word is needed first. 00293 word->seam_array = add_seam(word->seam_array, 00294 new_seam(0.0, best_split_pt, NULL, NULL, NULL)); 00295 for (int i = 0; i < array_count(word2->seam_array); ++i) { 00296 SEAM* seam = reinterpret_cast<SEAM*>(array_value(word2->seam_array, i)); 00297 array_value(word2->seam_array, i) = NULL; 00298 word->seam_array = add_seam(word->seam_array, seam); 00299 } 00300 word->best_state += word2->best_state; 00301 // Append the word choices. 00302 *word->best_choice += *word2->best_choice; 00303 *word->raw_choice += *word2->raw_choice; 00304 00305 // How many alt choices from each should we try to get? 00306 const int kAltsPerPiece = 2; 00307 // When do we start throwing away extra alt choices? 00308 const int kTooManyAltChoices = 100; 00309 00310 if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) { 00311 // Construct the cartesian product of the alt choices of word(1) and word2. 00312 int num_first_alt_choices = word->alt_choices.size(); 00313 // Nota Bene: For the main loop here, we leave in place word1-only 00314 // alt_choices in 00315 // word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1] 00316 // These will get fused with the best choices for word2 below. 00317 for (int j = 1; j < word2->alt_choices.size() && 00318 (j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices); 00319 j++) { 00320 for (int i = 0; i < num_first_alt_choices && 00321 (i <= kAltsPerPiece || 00322 word->alt_choices.size() < kTooManyAltChoices); 00323 i++) { 00324 WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]); 00325 *wc += *word2->alt_choices[j]; 00326 word->alt_choices.push_back(wc); 00327 00328 word->alt_states.push_back(GenericVector<int>()); 00329 GenericVector<int> &alt_state = word->alt_states.back(); 00330 alt_state += word->alt_states[i]; 00331 alt_state += word2->alt_states[j]; 00332 } 00333 } 00334 // Now that we've filled in as many alternates as we want, paste the best 00335 // choice for word2 onto the original word alt_choices. 00336 for (int i = 0; i < num_first_alt_choices; i++) { 00337 *word->alt_choices[i] += *word2->alt_choices[0]; 00338 word->alt_states[i] += word2->alt_states[0]; 00339 } 00340 } 00341 00342 // Restore the pointer to original blamer bundle and combine blamer 00343 // information recorded in the splits. 00344 if (orig_bb != NULL) { 00345 IncorrectResultReason irr = orig_bb->incorrect_result_reason; 00346 if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = ""; 00347 if (word->blamer_bundle->incorrect_result_reason != IRR_CORRECT && 00348 word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH && 00349 word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) { 00350 blamer_debug += "Blame from part 1: "; 00351 blamer_debug += word->blamer_bundle->debug; 00352 irr = word->blamer_bundle->incorrect_result_reason; 00353 } 00354 if (word2->blamer_bundle->incorrect_result_reason != IRR_CORRECT && 00355 word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH && 00356 word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) { 00357 blamer_debug += "Blame from part 2: "; 00358 blamer_debug += word2->blamer_bundle->debug; 00359 if (irr == IRR_CORRECT) { 00360 irr = word2->blamer_bundle->incorrect_result_reason; 00361 } else if (irr != word2->blamer_bundle->incorrect_result_reason) { 00362 irr = IRR_UNKNOWN; 00363 } 00364 } 00365 delete word->blamer_bundle; 00366 word->blamer_bundle = orig_bb; 00367 word->blamer_bundle->incorrect_result_reason = irr; 00368 if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) { 00369 word->blamer_bundle->SetBlame(irr, blamer_debug, NULL, 00370 wordrec_debug_blamer); 00371 } 00372 } 00373 delete word2; 00374 } 00375 00376 } // namespace tesseract