Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: bestfirst.c (Formerly bestfirst.c) 00005 * Description: Best first search functions 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Mon May 14 11:23:29 1990 00008 * Modified: Tue Jul 30 16:08:47 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Experimental (Do Not Distribute) 00012 * 00013 * (c) Copyright 1990, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 ***************************************************************************/ 00025 00026 /*---------------------------------------------------------------------- 00027 I n c l u d e s 00028 ---------------------------------------------------------------------*/ 00029 00030 #include <assert.h> 00031 00032 #include "associate.h" 00033 #include "bestfirst.h" 00034 #include "baseline.h" 00035 #include "bitvec.h" 00036 #include "dict.h" 00037 #include "freelist.h" 00038 #include "globals.h" 00039 #include "helpers.h" 00040 #include "pageres.h" 00041 #include "permute.h" 00042 #include "plotseg.h" 00043 #include "ratngs.h" 00044 #include "states.h" 00045 #include "stopper.h" 00046 #include "structures.h" 00047 #include "unicharset.h" 00048 #include "wordclass.h" 00049 #include "wordrec.h" 00050 00051 // Include automatically generated configuration file if running autoconf. 00052 #ifdef HAVE_CONFIG_H 00053 #include "config_auto.h" 00054 #endif 00055 00056 void call_caller(); 00057 00058 static void log_state(const char * message, 00059 int num_joints, 00060 STATE *state) { 00061 STRING segstate; 00062 print_state(state, num_joints, &segstate); 00063 tprintf("%20s [%40s]\n", message, segstate.string()); 00064 } 00065 00066 static void log_state(const char * message, 00067 int num_joints, 00068 STATE *state, 00069 float priority) { 00070 STRING segstate; 00071 print_state(state, num_joints, &segstate); 00072 tprintf("%20s [%40s], priority %8.3f\n", message, 00073 segstate.string(), priority); 00074 } 00075 00076 00077 /*---------------------------------------------------------------------- 00078 F u n c t i o n s 00079 ----------------------------------------------------------------------*/ 00080 00081 namespace tesseract { 00088 void Wordrec::best_first_search(CHUNKS_RECORD *chunks_record, 00089 BLOB_CHOICE_LIST_VECTOR *best_char_choices, 00090 WERD_RES *word, 00091 STATE *state, 00092 DANGERR *fixpt, 00093 STATE *best_state) { 00094 SEARCH_RECORD *the_search; 00095 inT16 keep_going; 00096 STATE guided_state; // not used 00097 00098 int num_joints = chunks_record->ratings->dimension() - 1; 00099 the_search = new_search(chunks_record, num_joints, best_char_choices, 00100 word->best_choice, word->raw_choice, state); 00101 00102 // The default state is initialized as the best choice. In order to apply 00103 // segmentation adjustment, or any other contextual processing in permute, 00104 // we give the best choice a poor rating to force the processed raw choice 00105 // to be promoted to best choice. 00106 the_search->best_choice->set_rating(WERD_CHOICE::kBadRating); 00107 evaluate_state(chunks_record, the_search, fixpt, word->blamer_bundle); 00108 if (wordrec_debug_level > 1) { 00109 tprintf("\n\n\n =========== BestFirstSearch ==============\n"); 00110 word->best_choice->print("**Initial BestChoice**"); 00111 } 00112 00113 FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record, the_search); 00114 if (worst_priority < wordrec_worst_state) 00115 worst_priority = wordrec_worst_state; 00116 if (wordrec_debug_level > 1) { 00117 log_state("BestFirstSearch", num_joints, best_state); 00118 } 00119 00120 guided_state = *state; 00121 do { 00122 /* Look for answer */ 00123 STATE orig_state = *the_search->this_state; 00124 if (!hash_lookup (the_search->closed_states, the_search->this_state)) { 00125 guided_state = *(the_search->this_state); 00126 keep_going = evaluate_state(chunks_record, the_search, fixpt, 00127 word->blamer_bundle); 00128 hash_add (the_search->closed_states, the_search->this_state); 00129 00130 if (!keep_going || 00131 (the_search->num_states > wordrec_num_seg_states)) { 00132 if (wordrec_debug_level > 1) 00133 tprintf("Breaking best_first_search on keep_going %s numstates %d\n", 00134 ((keep_going) ? "T" :"F"), the_search->num_states); 00135 free_state (the_search->this_state); 00136 break; 00137 } 00138 00139 FLOAT32 new_worst_priority = 2.0f * prioritize_state(chunks_record, 00140 the_search); 00141 if (new_worst_priority < worst_priority) { 00142 if (wordrec_debug_level > 1) 00143 tprintf("Lowering WorstPriority %f --> %f\n", 00144 worst_priority, new_worst_priority); 00145 // Tighten the threshold for admitting new paths as better search 00146 // candidates are found. After lowering this threshold, we can safely 00147 // popout everything that is worse than this score also. 00148 worst_priority = new_worst_priority; 00149 } 00150 expand_node(worst_priority, chunks_record, the_search); 00151 } 00152 00153 if (wordrec_debug_level > 1) { 00154 log_state("Done with", the_search->num_joints, &orig_state); 00155 } 00156 free_state (the_search->this_state); 00157 num_popped++; 00158 the_search->this_state = pop_queue (the_search->open_states); 00159 if (wordrec_debug_level > 1 && !the_search->this_state) 00160 tprintf("No more states to evalaute after %d evals", num_popped); 00161 } while (the_search->this_state); 00162 00163 state->part1 = the_search->best_state->part1; 00164 state->part2 = the_search->best_state->part2; 00165 if (wordrec_debug_level > 1) { 00166 tprintf("\n\n\n =========== BestFirstSearch ==============\n"); 00167 // best_choice->debug_string().string()); 00168 word->best_choice->print("**Final BestChoice**"); 00169 } 00170 // save the best_state stats 00171 delete_search(the_search); 00172 } 00173 00179 void Wordrec::delete_search(SEARCH_RECORD *the_search) { 00180 float closeness; 00181 00182 closeness = (the_search->num_joints ? 00183 (hamming_distance(reinterpret_cast<uinT32*>(the_search->first_state), 00184 reinterpret_cast<uinT32*>(the_search->best_state), 2) / 00185 (float) the_search->num_joints) : 0.0f); 00186 00187 free_state (the_search->first_state); 00188 free_state (the_search->best_state); 00189 00190 free_hash_table(the_search->closed_states); 00191 FreeHeapData (the_search->open_states, (void_dest) free_state); 00192 00193 memfree(the_search); 00194 } 00195 00196 00203 BLOB_CHOICE_LIST_VECTOR *Wordrec::evaluate_chunks(CHUNKS_RECORD *chunks_record, 00204 SEARCH_STATE search_state, 00205 BlamerBundle *blamer_bundle) { 00206 BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); 00207 BLOB_CHOICE_LIST *blob_choices; 00208 BLOB_CHOICE_IT blob_choice_it; 00209 int i; 00210 int x = 0; 00211 int y; 00212 00213 // Iterate sub-paths. 00214 for (i = 1; i <= search_state[0] + 1; i++) { 00215 if (i > search_state[0]) 00216 y = count_blobs (chunks_record->chunks) - 1; 00217 else 00218 y = x + search_state[i]; 00219 00220 // Process one square. 00221 00222 // Classify if needed. 00223 blob_choices = get_piece_rating(chunks_record->ratings, 00224 chunks_record->chunks, 00225 chunks_record->word_res->denorm, 00226 chunks_record->splits, 00227 x, y, blamer_bundle); 00228 00229 if (blob_choices == NULL) { 00230 delete char_choices; 00231 return (NULL); 00232 } 00233 00234 // Add permuted ratings. 00235 blob_choice_it.set_to_list(blob_choices); 00236 last_segmentation[i - 1].certainty = blob_choice_it.data()->certainty(); 00237 last_segmentation[i - 1].match = blob_choice_it.data()->rating(); 00238 00239 last_segmentation[i - 1].width = 00240 AssociateUtils::GetChunksWidth(chunks_record->chunk_widths, x, y); 00241 last_segmentation[i - 1].gap = 00242 AssociateUtils::GetChunksGap(chunks_record->chunk_widths, y); 00243 00244 *char_choices += blob_choices; 00245 x = y + 1; 00246 } 00247 return (char_choices); 00248 } 00249 00256 inT16 Wordrec::evaluate_state(CHUNKS_RECORD *chunks_record, 00257 SEARCH_RECORD *the_search, 00258 DANGERR *fixpt, 00259 BlamerBundle *blamer_bundle) { 00260 BLOB_CHOICE_LIST_VECTOR *char_choices; 00261 SEARCH_STATE chunk_groups; 00262 float rating_limit = the_search->best_choice->rating(); 00263 bool keep_going = true; 00264 PIECES_STATE widths; 00265 00266 the_search->num_states++; 00267 chunk_groups = bin_to_chunks(the_search->this_state, 00268 the_search->num_joints); 00269 bin_to_pieces (the_search->this_state, the_search->num_joints, widths); 00270 if (wordrec_debug_level > 1) { 00271 log_state("Evaluating state", the_search->num_joints, 00272 the_search->this_state); 00273 } 00274 getDict().LogNewSegmentation(widths); 00275 00276 char_choices = evaluate_chunks(chunks_record, chunk_groups, blamer_bundle); 00277 getDict().SetWordsegRatingAdjustFactor(-1.0f); 00278 bool updated_best_choice = false; 00279 if (char_choices != NULL && char_choices->length() > 0) { 00280 // Compute the segmentation cost and include the cost in word rating. 00281 // TODO(dsl): We should change the SEARCH_RECORD to store this cost 00282 // from state evaluation and avoid recomputing it here. 00283 prioritize_state(chunks_record, the_search); 00284 getDict().SetWordsegRatingAdjustFactor(the_search->segcost_bias); 00285 updated_best_choice = 00286 getDict().permute_characters(*char_choices, 00287 the_search->best_choice, 00288 the_search->raw_choice); 00289 bool replaced = false; 00290 if (updated_best_choice) { 00291 if (getDict().AcceptableChoice(char_choices, the_search->best_choice, 00292 NULL, ASSOCIATOR_CALLER, &replaced)) { 00293 keep_going = false; 00294 } 00295 CopyCharChoices(*char_choices, the_search->best_char_choices); 00296 } 00297 } 00298 getDict().SetWordsegRatingAdjustFactor(-1.0f); 00299 00300 #ifndef GRAPHICS_DISABLED 00301 if (wordrec_display_segmentations) { 00302 display_segmentation (chunks_record->chunks, chunk_groups); 00303 if (wordrec_display_segmentations > 1) 00304 window_wait(segm_window); 00305 } 00306 #endif 00307 00308 if (rating_limit != the_search->best_choice->rating()) { 00309 ASSERT_HOST(updated_best_choice); 00310 the_search->before_best = the_search->num_states; 00311 the_search->best_state->part1 = the_search->this_state->part1; 00312 the_search->best_state->part2 = the_search->this_state->part2; 00313 replace_char_widths(chunks_record, chunk_groups); 00314 } else { 00315 ASSERT_HOST(!updated_best_choice); 00316 if (char_choices != NULL) fixpt->clear(); 00317 } 00318 00319 if (char_choices != NULL) delete char_choices; 00320 memfree(chunk_groups); 00321 00322 return (keep_going); 00323 } 00324 00325 00332 BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state( 00333 WERD_RES *word, 00334 STATE *state, 00335 BLOB_CHOICE_LIST_VECTOR *old_choices, 00336 MATRIX *ratings) { 00337 // Initialize search_state, num_joints, x, y. 00338 int num_joints = array_count(word->seam_array); 00339 #ifndef GRAPHICS_DISABLED 00340 if (wordrec_display_segmentations) { 00341 print_state("Rebuilding state", state, num_joints); 00342 } 00343 #endif 00344 // Setup the rebuild_word ready for the output blobs. 00345 if (word->rebuild_word != NULL) 00346 delete word->rebuild_word; 00347 word->rebuild_word = new TWERD; 00348 // Setup the best_state. 00349 word->best_state.clear(); 00350 SEARCH_STATE search_state = bin_to_chunks(state, num_joints); 00351 // See which index is which below for information on x and y. 00352 int x = 0; 00353 int y; 00354 for (int i = 1; i <= search_state[0]; i++) { 00355 y = x + search_state[i]; 00356 x = y + 1; 00357 } 00358 y = count_blobs(word->chopped_word->blobs) - 1; 00359 00360 // Initialize char_choices, expanded_fragment_lengths: 00361 // e.g. if fragment_lengths = {1 1 2 3 1}, 00362 // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}. 00363 BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); 00364 STRING expanded_fragment_lengths_str = ""; 00365 bool state_has_fragments = false; 00366 const char *fragment_lengths = NULL; 00367 00368 if (word->best_choice->length() > 0) { 00369 fragment_lengths = word->best_choice->fragment_lengths(); 00370 } 00371 if (fragment_lengths) { 00372 for (int i = 0; i < word->best_choice->length(); ++i) { 00373 *char_choices += NULL; 00374 word->best_state.push_back(0); 00375 if (fragment_lengths[i] > 1) { 00376 state_has_fragments = true; 00377 } 00378 for (int j = 0; j < fragment_lengths[i]; ++j) { 00379 expanded_fragment_lengths_str += fragment_lengths[i]; 00380 } 00381 } 00382 } else { 00383 for (int i = 0; i <= search_state[0]; ++i) { 00384 expanded_fragment_lengths_str += (char)1; 00385 *char_choices += NULL; 00386 word->best_state.push_back(0); 00387 } 00388 } 00389 00390 // Set up variables for concatenating fragments. 00391 const char *word_lengths_ptr = NULL; 00392 const char *word_ptr = NULL; 00393 if (state_has_fragments) { 00394 // Make word_lengths_ptr point to the last element in 00395 // best_choice->unichar_lengths(). 00396 word_lengths_ptr = word->best_choice->unichar_lengths().string(); 00397 word_lengths_ptr += (strlen(word_lengths_ptr)-1); 00398 // Make word_str point to the beginning of the last 00399 // unichar in best_choice->unichar_string(). 00400 word_ptr = word->best_choice->unichar_string().string(); 00401 word_ptr += (strlen(word_ptr)-*word_lengths_ptr); 00402 } 00403 const char *expanded_fragment_lengths = 00404 expanded_fragment_lengths_str.string(); 00405 char unichar[UNICHAR_LEN + 1]; 00406 00407 // Populate char_choices list such that it corresponds to search_state. 00408 // 00409 // If we are rebuilding a state that contains character fragments: 00410 // -- combine blobs that belong to character fragments 00411 // -- re-classify the blobs to obtain choices list for the merged blob 00412 // -- ensure that correct classification appears in the new choices list 00413 // NOTE: a choice composed form original fragment choices will be always 00414 // added to the new choices list for each character composed from 00415 // fragments (even if the choice for the corresponding character appears 00416 // in the re-classified choices list of for the newly merged blob). 00417 int ss_index = search_state[0]; 00418 // Which index is which? 00419 // char_choices_index refers to the finished product: there is one for each 00420 // blob/unicharset entry in the final word. 00421 // ss_index refers to the search_state, and indexes a group (chunk) of blobs 00422 // that were classified together for the best state. 00423 // old_choice_index is a copy of ss_index, and accesses the old_choices, 00424 // which correspond to chunks in the best state. old_choice_index gets 00425 // set to -1 on a fragment set, as there is no corresponding chunk in 00426 // the best state. 00427 // x and y refer to the underlying blobs and are the first and last blob 00428 // indices in a chunk. 00429 for (int char_choices_index = char_choices->length() - 1; 00430 char_choices_index >= 0; 00431 --char_choices_index) { 00432 // The start and end of the blob to rebuild. 00433 int true_x = x; 00434 int true_y = y; 00435 // The fake merged fragment choice. 00436 BLOB_CHOICE* merged_choice = NULL; 00437 // Test for and combine fragments first. 00438 int fragment_pieces = expanded_fragment_lengths[ss_index]; 00439 int old_choice_index = ss_index; 00440 00441 if (fragment_pieces > 1) { 00442 strncpy(unichar, word_ptr, *word_lengths_ptr); 00443 unichar[*word_lengths_ptr] = '\0'; 00444 merged_choice = rebuild_fragments(unichar, expanded_fragment_lengths, 00445 old_choice_index, old_choices); 00446 old_choice_index = -1; 00447 } 00448 while (fragment_pieces > 0) { 00449 true_x = x; 00450 // Move left to the previous blob. 00451 y = x - 1; 00452 x = y - search_state[ss_index--]; 00453 --fragment_pieces; 00454 } 00455 word->best_state[char_choices_index] = true_y + 1 - true_x; 00456 BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify( 00457 word, true_x, true_y, old_choice_index, ratings, old_choices); 00458 if (merged_choice != NULL) { 00459 // Insert merged_blob into current_choices, such that current_choices 00460 // are still sorted in non-descending order by rating. 00461 ASSERT_HOST(!current_choices->empty()); 00462 BLOB_CHOICE_IT choice_it(current_choices); 00463 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && 00464 merged_choice->rating() > choice_it.data()->rating(); 00465 choice_it.forward()); 00466 choice_it.add_before_stay_put(merged_choice); 00467 } 00468 // Get rid of fragments in current_choices. 00469 BLOB_CHOICE_IT choice_it(current_choices); 00470 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 00471 choice_it.forward()) { 00472 if (getDict().getUnicharset().get_fragment( 00473 choice_it.data()->unichar_id())) { 00474 delete choice_it.extract(); 00475 } 00476 } 00477 char_choices->set(current_choices, char_choices_index); 00478 00479 // Update word_ptr and word_lengths_ptr. 00480 if (word_lengths_ptr != NULL && word_ptr != NULL) { 00481 word_lengths_ptr--; 00482 word_ptr -= (*word_lengths_ptr); 00483 } 00484 } 00485 old_choices->delete_data_pointers(); 00486 delete old_choices; 00487 memfree(search_state); 00488 00489 return char_choices; 00490 } 00491 00499 void Wordrec::expand_node(FLOAT32 worst_priority, 00500 CHUNKS_RECORD *chunks_record, 00501 SEARCH_RECORD *the_search) { 00502 STATE old_state; 00503 int x; 00504 uinT32 mask = 1 << (the_search->num_joints - 1 - 32); 00505 00506 old_state.part1 = the_search->this_state->part1; 00507 old_state.part2 = the_search->this_state->part2; 00508 00509 // We need to expand the search more intelligently, or we get stuck 00510 // with a bad starting segmentation in a long word sequence as in CJK. 00511 // Expand a child node only if it is within the global bound, and no 00512 // worse than 2x of its parent. 00513 // TODO(dsl): There is some redudency here in recomputing the priority, 00514 // and in filtering of old_merit and worst_priority. 00515 the_search->this_state->part2 = old_state.part2; 00516 for (x = the_search->num_joints; x > 32; x--) { 00517 the_search->this_state->part1 = mask ^ old_state.part1; 00518 if (!hash_lookup (the_search->closed_states, the_search->this_state)) { 00519 FLOAT32 new_merit = prioritize_state(chunks_record, the_search); 00520 if (new_merit < worst_priority) { 00521 if (wordrec_debug_level > 1) 00522 log_state("Pushing segstate", the_search->num_joints, 00523 the_search->this_state, new_merit); 00524 push_queue(the_search->open_states, the_search->this_state, 00525 worst_priority, new_merit, wordrec_debug_level > 1); 00526 } else { 00527 if (wordrec_debug_level > 1) 00528 log_state("Ignore weak segstate", the_search->num_joints, 00529 the_search->this_state, new_merit); 00530 } 00531 } 00532 mask >>= 1; 00533 } 00534 00535 if (the_search->num_joints > 32) { 00536 mask = 1 << 31; 00537 } 00538 else { 00539 mask = 1 << (the_search->num_joints - 1); 00540 } 00541 00542 the_search->this_state->part1 = old_state.part1; 00543 while (x--) { 00544 the_search->this_state->part2 = mask ^ old_state.part2; 00545 if (!hash_lookup (the_search->closed_states, the_search->this_state)) { 00546 FLOAT32 new_merit = prioritize_state(chunks_record, the_search); 00547 if (new_merit < worst_priority) { 00548 if (wordrec_debug_level > 1) 00549 log_state("Pushing segstate", the_search->num_joints, 00550 the_search->this_state, new_merit); 00551 push_queue(the_search->open_states, the_search->this_state, 00552 worst_priority, new_merit, wordrec_debug_level > 1); 00553 } else { 00554 if (wordrec_debug_level > 1) 00555 log_state("Ignoring weak segstate", the_search->num_joints, 00556 the_search->this_state, new_merit); 00557 } 00558 } 00559 mask >>= 1; 00560 } 00561 } 00562 00568 SEARCH_RECORD *Wordrec::new_search(CHUNKS_RECORD *chunks_record, 00569 int num_joints, 00570 BLOB_CHOICE_LIST_VECTOR *best_char_choices, 00571 WERD_CHOICE *best_choice, 00572 WERD_CHOICE *raw_choice, 00573 STATE *state) { 00574 SEARCH_RECORD *this_search; 00575 00576 this_search = (SEARCH_RECORD *) memalloc (sizeof (SEARCH_RECORD)); 00577 00578 this_search->open_states = MakeHeap (wordrec_num_seg_states * 20); 00579 this_search->closed_states = new_hash_table(); 00580 00581 if (state) 00582 this_search->this_state = new_state (state); 00583 else 00584 cprintf ("error: bad initial state in new_search\n"); 00585 00586 this_search->first_state = new_state (this_search->this_state); 00587 this_search->best_state = new_state (this_search->this_state); 00588 00589 this_search->best_choice = best_choice; 00590 this_search->raw_choice = raw_choice; 00591 this_search->best_char_choices = best_char_choices; 00592 00593 this_search->num_joints = num_joints; 00594 this_search->num_states = 0; 00595 this_search->before_best = 0; 00596 this_search->segcost_bias = 0; 00597 00598 return (this_search); 00599 } 00600 00607 STATE *Wordrec::pop_queue(HEAP *queue) { 00608 HEAPENTRY entry; 00609 00610 if (GetTopOfHeap (queue, &entry) == TESS_HEAP_OK) { 00611 #ifndef GRAPHICS_DISABLED 00612 if (wordrec_display_segmentations) { 00613 cprintf ("eval state: %8.3f ", entry.Key); 00614 print_state ("", (STATE *) entry.Data, num_joints); 00615 } 00616 #endif 00617 return ((STATE *) entry.Data); 00618 } 00619 else { 00620 return (NULL); 00621 } 00622 } 00623 00629 void Wordrec::push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority, 00630 FLOAT32 priority, bool debug) { 00631 HEAPENTRY entry; 00632 00633 if (priority < worst_priority) { 00634 if (SizeOfHeap (queue) >= MaxSizeOfHeap(queue)) { 00635 if (debug) tprintf("Heap is Full\n"); 00636 return; 00637 } 00638 entry.Data = (char *) new_state (state); 00639 num_pushed++; 00640 entry.Key = priority; 00641 HeapStore(queue, &entry); 00642 } 00643 } 00644 00651 void Wordrec::replace_char_widths(CHUNKS_RECORD *chunks_record, 00652 SEARCH_STATE state) { 00653 WIDTH_RECORD *width_record; 00654 int num_blobs; 00655 int i; 00656 00657 free_widths (chunks_record->char_widths); 00658 00659 num_blobs = state[0] + 1; 00660 width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2); 00661 width_record->num_chars = num_blobs; 00662 00663 for (i = 0; i < num_blobs; i++) { 00664 00665 width_record->widths[2 * i] = last_segmentation[i].width; 00666 00667 if (i + 1 < num_blobs) 00668 width_record->widths[2 * i + 1] = last_segmentation[i].gap; 00669 } 00670 chunks_record->char_widths = width_record; 00671 } 00672 00673 // Creates a fake blob choice from the combination of the given fragments. 00674 // unichar is the class to be made from the combination, 00675 // expanded_fragment_lengths[choice_index] is the number of fragments to use. 00676 // old_choices[choice_index] has the classifier output for each fragment. 00677 // choice index initially indexes the last fragment and should be decremented 00678 // expanded_fragment_lengths[choice_index] times to get the earlier fragments. 00679 // Guarantees to return something non-null, or abort! 00680 BLOB_CHOICE* Wordrec::rebuild_fragments( 00681 const char* unichar, 00682 const char* expanded_fragment_lengths, 00683 int choice_index, 00684 BLOB_CHOICE_LIST_VECTOR *old_choices) { 00685 float rating = 0.0f; 00686 float certainty = 0.0f; 00687 inT16 min_xheight = -MAX_INT16; 00688 inT16 max_xheight = MAX_INT16; 00689 for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1; 00690 fragment_pieces >= 0; --fragment_pieces, --choice_index) { 00691 // Get a pointer to the classifier results from the old_choices. 00692 BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index); 00693 // Populate fragment with updated values and look for the 00694 // fragment with the same values in current_choices. 00695 // Update rating and certainty of the character being composed. 00696 CHAR_FRAGMENT fragment; 00697 fragment.set_all(unichar, fragment_pieces, 00698 expanded_fragment_lengths[choice_index], false); 00699 BLOB_CHOICE_IT choice_it(current_choices); 00700 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 00701 choice_it.forward()) { 00702 BLOB_CHOICE* choice = choice_it.data(); 00703 const CHAR_FRAGMENT *current_fragment = 00704 getDict().getUnicharset().get_fragment(choice->unichar_id()); 00705 if (current_fragment && fragment.equals(current_fragment)) { 00706 rating += choice->rating(); 00707 if (choice->certainty() < certainty) { 00708 certainty = choice->certainty(); 00709 } 00710 IntersectRange(choice->min_xheight(), choice->max_xheight(), 00711 &min_xheight, &max_xheight); 00712 break; 00713 } 00714 } 00715 if (choice_it.cycled_list()) { 00716 print_ratings_list("Failure", current_choices, unicharset); 00717 tprintf("Failed to find fragment %s at index=%d\n", 00718 fragment.to_string().string(), choice_index); 00719 } 00720 ASSERT_HOST(!choice_it.cycled_list()); // Be sure we found the fragment. 00721 } 00722 return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), 00723 rating, certainty, -1, -1, 0, 00724 min_xheight, max_xheight, false); 00725 } 00726 00727 // Creates a joined copy of the blobs between x and y (inclusive) and 00728 // inserts as the first blob at word->rebuild_word->blobs. 00729 // Returns a deep copy of the classifier results for the blob. 00730 BLOB_CHOICE_LIST *Wordrec::join_blobs_and_classify( 00731 WERD_RES* word, int x, int y, int choice_index, MATRIX *ratings, 00732 BLOB_CHOICE_LIST_VECTOR *old_choices) { 00733 // Join parts to make the blob if needed. 00734 if (x != y) 00735 join_pieces(word->chopped_word->blobs, word->seam_array, x, y); 00736 TBLOB *blob = word->chopped_word->blobs; 00737 for (int i = 0; i < x; i++) { 00738 blob = blob->next; 00739 } 00740 // Deep copy this blob into the output word. 00741 TBLOB* copy_blob = new TBLOB(*blob); 00742 copy_blob->next = word->rebuild_word->blobs; 00743 word->rebuild_word->blobs = copy_blob; 00744 00745 BLOB_CHOICE_LIST *choices = NULL; 00746 // First check to see if we can look up the classificaiton 00747 // in old_choices (if there is no need to merge blobs). 00748 if (choice_index >= 0 && old_choices != NULL) { 00749 choices = old_choices->get(choice_index); 00750 old_choices->set(NULL, choice_index); 00751 } 00752 // The ratings matrix filled in by the associator will contain the next most 00753 // up-to-date classification info. Thus we look up the classification there 00754 // next, and only call classify_blob() if the classification is not found. 00755 if (choices == NULL && ratings != NULL) { 00756 choices = ratings->get(x, y); 00757 if (choices != NOT_CLASSIFIED) { 00758 ratings->put(x, y, NULL); 00759 } 00760 } 00761 // Get the choices for the blob by classification if necessary. 00762 if (choices == NULL) { 00763 choices = classify_blob(blob, word->denorm, "rebuild", Orange, 00764 word->blamer_bundle); 00765 } 00766 // Undo join_pieces to restore the chopped word to its fully chopped state. 00767 if (x != y) 00768 break_pieces(blob, word->seam_array, x, y); 00769 return choices; 00770 } 00771 00772 } // namespace tesseract