tesseract-doc/bestfirst_8cpp_source.html

00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        bestfirst.c  (Formerly bestfirst.c)
00005  * Description:  Best first search functions
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Mon May 14 11:23:29 1990
00008  * Modified:     Tue Jul 30 16:08:47 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Experimental (Do Not Distribute)
00012  *
00013  * (c) Copyright 1990, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  ***************************************************************************/
00025
00026 /*----------------------------------------------------------------------
00027           I n c l u d e s
00028 ---------------------------------------------------------------------*/
00029
00030 #include <assert.h>
00031
00032 #include "associate.h"
00033 #include "bestfirst.h"
00034 #include "baseline.h"
00035 #include "bitvec.h"
00036 #include "dict.h"
00037 #include "freelist.h"
00038 #include "globals.h"
00039 #include "helpers.h"
00040 #include "pageres.h"
00041 #include "permute.h"
00042 #include "plotseg.h"
00043 #include "ratngs.h"
00044 #include "states.h"
00045 #include "stopper.h"
00046 #include "structures.h"
00047 #include "unicharset.h"
00048 #include "wordclass.h"
00049 #include "wordrec.h"
00050
00051 // Include automatically generated configuration file if running autoconf.
00052 #ifdef HAVE_CONFIG_H
00053 #include "config_auto.h"
00054 #endif
00055
00056 void call_caller();
00057
00058 static void log_state(const char * message,
00059                       int num_joints,
00060                       STATE *state) {
00061   STRING segstate;
00062   print_state(state, num_joints, &segstate);
00063   tprintf("%20s [%40s]\n", message, segstate.string());
00064 }
00065
00066 static void log_state(const char * message,
00067                       int num_joints,
00068                       STATE *state,
00069                       float priority) {
00070   STRING segstate;
00071   print_state(state, num_joints, &segstate);
00072   tprintf("%20s [%40s], priority %8.3f\n", message,
00073           segstate.string(), priority);
00074 }
00075
00076
00077 /*----------------------------------------------------------------------
00078           F u n c t i o n s
00079 ----------------------------------------------------------------------*/
00080
00081 namespace tesseract {
00088 void Wordrec::best_first_search(CHUNKS_RECORD *chunks_record,
00089                                 BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00090                                 WERD_RES *word,
00091                                 STATE *state,
00092                                 DANGERR *fixpt,
00093                                 STATE *best_state) {
00094   SEARCH_RECORD *the_search;
00095   inT16 keep_going;
00096   STATE guided_state;   // not used
00097
00098   int num_joints = chunks_record->ratings->dimension() - 1;
00099   the_search = new_search(chunks_record, num_joints, best_char_choices,
00100                           word->best_choice, word->raw_choice, state);
00101
00102   // The default state is initialized as the best choice.  In order to apply
00103   // segmentation adjustment, or any other contextual processing in permute,
00104   // we give the best choice a poor rating to force the processed raw choice
00105   // to be promoted to best choice.
00106   the_search->best_choice->set_rating(WERD_CHOICE::kBadRating);
00107   evaluate_state(chunks_record, the_search, fixpt, word->blamer_bundle);
00108   if (wordrec_debug_level > 1) {
00109     tprintf("\n\n\n =========== BestFirstSearch ==============\n");
00110     word->best_choice->print("**Initial BestChoice**");
00111   }
00112
00113   FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record, the_search);
00114   if (worst_priority < wordrec_worst_state)
00115     worst_priority = wordrec_worst_state;
00116   if (wordrec_debug_level > 1) {
00117     log_state("BestFirstSearch", num_joints, best_state);
00118   }
00119
00120   guided_state = *state;
00121   do {
00122                                  /* Look for answer */
00123     STATE orig_state = *the_search->this_state;
00124     if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
00125       guided_state = *(the_search->this_state);
00126       keep_going = evaluate_state(chunks_record, the_search, fixpt,
00127                                   word->blamer_bundle);
00128       hash_add (the_search->closed_states, the_search->this_state);
00129
00130       if (!keep_going ||
00131           (the_search->num_states > wordrec_num_seg_states)) {
00132         if (wordrec_debug_level > 1)
00133           tprintf("Breaking best_first_search on keep_going %s numstates %d\n",
00134                   ((keep_going) ? "T" :"F"), the_search->num_states);
00135         free_state (the_search->this_state);
00136         break;
00137       }
00138
00139       FLOAT32 new_worst_priority = 2.0f * prioritize_state(chunks_record,
00140                                                            the_search);
00141       if (new_worst_priority < worst_priority) {
00142         if (wordrec_debug_level > 1)
00143           tprintf("Lowering WorstPriority %f --> %f\n",
00144                   worst_priority, new_worst_priority);
00145         // Tighten the threshold for admitting new paths as better search
00146         // candidates are found.  After lowering this threshold, we can safely
00147         // popout everything that is worse than this score also.
00148         worst_priority = new_worst_priority;
00149       }
00150       expand_node(worst_priority, chunks_record, the_search);
00151     }
00152
00153     if (wordrec_debug_level > 1) {
00154       log_state("Done with", the_search->num_joints, &orig_state);
00155     }
00156     free_state (the_search->this_state);
00157     num_popped++;
00158     the_search->this_state = pop_queue (the_search->open_states);
00159     if (wordrec_debug_level > 1 && !the_search->this_state)
00160       tprintf("No more states to evalaute after %d evals", num_popped);
00161   } while (the_search->this_state);
00162
00163   state->part1 = the_search->best_state->part1;
00164   state->part2 = the_search->best_state->part2;
00165   if (wordrec_debug_level > 1) {
00166     tprintf("\n\n\n =========== BestFirstSearch ==============\n");
00167             // best_choice->debug_string().string());
00168     word->best_choice->print("**Final BestChoice**");
00169   }
00170   // save the best_state stats
00171   delete_search(the_search);
00172 }
00173
00179 void Wordrec::delete_search(SEARCH_RECORD *the_search) {
00180   float closeness;
00181
00182   closeness = (the_search->num_joints ?
00183     (hamming_distance(reinterpret_cast<uinT32*>(the_search->first_state),
00184                       reinterpret_cast<uinT32*>(the_search->best_state), 2) /
00185       (float) the_search->num_joints) : 0.0f);
00186
00187   free_state (the_search->first_state);
00188   free_state (the_search->best_state);
00189
00190   free_hash_table(the_search->closed_states);
00191   FreeHeapData (the_search->open_states, (void_dest) free_state);
00192
00193   memfree(the_search);
00194 }
00195
00196
00203 BLOB_CHOICE_LIST_VECTOR *Wordrec::evaluate_chunks(CHUNKS_RECORD *chunks_record,
00204                                                   SEARCH_STATE search_state,
00205                                                   BlamerBundle *blamer_bundle) {
00206   BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
00207   BLOB_CHOICE_LIST *blob_choices;
00208   BLOB_CHOICE_IT blob_choice_it;
00209   int i;
00210   int x = 0;
00211   int y;
00212
00213   // Iterate sub-paths.
00214   for (i = 1; i <= search_state[0] + 1; i++) {
00215     if (i > search_state[0])
00216       y = count_blobs (chunks_record->chunks) - 1;
00217     else
00218       y = x + search_state[i];
00219
00220     // Process one square.
00221
00222     // Classify if needed.
00223     blob_choices = get_piece_rating(chunks_record->ratings,
00224                                     chunks_record->chunks,
00225                                     chunks_record->word_res->denorm,
00226                                     chunks_record->splits,
00227                                     x, y, blamer_bundle);
00228
00229     if (blob_choices == NULL) {
00230       delete char_choices;
00231       return (NULL);
00232     }
00233
00234     // Add permuted ratings.
00235     blob_choice_it.set_to_list(blob_choices);
00236     last_segmentation[i - 1].certainty = blob_choice_it.data()->certainty();
00237     last_segmentation[i - 1].match = blob_choice_it.data()->rating();
00238
00239     last_segmentation[i - 1].width =
00240       AssociateUtils::GetChunksWidth(chunks_record->chunk_widths, x, y);
00241     last_segmentation[i - 1].gap =
00242       AssociateUtils::GetChunksGap(chunks_record->chunk_widths, y);
00243
00244     *char_choices += blob_choices;
00245     x = y + 1;
00246   }
00247   return (char_choices);
00248 }
00249
00256 inT16 Wordrec::evaluate_state(CHUNKS_RECORD *chunks_record,
00257                               SEARCH_RECORD *the_search,
00258                               DANGERR *fixpt,
00259                               BlamerBundle *blamer_bundle) {
00260   BLOB_CHOICE_LIST_VECTOR *char_choices;
00261   SEARCH_STATE chunk_groups;
00262   float rating_limit = the_search->best_choice->rating();
00263   bool keep_going = true;
00264   PIECES_STATE widths;
00265
00266   the_search->num_states++;
00267   chunk_groups = bin_to_chunks(the_search->this_state,
00268                                the_search->num_joints);
00269   bin_to_pieces (the_search->this_state, the_search->num_joints, widths);
00270   if (wordrec_debug_level > 1) {
00271     log_state("Evaluating state", the_search->num_joints,
00272               the_search->this_state);
00273   }
00274   getDict().LogNewSegmentation(widths);
00275
00276   char_choices = evaluate_chunks(chunks_record, chunk_groups, blamer_bundle);
00277   getDict().SetWordsegRatingAdjustFactor(-1.0f);
00278   bool updated_best_choice = false;
00279   if (char_choices != NULL && char_choices->length() > 0) {
00280     // Compute the segmentation cost and include the cost in word rating.
00281     // TODO(dsl): We should change the SEARCH_RECORD to store this cost
00282     // from state evaluation and avoid recomputing it here.
00283     prioritize_state(chunks_record, the_search);
00284     getDict().SetWordsegRatingAdjustFactor(the_search->segcost_bias);
00285     updated_best_choice =
00286       getDict().permute_characters(*char_choices,
00287                                    the_search->best_choice,
00288                                    the_search->raw_choice);
00289     bool replaced = false;
00290     if (updated_best_choice) {
00291       if (getDict().AcceptableChoice(char_choices, the_search->best_choice,
00292                                      NULL, ASSOCIATOR_CALLER, &replaced)) {
00293         keep_going = false;
00294       }
00295       CopyCharChoices(*char_choices, the_search->best_char_choices);
00296     }
00297   }
00298   getDict().SetWordsegRatingAdjustFactor(-1.0f);
00299
00300 #ifndef GRAPHICS_DISABLED
00301   if (wordrec_display_segmentations) {
00302     display_segmentation (chunks_record->chunks, chunk_groups);
00303     if (wordrec_display_segmentations > 1)
00304       window_wait(segm_window);
00305   }
00306 #endif
00307
00308   if (rating_limit != the_search->best_choice->rating()) {
00309     ASSERT_HOST(updated_best_choice);
00310     the_search->before_best = the_search->num_states;
00311     the_search->best_state->part1 = the_search->this_state->part1;
00312     the_search->best_state->part2 = the_search->this_state->part2;
00313     replace_char_widths(chunks_record, chunk_groups);
00314   } else {
00315     ASSERT_HOST(!updated_best_choice);
00316     if (char_choices != NULL) fixpt->clear();
00317   }
00318
00319   if (char_choices != NULL) delete char_choices;
00320   memfree(chunk_groups);
00321
00322   return (keep_going);
00323 }
00324
00325
00332 BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state(
00333     WERD_RES *word,
00334     STATE *state,
00335     BLOB_CHOICE_LIST_VECTOR *old_choices,
00336     MATRIX *ratings) {
00337   // Initialize search_state, num_joints, x, y.
00338   int num_joints = array_count(word->seam_array);
00339 #ifndef GRAPHICS_DISABLED
00340     if (wordrec_display_segmentations) {
00341       print_state("Rebuilding state", state, num_joints);
00342     }
00343 #endif
00344   // Setup the rebuild_word ready for the output blobs.
00345   if (word->rebuild_word != NULL)
00346     delete word->rebuild_word;
00347   word->rebuild_word = new TWERD;
00348   // Setup the best_state.
00349   word->best_state.clear();
00350   SEARCH_STATE search_state = bin_to_chunks(state, num_joints);
00351   // See which index is which below for information on x and y.
00352   int x = 0;
00353   int y;
00354   for (int i = 1; i <= search_state[0]; i++) {
00355     y = x + search_state[i];
00356     x = y + 1;
00357   }
00358   y = count_blobs(word->chopped_word->blobs) - 1;
00359
00360   // Initialize char_choices, expanded_fragment_lengths:
00361   // e.g. if fragment_lengths = {1 1 2 3 1},
00362   // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}.
00363   BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
00364   STRING expanded_fragment_lengths_str = "";
00365   bool state_has_fragments = false;
00366   const char *fragment_lengths = NULL;
00367
00368   if (word->best_choice->length() > 0) {
00369     fragment_lengths = word->best_choice->fragment_lengths();
00370   }
00371   if (fragment_lengths) {
00372     for (int i = 0; i < word->best_choice->length(); ++i) {
00373       *char_choices += NULL;
00374       word->best_state.push_back(0);
00375       if (fragment_lengths[i] > 1) {
00376         state_has_fragments = true;
00377       }
00378       for (int j = 0; j < fragment_lengths[i]; ++j) {
00379         expanded_fragment_lengths_str += fragment_lengths[i];
00380       }
00381     }
00382   } else {
00383     for (int i = 0; i <= search_state[0]; ++i) {
00384       expanded_fragment_lengths_str += (char)1;
00385       *char_choices += NULL;
00386       word->best_state.push_back(0);
00387     }
00388   }
00389
00390   // Set up variables for concatenating fragments.
00391   const char *word_lengths_ptr = NULL;
00392   const char *word_ptr = NULL;
00393   if (state_has_fragments) {
00394     // Make word_lengths_ptr point to the last element in
00395     // best_choice->unichar_lengths().
00396     word_lengths_ptr = word->best_choice->unichar_lengths().string();
00397     word_lengths_ptr += (strlen(word_lengths_ptr)-1);
00398     // Make word_str point to the beginning of the last
00399     // unichar in best_choice->unichar_string().
00400     word_ptr = word->best_choice->unichar_string().string();
00401     word_ptr += (strlen(word_ptr)-*word_lengths_ptr);
00402   }
00403   const char *expanded_fragment_lengths =
00404     expanded_fragment_lengths_str.string();
00405   char unichar[UNICHAR_LEN + 1];
00406
00407   // Populate char_choices list such that it corresponds to search_state.
00408   //
00409   // If we are rebuilding a state that contains character fragments:
00410   // -- combine blobs that belong to character fragments
00411   // -- re-classify the blobs to obtain choices list for the merged blob
00412   // -- ensure that correct classification appears in the new choices list
00413   //    NOTE: a choice composed form original fragment choices will be always
00414   //    added to the new choices list for each character composed from
00415   //    fragments (even if the choice for the corresponding character appears
00416   //    in the re-classified choices list of for the newly merged blob).
00417   int ss_index = search_state[0];
00418   // Which index is which?
00419   // char_choices_index refers to the finished product: there is one for each
00420   // blob/unicharset entry in the final word.
00421   // ss_index refers to the search_state, and indexes a group (chunk) of blobs
00422   // that were classified together for the best state.
00423   // old_choice_index is a copy of ss_index, and accesses the old_choices,
00424   // which correspond to chunks in the best state. old_choice_index gets
00425   // set to -1 on a fragment set, as there is no corresponding chunk in
00426   // the best state.
00427   // x and y refer to the underlying blobs and are the first and last blob
00428   // indices in a chunk.
00429   for (int char_choices_index = char_choices->length() - 1;
00430        char_choices_index >= 0;
00431        --char_choices_index) {
00432     // The start and end of the blob to rebuild.
00433     int true_x = x;
00434     int true_y = y;
00435     // The fake merged fragment choice.
00436     BLOB_CHOICE* merged_choice = NULL;
00437     // Test for and combine fragments first.
00438     int fragment_pieces = expanded_fragment_lengths[ss_index];
00439     int old_choice_index = ss_index;
00440
00441     if (fragment_pieces > 1) {
00442       strncpy(unichar, word_ptr, *word_lengths_ptr);
00443       unichar[*word_lengths_ptr] = '\0';
00444       merged_choice = rebuild_fragments(unichar, expanded_fragment_lengths,
00445                                         old_choice_index, old_choices);
00446       old_choice_index = -1;
00447     }
00448     while (fragment_pieces > 0) {
00449       true_x = x;
00450       // Move left to the previous blob.
00451       y = x - 1;
00452       x = y - search_state[ss_index--];
00453       --fragment_pieces;
00454     }
00455     word->best_state[char_choices_index] = true_y + 1 - true_x;
00456     BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify(
00457         word, true_x, true_y, old_choice_index, ratings, old_choices);
00458     if (merged_choice != NULL) {
00459       // Insert merged_blob into current_choices, such that current_choices
00460       // are still sorted in non-descending order by rating.
00461       ASSERT_HOST(!current_choices->empty());
00462       BLOB_CHOICE_IT choice_it(current_choices);
00463       for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() &&
00464            merged_choice->rating() > choice_it.data()->rating();
00465            choice_it.forward());
00466         choice_it.add_before_stay_put(merged_choice);
00467     }
00468     // Get rid of fragments in current_choices.
00469     BLOB_CHOICE_IT choice_it(current_choices);
00470     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
00471         choice_it.forward()) {
00472       if (getDict().getUnicharset().get_fragment(
00473           choice_it.data()->unichar_id())) {
00474         delete choice_it.extract();
00475       }
00476     }
00477     char_choices->set(current_choices, char_choices_index);
00478
00479     // Update word_ptr and word_lengths_ptr.
00480     if (word_lengths_ptr != NULL && word_ptr != NULL) {
00481       word_lengths_ptr--;
00482       word_ptr -= (*word_lengths_ptr);
00483     }
00484   }
00485   old_choices->delete_data_pointers();
00486   delete old_choices;
00487   memfree(search_state);
00488
00489   return char_choices;
00490 }
00491
00499 void Wordrec::expand_node(FLOAT32 worst_priority,
00500                           CHUNKS_RECORD *chunks_record,
00501                           SEARCH_RECORD *the_search) {
00502   STATE old_state;
00503   int x;
00504   uinT32 mask = 1 << (the_search->num_joints - 1 - 32);
00505
00506   old_state.part1 = the_search->this_state->part1;
00507   old_state.part2 = the_search->this_state->part2;
00508
00509   // We need to expand the search more intelligently, or we get stuck
00510   // with a bad starting segmentation in a long word sequence as in CJK.
00511   // Expand a child node only if it is within the global bound, and no
00512   // worse than 2x of its parent.
00513   // TODO(dsl): There is some redudency here in recomputing the priority,
00514   // and in filtering of old_merit and worst_priority.
00515   the_search->this_state->part2 = old_state.part2;
00516   for (x = the_search->num_joints; x > 32; x--) {
00517     the_search->this_state->part1 = mask ^ old_state.part1;
00518     if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
00519       FLOAT32 new_merit = prioritize_state(chunks_record, the_search);
00520       if (new_merit < worst_priority) {
00521         if (wordrec_debug_level > 1)
00522           log_state("Pushing segstate", the_search->num_joints,
00523                     the_search->this_state, new_merit);
00524         push_queue(the_search->open_states, the_search->this_state,
00525                    worst_priority, new_merit, wordrec_debug_level > 1);
00526       } else {
00527         if (wordrec_debug_level > 1)
00528           log_state("Ignore weak segstate", the_search->num_joints,
00529                     the_search->this_state, new_merit);
00530       }
00531     }
00532     mask >>= 1;
00533   }
00534
00535   if (the_search->num_joints > 32) {
00536     mask = 1 << 31;
00537   }
00538   else {
00539     mask = 1 << (the_search->num_joints - 1);
00540   }
00541
00542   the_search->this_state->part1 = old_state.part1;
00543   while (x--) {
00544     the_search->this_state->part2 = mask ^ old_state.part2;
00545     if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
00546       FLOAT32 new_merit = prioritize_state(chunks_record, the_search);
00547       if (new_merit < worst_priority) {
00548         if (wordrec_debug_level > 1)
00549           log_state("Pushing segstate", the_search->num_joints,
00550                     the_search->this_state, new_merit);
00551         push_queue(the_search->open_states, the_search->this_state,
00552                    worst_priority, new_merit, wordrec_debug_level > 1);
00553       } else {
00554         if (wordrec_debug_level > 1)
00555           log_state("Ignoring weak segstate", the_search->num_joints,
00556                     the_search->this_state, new_merit);
00557       }
00558     }
00559     mask >>= 1;
00560   }
00561 }
00562
00568 SEARCH_RECORD *Wordrec::new_search(CHUNKS_RECORD *chunks_record,
00569                                    int num_joints,
00570                                    BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00571                                    WERD_CHOICE *best_choice,
00572                                    WERD_CHOICE *raw_choice,
00573                                    STATE *state) {
00574   SEARCH_RECORD *this_search;
00575
00576   this_search = (SEARCH_RECORD *) memalloc (sizeof (SEARCH_RECORD));
00577
00578   this_search->open_states = MakeHeap (wordrec_num_seg_states * 20);
00579   this_search->closed_states = new_hash_table();
00580
00581   if (state)
00582     this_search->this_state = new_state (state);
00583   else
00584     cprintf ("error: bad initial state in new_search\n");
00585
00586   this_search->first_state = new_state (this_search->this_state);
00587   this_search->best_state = new_state (this_search->this_state);
00588
00589   this_search->best_choice = best_choice;
00590   this_search->raw_choice = raw_choice;
00591   this_search->best_char_choices = best_char_choices;
00592
00593   this_search->num_joints = num_joints;
00594   this_search->num_states = 0;
00595   this_search->before_best = 0;
00596   this_search->segcost_bias = 0;
00597
00598   return (this_search);
00599 }
00600
00607 STATE *Wordrec::pop_queue(HEAP *queue) {
00608   HEAPENTRY entry;
00609
00610   if (GetTopOfHeap (queue, &entry) == TESS_HEAP_OK) {
00611 #ifndef GRAPHICS_DISABLED
00612     if (wordrec_display_segmentations) {
00613       cprintf ("eval state: %8.3f ", entry.Key);
00614       print_state ("", (STATE *) entry.Data, num_joints);
00615     }
00616 #endif
00617     return ((STATE *) entry.Data);
00618   }
00619   else {
00620     return (NULL);
00621   }
00622 }
00623
00629 void Wordrec::push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority,
00630                          FLOAT32 priority, bool debug) {
00631   HEAPENTRY entry;
00632
00633   if (priority < worst_priority) {
00634     if (SizeOfHeap (queue) >= MaxSizeOfHeap(queue)) {
00635       if (debug) tprintf("Heap is Full\n");
00636       return;
00637     }
00638     entry.Data = (char *) new_state (state);
00639     num_pushed++;
00640     entry.Key = priority;
00641     HeapStore(queue, &entry);
00642   }
00643 }
00644
00651 void Wordrec::replace_char_widths(CHUNKS_RECORD *chunks_record,
00652                                   SEARCH_STATE state) {
00653   WIDTH_RECORD *width_record;
00654   int num_blobs;
00655   int i;
00656
00657   free_widths (chunks_record->char_widths);
00658
00659   num_blobs = state[0] + 1;
00660   width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2);
00661   width_record->num_chars = num_blobs;
00662
00663   for (i = 0; i < num_blobs; i++) {
00664
00665     width_record->widths[2 * i] = last_segmentation[i].width;
00666
00667     if (i + 1 < num_blobs)
00668       width_record->widths[2 * i + 1] = last_segmentation[i].gap;
00669   }
00670   chunks_record->char_widths = width_record;
00671 }
00672
00673 // Creates a fake blob choice from the combination of the given fragments.
00674 // unichar is the class to be made from the combination,
00675 // expanded_fragment_lengths[choice_index] is the number of fragments to use.
00676 // old_choices[choice_index] has the classifier output for each fragment.
00677 // choice index initially indexes the last fragment and should be decremented
00678 // expanded_fragment_lengths[choice_index] times to get the earlier fragments.
00679 // Guarantees to return something non-null, or abort!
00680 BLOB_CHOICE* Wordrec::rebuild_fragments(
00681     const char* unichar,
00682     const char* expanded_fragment_lengths,
00683     int choice_index,
00684     BLOB_CHOICE_LIST_VECTOR *old_choices) {
00685   float rating = 0.0f;
00686   float certainty = 0.0f;
00687   inT16 min_xheight = -MAX_INT16;
00688   inT16 max_xheight = MAX_INT16;
00689   for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1;
00690        fragment_pieces >= 0; --fragment_pieces, --choice_index) {
00691     // Get a pointer to the classifier results from the old_choices.
00692     BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index);
00693     // Populate fragment with updated values and look for the
00694     // fragment with the same values in current_choices.
00695     // Update rating and certainty of the character being composed.
00696     CHAR_FRAGMENT fragment;
00697     fragment.set_all(unichar, fragment_pieces,
00698                      expanded_fragment_lengths[choice_index], false);
00699     BLOB_CHOICE_IT choice_it(current_choices);
00700     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
00701         choice_it.forward()) {
00702       BLOB_CHOICE* choice = choice_it.data();
00703       const CHAR_FRAGMENT *current_fragment =
00704           getDict().getUnicharset().get_fragment(choice->unichar_id());
00705       if (current_fragment && fragment.equals(current_fragment)) {
00706         rating += choice->rating();
00707         if (choice->certainty() < certainty) {
00708           certainty = choice->certainty();
00709         }
00710         IntersectRange(choice->min_xheight(), choice->max_xheight(),
00711                        &min_xheight, &max_xheight);
00712         break;
00713       }
00714     }
00715     if (choice_it.cycled_list()) {
00716       print_ratings_list("Failure", current_choices, unicharset);
00717       tprintf("Failed to find fragment %s at index=%d\n",
00718               fragment.to_string().string(), choice_index);
00719     }
00720     ASSERT_HOST(!choice_it.cycled_list());  // Be sure we found the fragment.
00721   }
00722   return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
00723                          rating, certainty, -1, -1, 0,
00724                          min_xheight, max_xheight, false);
00725 }
00726
00727 // Creates a joined copy of the blobs between x and y (inclusive) and
00728 // inserts as the first blob at word->rebuild_word->blobs.
00729 // Returns a deep copy of the classifier results for the blob.
00730 BLOB_CHOICE_LIST *Wordrec::join_blobs_and_classify(
00731     WERD_RES* word, int x, int y, int choice_index, MATRIX *ratings,
00732     BLOB_CHOICE_LIST_VECTOR *old_choices) {
00733   // Join parts to make the blob if needed.
00734   if (x != y)
00735     join_pieces(word->chopped_word->blobs, word->seam_array, x, y);
00736   TBLOB *blob = word->chopped_word->blobs;
00737   for (int i = 0; i < x; i++) {
00738     blob = blob->next;
00739   }
00740   // Deep copy this blob into the output word.
00741   TBLOB* copy_blob = new TBLOB(*blob);
00742   copy_blob->next = word->rebuild_word->blobs;
00743   word->rebuild_word->blobs = copy_blob;
00744
00745   BLOB_CHOICE_LIST *choices = NULL;
00746   // First check to see if we can look up the classificaiton
00747   // in old_choices (if there is no need to merge blobs).
00748   if (choice_index >= 0 && old_choices != NULL) {
00749     choices = old_choices->get(choice_index);
00750     old_choices->set(NULL, choice_index);
00751   }
00752   // The ratings matrix filled in by the associator will contain the next most
00753   // up-to-date classification info. Thus we look up the classification there
00754   // next, and only call classify_blob() if the classification is not found.
00755   if (choices == NULL && ratings != NULL) {
00756     choices = ratings->get(x, y);
00757     if (choices != NOT_CLASSIFIED) {
00758       ratings->put(x, y, NULL);
00759     }
00760   }
00761   // Get the choices for the blob by classification if necessary.
00762   if (choices == NULL) {
00763     choices = classify_blob(blob, word->denorm, "rebuild", Orange,
00764                             word->blamer_bundle);
00765   }
00766   // Undo join_pieces to restore the chopped word to its fully chopped state.
00767   if (x != y)
00768     break_pieces(blob, word->seam_array, x, y);
00769   return choices;
00770 }
00771
00772 }  // namespace tesseract