tesseract-doc/permute_8cpp_source.html

00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        permute.c  (Formerly permute.c)
00005  * Description:  Choose OCR text given character-probability maps
00006  *               for sequences of glyph fragments and a dictionary provided as
00007  *               a Dual Acyclic Word Graph.
00008  *               In this file, "permute" should be read "combine."
00009  * Author:       Mark Seaman, OCR Technology
00010  * Created:      Fri Sep 22 14:05:51 1989
00011  * Modified:     Thu Jan  3 16:38:46 1991 (Mark Seaman) marks@hpgrlt
00012  * Language:     C
00013  * Package:      N/A
00014  * Status:       Experimental (Do Not Distribute)
00015  *
00016  * (c) Copyright 1989, Hewlett-Packard Company.
00017  ** Licensed under the Apache License, Version 2.0 (the "License");
00018  ** you may not use this file except in compliance with the License.
00019  ** You may obtain a copy of the License at
00020  ** http://www.apache.org/licenses/LICENSE-2.0
00021  ** Unless required by applicable law or agreed to in writing, software
00022  ** distributed under the License is distributed on an "AS IS" BASIS,
00023  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00024  ** See the License for the specific language governing permissions and
00025  ** limitations under the License.
00026  *
00027  *********************************************************************************/
00028 /*----------------------------------------------------------------------
00029             I n c l u d e s
00030 ---------------------------------------------------------------------*/
00031
00032 #ifdef _MSC_VER
00033 #pragma warning(disable:4244)  // Conversion warnings
00034 #pragma warning(disable:4800)  // int/bool warnings
00035 #endif
00036
00037 #include <assert.h>
00038 #include <math.h>
00039
00040 #include "const.h"
00041
00042 #include "permute.h"
00043
00044 #include "callcpp.h"
00045 #include "ccutil.h"
00046 #include "dict.h"
00047 #include "freelist.h"
00048 #include "helpers.h"
00049 #include "image.h"
00050 #include "globals.h"
00051 #include "ndminx.h"
00052 #include "ratngs.h"
00053 #include "stopper.h"
00054 #include "tprintf.h"
00055 #include "trie.h"
00056 #include "params.h"
00057 #include "unicharset.h"
00058
00059
00060 namespace tesseract {
00061
00062 /*----------------------------------------------------------------------
00063               F u n c t i o n s
00064 ----------------------------------------------------------------------*/
00065
00074 WERD_CHOICE *get_best_delete_other(WERD_CHOICE *choice1,
00075                                    WERD_CHOICE *choice2) {
00076   if (!choice1) return choice2;
00077   if (!choice2) return choice1;
00078   if (choice1->rating() < choice2->rating() || choice2->length() == 0) {
00079     delete choice2;
00080     return choice1;
00081   } else {
00082     delete choice1;
00083     return choice2;
00084   }
00085 }
00086
00091 BLOB_CHOICE* get_nth_choice(BLOB_CHOICE_LIST* blob_list, int n) {
00092   BLOB_CHOICE_IT c_it(blob_list);
00093   while (n-- > 0 && !c_it.at_last())
00094     c_it.forward();
00095   return c_it.data();
00096 }
00097
00099 UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list) {
00100   if (!blob_list) return INVALID_UNICHAR_ID;
00101   BLOB_CHOICE_IT blob_choice_it(blob_list);
00102   return (blob_choice_it.data()) ? blob_choice_it.data()->unichar_id()
00103                                  : INVALID_UNICHAR_ID;
00104 }
00105
00110 int find_choice_by_uid(BLOB_CHOICE_LIST *blob_list, UNICHAR_ID target_uid) {
00111   BLOB_CHOICE_IT c_it(blob_list);
00112   int pos = 0;
00113   while (1) {
00114     if (c_it.data()->unichar_id() == target_uid) return pos;
00115     if (c_it.at_last()) break;
00116     c_it.forward();
00117     pos++;
00118   }
00119   return -1;
00120 }
00121
00129 WERD_CHOICE* get_choice_from_posstr(const UNICHARSET *unicharset,
00130                                     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00131                                     int start_pos,
00132                                     const char* pos_str,
00133                                     float *certainties) {
00134   int pos_str_len = strlen(pos_str);
00135   WERD_CHOICE* wchoice = new WERD_CHOICE(unicharset);
00136   if (start_pos + pos_str_len > char_choices.length()) {
00137     wchoice->make_bad();
00138     return wchoice;
00139   }
00140   for (int x = 0; x < pos_str_len; x++) {
00141     int pos = pos_str[x]-'0';
00142     if (pos < 0) pos = 0;   // use the top choice by default, eg. '.'
00143     if (pos >= 10)
00144       tprintf("PosStr[%d](%d)=%c  %d\n", x, pos_str_len, pos_str[x], pos);
00145     ASSERT_HOST(pos < 10);
00146     BLOB_CHOICE* blob_it = get_nth_choice(char_choices.get(start_pos+x), pos);
00147     wchoice->set_permuter(NO_PERM);
00148     wchoice->append_unichar_id(blob_it->unichar_id(), 1,
00149                                blob_it->rating(),
00150                                blob_it->certainty());
00151     if (certainties != NULL) certainties[x] = blob_it->certainty();
00152   }
00153   return wchoice;
00154 }
00155
00161 void get_posstr_from_choice(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00162                             WERD_CHOICE* word_choice,
00163                             int start_pos,
00164                             char* pos_str) {
00165   for (int i = 0; i < word_choice->length(); i++) {
00166     UNICHAR_ID target_id = word_choice->unichar_id(i);
00167     BLOB_CHOICE_LIST* blob_choice_list = char_choices.get(start_pos + i);
00168     int pos = find_choice_by_uid(blob_choice_list, target_id);
00169     if (pos < 0) pos = 0;
00170     pos_str[i] = pos + '0';
00171   }
00172   pos_str[word_choice->length()] = '\0';
00173 }
00174
00181 BLOB_CHOICE* find_choice_by_type(
00182     BLOB_CHOICE_LIST *blob_choices,
00183     char target_type,
00184     const UNICHARSET &unicharset) {
00185   BLOB_CHOICE_IT c_it(blob_choices);
00186   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00187     if (c_it.data() &&
00188         unicharset.get_chartype(c_it.data()->unichar_id()) == target_type)
00189       return c_it.data();
00190   }
00191   return NULL;
00192 }
00193
00206 BLOB_CHOICE* find_choice_by_script(
00207     BLOB_CHOICE_LIST *blob_choices,
00208     int target_sid,
00209     int backup_sid,
00210     int secondary_sid) {
00211   BLOB_CHOICE_IT c_it(blob_choices);
00212   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00213     bool found = false;
00214     if (c_it.data()->script_id() == 0) continue;
00215     if (c_it.data()->script_id() == target_sid) found = true;
00216     if (backup_sid > 0 && c_it.data()->script_id() == backup_sid) found = true;
00217     if (found) return c_it.data();
00218   }
00219   if (secondary_sid > 0) {
00220     c_it.set_to_list(blob_choices);
00221     for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00222       if (c_it.data()->script_id() == 0) continue;
00223       if (c_it.data()->script_id() == secondary_sid)
00224         return c_it.data();
00225     }
00226   }
00227   return NULL;
00228 }
00229
00230
00231 PermuterState::PermuterState() {
00232   unicharset_ = NULL;
00233   char_choices_ = NULL;
00234   adjust_factor_ = 1.0f;
00235   allow_collision_ = false;
00236   word_length_ = 0;
00237   debug_ = false;
00238 }
00239
00240 void PermuterState::Init(const BLOB_CHOICE_LIST_VECTOR& char_choices,
00241                          const UNICHARSET& unicharset,
00242                          float default_bias,
00243                          bool debug) {
00244   ASSERT_HOST(char_choices.length() < MAX_PERM_LENGTH);
00245   unicharset_ = &unicharset;
00246   char_choices_ = &char_choices;
00247   word_length_ = char_choices.length();
00248   for (int i = 0; i < word_length_; ++i)
00249     perm_state_[i] = kPosFree;
00250   perm_state_[word_length_] = '\0';
00251   // Skip fragment choice and the mark positions so they remain unchanged.
00252   for (int i = 0; i < word_length_; ++i) {
00253     UNICHAR_ID unichar_id = get_top_choice_uid(char_choices.get(i));
00254     if (unicharset.get_fragment(unichar_id) != NULL)
00255       perm_state_[i] = '1';
00256   }
00257   adjust_factor_ = default_bias;
00258   allow_collision_ = false;
00259   debug_ = debug;
00260 }
00261
00262 // Promote char positions specified in pos_str with given weight.
00263 // For example, AddPreference(5, "234", 0.95f)
00264 // would promote the 3rd, 4th and 5th choice for character 5, 6, 7
00265 // (starting at 0) in the word with a rating adjustment of 0.95.
00266 void PermuterState::AddPreference(int start_pos, char* pos_str, float weight) {
00267   ASSERT_HOST(char_choices_ != NULL);
00268   ASSERT_HOST(start_pos + strlen(pos_str) - 1 < word_length_);
00269   if (debug_) {
00270     tprintf("Copy over %s -> %s @ %d ", pos_str, perm_state_, start_pos);
00271   }
00272   // copy over preferred position without the terminating null
00273   if (!allow_collision_) {
00274     int len = strlen(pos_str);
00275     for (int i = 0; i < len; ++i)
00276       if (position_marked(start_pos + i)) return;
00277   }
00278   strncpy(&perm_state_[start_pos], pos_str, strlen(pos_str));
00279   adjust_factor_ *= weight;
00280   if (debug_) tprintf("==> %s %f\n", perm_state_, adjust_factor_);
00281 }
00282
00283 // Promote the input blob_choice at specified position with given weight.
00284 void PermuterState::AddPreference(int char_pos, BLOB_CHOICE* blob_choice,
00285                                   float weight) {
00286   ASSERT_HOST(char_choices_ != NULL);
00287   ASSERT_HOST(char_pos < word_length_);
00288   // avoid collision (but this doesn't work if the first choice is favored!
00289   if (!allow_collision_ && position_marked(char_pos)) return;
00290
00291   if (debug_) {
00292     tprintf("Set UID %d -> %s @ %d ",
00293             blob_choice->unichar_id(), perm_state_, char_pos);
00294   }
00295   int pos = find_choice_by_uid(char_choices_->get(char_pos),
00296                                blob_choice->unichar_id());
00297   perm_state_[char_pos] = pos + '0';
00298   adjust_factor_ *= weight;
00299   if (debug_) tprintf("==> %s %f\n", perm_state_, adjust_factor_);
00300 }
00301
00302 // Get the best word permutation so far.  Caller should destroy WERD_CHOICE.
00303 WERD_CHOICE* PermuterState::GetPermutedWord(float *certainties,
00304                                             float *adjust_factor) {
00305   ASSERT_HOST(char_choices_ != NULL);
00306   WERD_CHOICE *word_choice = get_choice_from_posstr(
00307       unicharset_, *char_choices_, 0, perm_state_, certainties);
00308   float rating = word_choice->rating() * adjust_factor_;
00309   word_choice->set_rating(rating);
00310   *adjust_factor = adjust_factor_;
00311   return word_choice;
00312 }
00313
00314
00315 /**********************************************************************
00316  * permute_all
00317  *
00318  * Permute all the characters together using all of the different types
00319  * of permuters/selectors available.  Each of the characters must have
00320  * a non-NULL choice list.
00321  *
00322  * Note: order of applying permuters does matter, since the latter
00323  * permuter will be recorded if the resulting word ratings are the same.
00324  *
00325  * Note: the function assumes that best_choice and raw_choice are not NULL.
00326  *
00327  * Note: Since permuter_all maybe called recursively (through permuter_
00328  * compound_words), there must be a separate instance of permuter_state
00329  * for each invocation.
00330  **********************************************************************/
00331 WERD_CHOICE *Dict::permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00332                                const WERD_CHOICE *best_choice,
00333                                WERD_CHOICE *raw_choice) {
00334   WERD_CHOICE *result1 = NULL;
00335   WERD_CHOICE *result2 = NULL;
00336   BOOL8 any_alpha;
00337   float top_choice_rating_limit = best_choice->rating();
00338   int word_script_id = get_top_word_script(char_choices, getUnicharset());
00339
00340   PermuterState permuter_state;
00341   if (getUnicharset().han_sid() != getUnicharset().null_sid() &&
00342       word_script_id == getUnicharset().han_sid()) {
00343     permuter_state.Init(char_choices, getUnicharset(), 1.0f, permute_debug);
00344
00345     result1 = get_top_choice_word(char_choices);
00346
00347     // Note that we no longer need the returned word from these permuters,
00348     // except to delete the memory.  The word choice from all permutations
00349     // is returned by permuter_state.GetpermutedWord() at the end.
00350     if (permute_fixed_length_dawg) {
00351       result2 = permute_fixed_length_words(char_choices, &permuter_state);
00352       delete result2;
00353     }
00354     if (permute_chartype_word) {
00355       result2 = permute_chartype_words(char_choices, &permuter_state);
00356       delete result2;
00357     }
00358     if (permute_script_word) {
00359       result2 = permute_script_words(char_choices, &permuter_state);
00360       delete result2;
00361     }
00362
00363     float certainties[MAX_PERM_LENGTH];
00364     float adjust_factor;
00365     result2 = permuter_state.GetPermutedWord(certainties, &adjust_factor);
00366     LogNewChoice(adjust_factor, certainties, false, result2);
00367     result1 = get_best_delete_other(result1, result2);
00368
00369     if (segment_segcost_rating) incorporate_segcost(result1);
00370   } else {
00371     result1 = permute_top_choice(char_choices, &top_choice_rating_limit,
00372                                  raw_choice, &any_alpha);
00373     if (result1 == NULL)
00374       return (NULL);
00375     if (permute_only_top)
00376       return result1;
00377
00378     if (permute_chartype_word) {
00379       permuter_state.Init(char_choices, getUnicharset(),
00380                           segment_penalty_garbage, permute_debug);
00381       result2 = permute_chartype_words(char_choices, &permuter_state);
00382       result1 = get_best_delete_other(result1, result2);
00383     }
00384
00385     // Permute character fragments if necessary.
00386     if (result1 == NULL || result1->fragment_mark()) {
00387       result2 = top_fragments_permute_and_select(char_choices,
00388                                                  top_choice_rating_limit);
00389       result1 = get_best_delete_other(result1, result2);
00390     }
00391
00392     result2 = dawg_permute_and_select(char_choices, best_choice->rating());
00393     result1 = get_best_delete_other(result1, result2);
00394
00395     result2 = permute_compound_words(char_choices, best_choice->rating());
00396     result1 = get_best_delete_other(result1, result2);
00397   }
00398   return result1;
00399 }
00400
00409 void Dict::incorporate_segcost(WERD_CHOICE *word) {
00410   if (!word || wordseg_rating_adjust_factor_ <= 0) return;
00411
00412   float old_rating = word->rating();
00413   float new_rating = old_rating * wordseg_rating_adjust_factor_;
00414   word->set_rating(new_rating);
00415   if (permute_debug)
00416     tprintf("Permute segadjust %f * %f --> %f\n",
00417             old_rating, wordseg_rating_adjust_factor_, new_rating);
00418 }
00419
00420
00430 WERD_CHOICE* Dict::permute_fixed_length_words(
00431     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00432     PermuterState *permuter_state) {
00433   if (permute_debug)
00434     print_char_choices_list("\n\nPermute FixedLength Word",
00435                             char_choices, getUnicharset(), false);
00436   WERD_CHOICE* best_choice =
00437       new WERD_CHOICE(&getUnicharset(), char_choices.length());
00438   const int max_dict_len = max_fixed_length_dawgs_wdlen_;
00439   const int min_dict_len = 2;
00440   char posstr[256];
00441   int match_score = 0;
00442   int anchor_pos = 0;
00443   while (anchor_pos < char_choices.length()) {
00444      // search from longest phrase to shortest, stop when we find a match
00445      WERD_CHOICE* part_choice = NULL;
00446      int step = max_dict_len;
00447      while (step >= min_dict_len) {
00448        int end_pos = anchor_pos + step - 1;
00449        if (end_pos < char_choices.length()) {
00450          part_choice = dawg_permute_and_select(char_choices,
00451                                                200.0,   // rate limit
00452                                                step,
00453                                                anchor_pos);
00454          if (part_choice->length() == step) {
00455            if (permute_debug)
00456              tprintf("match found at pos=%d len=%d\n%s\n", anchor_pos, step,
00457                      part_choice->unichar_string().string());
00458            break;
00459          }
00460          delete part_choice;
00461          part_choice = NULL;
00462        }
00463        step--;
00464      }
00465
00466      if (part_choice && step > 1) {   // found lexicon match
00467        get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
00468        float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
00469        if (permuter_state)
00470          permuter_state->AddPreference(anchor_pos, posstr, adjust_factor);
00471        match_score += step - 1;   // single chars don't count
00472        if (permute_debug)
00473          tprintf("Promote word rating %d-len%d\n%s\n", anchor_pos, step,
00474                  part_choice->unichar_string().string());
00475      } else {     // no lexicon match
00476        step = 1;
00477        part_choice = get_choice_from_posstr(&getUnicharset(), char_choices,
00478                                             anchor_pos, "0", NULL);
00479        if (permute_debug)
00480          tprintf("Single char %d %s\n", anchor_pos,
00481                  part_choice->unichar_string().string());
00482      }
00483      if (part_choice && part_choice->length() > 0)
00484        (*best_choice) += (*part_choice);
00485      if (part_choice) delete part_choice;
00486      anchor_pos += step;
00487   }
00488
00489   if (match_score > 0) {
00490     float adjust_factor = pow(0.8,    // 1.0/segment_penalty_dict_nonword,
00491                               match_score * 2.0 / char_choices.length());
00492     float adjusted_score = best_choice->rating() * adjust_factor;
00493     if (permute_debug)
00494       tprintf("Adjusting score %f @ %d -> %f\n",
00495               best_choice->rating(), match_score, adjusted_score);
00496     best_choice->set_rating(adjusted_score);
00497   }
00498   if (permute_debug)
00499     tprintf("Found Best CJK word %f: %s\n",
00500             best_choice->rating(), best_choice->unichar_string().string());
00501   return best_choice;
00502 }
00503
00504
00505 /**********************************************************************
00506  * Returns the dominant chartype for the word.  Only the "main" chartype
00507  * of each character is used, and a consistent chartype is defined by
00508  * the majority chartype from non-ambiguous character positions.
00509  * If pos_chartypes is not NULL, then the "main" chartype at each pos
00510  * is also returned.  The caller must allocate and deallocate the space.
00511  **********************************************************************/
00512 char Dict::top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00513                              char* pos_chartypes) {
00514   const UNICHARSET &unicharset = getUnicharset();
00515   const int hist_size = 128;   // to contain 'A','a','0','x','p'
00516   int chprop[hist_size];
00517   int x;
00518   for (x = 0; x < hist_size; x++) chprop[x] = 0;
00519   for (x = 0; x < char_choices.length(); ++x) {
00520     UNICHAR_ID unichar_id = get_top_choice_uid(char_choices.get(x));
00521     char ctype = unicharset.get_chartype(unichar_id);
00522     if (pos_chartypes) pos_chartypes[x] = ctype;
00523     if (ctype == 0 || ctype == 'p') continue;
00524     if (getUnicharAmbigs().OneToOneDefiniteAmbigs(unichar_id) != NULL) continue;
00525     chprop[ctype]++;
00526     if (x == 0 && ctype == 'A')  // first-cap also counts as lower
00527       chprop['a']++;
00528   }
00529   int max_prop = 0;
00530   for (x = 1; x < hist_size; x++)
00531     if (chprop[x] >= chprop[max_prop]) max_prop = x;
00532   return (chprop[max_prop] > 0) ? max_prop : 0;
00533 }
00534
00535 /**********************************************************************
00536  * Promote consistent character type based on neighboring characters.
00537  * For each character position, if the top choice property is inconsistent
00538  * with that of the word or previous character, then its likely
00539  * subsitutions, as defined by DangAmbigs, will be examined and the one
00540  * with a matching property will be selected.
00541  **********************************************************************/
00542 WERD_CHOICE* Dict::permute_chartype_words(
00543     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00544     PermuterState *permuter_state) {
00545
00546   if (char_choices.length() >= MAX_PERM_LENGTH)
00547     return NULL;
00548   // Store main character property of top choice at every position
00549   char pos_chartypes[MAX_PERM_LENGTH];
00550   char word_type = top_word_chartype(char_choices, pos_chartypes);
00551   if (word_type == 0 || word_type == 'p')
00552     return NULL;     // skip if word type is punctuation or unknown
00553   if (permute_debug) {
00554     tprintf("\n\nPermuteCharType[%c]\n", word_type);
00555     print_char_choices_list("", char_choices, getUnicharset(), true);
00556   }
00557
00558   WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
00559   BLOB_CHOICE_IT blob_choice_it;
00560   const UNICHARSET& unicharset = getUnicharset();
00561   bool replaced = false;        // has any character choice been replaced
00562   int prev_unambig_type = 0;    // the last chartype of an unambiguous char
00563   float certainties[MAX_PERM_LENGTH + 1];
00564   for (int x = 0; x < char_choices.length(); ++x) {
00565     BLOB_CHOICE_LIST* pos_choice = char_choices.get(x);
00566     UNICHAR_ID unichar_id = get_top_choice_uid(pos_choice);
00567     if (unichar_id == 0) {
00568       delete current_word;
00569       return NULL;
00570     }
00571     blob_choice_it.set_to_list(pos_choice);
00572     BLOB_CHOICE *first_choice = blob_choice_it.data();
00573     ASSERT_HOST(first_choice != NULL);
00574
00575     const UnicharIdVector* ambig_uids =
00576         getUnicharAmbigs().OneToOneDefiniteAmbigs(unichar_id);
00577     bool is_ambiguous = (ambig_uids != NULL);
00578     bool is_punct = unicharset.get_ispunctuation(unichar_id);
00579     bool is_consistent = is_punct ||
00580         unicharset.get_chartype(unichar_id) == prev_unambig_type ||
00581         unicharset.get_chartype(unichar_id) == word_type;
00582     bool is_fragment = getUnicharset().get_fragment(unichar_id) != NULL;
00583     if (permute_debug)
00584       tprintf("char[%d]:%s is_ambig %c   is_punct %c  is_consistent %c\n",
00585               x, unicharset.id_to_unichar(unichar_id),
00586               is_ambiguous?'T':'F', is_punct?'T':'F', is_consistent?'T':'F');
00587
00588     if (is_fragment) {
00589       // Ignore any fragmented characters by skipping them to next choice
00590       // (originally first choice).
00591       first_choice = get_nth_choice(pos_choice, 1);
00592       ASSERT_HOST(first_choice != NULL);
00593     } else if (is_ambiguous && !is_consistent) {
00594       // Check every confusable blob choice where the top choice is inconsistent
00595       // with the character type of the previous unambiguous character.
00596       if (permute_debug) {
00597         tprintf("Checking %s r%g  PrevCharType %c\n",
00598                 unicharset.id_to_unichar(unichar_id),
00599                 first_choice->rating(), prev_unambig_type);
00600         print_ratings_list("\t", pos_choice, getUnicharset());
00601       }
00602       BLOB_CHOICE* c_it = NULL;
00603       if (c_it == NULL) {
00604         c_it = find_choice_by_type(pos_choice, word_type, unicharset);
00605       }
00606
00607       // Prefer a character choice whose type is the same as the previous
00608       // unambiguous character and the confusion appears in the ambig list.
00609       if (c_it == NULL && prev_unambig_type > 0) {
00610         c_it = find_choice_by_type(pos_choice, prev_unambig_type, unicharset);
00611         if (c_it &&
00612             UnicharIdArrayUtils::find_in(*ambig_uids, c_it->unichar_id()) < 0)
00613           c_it = NULL;
00614       }
00615
00616       // Otherwise, perfer a punctuation
00617       if (c_it == NULL) {
00618         c_it = find_choice_by_type(pos_choice, 'p', unicharset);
00619         if (c_it &&
00620             UnicharIdArrayUtils::find_in(*ambig_uids, c_it->unichar_id()) < 0)
00621           c_it = NULL;
00622       }
00623
00624       // save any preference other than the top choice
00625       if (c_it != NULL) {
00626         if (permute_debug) {
00627           tprintf("Replacing %s r%g ==> %s r%g\n",
00628                   unicharset.id_to_unichar(unichar_id), first_choice->rating(),
00629                   unicharset.id_to_unichar(c_it->unichar_id()), c_it->rating());
00630           tprintf("\n\nPermuteCharType[%c]\n", word_type);
00631           print_char_choices_list("", char_choices, getUnicharset(), false);
00632         }
00633         if (permuter_state)
00634           permuter_state->AddPreference(x, c_it, segment_reward_chartype);
00635         first_choice = c_it;
00636         replaced = true;
00637       }
00638     } else if (!is_ambiguous && !is_punct) {
00639       // keep the last unambiguous character type
00640       prev_unambig_type = pos_chartypes[x];
00641     }
00642     current_word->append_unichar_id(first_choice->unichar_id(), 1,
00643                                     first_choice->rating(),
00644                                     first_choice->certainty());
00645     certainties[x] = first_choice->certainty();
00646   }
00647   // All permuter choices should go through adjust_non_word so the choice
00648   // rating would be adjusted on the same scale.
00649   adjust_non_word(current_word, certainties, permute_debug);
00650   if (replaced) {
00651     // Apply a reward multiplier on rating if an chartype permutation is made.
00652     float rating = current_word->rating();
00653     current_word->set_rating(rating * segment_reward_chartype);
00654     if (permute_debug)
00655       current_word->print("<== permute_chartype_word **");
00656   }
00657   return current_word;
00658 }
00659
00669 WERD_CHOICE* Dict::permute_script_words(
00670     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00671     PermuterState *permuter_state) {
00672   if (char_choices.length() >= MAX_WERD_LENGTH)
00673     return NULL;
00674
00675   int word_sid = get_top_word_script(char_choices, getUnicharset());
00676   if (word_sid == getUnicharset().null_sid())
00677     return NULL;
00678
00679   if (permute_debug) {
00680     tprintf("\n\nPermuteScript %s\n",
00681             getUnicharset().get_script_from_script_id(word_sid));
00682     print_char_choices_list("", char_choices, getUnicharset(),
00683                             permute_debug > 1);
00684   }
00685
00686   WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
00687   BLOB_CHOICE_IT blob_choice_it;
00688   bool replaced = false;
00689   bool prev_is_consistent = false;
00690   float certainties[MAX_PERM_LENGTH + 1];
00691   for (int x = 0; x < char_choices.length(); ++x) {
00692     blob_choice_it.set_to_list(char_choices.get(x));
00693     BLOB_CHOICE *first_choice = blob_choice_it.data();
00694     if (!first_choice) {
00695       delete current_word;
00696       return NULL;
00697     }
00698     UNICHAR_ID unichar_id = first_choice->unichar_id();
00699     if (unichar_id == 0) {
00700       delete current_word;
00701       return NULL;
00702     }
00703     bool sid_consistent = (getUnicharset().get_script(unichar_id) == word_sid);
00704     bool this_is_punct = getUnicharset().get_chartype(unichar_id) == 'p';
00705     bool is_fragment = getUnicharset().get_fragment(unichar_id) != NULL;
00706
00707     if (is_fragment) {
00708       // Ignore any fragmented characters by skipping them to next choice
00709       // (originally first choice).
00710       first_choice = get_nth_choice(char_choices.get(x), 1);
00711       ASSERT_HOST(first_choice != NULL);
00712     } else if (!sid_consistent && !this_is_punct && prev_is_consistent) {
00713       // If the previous char is CJK, we prefer a cjk over non-cjk char
00714       if (permute_debug) {
00715         tprintf("Checking %s r%g\n", getUnicharset().id_to_unichar(unichar_id),
00716                                      first_choice->rating());
00717         print_ratings_list("\t", char_choices.get(x), getUnicharset());
00718       }
00719       // prefer a script consistent choice
00720       BLOB_CHOICE* c_it = find_choice_by_script(char_choices.get(x),
00721                                                 word_sid, 0, 0);
00722       // otherwise, prefer a punctuation
00723       if (c_it == NULL)
00724         c_it = find_choice_by_type(char_choices.get(x), 'p', getUnicharset());
00725
00726       if (c_it != NULL) {
00727         if (permute_debug)
00728           tprintf("Replacing %s r%g ==> %s r%g\n",
00729                   getUnicharset().id_to_unichar(unichar_id),
00730                   first_choice->rating(),
00731                   getUnicharset().id_to_unichar(c_it->unichar_id()),
00732                   c_it->rating());
00733         if (permuter_state)
00734           permuter_state->AddPreference(x, c_it, segment_reward_script);
00735         first_choice = c_it;
00736         replaced = true;
00737       }
00738     }
00739     current_word->append_unichar_id(first_choice->unichar_id(), 1,
00740                                     first_choice->rating(),
00741                                     first_choice->certainty());
00742     certainties[x] = first_choice->certainty();
00743     prev_is_consistent = sid_consistent;
00744   }
00745   // All permuter choices should go through adjust_non_word so the choice
00746   // rating would be adjusted on the same scale.
00747   adjust_non_word(current_word, certainties, permute_debug);
00748   if (replaced) {
00749     // Apply a reward multiplier on rating if an script permutation is made.
00750     float rating = current_word->rating();
00751     current_word->set_rating(rating * segment_reward_script);
00752     if (permute_debug)
00753       current_word->print("<== permute_script_word **");
00754   }
00755   return current_word;
00756 }
00757
00765 bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00766                               WERD_CHOICE *best_choice,
00767                               WERD_CHOICE *raw_choice) {
00768   if (permute_debug) {
00769     tprintf("\n\n\n##### Permute_Characters #######\n");
00770     print_char_choices_list("\n==> Input CharChoices", char_choices,
00771                             getUnicharset(), segment_debug > 1);
00772     tprintf("\n");
00773   }
00774
00775   if (char_choices.length() == 1 &&
00776       get_top_choice_uid(char_choices.get(0)) == 0) return false;
00777   WERD_CHOICE *this_choice = permute_all(char_choices, best_choice, raw_choice);
00778
00779   if (this_choice && this_choice->rating() < best_choice->rating()) {
00780     *best_choice = *this_choice;
00781
00782     if (permute_debug) {
00783       best_choice->print("\n**** Populate BestChoice");
00784       cprintf("populate best_choice\n\t%s\n",
00785               best_choice->debug_string().string());
00786     }
00787     delete this_choice;
00788     return true;
00789   }
00790   delete this_choice;
00791   return false;
00792 }
00793
00799 WERD_CHOICE *Dict::permute_compound_words(
00800     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00801     float rating_limit) {
00802   BLOB_CHOICE *first_choice;
00803   WERD_CHOICE *best_choice = NULL;
00804   WERD_CHOICE current_word(&getUnicharset(), MAX_WERD_LENGTH);
00805   int first_index = 0;
00806   int x;
00807   BLOB_CHOICE_IT blob_choice_it;
00808
00809   if (char_choices.length() > MAX_WERD_LENGTH) {
00810     WERD_CHOICE *bad_word_choice = new WERD_CHOICE(&getUnicharset());
00811     bad_word_choice->make_bad();
00812     return bad_word_choice;
00813   }
00814
00815   UNICHAR_ID slash = getUnicharset().unichar_to_id("/");
00816   UNICHAR_ID dash = getUnicharset().unichar_to_id("-");
00817   for (x = 0; x < char_choices.length(); ++x) {
00818     blob_choice_it.set_to_list(char_choices.get(x));
00819     first_choice = blob_choice_it.data();
00820     if (first_choice->unichar_id() == slash ||
00821         first_choice->unichar_id() == dash) {
00822       if (x > first_index) {
00823         if (segment_debug)
00824           cprintf ("Hyphenated word found\n");
00825         permute_subword(char_choices, rating_limit, first_index,
00826                         x - 1, &current_word);
00827         if (current_word.rating() > rating_limit)
00828           break;
00829       }
00830       // Append hyphen/slash separator to current_word.
00831       current_word.append_unichar_id_space_allocated(
00832           first_choice->unichar_id(), 1,
00833           first_choice->rating(), first_choice->certainty());
00834
00835       first_index = x + 1;  // update first_index
00836     }
00837   }
00838
00839   if (first_index > 0 && first_index < x &&
00840       current_word.rating() <= rating_limit) {
00841     permute_subword(char_choices, rating_limit, first_index,
00842                     x - 1, &current_word);
00843     best_choice = new WERD_CHOICE(current_word);
00844     best_choice->set_permuter(COMPOUND_PERM);
00845   }
00846   return (best_choice);
00847 }
00848
00849
00859 void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00860                            float rating_limit,
00861                            int start,
00862                            int end,
00863                            WERD_CHOICE *current_word) {
00864   int x;
00865   BLOB_CHOICE_LIST_VECTOR subchoices;
00866   WERD_CHOICE *best_choice = NULL;
00867   WERD_CHOICE raw_choice(&getUnicharset());
00868   raw_choice.make_bad();
00869
00870   DisableChoiceAccum();
00871
00872   for (x = start; x <= end; x++) {
00873     if (char_choices.get(x) != NULL) {
00874       subchoices += char_choices.get(x);
00875     }
00876   }
00877
00878   if (!subchoices.empty()) {
00879     WERD_CHOICE initial_choice(&getUnicharset());
00880     initial_choice.make_bad();
00881     initial_choice.set_rating(rating_limit);
00882
00883     best_choice = permute_all(subchoices, &initial_choice, &raw_choice);
00884
00885     if (best_choice && best_choice->length() > 0) {
00886       *current_word += *best_choice;
00887     } else {
00888       current_word->set_rating(MAX_FLOAT32);
00889     }
00890   } else {
00891     current_word->set_rating(MAX_FLOAT32);
00892   }
00893
00894   if (best_choice)
00895     delete best_choice;
00896
00897   if (segment_debug && current_word->rating() < MAX_FLOAT32) {
00898     cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
00899              current_word->debug_string().string(),
00900              current_word->rating(), current_word->certainty());
00901   }
00902   EnableChoiceAccum();
00903 }
00904
00908 WERD_CHOICE *Dict::get_top_choice_word(
00909     const BLOB_CHOICE_LIST_VECTOR &char_choices) {
00910   WERD_CHOICE *top_word = new WERD_CHOICE(&getUnicharset(), MAX_PERM_LENGTH);
00911   float certainties[MAX_PERM_LENGTH];
00912   top_word->set_permuter(TOP_CHOICE_PERM);
00913   for (int x = 0; x < char_choices.length(); x++) {
00914     BLOB_CHOICE_IT blob_choice_it;
00915     blob_choice_it.set_to_list(char_choices.get(x));
00916     BLOB_CHOICE *top_choice = blob_choice_it.data();
00917     top_word->append_unichar_id_space_allocated(top_choice->unichar_id(), 1,
00918                                                 top_choice->rating(),
00919                                                 top_choice->certainty());
00920     certainties[x] = top_choice->certainty();
00921   }
00922   LogNewChoice(1.0, certainties, true, top_word);
00923   return top_word;
00924 }
00925
00934 WERD_CHOICE *Dict::permute_top_choice(
00935     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00936     float* rating_limit,
00937     WERD_CHOICE *raw_choice,
00938     BOOL8 *any_alpha) {
00939   BLOB_CHOICE *first_choice;
00940   const char *first_char;             //first choice
00941   const char *second_char;            //second choice
00942   const char *third_char;             //third choice
00943   char prev_char[UNICHAR_LEN + 1];    //prev in word
00944   const char *next_char = "";         //next in word
00945   const char *next_next_char = "";    //after next next in word
00946
00947   WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
00948   word.set_permuter(TOP_CHOICE_PERM);
00949   WERD_CHOICE capital_word(&getUnicharset(), MAX_PERM_LENGTH);
00950   capital_word.set_permuter(UPPER_CASE_PERM);
00951   WERD_CHOICE lower_word(&getUnicharset(), MAX_PERM_LENGTH);
00952   lower_word.set_permuter(LOWER_CASE_PERM);
00953
00954   int x;
00955   BOOL8 char_alpha;
00956   float first_rating = 0;
00957
00958   float certainties[MAX_PERM_LENGTH + 1];
00959   float lower_certainties[MAX_PERM_LENGTH + 1];
00960   float upper_certainties[MAX_PERM_LENGTH + 1];
00961
00962   BLOB_CHOICE_IT blob_choice_it;
00963   UNICHAR_ID temp_id;
00964   UNICHAR_ID unichar_id;
00965   UNICHAR_ID space = getUnicharset().unichar_to_id(" ");
00966   register const char* ch;
00967   register inT8 lower_done;
00968   register inT8 upper_done;
00969
00970   prev_char[0] = '\0';
00971
00972   if (any_alpha != NULL)
00973     *any_alpha = FALSE;
00974
00975   if (char_choices.length() > MAX_PERM_LENGTH) {
00976     return (NULL);
00977   }
00978
00979   for (x = 0; x < char_choices.length(); ++x) {
00980     if (x + 1 < char_choices.length()) {
00981       unichar_id = get_top_choice_uid(char_choices.get(x+1));
00982       next_char = unichar_id != INVALID_UNICHAR_ID ?
00983         getUnicharset().id_to_unichar(unichar_id) : "";
00984     } else {
00985       next_char = "";
00986     }
00987
00988     if (x + 2 < char_choices.length()) {
00989       unichar_id = get_top_choice_uid(char_choices.get(x+2));
00990       next_next_char = unichar_id != INVALID_UNICHAR_ID ?
00991         getUnicharset().id_to_unichar(unichar_id) : "";
00992     } else {
00993       next_next_char = "";
00994     }
00995
00996     blob_choice_it.set_to_list(char_choices.get(x));
00997     ASSERT_HOST(!blob_choice_it.empty());
00998     first_choice = NULL;
00999     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
01000          blob_choice_it.forward()) {  // find the best non-fragment char choice
01001       temp_id = blob_choice_it.data()->unichar_id();
01002       if (!(getUnicharset().get_fragment(temp_id))) {
01003         first_choice = blob_choice_it.data();
01004         break;
01005       } else if (char_choices.length() > 1) {
01006         word.set_fragment_mark(true);
01007         capital_word.set_fragment_mark(true);
01008         lower_word.set_fragment_mark(true);
01009       }
01010     }
01011     if (first_choice == NULL) {
01012       cprintf("Permuter found only fragments for"
01013               " character at position %d; word=%s\n",
01014               x, word.debug_string().string());
01015     }
01016     ASSERT_HOST(first_choice != NULL);
01017
01018     unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ?
01019       first_choice->unichar_id() : space;
01020     first_char = getUnicharset().id_to_unichar(unichar_id);
01021     first_rating = first_choice->rating();
01022     word.append_unichar_id_space_allocated(
01023         unichar_id, 1, first_choice->rating(), first_choice->certainty());
01024     capital_word.append_unichar_id_space_allocated(
01025         unichar_id, 1, first_choice->rating(), first_choice->certainty());
01026     lower_word.append_unichar_id_space_allocated(
01027         unichar_id, 1, first_choice->rating(), first_choice->certainty());
01028
01029     certainties[x] = first_choice->certainty();
01030     lower_certainties[x] = first_choice->certainty();
01031     upper_certainties[x] = first_choice->certainty();
01032
01033     lower_done = FALSE;
01034     upper_done = FALSE;
01035     char_alpha = FALSE;
01036     second_char = "";
01037     third_char = "";
01038     for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
01039       unichar_id = blob_choice_it.data()->unichar_id();
01040       if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() &&
01041           blob_choice_it.data_relative(1)->rating() == first_rating) {
01042         temp_id = blob_choice_it.data_relative(1)->unichar_id();
01043         if (getUnicharset().eq(temp_id, "1") ||
01044             getUnicharset().eq(temp_id, "I")) {
01045           second_char = getUnicharset().id_to_unichar(temp_id);
01046           blob_choice_it.forward();
01047           if (!blob_choice_it.at_last() &&
01048               blob_choice_it.data_relative(1)->rating() == first_rating) {
01049             temp_id = blob_choice_it.data_relative(1)->unichar_id();
01050             if (getUnicharset().eq(temp_id, "1") ||
01051                 getUnicharset().eq(temp_id, "I")) {
01052               third_char = getUnicharset().id_to_unichar(temp_id);
01053               blob_choice_it.forward();
01054             }
01055           }
01056           ch = choose_il1 (first_char, second_char, third_char,
01057             prev_char, next_char, next_next_char);
01058           unichar_id = (ch != NULL && *ch != '\0') ?
01059             getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID;
01060           if (strcmp(ch, "l") != 0 &&
01061               getUnicharset().eq(word.unichar_id(x), "l")) {
01062             word.set_unichar_id(unichar_id, x);
01063             lower_word.set_unichar_id(unichar_id, x);
01064             capital_word.set_unichar_id(unichar_id, x);
01065           }
01066         }
01067       }
01068       if (unichar_id != INVALID_UNICHAR_ID) {
01069         /* Find lower case */
01070         if (!lower_done &&
01071             (getUnicharset().get_islower(unichar_id) ||
01072              (getUnicharset().get_isupper(unichar_id) && x == 0))) {
01073           lower_word.set_unichar_id(unichar_id, x);
01074           lower_word.set_rating(lower_word.rating() -
01075             first_choice->rating() + blob_choice_it.data()->rating());
01076           if (blob_choice_it.data()->certainty() < lower_word.certainty()) {
01077             lower_word.set_certainty(blob_choice_it.data()->certainty());
01078           }
01079           lower_certainties[x] = blob_choice_it.data()->certainty();
01080           lower_done = TRUE;
01081         }
01082         /* Find upper case */
01083         if (!upper_done && getUnicharset().get_isupper(unichar_id)) {
01084           capital_word.set_unichar_id(unichar_id, x);
01085           capital_word.set_rating(capital_word.rating() -
01086             first_choice->rating() + blob_choice_it.data()->rating());
01087           if (blob_choice_it.data()->certainty() < capital_word.certainty()) {
01088             capital_word.set_certainty(blob_choice_it.data()->certainty());
01089           }
01090           upper_certainties[x] = blob_choice_it.data()->certainty();
01091           upper_done = TRUE;
01092         }
01093         if (!char_alpha) {
01094           const CHAR_FRAGMENT *fragment =
01095             getUnicharset().get_fragment(unichar_id);
01096           temp_id = !fragment ? unichar_id :
01097             getUnicharset().unichar_to_id(fragment->get_unichar());
01098           if (getUnicharset().get_isalpha(temp_id)) {
01099             char_alpha = TRUE;
01100           }
01101         }
01102         if (lower_done && upper_done)
01103           break;
01104       }
01105     }
01106     if (char_alpha && any_alpha != NULL)
01107       *any_alpha = TRUE;
01108
01109     if (word.rating() > bestrate_pruning_factor * *rating_limit) {
01110       if (permute_debug)
01111         tprintf("\n***** Aborting high-cost word: %g > limit %g\n",
01112                 word.rating(), bestrate_pruning_factor * *rating_limit);
01113       return (NULL);
01114     }
01115
01116     *prev_char = '\0';
01117     temp_id = word.unichar_id(word.length()-1);
01118     if (temp_id != INVALID_UNICHAR_ID) {
01119       strcpy(prev_char, getUnicharset().id_to_unichar(temp_id));
01120     }
01121   }
01122
01123   if (raw_choice != NULL && word.rating() < raw_choice->rating()) {
01124     *raw_choice = word;
01125     LogNewChoice(1.0, certainties, true, raw_choice);
01126   }
01127   float rating = word.rating();
01128   adjust_non_word(&word, certainties, permute_debug);
01129
01130   float lower_rating = lower_word.rating();
01131   adjust_non_word(&lower_word, lower_certainties, permute_debug);
01132
01133   float upper_rating = capital_word.rating();
01134   adjust_non_word(&capital_word, upper_certainties, permute_debug);
01135
01136   WERD_CHOICE *best_choice = &word;
01137   *rating_limit = rating;
01138   if (lower_word.rating() < best_choice->rating()) {
01139     best_choice = &lower_word;
01140     *rating_limit = lower_rating;
01141   }
01142   if (capital_word.rating() < best_choice->rating()) {
01143     best_choice = &capital_word;
01144     *rating_limit = upper_rating;
01145   }
01146   return new WERD_CHOICE(*best_choice);
01147 }
01148
01149
01161 const char* Dict::choose_il1(const char *first_char,
01162                              const char *second_char,
01163                              const char *third_char,
01164                              const char *prev_char,
01165                              const char *next_char,
01166                              const char *next_next_char) {
01167   inT32 type1;                   //1/I/l type of first choice
01168   inT32 type2;                   //1/I/l type of second choice
01169   inT32 type3;                   //1/I/l type of third choice
01170
01171   int first_char_length = strlen(first_char);
01172   int prev_char_length = strlen(prev_char);
01173   int next_char_length = strlen(next_char);
01174   int next_next_char_length = strlen(next_next_char);
01175
01176   if (*first_char == 'l' && *second_char != '\0') {
01177     if (*second_char == 'I'
01178         && (((prev_char_length != 0 &&
01179             getUnicharset().get_isupper (prev_char, prev_char_length)) &&
01180             (next_char_length == 0 ||
01181              !getUnicharset().get_islower (next_char, next_char_length)) &&
01182             (next_char_length == 0 ||
01183              !getUnicharset().get_isdigit (next_char, next_char_length))) ||
01184             ((next_char_length != 0 &&
01185              getUnicharset().get_isupper (next_char, next_char_length)) &&
01186             (prev_char_length == 0 ||
01187              !getUnicharset().get_islower (prev_char, prev_char_length)) &&
01188             (prev_char_length == 0 ||
01189              !getUnicharset().get_isdigit (prev_char, prev_char_length)))))
01190       first_char = second_char;  //override
01191     else if (*second_char == '1' || *third_char == '1') {
01192       if ((next_char_length != 0 &&
01193            getUnicharset().get_isdigit (next_char, next_char_length)) ||
01194           (prev_char_length != 0 &&
01195            getUnicharset().get_isdigit (prev_char, prev_char_length))
01196           || (*next_char == 'l' &&
01197           (next_next_char_length != 0 &&
01198            getUnicharset().get_isdigit (next_next_char,
01199                                         next_next_char_length)))) {
01200         first_char = "1";
01201         first_char_length = 1;
01202       }
01203       else if ((prev_char_length == 0 ||
01204                 !getUnicharset().get_islower (prev_char, prev_char_length)) &&
01205                ((next_char_length == 0 ||
01206                  !getUnicharset().get_islower (next_char, next_char_length)) ||
01207                 (*next_char == 's' &&
01208                 *next_next_char == 't'))) {
01209         if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
01210             && ((*next_char != '\'' && *next_char != '`')
01211                 || *prev_char != '\0')) {
01212           first_char = "1";
01213           first_char_length = 1;
01214         }
01215       }
01216     }
01217     if (*first_char == 'l' && *next_char != '\0' &&
01218         (prev_char_length == 0 ||
01219          !getUnicharset().get_isalpha (prev_char, prev_char_length))) {
01220       type1 = 2;
01221
01222       if (*second_char == '1')
01223         type2 = 0;
01224       else if (*second_char == 'I')
01225         type2 = 1;
01226       else if (*second_char == 'l')
01227         type2 = 2;
01228       else
01229         type2 = type1;
01230
01231       if (*third_char == '1')
01232         type3 = 0;
01233       else if (*third_char == 'I')
01234         type3 = 1;
01235       else if (*third_char == 'l')
01236         type3 = 2;
01237       else
01238         type3 = type1;
01239
01240 #if 0
01241       if (bigram_counts[*next_char][type2] >
01242       bigram_counts[*next_char][type1]) {
01243         first_char = second_char;
01244         type1 = type2;
01245       }
01246       if (bigram_counts[*next_char][type3] >
01247       bigram_counts[*next_char][type1]) {
01248         first_char = third_char;
01249       }
01250 #endif
01251     }
01252   }
01253   return first_char;
01254 }
01255
01281 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
01282                                float curr_rating, float curr_certainty,
01283                                const CHAR_FRAGMENT_INFO *prev_char_frag_info,
01284                                const char *debug, int word_ending,
01285                                CHAR_FRAGMENT_INFO *char_frag_info) {
01286   const CHAR_FRAGMENT *this_fragment =
01287     getUnicharset().get_fragment(curr_unichar_id);
01288   const CHAR_FRAGMENT *prev_fragment =
01289     prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
01290
01291   // Print debug info for fragments.
01292   if (debug && (prev_fragment || this_fragment)) {
01293     cprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
01294             getUnicharset().debug_str(curr_unichar_id).string(),
01295             word_ending);
01296     if (prev_fragment) {
01297       cprintf("prev_fragment %s\n", prev_fragment->to_string().string());
01298     }
01299     if (this_fragment) {
01300       cprintf("this_fragment %s\n", this_fragment->to_string().string());
01301     }
01302   }
01303
01304   char_frag_info->unichar_id = curr_unichar_id;
01305   char_frag_info->fragment = this_fragment;
01306   char_frag_info->rating = curr_rating;
01307   char_frag_info->certainty = curr_certainty;
01308   char_frag_info->num_fragments = 1;
01309   if (prev_fragment && !this_fragment) {
01310     if (debug) tprintf("Skip choice with incomplete fragment\n");
01311     return false;
01312   }
01313   if (this_fragment) {
01314     // We are dealing with a fragment.
01315     char_frag_info->unichar_id = INVALID_UNICHAR_ID;
01316     if (prev_fragment) {
01317       if (!this_fragment->is_continuation_of(prev_fragment)) {
01318         if (debug) tprintf("Non-matching fragment piece\n");
01319         return false;
01320       }
01321       if (this_fragment->is_ending()) {
01322         char_frag_info->unichar_id =
01323           getUnicharset().unichar_to_id(this_fragment->get_unichar());
01324         char_frag_info->fragment = NULL;
01325         if (debug) {
01326           tprintf("Built character %s from fragments\n",
01327                   getUnicharset().debug_str(
01328                       char_frag_info->unichar_id).string());
01329         }
01330       } else {
01331         if (debug) tprintf("Record fragment continuation\n");
01332         char_frag_info->fragment = this_fragment;
01333       }
01334       // Update certainty and rating.
01335       char_frag_info->rating =
01336         prev_char_frag_info->rating + curr_rating;
01337       char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
01338       char_frag_info->certainty =
01339         MIN(curr_certainty, prev_char_frag_info->certainty);
01340     } else {
01341       if (this_fragment->is_beginning()) {
01342         if (debug) cprintf("Record fragment beginning\n");
01343       } else {
01344         if (debug) {
01345           tprintf("Non-starting fragment piece with no prev_fragment\n");
01346         }
01347         return false;
01348       }
01349     }
01350   }
01351   if (word_ending && char_frag_info->fragment) {
01352     if (debug) tprintf("Word can not end with a fragment\n");
01353     return false;
01354   }
01355   return true;
01356 }
01365 WERD_CHOICE *Dict::top_fragments_permute_and_select(
01366     const BLOB_CHOICE_LIST_VECTOR &char_choices,
01367     float rating_limit) {
01368   if (char_choices.length() <= 1 ||
01369       char_choices.length() > MAX_PERM_LENGTH) {
01370     return NULL;
01371   }
01372   // See it would be possible to benefit from permuting fragments.
01373   int x;
01374   float min_rating = 0.0;
01375   BLOB_CHOICE_IT blob_choice_it;
01376   for (x = 0; x < char_choices.length(); ++x) {
01377     blob_choice_it.set_to_list(char_choices.get(x));
01378     if (blob_choice_it.data()) {
01379       min_rating += blob_choice_it.data()->rating();
01380     }
01381     if (min_rating >= rating_limit) {
01382       return NULL;
01383     }
01384   }
01385   if (fragments_debug > 1) {
01386     tprintf("A choice with fragment beats top choice\n");
01387     tprintf("Running fragment permuter...\n");
01388   }
01389
01390   // Construct a modified choices list that contains (for each position):
01391   // the best choice, all fragments and at least one choice for
01392   // a non-fragmented character.
01393   BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length());
01394   for (x = 0; x < char_choices.length(); ++x) {
01395     bool need_nonfrag_char = true;
01396     BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST();
01397     BLOB_CHOICE_IT frag_choices_it;
01398     frag_choices_it.set_to_list(frag_choices);
01399     blob_choice_it.set_to_list(char_choices.get(x));
01400     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
01401          blob_choice_it.forward()) {
01402       if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) {
01403         frag_choices_it.add_after_then_move(
01404             new BLOB_CHOICE(*(blob_choice_it.data())));
01405       } else if (need_nonfrag_char) {
01406         frag_choices_it.add_after_then_move(
01407             new BLOB_CHOICE(*(blob_choice_it.data())));
01408         need_nonfrag_char = false;
01409       }
01410     }
01411     frag_char_choices += frag_choices;
01412   }
01413
01414   WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
01415   best_choice->make_bad();
01416   WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
01417   word.set_permuter(TOP_CHOICE_PERM);
01418   float certainties[MAX_PERM_LENGTH];
01419   this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn;
01420   int attempts_left = max_permuter_attempts;
01421   permute_choices((fragments_debug > 1) ? "fragments_debug" : NULL,
01422                   frag_char_choices, 0, NULL, &word, certainties,
01423                   &rating_limit, best_choice, &attempts_left, NULL);
01424
01425   frag_char_choices.delete_data_pointers();
01426   return best_choice;
01427 }
01428
01435 void Dict::permute_choices(
01436     const char *debug,
01437     const BLOB_CHOICE_LIST_VECTOR &char_choices,
01438     int char_choice_index,
01439     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
01440     WERD_CHOICE *word,
01441     float certainties[],
01442     float *limit,
01443     WERD_CHOICE *best_choice,
01444     int *attempts_left,
01445     void *more_args) {
01446   if (debug) {
01447     tprintf("%s permute_choices: char_choice_index=%d"
01448             " limit=%g rating=%g, certainty=%g word=%s\n",
01449             debug, char_choice_index, *limit, word->rating(),
01450             word->certainty(), word->debug_string().string());
01451   }
01452   if (char_choice_index < char_choices.length()) {
01453     BLOB_CHOICE_IT blob_choice_it;
01454     blob_choice_it.set_to_list(char_choices.get(char_choice_index));
01455     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
01456          blob_choice_it.forward()) {
01457       (*attempts_left)--;
01458       append_choices(debug, char_choices, *(blob_choice_it.data()),
01459                      char_choice_index, prev_char_frag_info, word,
01460                      certainties, limit, best_choice, attempts_left, more_args);
01461       if (*attempts_left <= 0) {
01462         if (debug) tprintf("permute_choices(): attempts_left is 0\n");
01463         break;
01464       }
01465     }
01466   }
01467 }
01468
01477 void Dict::append_choices(
01478     const char *debug,
01479     const BLOB_CHOICE_LIST_VECTOR &char_choices,
01480     const BLOB_CHOICE &blob_choice,
01481     int char_choice_index,
01482     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
01483     WERD_CHOICE *word,
01484     float certainties[],
01485     float *limit,
01486     WERD_CHOICE *best_choice,
01487     int *attempts_left,
01488     void *more_args) {
01489   int word_ending =
01490     (char_choice_index == char_choices.length() - 1) ? true : false;
01491
01492   // Deal with fragments.
01493   CHAR_FRAGMENT_INFO char_frag_info;
01494   if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
01495                            blob_choice.certainty(), prev_char_frag_info, debug,
01496                            word_ending, &char_frag_info)) {
01497     return;  // blob_choice must be an invalid fragment
01498   }
01499   // Search the next letter if this character is a fragment.
01500   if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
01501     permute_choices(debug, char_choices, char_choice_index + 1,
01502                     &char_frag_info, word, certainties, limit,
01503                     best_choice, attempts_left, more_args);
01504     return;
01505   }
01506
01507   // Add the next unichar.
01508   float old_rating = word->rating();
01509   float old_certainty = word->certainty();
01510   uinT8 old_permuter = word->permuter();
01511   certainties[word->length()] = char_frag_info.certainty;
01512   word->append_unichar_id_space_allocated(
01513       char_frag_info.unichar_id, char_frag_info.num_fragments,
01514       char_frag_info.rating, char_frag_info.certainty);
01515
01516   // Explore the next unichar.
01517   (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
01518                           &char_frag_info, word_ending, word, certainties,
01519                           limit, best_choice, attempts_left, more_args);
01520
01521   // Remove the unichar we added to explore other choices in it's place.
01522   word->remove_last_unichar_id();
01523   word->set_rating(old_rating);
01524   word->set_certainty(old_certainty);
01525   word->set_permuter(old_permuter);
01526 }
01527
01536 void Dict::go_deeper_top_fragments_fxn(
01537     const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
01538     int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
01539     bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
01540     WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {
01541   if (word->rating() < *limit) {
01542     if (word_ending) {
01543       if (fragments_debug > 1) {
01544         tprintf("fragments_debug new choice = %s\n",
01545                 word->debug_string().string());
01546       }
01547       *limit = word->rating();
01548       adjust_non_word(word, certainties, permute_debug);
01549       update_best_choice(*word, best_choice);
01550     } else {  // search the next letter
01551       permute_choices(debug, char_choices, char_choice_index + 1,
01552                       prev_char_frag_info, word, certainties, limit,
01553                       best_choice, attempts_left, more_args);
01554     }
01555   } else {
01556     if (fragments_debug > 1) {
01557       tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
01558               word->debug_string().string(), word->rating(), *limit);
01559     }
01560   }
01561 }
01562
01563 }  // namespace tesseract