tesseract-doc/permdawg_8cpp_source.html

00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        permdawg.c  (Formerly permdawg.c)
00005  * Description:  Scale word choices by a dictionary
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Tue Jul  9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 /*----------------------------------------------------------------------
00026               I n c l u d e s
00027 ----------------------------------------------------------------------*/
00028
00029 #include "cutil.h"
00030 #include "dawg.h"
00031 #include "freelist.h"
00032 #include "globals.h"
00033 #include "ndminx.h"
00034 #include "permute.h"
00035 #include "stopper.h"
00036 #include "tprintf.h"
00037 #include "params.h"
00038
00039 #include <ctype.h>
00040 #include "dict.h"
00041 #include "image.h"
00042
00043 /*----------------------------------------------------------------------
00044               F u n c t i o n s
00045 ----------------------------------------------------------------------*/
00046 namespace tesseract {
00047
00048 static const float kPermDawgRatingPad = 5.0;
00049
00067 void Dict::go_deeper_dawg_fxn(
00068     const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00069     int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00070     bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
00071     WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
00072   DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
00073   word_ending = (char_choice_index == more_args->end_char_choice_index);
00074   int word_index = word->length() - 1;
00075
00076   if (ambigs_mode(*limit)) {
00077     if (best_choice->rating() < *limit) return;
00078   } else {
00079     // Prune bad subwords
00080     if (more_args->rating_array[word_index] == NO_RATING) {
00081       more_args->rating_array[word_index] = word->rating();
00082     } else {
00083       float permdawg_limit = more_args->rating_array[word_index] *
00084         more_args->rating_margin + kPermDawgRatingPad;
00085       if (permdawg_limit < word->rating()) {
00086         if (permute_debug && dawg_debug_level) {
00087           tprintf("early pruned word rating=%4.2f,"
00088                   " permdawg_limit=%4.2f, word=%s\n", word->rating(),
00089                   permdawg_limit, word->debug_string().string());
00090         }
00091         return;
00092       }
00093     }
00094   }
00095   // Deal with hyphens
00096   if (word_ending && more_args->sought_word_length == kAnyWordLength &&
00097       has_hyphen_end(*word) && !ambigs_mode(*limit)) {
00098     // Copy more_args->active_dawgs to clean_active_dawgs removing
00099     // dawgs of type DAWG_TYPE_PATTERN.
00100     DawgInfoVector clean_active_dawgs;
00101     const DawgInfoVector &active_dawgs = *(more_args->active_dawgs);
00102     for (int i = 0; i < active_dawgs.size(); ++i) {
00103       if (dawgs_[active_dawgs[i].dawg_index]->type() != DAWG_TYPE_PATTERN) {
00104         clean_active_dawgs += active_dawgs[i];
00105       }
00106     }
00107     if (clean_active_dawgs.size() > 0) {
00108       if (permute_debug && dawg_debug_level)
00109         tprintf("new hyphen choice = %s\n", word->debug_string().string());
00110       word->set_permuter(more_args->permuter);
00111       adjust_word(word, certainties, permute_debug);
00112       set_hyphen_word(*word, *(more_args->active_dawgs),
00113                       *(more_args->constraints));
00114       update_best_choice(*word, best_choice);
00115     }
00116   } else {  // Look up char in DAWG
00117     // TODO(daria): update the rest of the code that specifies alternative
00118     // letter_is_okay_ functions (e.g. TessCharNgram class) to work with
00119     // multi-byte unichars and/or unichar ids.
00120
00121     // If the current unichar is an ngram first try calling
00122     // letter_is_okay() for each unigram it contains separately.
00123     UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
00124     bool checked_unigrams = false;
00125     if (getUnicharset().get_isngram(orig_uch_id)) {
00126       if (permute_debug && dawg_debug_level) {
00127         tprintf("checking unigrams in an ngram %s\n",
00128                 getUnicharset().debug_str(orig_uch_id).string());
00129       }
00130       int orig_num_fragments = word->fragment_length(word_index);
00131       int num_unigrams = 0;
00132       word->remove_last_unichar_id();
00133       const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
00134       const char *ngram_str_end = ngram_str + strlen(ngram_str);
00135       const char *ngram_ptr = ngram_str;
00136       bool unigrams_ok = true;
00137       // Construct DawgArgs that reflect the current state.
00138       DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
00139       DawgInfoVector unigram_constraints = *(more_args->constraints);
00140       DawgInfoVector unigram_updated_active_dawgs;
00141       DawgInfoVector unigram_updated_constraints;
00142       DawgArgs unigram_dawg_args(&unigram_active_dawgs,
00143                                  &unigram_constraints,
00144                                  &unigram_updated_active_dawgs,
00145                                  &unigram_updated_constraints, 0.0,
00146                                  more_args->permuter,
00147                                  more_args->sought_word_length,
00148                                  more_args->end_char_choice_index);
00149       // Check unigrams in the ngram with letter_is_okay().
00150       while (unigrams_ok && ngram_ptr < ngram_str_end) {
00151         int step = getUnicharset().step(ngram_ptr);
00152         UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
00153             getUnicharset().unichar_to_id(ngram_ptr, step);
00154         ngram_ptr += step;
00155         ++num_unigrams;
00156         word->append_unichar_id(uch_id, 1, 0.0, 0.0);
00157         unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
00158             &unigram_dawg_args,
00159             word->unichar_id(word_index+num_unigrams-1),
00160             word_ending && (ngram_ptr == ngram_str_end));
00161         (*unigram_dawg_args.active_dawgs) =
00162           *(unigram_dawg_args.updated_active_dawgs);
00163         (*unigram_dawg_args.constraints) =
00164           *(unigram_dawg_args.updated_constraints);
00165         if (permute_debug && dawg_debug_level) {
00166           tprintf("unigram %s is %s\n",
00167                   getUnicharset().debug_str(uch_id).string(),
00168                   unigrams_ok ? "OK" : "not OK");
00169         }
00170       }
00171       // Restore the word and copy the updated dawg state if needed.
00172       while (num_unigrams-- > 0) word->remove_last_unichar_id();
00173       word->append_unichar_id_space_allocated(
00174           orig_uch_id, orig_num_fragments, 0.0, 0.0);
00175       if (unigrams_ok) {
00176         checked_unigrams = true;
00177         more_args->permuter = unigram_dawg_args.permuter;
00178         *(more_args->updated_active_dawgs) =
00179           *(unigram_dawg_args.updated_active_dawgs);
00180         *(more_args->updated_constraints) =
00181           *(unigram_dawg_args.updated_constraints);
00182       }
00183     }
00184
00185     // Check which dawgs from the dawgs_ vector contain the word
00186     // up to and including the current unichar.
00187     if (checked_unigrams || (this->*letter_is_okay_)(
00188         more_args, word->unichar_id(word_index), word_ending)) {
00189       // Add a new word choice
00190       if (word_ending) {
00191         if (permute_debug && dawg_debug_level) {
00192           tprintf("found word = %s\n", word->debug_string().string());
00193         }
00194         if (ambigs_mode(*limit) &&
00195             strcmp(output_ambig_words_file.string(), "") != 0) {
00196           if (output_ambig_words_file_ == NULL) {
00197             output_ambig_words_file_ =
00198                 fopen(output_ambig_words_file.string(), "wb+");
00199             if (output_ambig_words_file_ == NULL) {
00200               tprintf("Failed to open output_ambig_words_file %s\n",
00201                       output_ambig_words_file.string());
00202               exit(1);
00203             }
00204           }
00205           STRING word_str;
00206           word->string_and_lengths(&word_str, NULL);
00207           word_str += " ";
00208           fprintf(output_ambig_words_file_, word_str.string());
00209         }
00210         WERD_CHOICE *adjusted_word = word;
00211         WERD_CHOICE hyphen_tail_word(&getUnicharset());
00212         if (hyphen_base_size() > 0) {
00213           hyphen_tail_word = *word;
00214           remove_hyphen_head(&hyphen_tail_word);
00215           adjusted_word = &hyphen_tail_word;
00216         }
00217         adjusted_word->set_permuter(more_args->permuter);
00218         if (!ambigs_mode(*limit)) {
00219           adjust_word(adjusted_word, &certainties[hyphen_base_size()],
00220                       permute_debug);
00221         }
00222         update_best_choice(*adjusted_word, best_choice);
00223       } else {  // search the next letter
00224         // Make updated_* point to the next entries in the DawgInfoVector
00225         // arrays (that were originally created in dawg_permute_and_select)
00226         ++(more_args->updated_active_dawgs);
00227         ++(more_args->updated_constraints);
00228         // Make active_dawgs and constraints point to the updated ones.
00229         ++(more_args->active_dawgs);
00230         ++(more_args->constraints);
00231         permute_choices(debug, char_choices, char_choice_index + 1,
00232                         prev_char_frag_info, word, certainties, limit,
00233                         best_choice, attempts_left, more_args);
00234         // Restore previous state to explore another letter in this position.
00235         --(more_args->updated_active_dawgs);
00236         --(more_args->updated_constraints);
00237         --(more_args->active_dawgs);
00238         --(more_args->constraints);
00239       }
00240     } else {
00241       if (permute_debug && dawg_debug_level) {
00242         tprintf("last unichar not OK at index %d in %s\n",
00243                 word_index, word->debug_string().string());
00244       }
00245     }
00246   }
00247 }
00248
00263 WERD_CHOICE *Dict::dawg_permute_and_select(
00264     const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
00265     int sought_word_length, int start_char_choice_index) {
00266   WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
00267   best_choice->make_bad();
00268   best_choice->set_rating(rating_limit);
00269   if (char_choices.length() == 0) return best_choice;
00270   DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1];
00271   DawgInfoVector *constraints =  new DawgInfoVector[char_choices.length() + 1];
00272   init_active_dawgs(sought_word_length, &(active_dawgs[0]),
00273                     ambigs_mode(rating_limit));
00274   init_constraints(&(constraints[0]));
00275   int end_char_choice_index = (sought_word_length == kAnyWordLength) ?
00276     char_choices.length()-1 : start_char_choice_index+sought_word_length-1;
00277   // Need to skip accumulating word choices if we are only searching a part of
00278   // the word (e.g. for the phrase search in non-space delimited languages).
00279   // Also need to skip accumulating choices if char_choices are expanded
00280   // with ambiguities.
00281   bool re_enable_choice_accum = ChoiceAccumEnabled();
00282   if (sought_word_length != kAnyWordLength ||
00283       ambigs_mode(rating_limit)) DisableChoiceAccum();
00284   DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
00285                      &(active_dawgs[1]), &(constraints[1]),
00286                      (segment_penalty_dict_case_bad /
00287                       segment_penalty_dict_case_ok),
00288                      NO_PERM, sought_word_length, end_char_choice_index);
00289   WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
00290   copy_hyphen_info(&word);
00291   // Discard rating and certainty of the hyphen base (if any).
00292   word.set_rating(0.0);
00293   word.set_certainty(0.0);
00294   if (word.length() + char_choices.length() > MAX_WERD_LENGTH) {
00295     delete[] active_dawgs;
00296     delete[] constraints;
00297     return best_choice;  // the word is too long to permute
00298   }
00299   float certainties[MAX_WERD_LENGTH];
00300   this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
00301   int attempts_left = max_permuter_attempts;
00302   permute_choices((permute_debug && dawg_debug_level) ?
00303                   "permute_dawg_debug" : NULL,
00304                   char_choices, start_char_choice_index, NULL, &word,
00305                   certainties, &rating_limit, best_choice, &attempts_left,
00306                   &dawg_args);
00307   delete[] active_dawgs;
00308   delete[] constraints;
00309   if (re_enable_choice_accum) EnableChoiceAccum();
00310   return best_choice;
00311 }
00312
00313 }  // namespace tesseract