Tesseract  3.02
tesseract-ocr/dict/permute.h
Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        permute.h  (Formerly permute.h)
00005  * Description:  Permute choices together
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Sep 22 14:05:51 1989
00008  * Modified:     Mon May 20 16:32:04 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Experimental (Do Not Distribute)
00012  *
00013  * (c) Copyright 1989, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  ********************************************************************************/
00025 #ifndef PERMUTE_H
00026 #define PERMUTE_H
00027 
00028 /*----------------------------------------------------------------------
00029               I n c l u d e s
00030 ----------------------------------------------------------------------*/
00031 
00032 #include "ratngs.h"
00033 #include "params.h"
00034 #include "unicharset.h"
00035 
00036 #define MAX_PERM_LENGTH 128
00037 
00038 /*----------------------------------------------------------------------
00039               V a r i a b l e s
00040 ----------------------------------------------------------------------*/
00041 extern INT_VAR_H(fragments_debug, 0, "Debug character fragments");
00042 extern INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
00043 extern BOOL_VAR_H(permute_debug, 0, "char permutation debug");
00044 
00045 extern BOOL_VAR_H(permute_script_word, 0,
00046                   "Turn on word script consistency permuter");
00047 
00048 extern BOOL_VAR_H(permute_fixed_length_dawg, 0,
00049                   "Turn on fixed-length phrasebook search permuter");
00050 
00051 extern BOOL_VAR_H(segment_segcost_rating, 0,
00052                   "incorporate segmentation cost in word rating?");
00053 
00054 extern double_VAR_H(segment_reward_script, 0.95,
00055                     "Score multipler for script consistency within a word. "
00056                     "Being a 'reward' factor, it should be <= 1. "
00057                     "Smaller value implies bigger reward.");
00058 
00059 extern BOOL_VAR_H(permute_chartype_word, 0,
00060          "Turn on character type (property) consistency permuter");
00061 extern double_VAR_H(segment_reward_chartype, 0.97,
00062            "Score multipler for char type consistency within a word. ");
00063 
00064 extern double_VAR_H(segment_reward_ngram_best_choice, 0.99,
00065                     "Score multipler for ngram permuter's best choice"
00066                     " (only used in the Han script path).");
00067 
00068 extern INT_VAR_H(max_permuter_attempts, 100000,
00069                  "Maximum number of different character choices to consider"
00070                  " during permutation. This limit is especially useful when"
00071                  " user patterns are specified, since overly generic patterns"
00072                  " can result in dawg search exploring an overly large number"
00073                  "of options.");
00074 
00075 extern int permute_only_top;
00076 
00077 /*----------------------------------------------------------------------
00078               F u n c t i o n s
00079 ----------------------------------------------------------------------*/
00080 void adjust_non_word(const char *word, const char *word_lengths,
00081                      float rating, float *new_rating, float *adjust_factor);
00082 
00083 const char* choose_il1(const char *first_char,   //first choice
00084                        const char *second_char,  //second choice
00085                        const char *third_char,   //third choice
00086                        const char *prev_char,    //prev in word
00087                        const char *next_char,    //next in word
00088                        const char *next_next_char);
00089 
00090 namespace tesseract {
00091 
00092 // This is an awkward solution to allow "compounding" of permuter effects.
00093 // Right now, each permuter generates a WERD_CHOICE with some modified
00094 // rating which is compared to the current best choice, and the winner
00095 // is saved.  Therefore, independent permuter improvements, eg. from script
00096 // consistency, dictionary check, and punctuation promoting, override each
00097 // other and can not be combined.
00098 // We need a trellis and someway to modify the path cost.  Instead, we
00099 // approximate by saving a permutation string, which records the preferred
00100 // char choice [0-9] at each position [0..#chunks], and a cumulative reward
00101 // factor.  Non-conflicting changes can be accumulated and the combined
00102 // result will be returned.
00103 // Default_bias is the initial value for the base multiplier.  In other words,
00104 // it is the multiplier for raw choice rating if nothing is modified.
00105 // This would be 1.0 when used with reward-based permuters in CJK-path,
00106 // but it could be > 1 (eg. segment_penalty_garbage) to be compatible with
00107 // penalty-based permuters in the Latin path.
00108 // Note this class does not handle fragmented characters.  It does so by
00109 // setting the preferred position of fragmented characters to '1' at Init,
00110 // which effectively skips the fragment choice.  However, it can still be
00111 // overridden if collision is allowed.  It is the responsibility of the
00112 // permuters to avoid permuting fragmented characters.
00113 class PermuterState {
00114  public:
00115   PermuterState();
00116 
00117   void Init(const BLOB_CHOICE_LIST_VECTOR& char_choices,
00118             const UNICHARSET &unicharset,
00119             float default_bias,
00120             bool debug);
00121 
00122   void AddPreference(int start_pos, char* pos_str, float weight);
00123 
00124   void AddPreference(int char_pos, BLOB_CHOICE* blob_choice, float weight);
00125 
00126   WERD_CHOICE* GetPermutedWord(float *certainties, float *adjust_factor);
00127 
00128   void set_allow_collision(bool flag) { allow_collision_ = flag; }
00129   void set_adjust_factor(float factor) { adjust_factor_ = factor; }
00130   void set_debug(bool debug) { debug_ = debug; }
00131   bool position_marked(int pos) { return perm_state_[pos] != kPosFree; }
00132 
00133  private:
00134   static const char kPosFree = '.';
00135 
00136   const UNICHARSET *unicharset_;
00137 
00138   const BLOB_CHOICE_LIST_VECTOR *char_choices_;   // reference pointer only
00139                             // does not need to be allocated or freed
00140   char perm_state_[MAX_PERM_LENGTH];   // handles upto MAX_PERM_LENGTH-1 states
00141                             // stores preferred char choices, '0'..'9', or '.'
00142   int word_length_;         // the number of char positions in the word
00143   bool allow_collision_;    // can previously set preference to be overwritten?
00144   float adjust_factor_;     // multiplying factor for rating adjustment
00145   bool debug_;              // whether debug statements should be printed
00146 };
00147 
00148 }  // namespace tesseract
00149 
00150 #endif