Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: permute.h (Formerly permute.h) 00005 * Description: Permute choices together 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Sep 22 14:05:51 1989 00008 * Modified: Mon May 20 16:32:04 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Experimental (Do Not Distribute) 00012 * 00013 * (c) Copyright 1989, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 ********************************************************************************/ 00025 #ifndef PERMUTE_H 00026 #define PERMUTE_H 00027 00028 /*---------------------------------------------------------------------- 00029 I n c l u d e s 00030 ----------------------------------------------------------------------*/ 00031 00032 #include "ratngs.h" 00033 #include "params.h" 00034 #include "unicharset.h" 00035 00036 #define MAX_PERM_LENGTH 128 00037 00038 /*---------------------------------------------------------------------- 00039 V a r i a b l e s 00040 ----------------------------------------------------------------------*/ 00041 extern INT_VAR_H(fragments_debug, 0, "Debug character fragments"); 00042 extern INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process"); 00043 extern BOOL_VAR_H(permute_debug, 0, "char permutation debug"); 00044 00045 extern BOOL_VAR_H(permute_script_word, 0, 00046 "Turn on word script consistency permuter"); 00047 00048 extern BOOL_VAR_H(permute_fixed_length_dawg, 0, 00049 "Turn on fixed-length phrasebook search permuter"); 00050 00051 extern BOOL_VAR_H(segment_segcost_rating, 0, 00052 "incorporate segmentation cost in word rating?"); 00053 00054 extern double_VAR_H(segment_reward_script, 0.95, 00055 "Score multipler for script consistency within a word. " 00056 "Being a 'reward' factor, it should be <= 1. " 00057 "Smaller value implies bigger reward."); 00058 00059 extern BOOL_VAR_H(permute_chartype_word, 0, 00060 "Turn on character type (property) consistency permuter"); 00061 extern double_VAR_H(segment_reward_chartype, 0.97, 00062 "Score multipler for char type consistency within a word. "); 00063 00064 extern double_VAR_H(segment_reward_ngram_best_choice, 0.99, 00065 "Score multipler for ngram permuter's best choice" 00066 " (only used in the Han script path)."); 00067 00068 extern INT_VAR_H(max_permuter_attempts, 100000, 00069 "Maximum number of different character choices to consider" 00070 " during permutation. This limit is especially useful when" 00071 " user patterns are specified, since overly generic patterns" 00072 " can result in dawg search exploring an overly large number" 00073 "of options."); 00074 00075 extern int permute_only_top; 00076 00077 /*---------------------------------------------------------------------- 00078 F u n c t i o n s 00079 ----------------------------------------------------------------------*/ 00080 void adjust_non_word(const char *word, const char *word_lengths, 00081 float rating, float *new_rating, float *adjust_factor); 00082 00083 const char* choose_il1(const char *first_char, //first choice 00084 const char *second_char, //second choice 00085 const char *third_char, //third choice 00086 const char *prev_char, //prev in word 00087 const char *next_char, //next in word 00088 const char *next_next_char); 00089 00090 namespace tesseract { 00091 00092 // This is an awkward solution to allow "compounding" of permuter effects. 00093 // Right now, each permuter generates a WERD_CHOICE with some modified 00094 // rating which is compared to the current best choice, and the winner 00095 // is saved. Therefore, independent permuter improvements, eg. from script 00096 // consistency, dictionary check, and punctuation promoting, override each 00097 // other and can not be combined. 00098 // We need a trellis and someway to modify the path cost. Instead, we 00099 // approximate by saving a permutation string, which records the preferred 00100 // char choice [0-9] at each position [0..#chunks], and a cumulative reward 00101 // factor. Non-conflicting changes can be accumulated and the combined 00102 // result will be returned. 00103 // Default_bias is the initial value for the base multiplier. In other words, 00104 // it is the multiplier for raw choice rating if nothing is modified. 00105 // This would be 1.0 when used with reward-based permuters in CJK-path, 00106 // but it could be > 1 (eg. segment_penalty_garbage) to be compatible with 00107 // penalty-based permuters in the Latin path. 00108 // Note this class does not handle fragmented characters. It does so by 00109 // setting the preferred position of fragmented characters to '1' at Init, 00110 // which effectively skips the fragment choice. However, it can still be 00111 // overridden if collision is allowed. It is the responsibility of the 00112 // permuters to avoid permuting fragmented characters. 00113 class PermuterState { 00114 public: 00115 PermuterState(); 00116 00117 void Init(const BLOB_CHOICE_LIST_VECTOR& char_choices, 00118 const UNICHARSET &unicharset, 00119 float default_bias, 00120 bool debug); 00121 00122 void AddPreference(int start_pos, char* pos_str, float weight); 00123 00124 void AddPreference(int char_pos, BLOB_CHOICE* blob_choice, float weight); 00125 00126 WERD_CHOICE* GetPermutedWord(float *certainties, float *adjust_factor); 00127 00128 void set_allow_collision(bool flag) { allow_collision_ = flag; } 00129 void set_adjust_factor(float factor) { adjust_factor_ = factor; } 00130 void set_debug(bool debug) { debug_ = debug; } 00131 bool position_marked(int pos) { return perm_state_[pos] != kPosFree; } 00132 00133 private: 00134 static const char kPosFree = '.'; 00135 00136 const UNICHARSET *unicharset_; 00137 00138 const BLOB_CHOICE_LIST_VECTOR *char_choices_; // reference pointer only 00139 // does not need to be allocated or freed 00140 char perm_state_[MAX_PERM_LENGTH]; // handles upto MAX_PERM_LENGTH-1 states 00141 // stores preferred char choices, '0'..'9', or '.' 00142 int word_length_; // the number of char positions in the word 00143 bool allow_collision_; // can previously set preference to be overwritten? 00144 float adjust_factor_; // multiplying factor for rating adjustment 00145 bool debug_; // whether debug statements should be printed 00146 }; 00147 00148 } // namespace tesseract 00149 00150 #endif