Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: rejctmap.h (Formerly rejmap.h) 00003 * Description: REJ and REJMAP class functions. 00004 * Author: Phil Cheatle 00005 * Created: Thu Jun 9 13:46:38 BST 1994 00006 * 00007 * (C) Copyright 1994, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 00019 This module may look unneccessarily verbose, but here's the philosophy... 00020 00021 ALL processing of the reject map is done in this module. There are lots of 00022 separate calls to set reject/accept flags. These have DELIBERATELY been kept 00023 distinct so that this module can decide what to do. 00024 00025 Basically, there is a flag for each sort of rejection or acceptance. This 00026 provides a history of what has happened to EACH character. 00027 00028 Determining whether a character is CURRENTLY rejected depends on implicit 00029 understanding of the SEQUENCE of possible calls. The flags are defined and 00030 grouped in the REJ_FLAGS enum. These groupings are used in determining a 00031 characters CURRENT rejection status. Basically, a character is ACCEPTED if 00032 00033 none of the permanent rej flags are set 00034 AND ( the character has never been rejected 00035 OR an accept flag is set which is LATER than the latest reject flag ) 00036 00037 IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE 00038 OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! 00039 **********************************************************************/ 00040 00041 #ifndef REJCTMAP_H 00042 #define REJCTMAP_H 00043 00044 #ifdef __UNIX__ 00045 #include <assert.h> 00046 #endif 00047 #include "memry.h" 00048 #include "bits16.h" 00049 #include "params.h" 00050 #include "notdll.h" 00051 00052 enum REJ_FLAGS 00053 { 00054 /* Reject modes which are NEVER overridden */ 00055 R_TESS_FAILURE, // PERM Tess didnt classify 00056 R_SMALL_XHT, // PERM Xht too small 00057 R_EDGE_CHAR, // PERM Too close to edge of image 00058 R_1IL_CONFLICT, // PERM 1Il confusion 00059 R_POSTNN_1IL, // PERM 1Il unrejected by NN 00060 R_REJ_CBLOB, // PERM Odd blob 00061 R_MM_REJECT, // PERM Matrix match rejection (m's) 00062 R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend 00063 00064 /* Initial reject modes (pre NN_ACCEPT) */ 00065 R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) 00066 R_NOT_TESS_ACCEPTED, // TEMP Tess didnt accept WERD 00067 R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD 00068 R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD 00069 00070 /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */ 00071 R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop 00072 R_DUBIOUS, // TEMP Post NN dodgy chars 00073 R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN 00074 R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest 00075 R_XHT_FIXUP, // TEMP Xht tests unsure 00076 00077 /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */ 00078 R_BAD_QUALITY, // TEMP Quality metrics bad for WERD 00079 00080 /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/ 00081 R_DOC_REJ, // TEMP Document rejection 00082 R_BLOCK_REJ, // TEMP Block rejection 00083 R_ROW_REJ, // TEMP Row rejection 00084 R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space 00085 00086 /* Accept modes which occur inbetween the above rejection groups */ 00087 R_NN_ACCEPT, //NN acceptance 00088 R_HYPHEN_ACCEPT, //Hyphen acceptance 00089 R_MM_ACCEPT, //Matrix match acceptance 00090 R_QUALITY_ACCEPT, //Accept word in good quality doc 00091 R_MINIMAL_REJ_ACCEPT //Accept EVERYTHING except tess failures 00092 }; 00093 00094 /* REJECT MAP VALUES */ 00095 00096 #define MAP_ACCEPT '1' 00097 #define MAP_REJECT_PERM '0' 00098 #define MAP_REJECT_TEMP '2' 00099 #define MAP_REJECT_POTENTIAL '3' 00100 00101 class REJ 00102 { 00103 BITS16 flags1; 00104 BITS16 flags2; 00105 00106 void set_flag(REJ_FLAGS rej_flag) { 00107 if (rej_flag < 16) 00108 flags1.turn_on_bit (rej_flag); 00109 else 00110 flags2.turn_on_bit (rej_flag - 16); 00111 } 00112 00113 BOOL8 rej_before_nn_accept(); 00114 BOOL8 rej_between_nn_and_mm(); 00115 BOOL8 rej_between_mm_and_quality_accept(); 00116 BOOL8 rej_between_quality_and_minimal_rej_accept(); 00117 BOOL8 rej_before_mm_accept(); 00118 BOOL8 rej_before_quality_accept(); 00119 00120 public: 00121 REJ() { //constructor 00122 } 00123 00124 REJ( //classwise copy 00125 const REJ &source) { 00126 flags1 = source.flags1; 00127 flags2 = source.flags2; 00128 } 00129 00130 REJ & operator= ( //assign REJ 00131 const REJ & source) { //from this 00132 flags1 = source.flags1; 00133 flags2 = source.flags2; 00134 return *this; 00135 } 00136 00137 BOOL8 flag(REJ_FLAGS rej_flag) { 00138 if (rej_flag < 16) 00139 return flags1.bit (rej_flag); 00140 else 00141 return flags2.bit (rej_flag - 16); 00142 } 00143 00144 char display_char() { 00145 if (perm_rejected ()) 00146 return MAP_REJECT_PERM; 00147 else if (accept_if_good_quality ()) 00148 return MAP_REJECT_POTENTIAL; 00149 else if (rejected ()) 00150 return MAP_REJECT_TEMP; 00151 else 00152 return MAP_ACCEPT; 00153 } 00154 00155 BOOL8 perm_rejected(); //Is char perm reject? 00156 00157 BOOL8 rejected(); //Is char rejected? 00158 00159 BOOL8 accepted() { //Is char accepted? 00160 return !rejected (); 00161 } 00162 00163 //potential rej? 00164 BOOL8 accept_if_good_quality(); 00165 00166 BOOL8 recoverable() { 00167 return (rejected () && !perm_rejected ()); 00168 } 00169 00170 void setrej_tess_failure(); //Tess generated blank 00171 void setrej_small_xht(); //Small xht char/wd 00172 void setrej_edge_char(); //Close to image edge 00173 void setrej_1Il_conflict(); //Initial reject map 00174 void setrej_postNN_1Il(); //1Il after NN 00175 void setrej_rej_cblob(); //Insert duff blob 00176 void setrej_mm_reject(); //Matrix matcher 00177 //Odd repeated char 00178 void setrej_bad_repetition(); 00179 void setrej_poor_match(); //Failed Rays heuristic 00180 //TEMP reject_word 00181 void setrej_not_tess_accepted(); 00182 //TEMP reject_word 00183 void setrej_contains_blanks(); 00184 void setrej_bad_permuter(); //POTENTIAL reject_word 00185 void setrej_hyphen(); //PostNN dubious hyph or . 00186 void setrej_dubious(); //PostNN dubious limit 00187 void setrej_no_alphanums(); //TEMP reject_word 00188 void setrej_mostly_rej(); //TEMP reject_word 00189 void setrej_xht_fixup(); //xht fixup 00190 void setrej_bad_quality(); //TEMP reject_word 00191 void setrej_doc_rej(); //TEMP reject_word 00192 void setrej_block_rej(); //TEMP reject_word 00193 void setrej_row_rej(); //TEMP reject_word 00194 void setrej_unlv_rej(); //TEMP reject_word 00195 void setrej_nn_accept(); //NN Flipped a char 00196 void setrej_hyphen_accept(); //Good aspect ratio 00197 void setrej_mm_accept(); //Matrix matcher 00198 //Quality flip a char 00199 void setrej_quality_accept(); 00200 //Accept all except blank 00201 void setrej_minimal_rej_accept(); 00202 00203 void full_print(FILE *fp); 00204 }; 00205 00206 class REJMAP 00207 { 00208 REJ *ptr; //ptr to the chars 00209 inT16 len; //Number of chars 00210 00211 public: 00212 REJMAP() { //constructor 00213 ptr = NULL; 00214 len = 0; 00215 } 00216 00217 REJMAP( //classwise copy 00218 const REJMAP &rejmap); 00219 00220 REJMAP & operator= ( //assign REJMAP 00221 const REJMAP & source); //from this 00222 00223 ~REJMAP () { //destructor 00224 if (ptr != NULL) 00225 free_struct (ptr, len * sizeof (REJ), "REJ"); 00226 } 00227 00228 void initialise( //Redefine map 00229 inT16 length); 00230 00231 REJ & operator[]( //access function 00232 inT16 index) const //map index 00233 { 00234 ASSERT_HOST (index < len); 00235 return ptr[index]; //no bounds checks 00236 } 00237 00238 inT32 length() const { //map length 00239 return len; 00240 } 00241 00242 inT16 accept_count(); //How many accepted? 00243 00244 inT16 reject_count() { //How many rejects? 00245 return len - accept_count (); 00246 } 00247 00248 void remove_pos( //Cut out an element 00249 inT16 pos); //element to remove 00250 00251 void print(FILE *fp); 00252 00253 void full_print(FILE *fp); 00254 00255 BOOL8 recoverable_rejects(); //Any non perm rejs? 00256 00257 BOOL8 quality_recoverable_rejects(); 00258 //Any potential rejs? 00259 00260 void rej_word_small_xht(); //Reject whole word 00261 //Reject whole word 00262 void rej_word_tess_failure(); 00263 void rej_word_not_tess_accepted(); 00264 //Reject whole word 00265 //Reject whole word 00266 void rej_word_contains_blanks(); 00267 //Reject whole word 00268 void rej_word_bad_permuter(); 00269 void rej_word_xht_fixup(); //Reject whole word 00270 //Reject whole word 00271 void rej_word_no_alphanums(); 00272 void rej_word_mostly_rej(); //Reject whole word 00273 void rej_word_bad_quality(); //Reject whole word 00274 void rej_word_doc_rej(); //Reject whole word 00275 void rej_word_block_rej(); //Reject whole word 00276 void rej_word_row_rej(); //Reject whole word 00277 }; 00278 #endif