Tesseract  3.02
tesseract-ocr/ccstruct/rejctmap.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        rejctmap.h  (Formerly rejmap.h)
00003  * Description: REJ and REJMAP class functions.
00004  * Author:              Phil Cheatle
00005  * Created:             Thu Jun  9 13:46:38 BST 1994
00006  *
00007  * (C) Copyright 1994, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018 
00019 This module may look unneccessarily verbose, but here's the philosophy...
00020 
00021 ALL processing of the reject map is done in this module. There are lots of
00022 separate calls to set reject/accept flags. These have DELIBERATELY been kept
00023 distinct so that this module can decide what to do.
00024 
00025 Basically, there is a flag for each sort of rejection or acceptance. This
00026 provides a history of what has happened to EACH character.
00027 
00028 Determining whether a character is CURRENTLY rejected depends on implicit
00029 understanding of the SEQUENCE of possible calls. The flags are defined and
00030 grouped in the REJ_FLAGS enum. These groupings are used in determining a
00031 characters CURRENT rejection status. Basically, a character is ACCEPTED if
00032 
00033     none of the permanent rej flags are set
00034   AND (    the character has never been rejected
00035       OR an accept flag is set which is LATER than the latest reject flag )
00036 
00037 IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE
00038 OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!
00039 **********************************************************************/
00040 
00041 #ifndef           REJCTMAP_H
00042 #define           REJCTMAP_H
00043 
00044 #ifdef __UNIX__
00045 #include          <assert.h>
00046 #endif
00047 #include          "memry.h"
00048 #include          "bits16.h"
00049 #include                   "params.h"
00050 #include          "notdll.h"
00051 
00052 enum REJ_FLAGS
00053 {
00054   /* Reject modes which are NEVER overridden */
00055   R_TESS_FAILURE,                // PERM Tess didnt classify
00056   R_SMALL_XHT,                   // PERM Xht too small
00057   R_EDGE_CHAR,                   // PERM Too close to edge of image
00058   R_1IL_CONFLICT,                // PERM 1Il confusion
00059   R_POSTNN_1IL,                  // PERM 1Il unrejected by NN
00060   R_REJ_CBLOB,                   // PERM Odd blob
00061   R_MM_REJECT,                   // PERM Matrix match rejection (m's)
00062   R_BAD_REPETITION,              // TEMP Repeated char which doesn't match trend
00063 
00064   /* Initial reject modes (pre NN_ACCEPT) */
00065   R_POOR_MATCH,                  // TEMP Ray's original heuristic (Not used)
00066   R_NOT_TESS_ACCEPTED,           // TEMP Tess didnt accept WERD
00067   R_CONTAINS_BLANKS,             // TEMP Tess failed on other chs in WERD
00068   R_BAD_PERMUTER,                // POTENTIAL Bad permuter for WERD
00069 
00070   /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */
00071   R_HYPHEN,                      // TEMP Post NN dodgy hyphen or full stop
00072   R_DUBIOUS,                     // TEMP Post NN dodgy chars
00073   R_NO_ALPHANUMS,                // TEMP No alphanumerics in word after NN
00074   R_MOSTLY_REJ,                  // TEMP Most of word rejected so rej the rest
00075   R_XHT_FIXUP,                   // TEMP Xht tests unsure
00076 
00077   /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */
00078   R_BAD_QUALITY,                 // TEMP Quality metrics bad for WERD
00079 
00080   /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/
00081   R_DOC_REJ,                     // TEMP Document rejection
00082   R_BLOCK_REJ,                   // TEMP Block rejection
00083   R_ROW_REJ,                     // TEMP Row rejection
00084   R_UNLV_REJ,                    // TEMP ~ turned to - or ^ turned to space
00085 
00086   /* Accept modes which occur inbetween the above rejection groups */
00087   R_NN_ACCEPT,                   //NN acceptance
00088   R_HYPHEN_ACCEPT,               //Hyphen acceptance
00089   R_MM_ACCEPT,                   //Matrix match acceptance
00090   R_QUALITY_ACCEPT,              //Accept word in good quality doc
00091   R_MINIMAL_REJ_ACCEPT           //Accept EVERYTHING except tess failures
00092 };
00093 
00094 /* REJECT MAP VALUES */
00095 
00096 #define           MAP_ACCEPT '1'
00097 #define           MAP_REJECT_PERM '0'
00098 #define           MAP_REJECT_TEMP '2'
00099 #define           MAP_REJECT_POTENTIAL '3'
00100 
00101 class REJ
00102 {
00103   BITS16 flags1;
00104   BITS16 flags2;
00105 
00106   void set_flag(REJ_FLAGS rej_flag) {
00107     if (rej_flag < 16)
00108       flags1.turn_on_bit (rej_flag);
00109     else
00110       flags2.turn_on_bit (rej_flag - 16);
00111   }
00112 
00113   BOOL8 rej_before_nn_accept();
00114   BOOL8 rej_between_nn_and_mm();
00115   BOOL8 rej_between_mm_and_quality_accept();
00116   BOOL8 rej_between_quality_and_minimal_rej_accept();
00117   BOOL8 rej_before_mm_accept();
00118   BOOL8 rej_before_quality_accept();
00119 
00120   public:
00121     REJ() {  //constructor
00122     }
00123 
00124     REJ(  //classwise copy
00125         const REJ &source) {
00126       flags1 = source.flags1;
00127       flags2 = source.flags2;
00128     }
00129 
00130     REJ & operator= (            //assign REJ
00131     const REJ & source) {        //from this
00132       flags1 = source.flags1;
00133       flags2 = source.flags2;
00134       return *this;
00135     }
00136 
00137     BOOL8 flag(REJ_FLAGS rej_flag) {
00138       if (rej_flag < 16)
00139         return flags1.bit (rej_flag);
00140       else
00141         return flags2.bit (rej_flag - 16);
00142     }
00143 
00144     char display_char() {
00145       if (perm_rejected ())
00146         return MAP_REJECT_PERM;
00147       else if (accept_if_good_quality ())
00148         return MAP_REJECT_POTENTIAL;
00149       else if (rejected ())
00150         return MAP_REJECT_TEMP;
00151       else
00152         return MAP_ACCEPT;
00153     }
00154 
00155     BOOL8 perm_rejected();  //Is char perm reject?
00156 
00157     BOOL8 rejected();  //Is char rejected?
00158 
00159     BOOL8 accepted() {  //Is char accepted?
00160       return !rejected ();
00161     }
00162 
00163                                  //potential rej?
00164     BOOL8 accept_if_good_quality();
00165 
00166     BOOL8 recoverable() {
00167       return (rejected () && !perm_rejected ());
00168     }
00169 
00170     void setrej_tess_failure();  //Tess generated blank
00171     void setrej_small_xht();  //Small xht char/wd
00172     void setrej_edge_char();  //Close to image edge
00173     void setrej_1Il_conflict();  //Initial reject map
00174     void setrej_postNN_1Il();  //1Il after NN
00175     void setrej_rej_cblob();  //Insert duff blob
00176     void setrej_mm_reject();  //Matrix matcher
00177                                  //Odd repeated char
00178     void setrej_bad_repetition();
00179     void setrej_poor_match();  //Failed Rays heuristic
00180                                  //TEMP reject_word
00181     void setrej_not_tess_accepted();
00182                                  //TEMP reject_word
00183     void setrej_contains_blanks();
00184     void setrej_bad_permuter();  //POTENTIAL reject_word
00185     void setrej_hyphen();  //PostNN dubious hyph or .
00186     void setrej_dubious();  //PostNN dubious limit
00187     void setrej_no_alphanums();  //TEMP reject_word
00188     void setrej_mostly_rej();  //TEMP reject_word
00189     void setrej_xht_fixup();  //xht fixup
00190     void setrej_bad_quality();  //TEMP reject_word
00191     void setrej_doc_rej();  //TEMP reject_word
00192     void setrej_block_rej();  //TEMP reject_word
00193     void setrej_row_rej();  //TEMP reject_word
00194     void setrej_unlv_rej();  //TEMP reject_word
00195     void setrej_nn_accept();  //NN Flipped a char
00196     void setrej_hyphen_accept();  //Good aspect ratio
00197     void setrej_mm_accept();  //Matrix matcher
00198                                  //Quality flip a char
00199     void setrej_quality_accept();
00200                                  //Accept all except blank
00201     void setrej_minimal_rej_accept();
00202 
00203     void full_print(FILE *fp);
00204 };
00205 
00206 class REJMAP
00207 {
00208   REJ *ptr;                      //ptr to the chars
00209   inT16 len;                     //Number of chars
00210 
00211   public:
00212     REJMAP() {  //constructor
00213       ptr = NULL;
00214       len = 0;
00215     }
00216 
00217     REJMAP(  //classwise copy
00218            const REJMAP &rejmap);
00219 
00220     REJMAP & operator= (         //assign REJMAP
00221       const REJMAP & source);    //from this
00222 
00223     ~REJMAP () {                 //destructor
00224       if (ptr != NULL)
00225         free_struct (ptr, len * sizeof (REJ), "REJ");
00226     }
00227 
00228     void initialise(  //Redefine map
00229                     inT16 length);
00230 
00231     REJ & operator[](            //access function
00232       inT16 index) const         //map index
00233     {
00234       ASSERT_HOST (index < len);
00235       return ptr[index];         //no bounds checks
00236     }
00237 
00238     inT32 length() const {  //map length
00239       return len;
00240     }
00241 
00242     inT16 accept_count();  //How many accepted?
00243 
00244     inT16 reject_count() {  //How many rejects?
00245       return len - accept_count ();
00246     }
00247 
00248     void remove_pos(             //Cut out an element
00249                     inT16 pos);  //element to remove
00250 
00251     void print(FILE *fp);
00252 
00253     void full_print(FILE *fp);
00254 
00255     BOOL8 recoverable_rejects();  //Any non perm rejs?
00256 
00257     BOOL8 quality_recoverable_rejects();
00258     //Any potential rejs?
00259 
00260     void rej_word_small_xht();  //Reject whole word
00261                                  //Reject whole word
00262     void rej_word_tess_failure();
00263     void rej_word_not_tess_accepted();
00264     //Reject whole word
00265                                  //Reject whole word
00266     void rej_word_contains_blanks();
00267                                  //Reject whole word
00268     void rej_word_bad_permuter();
00269     void rej_word_xht_fixup();  //Reject whole word
00270                                  //Reject whole word
00271     void rej_word_no_alphanums();
00272     void rej_word_mostly_rej();  //Reject whole word
00273     void rej_word_bad_quality();  //Reject whole word
00274     void rej_word_doc_rej();  //Reject whole word
00275     void rej_word_block_rej();  //Reject whole word
00276     void rej_word_row_rej();  //Reject whole word
00277 };
00278 #endif