Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: ratngs.h (Formerly ratings.h) 00003 * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 11:40:38 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef RATNGS_H 00021 #define RATNGS_H 00022 00023 #include <assert.h> 00024 00025 #include "clst.h" 00026 #include "genericvector.h" 00027 #include "notdll.h" 00028 #include "unichar.h" 00029 #include "unicharset.h" 00030 #include "werd.h" 00031 00032 class BLOB_CHOICE: public ELIST_LINK 00033 { 00034 public: 00035 BLOB_CHOICE() { 00036 unichar_id_ = INVALID_UNICHAR_ID; 00037 fontinfo_id_ = -1; 00038 fontinfo_id2_ = -1; 00039 rating_ = MAX_FLOAT32; 00040 certainty_ = -MAX_FLOAT32; 00041 script_id_ = -1; 00042 language_model_state_ = NULL; 00043 min_xheight_ = 0; 00044 max_xheight_ = 0; 00045 adapted_ = false; 00046 } 00047 BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id 00048 float src_rating, // rating 00049 float src_cert, // certainty 00050 inT16 src_fontinfo_id, // font 00051 inT16 src_fontinfo_id2, // 2nd choice font 00052 int script_id, // script 00053 inT16 min_xheight, // min xheight in image pixel units 00054 inT16 max_xheight, // max xheight allowed by this char 00055 bool adapted); // adapted match or not 00056 BLOB_CHOICE(const BLOB_CHOICE &other); 00057 ~BLOB_CHOICE() {} 00058 00059 UNICHAR_ID unichar_id() const { 00060 return unichar_id_; 00061 } 00062 float rating() const { 00063 return rating_; 00064 } 00065 float certainty() const { 00066 return certainty_; 00067 } 00068 inT16 fontinfo_id() const { 00069 return fontinfo_id_; 00070 } 00071 inT16 fontinfo_id2() const { 00072 return fontinfo_id2_; 00073 } 00074 int script_id() const { 00075 return script_id_; 00076 } 00077 void *language_model_state() { 00078 return language_model_state_; 00079 } 00080 inT16 xgap_before() const { 00081 return xgap_before_; 00082 } 00083 inT16 xgap_after() const { 00084 return xgap_after_; 00085 } 00086 inT16 min_xheight() const { 00087 return min_xheight_; 00088 } 00089 inT16 max_xheight() const { 00090 return max_xheight_; 00091 } 00092 bool adapted() const { 00093 return adapted_; 00094 } 00095 00096 void set_unichar_id(UNICHAR_ID newunichar_id) { 00097 unichar_id_ = newunichar_id; 00098 } 00099 void set_rating(float newrat) { 00100 rating_ = newrat; 00101 } 00102 void set_certainty(float newrat) { 00103 certainty_ = newrat; 00104 } 00105 void set_fontinfo_id(inT16 newfont) { 00106 fontinfo_id_ = newfont; 00107 } 00108 void set_fontinfo_id2(inT16 newfont) { 00109 fontinfo_id2_ = newfont; 00110 } 00111 void set_script(int newscript_id) { 00112 script_id_ = newscript_id; 00113 } 00114 void set_language_model_state(void *language_model_state) { 00115 language_model_state_ = language_model_state; 00116 } 00117 void set_xgap_before(inT16 gap) { 00118 xgap_before_ = gap; 00119 } 00120 void set_xgap_after(inT16 gap) { 00121 xgap_after_ = gap; 00122 } 00123 void set_adapted(bool adapted) { 00124 adapted_ = adapted; 00125 } 00126 static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) { 00127 BLOB_CHOICE* choice = new BLOB_CHOICE; 00128 *choice = *src; 00129 return choice; 00130 } 00131 void print(const UNICHARSET *unicharset) { 00132 tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_, 00133 (unicharset == NULL) ? "" : 00134 unicharset->debug_str(unichar_id_).string()); 00135 } 00136 00137 private: 00138 UNICHAR_ID unichar_id_; // unichar id 00139 inT16 fontinfo_id_; // char font information 00140 inT16 fontinfo_id2_; // 2nd choice font information 00141 float rating_; // size related 00142 float certainty_; // absolute 00143 int script_id_; 00144 // Stores language model information about this BLOB_CHOICE. Used during 00145 // the segmentation search for BLOB_CHOICEs in BLOB_CHOICE_LISTs that are 00146 // recorded in the ratings matrix. 00147 // The pointer is owned/managed by the segmentation search. 00148 void *language_model_state_; 00149 inT16 xgap_before_; 00150 inT16 xgap_after_; 00151 // X-height range (in image pixels) that this classification supports. 00152 inT16 min_xheight_; 00153 inT16 max_xheight_; 00154 bool adapted_; // true if this is a match from adapted templates 00155 }; 00156 00157 // Make BLOB_CHOICE listable. 00158 ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST) 00159 00160 // Permuter codes used in WERD_CHOICEs. 00161 enum PermuterType { 00162 NO_PERM, // 0 00163 PUNC_PERM, // 1 00164 TOP_CHOICE_PERM, // 2 00165 LOWER_CASE_PERM, // 3 00166 UPPER_CASE_PERM, // 4 00167 NGRAM_PERM, // 5 00168 NUMBER_PERM, // 6 00169 USER_PATTERN_PERM, // 7 00170 SYSTEM_DAWG_PERM, // 8 00171 DOC_DAWG_PERM, // 9 00172 USER_DAWG_PERM, // 10 00173 FREQ_DAWG_PERM, // 11 00174 COMPOUND_PERM, // 12 00175 }; 00176 00177 class WERD_CHOICE { 00178 public: 00179 static const float kBadRating; 00180 00181 WERD_CHOICE(const UNICHARSET *unicharset) 00182 : unicharset_(unicharset) { this->init(8); } 00183 WERD_CHOICE(const UNICHARSET *unicharset, int reserved) 00184 : unicharset_(unicharset) { this->init(reserved); } 00185 WERD_CHOICE(const char *src_string, 00186 const char *src_lengths, 00187 float src_rating, 00188 float src_certainty, 00189 uinT8 src_permuter, 00190 const UNICHARSET &unicharset) 00191 : unicharset_(&unicharset) { 00192 this->init(src_string, src_lengths, src_rating, 00193 src_certainty, src_permuter); 00194 } 00195 WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset); 00196 WERD_CHOICE(const WERD_CHOICE &word) : unicharset_(word.unicharset_) { 00197 this->init(word.length()); 00198 this->operator=(word); 00199 } 00200 ~WERD_CHOICE(); 00201 00202 const UNICHARSET *unicharset() const { 00203 return unicharset_; 00204 } 00205 inline int length() const { 00206 return length_; 00207 } 00208 inline const UNICHAR_ID *unichar_ids() const { 00209 return unichar_ids_; 00210 } 00211 inline const UNICHAR_ID unichar_id(int index) const { 00212 assert(index < length_); 00213 return unichar_ids_[index]; 00214 } 00215 inline const char *fragment_lengths() const { 00216 return fragment_lengths_; 00217 } 00218 inline const char fragment_length(int index) const { 00219 assert(index < length_); 00220 return fragment_lengths_[index]; 00221 } 00222 inline float rating() const { 00223 return rating_; 00224 } 00225 inline float certainty() const { 00226 return certainty_; 00227 } 00228 inline uinT8 permuter() const { 00229 return permuter_; 00230 } 00231 const char *permuter_name() const; 00232 inline bool fragment_mark() const { 00233 return fragment_mark_; 00234 } 00235 inline BLOB_CHOICE_LIST_CLIST* blob_choices() { 00236 return blob_choices_; 00237 } 00238 inline void set_unichar_id(UNICHAR_ID unichar_id, int index) { 00239 assert(index < length_); 00240 unichar_ids_[index] = unichar_id; 00241 } 00242 inline void set_fragment_length(char flen, int index) { 00243 assert(index < length_); 00244 fragment_lengths_[index] = flen; 00245 } 00246 inline void set_rating(float new_val) { 00247 rating_ = new_val; 00248 } 00249 inline void set_certainty(float new_val) { 00250 certainty_ = new_val; 00251 } 00252 inline void set_permuter(uinT8 perm) { 00253 permuter_ = perm; 00254 } 00255 inline void set_fragment_mark(bool new_fragment_mark) { 00256 fragment_mark_ = new_fragment_mark; 00257 } 00258 // Note: this function should only be used if all the fields 00259 // are populated manually with set_* functions (rather than 00260 // (copy)constructors and append_* functions). 00261 inline void set_length(int len) { 00262 ASSERT_HOST(reserved_ >= len); 00263 length_ = len; 00264 } 00265 void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices); 00266 00268 inline void double_the_size() { 00269 if (reserved_ > 0) { 00270 unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy( 00271 reserved_, unichar_ids_); 00272 fragment_lengths_ = GenericVector<char>::double_the_size_memcpy( 00273 reserved_, fragment_lengths_); 00274 reserved_ *= 2; 00275 } else { 00276 unichar_ids_ = new UNICHAR_ID[1]; 00277 fragment_lengths_ = new char[1]; 00278 reserved_ = 1; 00279 } 00280 } 00281 00284 inline void init(int reserved) { 00285 reserved_ = reserved; 00286 if (reserved > 0) { 00287 unichar_ids_ = new UNICHAR_ID[reserved]; 00288 fragment_lengths_ = new char[reserved]; 00289 } else { 00290 unichar_ids_ = NULL; 00291 fragment_lengths_ = NULL; 00292 } 00293 length_ = 0; 00294 rating_ = 0.0; 00295 certainty_ = MAX_FLOAT32; 00296 permuter_ = NO_PERM; 00297 fragment_mark_ = false; 00298 blob_choices_ = NULL; 00299 unichars_in_script_order_ = false; // Tesseract is strict left-to-right. 00300 } 00301 00307 void init(const char *src_string, const char *src_lengths, 00308 float src_rating, float src_certainty, 00309 uinT8 src_permuter); 00310 00312 inline void make_bad() { 00313 length_ = 0; 00314 rating_ = kBadRating; 00315 certainty_ = -MAX_FLOAT32; 00316 fragment_mark_ = false; 00317 } 00318 00322 inline void append_unichar_id_space_allocated( 00323 UNICHAR_ID unichar_id, char fragment_length, 00324 float rating, float certainty) { 00325 assert(reserved_ > length_); 00326 length_++; 00327 this->set_unichar_id(unichar_id, fragment_length, 00328 rating, certainty, length_-1); 00329 } 00330 00331 void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length, 00332 float rating, float certainty); 00333 00334 inline void set_unichar_id(UNICHAR_ID unichar_id, char fragment_length, 00335 float rating, float certainty, int index) { 00336 assert(index < length_); 00337 unichar_ids_[index] = unichar_id; 00338 fragment_lengths_[index] = fragment_length; 00339 rating_ += rating; 00340 if (certainty < certainty_) { 00341 certainty_ = certainty; 00342 } 00343 } 00344 00345 bool contains_unichar_id(UNICHAR_ID unichar_id) const; 00346 void remove_unichar_ids(int index, int num); 00347 inline void remove_last_unichar_id() { --length_; } 00348 inline void remove_unichar_id(int index) { 00349 this->remove_unichar_ids(index, 1); 00350 } 00351 bool has_rtl_unichar_id() const; 00352 void reverse_and_mirror_unichar_ids(); 00353 00354 // Returns the half-open interval of unichar_id indices [start, end) which 00355 // enclose the core portion of this word -- the part after stripping 00356 // punctuation from the left and right. 00357 void punct_stripped(int *start_core, int *end_core) const; 00358 00359 // Return a copy of this WERD_CHOICE with the choices [start, end). 00360 // The result is useful only for checking against a dictionary. 00361 WERD_CHOICE shallow_copy(int start, int end) const; 00362 00363 void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const; 00364 const STRING debug_string() const { 00365 STRING word_str; 00366 for (int i = 0; i < length_; ++i) { 00367 word_str += unicharset_->debug_str(unichar_ids_[i]); 00368 word_str += " "; 00369 } 00370 return word_str; 00371 } 00372 00373 // Call this to override the default (strict left to right graphemes) 00374 // with the fact that some engine produces a "reading order" set of 00375 // Graphemes for each word. 00376 bool set_unichars_in_script_order(bool in_script_order) { 00377 return unichars_in_script_order_ = in_script_order; 00378 } 00379 00380 bool unichars_in_script_order() const { 00381 return unichars_in_script_order_; 00382 } 00383 00384 // Returns a UTF-8 string equivalent to the current choice 00385 // of UNICHAR IDs. 00386 const STRING &unichar_string() const { 00387 this->string_and_lengths(&unichar_string_, &unichar_lengths_); 00388 return unichar_string_; 00389 } 00390 00391 // Returns the lengths, one byte each, representing the number of bytes 00392 // required in the unichar_string for each UNICHAR_ID. 00393 const STRING &unichar_lengths() const { 00394 this->string_and_lengths(&unichar_string_, &unichar_lengths_); 00395 return unichar_lengths_; 00396 } 00397 const void print() const { this->print(""); } 00398 const void print(const char *msg) const; 00399 00400 WERD_CHOICE& operator+= ( // concatanate 00401 const WERD_CHOICE & second);// second on first 00402 00403 WERD_CHOICE& operator= (const WERD_CHOICE& source); 00404 00405 private: 00406 const UNICHARSET *unicharset_; 00407 UNICHAR_ID *unichar_ids_; // unichar ids that represent the text of the word 00408 char *fragment_lengths_; // number of fragments in each unichar 00409 int reserved_; // size of the above arrays 00410 int length_; // word length 00411 float rating_; // size related 00412 float certainty_; // absolute 00413 uinT8 permuter_; // permuter code 00414 bool fragment_mark_; // if true, indicates that this choice 00415 // was chosen over a better one that 00416 // contained a fragment 00417 BLOB_CHOICE_LIST_CLIST *blob_choices_; // best choices for each blob 00418 00419 // Normally, the blob_choices_ represent the recognition results in order 00420 // from left-to-right. However, some engines (say Cube) may return 00421 // recognition results in the order of the script's major reading direction 00422 // (for Arabic, that is right-to-left). 00423 bool unichars_in_script_order_; 00424 00425 // The following variables are populated and passed by reference any 00426 // time unichar_string() or unichar_lengths() are called. 00427 mutable STRING unichar_string_; 00428 mutable STRING unichar_lengths_; 00429 00430 bool unichar_info_present; 00431 00432 private: 00433 void delete_blob_choices(); 00434 }; 00435 00436 // Make WERD_CHOICE listable. 00437 ELISTIZEH (WERD_CHOICE) 00438 typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR; 00439 typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR; 00440 00441 // Utilities for comparing WERD_CHOICEs 00442 00443 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, 00444 const WERD_CHOICE &word2); 00445 00446 // Utilities for debug printing. 00447 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings); 00448 void print_ratings_list( 00449 const char *msg, // intro message 00450 BLOB_CHOICE_LIST *ratings, // list of results 00451 const UNICHARSET ¤t_unicharset // unicharset that can be used 00452 // for id-to-unichar conversion 00453 ); 00454 void print_ratings_info( 00455 FILE *fp, // file to use 00456 BLOB_CHOICE_LIST *ratings, // list of results 00457 const UNICHARSET ¤t_unicharset // unicharset that can be used 00458 // for id-to-unichar conversion 00459 ); 00460 void print_char_choices_list( 00461 const char *msg, 00462 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00463 const UNICHARSET ¤t_unicharset, 00464 BOOL8 detailed 00465 ); 00466 void print_word_alternates_list( 00467 WERD_CHOICE *word, 00468 GenericVector<WERD_CHOICE *> *alternates); 00469 00470 #endif