Tesseract  3.02
tesseract-ocr/ccstruct/ratngs.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        ratngs.h  (Formerly ratings.h)
00003  * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
00004  * Author:      Ray Smith
00005  * Created:     Thu Apr 23 11:40:38 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef           RATNGS_H
00021 #define           RATNGS_H
00022 
00023 #include <assert.h>
00024 
00025 #include "clst.h"
00026 #include "genericvector.h"
00027 #include "notdll.h"
00028 #include "unichar.h"
00029 #include "unicharset.h"
00030 #include "werd.h"
00031 
00032 class BLOB_CHOICE: public ELIST_LINK
00033 {
00034   public:
00035     BLOB_CHOICE() {
00036       unichar_id_ = INVALID_UNICHAR_ID;
00037       fontinfo_id_ = -1;
00038       fontinfo_id2_ = -1;
00039       rating_ = MAX_FLOAT32;
00040       certainty_ = -MAX_FLOAT32;
00041       script_id_ = -1;
00042       language_model_state_ = NULL;
00043       min_xheight_ = 0;
00044       max_xheight_ = 0;
00045       adapted_ = false;
00046     }
00047     BLOB_CHOICE(UNICHAR_ID src_unichar_id,  // character id
00048                 float src_rating,          // rating
00049                 float src_cert,            // certainty
00050                 inT16 src_fontinfo_id,      // font
00051                 inT16 src_fontinfo_id2,     // 2nd choice font
00052                 int script_id,             // script
00053                 inT16 min_xheight,         // min xheight in image pixel units
00054                 inT16 max_xheight,         // max xheight allowed by this char
00055                 bool adapted);             // adapted match or not
00056     BLOB_CHOICE(const BLOB_CHOICE &other);
00057     ~BLOB_CHOICE() {}
00058 
00059     UNICHAR_ID unichar_id() const {
00060       return unichar_id_;
00061     }
00062     float rating() const {
00063       return rating_;
00064     }
00065     float certainty() const {
00066       return certainty_;
00067     }
00068     inT16 fontinfo_id() const {
00069       return fontinfo_id_;
00070     }
00071     inT16 fontinfo_id2() const {
00072       return fontinfo_id2_;
00073     }
00074     int script_id() const {
00075       return script_id_;
00076     }
00077     void *language_model_state() {
00078       return language_model_state_;
00079     }
00080     inT16 xgap_before() const {
00081       return xgap_before_;
00082     }
00083     inT16 xgap_after() const {
00084       return xgap_after_;
00085     }
00086     inT16 min_xheight() const {
00087       return min_xheight_;
00088     }
00089     inT16 max_xheight() const {
00090       return max_xheight_;
00091     }
00092     bool adapted() const {
00093       return adapted_;
00094     }
00095 
00096     void set_unichar_id(UNICHAR_ID newunichar_id) {
00097       unichar_id_ = newunichar_id;
00098     }
00099     void set_rating(float newrat) {
00100       rating_ = newrat;
00101     }
00102     void set_certainty(float newrat) {
00103       certainty_ = newrat;
00104     }
00105     void set_fontinfo_id(inT16 newfont) {
00106       fontinfo_id_ = newfont;
00107     }
00108     void set_fontinfo_id2(inT16 newfont) {
00109       fontinfo_id2_ = newfont;
00110     }
00111     void set_script(int newscript_id) {
00112       script_id_ = newscript_id;
00113     }
00114     void set_language_model_state(void *language_model_state) {
00115       language_model_state_ = language_model_state;
00116     }
00117     void set_xgap_before(inT16 gap) {
00118       xgap_before_ = gap;
00119     }
00120     void set_xgap_after(inT16 gap) {
00121       xgap_after_ = gap;
00122     }
00123     void set_adapted(bool adapted) {
00124       adapted_ = adapted;
00125     }
00126     static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
00127       BLOB_CHOICE* choice = new BLOB_CHOICE;
00128       *choice = *src;
00129       return choice;
00130     }
00131     void print(const UNICHARSET *unicharset) {
00132       tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_,
00133               (unicharset == NULL) ? "" :
00134               unicharset->debug_str(unichar_id_).string());
00135     }
00136 
00137  private:
00138   UNICHAR_ID unichar_id_;          // unichar id
00139   inT16 fontinfo_id_;              // char font information
00140   inT16 fontinfo_id2_;             // 2nd choice font information
00141   float rating_;                  // size related
00142   float certainty_;               // absolute
00143   int script_id_;
00144   // Stores language model information about this BLOB_CHOICE. Used during
00145   // the segmentation search for BLOB_CHOICEs in BLOB_CHOICE_LISTs that are
00146   // recorded in the ratings matrix.
00147   // The pointer is owned/managed by the segmentation search.
00148   void *language_model_state_;
00149   inT16 xgap_before_;
00150   inT16 xgap_after_;
00151   // X-height range (in image pixels) that this classification supports.
00152   inT16 min_xheight_;
00153   inT16 max_xheight_;
00154   bool adapted_;  // true if this is a match from adapted templates
00155 };
00156 
00157 // Make BLOB_CHOICE listable.
00158 ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST)
00159 
00160 // Permuter codes used in WERD_CHOICEs.
00161 enum PermuterType {
00162   NO_PERM,            // 0
00163   PUNC_PERM,          // 1
00164   TOP_CHOICE_PERM,    // 2
00165   LOWER_CASE_PERM,    // 3
00166   UPPER_CASE_PERM,    // 4
00167   NGRAM_PERM,         // 5
00168   NUMBER_PERM,        // 6
00169   USER_PATTERN_PERM,  // 7
00170   SYSTEM_DAWG_PERM,   // 8
00171   DOC_DAWG_PERM,      // 9
00172   USER_DAWG_PERM,     // 10
00173   FREQ_DAWG_PERM,     // 11
00174   COMPOUND_PERM,      // 12
00175 };
00176 
00177 class WERD_CHOICE {
00178  public:
00179   static const float kBadRating;
00180 
00181   WERD_CHOICE(const UNICHARSET *unicharset)
00182     : unicharset_(unicharset) { this->init(8); }
00183   WERD_CHOICE(const UNICHARSET *unicharset, int reserved)
00184     : unicharset_(unicharset) { this->init(reserved); }
00185   WERD_CHOICE(const char *src_string,
00186               const char *src_lengths,
00187               float src_rating,
00188               float src_certainty,
00189               uinT8 src_permuter,
00190               const UNICHARSET &unicharset)
00191     : unicharset_(&unicharset) {
00192     this->init(src_string, src_lengths, src_rating,
00193                src_certainty, src_permuter);
00194   }
00195   WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
00196   WERD_CHOICE(const WERD_CHOICE &word) : unicharset_(word.unicharset_) {
00197     this->init(word.length());
00198     this->operator=(word);
00199   }
00200   ~WERD_CHOICE();
00201 
00202   const UNICHARSET *unicharset() const {
00203     return unicharset_;
00204   }
00205   inline int length() const {
00206     return length_;
00207   }
00208   inline const UNICHAR_ID *unichar_ids() const {
00209     return unichar_ids_;
00210   }
00211   inline const UNICHAR_ID unichar_id(int index) const {
00212     assert(index < length_);
00213     return unichar_ids_[index];
00214   }
00215   inline const char *fragment_lengths() const {
00216     return fragment_lengths_;
00217   }
00218   inline const char fragment_length(int index) const {
00219     assert(index < length_);
00220     return fragment_lengths_[index];
00221   }
00222   inline float rating() const {
00223     return rating_;
00224   }
00225   inline float certainty() const {
00226     return certainty_;
00227   }
00228   inline uinT8 permuter() const {
00229     return permuter_;
00230   }
00231   const char *permuter_name() const;
00232   inline bool fragment_mark() const {
00233     return fragment_mark_;
00234   }
00235   inline BLOB_CHOICE_LIST_CLIST* blob_choices() {
00236     return blob_choices_;
00237   }
00238   inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
00239     assert(index < length_);
00240     unichar_ids_[index] = unichar_id;
00241   }
00242   inline void set_fragment_length(char flen, int index) {
00243     assert(index < length_);
00244     fragment_lengths_[index] = flen;
00245   }
00246   inline void set_rating(float new_val) {
00247     rating_ = new_val;
00248   }
00249   inline void set_certainty(float new_val) {
00250     certainty_ = new_val;
00251   }
00252   inline void set_permuter(uinT8 perm) {
00253     permuter_ = perm;
00254   }
00255   inline void set_fragment_mark(bool new_fragment_mark) {
00256     fragment_mark_ = new_fragment_mark;
00257   }
00258   // Note: this function should only be used if all the fields
00259   // are populated manually with set_* functions (rather than
00260   // (copy)constructors and append_* functions).
00261   inline void set_length(int len) {
00262     ASSERT_HOST(reserved_ >= len);
00263     length_ = len;
00264   }
00265   void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices);
00266 
00268   inline void double_the_size() {
00269     if (reserved_ > 0) {
00270       unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
00271           reserved_, unichar_ids_);
00272       fragment_lengths_ = GenericVector<char>::double_the_size_memcpy(
00273           reserved_, fragment_lengths_);
00274       reserved_ *= 2;
00275     } else {
00276       unichar_ids_ = new UNICHAR_ID[1];
00277       fragment_lengths_ = new char[1];
00278       reserved_ = 1;
00279     }
00280   }
00281 
00284   inline void init(int reserved) {
00285     reserved_ = reserved;
00286     if (reserved > 0) {
00287       unichar_ids_ = new UNICHAR_ID[reserved];
00288       fragment_lengths_ = new char[reserved];
00289     } else {
00290       unichar_ids_ = NULL;
00291       fragment_lengths_ = NULL;
00292     }
00293     length_ = 0;
00294     rating_ = 0.0;
00295     certainty_ = MAX_FLOAT32;
00296     permuter_ = NO_PERM;
00297     fragment_mark_ = false;
00298     blob_choices_ = NULL;
00299     unichars_in_script_order_ = false;  // Tesseract is strict left-to-right.
00300   }
00301 
00307   void init(const char *src_string, const char *src_lengths,
00308             float src_rating, float src_certainty,
00309             uinT8 src_permuter);
00310 
00312   inline void make_bad() {
00313     length_ = 0;
00314     rating_ = kBadRating;
00315     certainty_ = -MAX_FLOAT32;
00316     fragment_mark_ = false;
00317   }
00318 
00322   inline void append_unichar_id_space_allocated(
00323       UNICHAR_ID unichar_id, char fragment_length,
00324       float rating, float certainty) {
00325     assert(reserved_ > length_);
00326     length_++;
00327     this->set_unichar_id(unichar_id, fragment_length,
00328                          rating, certainty, length_-1);
00329   }
00330 
00331   void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
00332                          float rating, float certainty);
00333 
00334   inline void set_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
00335                              float rating, float certainty, int index) {
00336     assert(index < length_);
00337     unichar_ids_[index] = unichar_id;
00338     fragment_lengths_[index] = fragment_length;
00339     rating_ += rating;
00340     if (certainty < certainty_) {
00341       certainty_ = certainty;
00342     }
00343   }
00344 
00345   bool contains_unichar_id(UNICHAR_ID unichar_id) const;
00346   void remove_unichar_ids(int index, int num);
00347   inline void remove_last_unichar_id() { --length_; }
00348   inline void remove_unichar_id(int index) {
00349     this->remove_unichar_ids(index, 1);
00350   }
00351   bool has_rtl_unichar_id() const;
00352   void reverse_and_mirror_unichar_ids();
00353 
00354   // Returns the half-open interval of unichar_id indices [start, end) which
00355   // enclose the core portion of this word -- the part after stripping
00356   // punctuation from the left and right.
00357   void punct_stripped(int *start_core, int *end_core) const;
00358 
00359   // Return a copy of this WERD_CHOICE with the choices [start, end).
00360   // The result is useful only for checking against a dictionary.
00361   WERD_CHOICE shallow_copy(int start, int end) const;
00362 
00363   void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const;
00364   const STRING debug_string() const {
00365     STRING word_str;
00366     for (int i = 0; i < length_; ++i) {
00367       word_str += unicharset_->debug_str(unichar_ids_[i]);
00368       word_str += " ";
00369     }
00370     return word_str;
00371   }
00372 
00373   // Call this to override the default (strict left to right graphemes)
00374   // with the fact that some engine produces a "reading order" set of
00375   // Graphemes for each word.
00376   bool set_unichars_in_script_order(bool in_script_order) {
00377     return unichars_in_script_order_ = in_script_order;
00378   }
00379 
00380   bool unichars_in_script_order() const {
00381     return unichars_in_script_order_;
00382   }
00383 
00384   // Returns a UTF-8 string equivalent to the current choice
00385   // of UNICHAR IDs.
00386   const STRING &unichar_string() const {
00387     this->string_and_lengths(&unichar_string_, &unichar_lengths_);
00388     return unichar_string_;
00389   }
00390 
00391   // Returns the lengths, one byte each, representing the number of bytes
00392   // required in the unichar_string for each UNICHAR_ID.
00393   const STRING &unichar_lengths() const {
00394     this->string_and_lengths(&unichar_string_, &unichar_lengths_);
00395     return unichar_lengths_;
00396   }
00397   const void print() const { this->print(""); }
00398   const void print(const char *msg) const;
00399 
00400   WERD_CHOICE& operator+= (     // concatanate
00401     const WERD_CHOICE & second);// second on first
00402 
00403   WERD_CHOICE& operator= (const WERD_CHOICE& source);
00404 
00405  private:
00406   const UNICHARSET *unicharset_;
00407   UNICHAR_ID *unichar_ids_;  // unichar ids that represent the text of the word
00408   char *fragment_lengths_;   // number of fragments in each unichar
00409   int reserved_;             // size of the above arrays
00410   int length_;               // word length
00411   float rating_;             // size related
00412   float certainty_;          // absolute
00413   uinT8 permuter_;           // permuter code
00414   bool fragment_mark_;       // if true, indicates that this choice
00415                              // was chosen over a better one that
00416                              // contained a fragment
00417   BLOB_CHOICE_LIST_CLIST *blob_choices_;  // best choices for each blob
00418 
00419   // Normally, the blob_choices_ represent the recognition results in order
00420   // from left-to-right.  However, some engines (say Cube) may return
00421   // recognition results in the order of the script's major reading direction
00422   // (for Arabic, that is right-to-left).
00423   bool unichars_in_script_order_;
00424 
00425   // The following variables are populated and passed by reference any
00426   // time unichar_string() or unichar_lengths() are called.
00427   mutable STRING unichar_string_;
00428   mutable STRING unichar_lengths_;
00429 
00430   bool unichar_info_present;
00431 
00432  private:
00433   void delete_blob_choices();
00434 };
00435 
00436 // Make WERD_CHOICE listable.
00437 ELISTIZEH (WERD_CHOICE)
00438 typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
00439 typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR;
00440 
00441 // Utilities for comparing WERD_CHOICEs
00442 
00443 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
00444                                        const WERD_CHOICE &word2);
00445 
00446 // Utilities for debug printing.
00447 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings);
00448 void print_ratings_list(
00449     const char *msg,                      // intro message
00450     BLOB_CHOICE_LIST *ratings,            // list of results
00451     const UNICHARSET &current_unicharset  // unicharset that can be used
00452                                           // for id-to-unichar conversion
00453     );
00454 void print_ratings_info(
00455     FILE *fp,                             // file to use
00456     BLOB_CHOICE_LIST *ratings,            // list of results
00457     const UNICHARSET &current_unicharset  // unicharset that can be used
00458                                           // for id-to-unichar conversion
00459     );
00460 void print_char_choices_list(
00461     const char *msg,
00462     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00463     const UNICHARSET &current_unicharset,
00464     BOOL8 detailed
00465     );
00466 void print_word_alternates_list(
00467     WERD_CHOICE *word,
00468     GenericVector<WERD_CHOICE *> *alternates);
00469 
00470 #endif