Tesseract
3.02
|
00001 00002 // File: ambigs.h 00003 // Description: Constants, flags, functions for dealing with 00004 // ambiguities (training and recognition). 00005 // Author: Daria Antonova 00006 // Created: Mon Aug 23 11:26:43 PDT 2008 00007 // 00008 // (C) Copyright 2008, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_CCUTIL_AMBIGS_H_ 00022 #define TESSERACT_CCUTIL_AMBIGS_H_ 00023 00024 #include "elst.h" 00025 #include "tprintf.h" 00026 #include "unichar.h" 00027 #include "unicharset.h" 00028 #include "genericvector.h" 00029 00030 #define MAX_AMBIG_SIZE 10 00031 00032 namespace tesseract { 00033 00034 typedef GenericVector<UNICHAR_ID> UnicharIdVector; 00035 00036 static const int kUnigramAmbigsBufferSize = 1000; 00037 static const char kAmbigNgramSeparator[] = { ' ', '\0' }; 00038 static const char kAmbigDelimiters[] = "\t "; 00039 static const char kIllegalMsg[] = 00040 "Illegal ambiguity specification on line %d\n"; 00041 static const char kIllegalUnicharMsg[] = 00042 "Illegal unichar %s in ambiguity specification\n"; 00043 00044 enum AmbigType { 00045 NOT_AMBIG, // the ngram pair is not ambiguous 00046 REPLACE_AMBIG, // ocred ngram should always be substituted with correct 00047 DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1) 00048 SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1) 00049 CASE_AMBIG, // this is a case ambiguity (1-1) 00050 00051 AMBIG_TYPE_COUNT // number of enum entries 00052 }; 00053 00054 // A collection of utility functions for arrays of UNICHAR_IDs that are 00055 // terminated by INVALID_UNICHAR_ID. 00056 class UnicharIdArrayUtils { 00057 public: 00058 // Compares two arrays of unichar ids. Returns -1 if the length of array1 is 00059 // less than length of array2, if any array1[i] is less than array2[i]. 00060 // Returns 0 if the arrays are equal, 1 otherwise. 00061 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID. 00062 static inline int compare(const UNICHAR_ID array1[], 00063 const UNICHAR_ID array2[]) { 00064 const UNICHAR_ID *ptr1 = array1; 00065 const UNICHAR_ID *ptr2 = array2; 00066 while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) { 00067 if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1; 00068 ++ptr1; 00069 ++ptr2; 00070 } 00071 if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0; 00072 return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1; 00073 } 00074 00075 // Look uid in the vector of uids. If found, the index of the matched 00076 // element is returned. Otherwise, it returns -1. 00077 static inline int find_in(const UnicharIdVector& uid_vec, 00078 const UNICHAR_ID uid) { 00079 for (int i = 0; i < uid_vec.size(); ++i) 00080 if (uid_vec[i] == uid) return i; 00081 return -1; 00082 } 00083 00084 // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied. 00085 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID 00086 // and that dst has enough space for all the elements from src. 00087 static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) { 00088 int i = 0; 00089 do { 00090 dst[i] = src[i]; 00091 } while (dst[i++] != INVALID_UNICHAR_ID); 00092 return i - 1; 00093 } 00094 00095 // Prints unichars corresponding to the unichar_ids in the given array. 00096 // The function assumes that array is terminated by INVALID_UNICHAR_ID. 00097 static inline void print(const UNICHAR_ID array[], 00098 const UNICHARSET &unicharset) { 00099 const UNICHAR_ID *ptr = array; 00100 if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]"); 00101 while (*ptr != INVALID_UNICHAR_ID) { 00102 tprintf("%s ", unicharset.id_to_unichar(*ptr++)); 00103 } 00104 tprintf("( "); 00105 ptr = array; 00106 while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++); 00107 tprintf(")\n"); 00108 } 00109 }; 00110 00111 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that 00112 // start with the same unichar (e.g. r->t rn->m rr1->m). 00113 class AmbigSpec : public ELIST_LINK { 00114 public: 00115 AmbigSpec(); 00116 ~AmbigSpec() {} 00117 00118 // Comparator function for sorting AmbigSpec_LISTs. The lists will 00119 // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors 00120 // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1]. 00121 static int compare_ambig_specs(const void *spec1, const void *spec2) { 00122 const AmbigSpec *s1 = 00123 *reinterpret_cast<const AmbigSpec * const *>(spec1); 00124 const AmbigSpec *s2 = 00125 *reinterpret_cast<const AmbigSpec * const *>(spec2); 00126 return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram); 00127 } 00128 00129 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; 00130 UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1]; 00131 UNICHAR_ID correct_ngram_id; 00132 AmbigType type; 00133 int wrong_ngram_size; 00134 }; 00135 ELISTIZEH(AmbigSpec); 00136 00137 // AMBIG_TABLE[i] stores a set of ambiguities whose 00138 // wrong ngram starts with unichar id i. 00139 typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector; 00140 00141 class UnicharAmbigs { 00142 public: 00143 UnicharAmbigs() {} 00144 ~UnicharAmbigs() { 00145 replace_ambigs_.delete_data_pointers(); 00146 dang_ambigs_.delete_data_pointers(); 00147 one_to_one_definite_ambigs_.delete_data_pointers(); 00148 } 00149 00150 const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; } 00151 const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; } 00152 00153 // Fills in two ambiguity tables (replaceable and dangerous) with information 00154 // read from the ambigs file. An ambiguity table is an array of lists. 00155 // The array is indexed by a class id. Each entry in the table provides 00156 // a list of potential ambiguities which can start with the corresponding 00157 // character. For example the ambiguity "rn -> m", would be located in the 00158 // table at index of unicharset.unichar_to_id('r'). 00159 // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in 00160 // one_to_one_definite_ambigs_. This vector is also indexed by the class id 00161 // of the wrong part of the ambiguity and each entry contains a vector of 00162 // unichar ids that are ambiguous to it. 00163 void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset, int debug_level, 00164 bool use_ambigs_for_adaption, UNICHARSET *unicharset); 00165 00166 // Returns definite 1-1 ambigs for the given unichar id. 00167 inline const UnicharIdVector *OneToOneDefiniteAmbigs( 00168 UNICHAR_ID unichar_id) const { 00169 if (one_to_one_definite_ambigs_.empty()) return NULL; 00170 return one_to_one_definite_ambigs_[unichar_id]; 00171 } 00172 00173 // Returns a pointer to the vector with all unichar ids that appear in the 00174 // 'correct' part of the ambiguity pair when the given unichar id appears 00175 // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of 00176 // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of 00177 // m will return a pointer to a vector with unichar ids of r,n,i. 00178 inline const UnicharIdVector *AmbigsForAdaption( 00179 UNICHAR_ID unichar_id) const { 00180 if (ambigs_for_adaption_.empty()) return NULL; 00181 return ambigs_for_adaption_[unichar_id]; 00182 } 00183 00184 // Similar to the above, but return the vector of unichar ids for which 00185 // the given unichar_id is an ambiguity (appears in the 'wrong' part of 00186 // some ambiguity pair). 00187 inline const UnicharIdVector *ReverseAmbigsForAdaption( 00188 UNICHAR_ID unichar_id) const { 00189 if (reverse_ambigs_for_adaption_.empty()) return NULL; 00190 return reverse_ambigs_for_adaption_[unichar_id]; 00191 } 00192 00193 private: 00194 00195 bool ParseAmbiguityLine(int line_num, int version, int debug_level, 00196 const UNICHARSET &unicharset, char *buffer, 00197 int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds, 00198 int *ReplacementAmbigPartSize, 00199 char *ReplacementString, int *type); 00200 void InsertIntoTable(UnicharAmbigsVector &table, 00201 int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds, 00202 int ReplacementAmbigPartSize, 00203 const char *ReplacementString, int type, 00204 AmbigSpec *ambig_spec, UNICHARSET *unicharset); 00205 UnicharAmbigsVector dang_ambigs_; 00206 UnicharAmbigsVector replace_ambigs_; 00207 GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_; 00208 GenericVector<UnicharIdVector *> ambigs_for_adaption_; 00209 GenericVector<UnicharIdVector *> reverse_ambigs_for_adaption_; 00210 }; 00211 00212 } // namespace tesseract 00213 00214 #endif // TESSERACT_CCUTIL_AMBIGS_H_