Tesseract
3.02
|
00001 00002 // File: ambigs.cc 00003 // Description: Functions for dealing with ambiguities 00004 // (training and recognition). 00005 // Author: Daria Antonova 00006 // Created: Mon Feb 5 11:26:43 PDT 2009 00007 // 00008 // (C) Copyright 2008, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #include "ambigs.h" 00022 #include "helpers.h" 00023 00024 #ifdef _WIN32 00025 #ifndef __GNUC__ 00026 #define strtok_r strtok_s 00027 #else 00028 #include "strtok_r.h" 00029 #endif /* __GNUC__ */ 00030 #endif /* _WIN32 */ 00031 00032 namespace tesseract { 00033 00034 AmbigSpec::AmbigSpec() { 00035 wrong_ngram[0] = INVALID_UNICHAR_ID; 00036 correct_fragments[0] = INVALID_UNICHAR_ID; 00037 correct_ngram_id = INVALID_UNICHAR_ID; 00038 type = NOT_AMBIG; 00039 wrong_ngram_size = 0; 00040 } 00041 00042 ELISTIZE(AmbigSpec); 00043 00044 void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile, 00045 inT64 end_offset, 00046 int debug_level, 00047 bool use_ambigs_for_adaption, 00048 UNICHARSET *unicharset) { 00049 int i, j; 00050 UnicharIdVector *adaption_ambigs_entry; 00051 for (i = 0; i < unicharset->size(); ++i) { 00052 replace_ambigs_.push_back(NULL); 00053 dang_ambigs_.push_back(NULL); 00054 one_to_one_definite_ambigs_.push_back(NULL); 00055 if (use_ambigs_for_adaption) { 00056 ambigs_for_adaption_.push_back(NULL); 00057 reverse_ambigs_for_adaption_.push_back(NULL); 00058 } 00059 } 00060 if (debug_level) tprintf("Reading ambiguities\n"); 00061 00062 int TestAmbigPartSize; 00063 int ReplacementAmbigPartSize; 00064 // Maximum line size: 00065 // 10 for sizes of ambigs, tabs, abmig type and newline 00066 // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig 00067 // The space for buffer is allocated on the heap to avoid 00068 // GCC frame size warning. 00069 const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1); 00070 const int kBufferSize = 10 + 2 * kMaxAmbigStringSize; 00071 char *buffer = new char[kBufferSize]; 00072 char ReplacementString[kMaxAmbigStringSize]; 00073 UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1]; 00074 int line_num = 0; 00075 int type = NOT_AMBIG; 00076 00077 // Determine the version of the ambigs file. 00078 int version = 0; 00079 ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL && 00080 strlen(buffer) > 0); 00081 if (*buffer == 'v') { 00082 version = static_cast<int>(strtol(buffer+1, NULL, 10)); 00083 ++line_num; 00084 } else { 00085 rewind(AmbigFile); 00086 } 00087 while ((end_offset < 0 || ftell(AmbigFile) < end_offset) && 00088 fgets(buffer, kBufferSize, AmbigFile) != NULL) { 00089 chomp_string(buffer); 00090 if (debug_level > 2) tprintf("read line %s\n", buffer); 00091 ++line_num; 00092 if (!ParseAmbiguityLine(line_num, version, debug_level, *unicharset, 00093 buffer, &TestAmbigPartSize, TestUnicharIds, 00094 &ReplacementAmbigPartSize, 00095 ReplacementString, &type)) continue; 00096 // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST. 00097 AmbigSpec *ambig_spec = new AmbigSpec(); 00098 InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_, 00099 TestAmbigPartSize, TestUnicharIds, 00100 ReplacementAmbigPartSize, ReplacementString, type, 00101 ambig_spec, unicharset); 00102 00103 // Update one_to_one_definite_ambigs_. 00104 if (TestAmbigPartSize == 1 && 00105 ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) { 00106 if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) { 00107 one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector(); 00108 } 00109 one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back( 00110 ambig_spec->correct_ngram_id); 00111 } 00112 // Update ambigs_for_adaption_. 00113 if (use_ambigs_for_adaption) { 00114 for (i = 0; i < TestAmbigPartSize; ++i) { 00115 if (ambigs_for_adaption_[TestUnicharIds[i]] == NULL) { 00116 ambigs_for_adaption_[TestUnicharIds[i]] = new UnicharIdVector(); 00117 } 00118 adaption_ambigs_entry = ambigs_for_adaption_[TestUnicharIds[i]]; 00119 const char *tmp_ptr = ReplacementString; 00120 const char *tmp_ptr_end = ReplacementString + strlen(ReplacementString); 00121 int step = unicharset->step(tmp_ptr); 00122 while (step > 0) { 00123 UNICHAR_ID id_to_insert = unicharset->unichar_to_id(tmp_ptr, step); 00124 ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID); 00125 // Add the new unichar id to adaption_ambigs_entry (only if the 00126 // vector does not already contain it) keeping it in sorted order. 00127 for (j = 0; j < adaption_ambigs_entry->size() && 00128 (*adaption_ambigs_entry)[j] > id_to_insert; ++j); 00129 if (j < adaption_ambigs_entry->size()) { 00130 if ((*adaption_ambigs_entry)[j] != id_to_insert) { 00131 adaption_ambigs_entry->insert(id_to_insert, j); 00132 } 00133 } else { 00134 adaption_ambigs_entry->push_back(id_to_insert); 00135 } 00136 // Update tmp_ptr and step. 00137 tmp_ptr += step; 00138 step = tmp_ptr < tmp_ptr_end ? unicharset->step(tmp_ptr) : 0; 00139 } 00140 } 00141 } 00142 } 00143 delete[] buffer; 00144 00145 // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector. 00146 if (use_ambigs_for_adaption) { 00147 for (i = 0; i < ambigs_for_adaption_.size(); ++i) { 00148 adaption_ambigs_entry = ambigs_for_adaption_[i]; 00149 if (adaption_ambigs_entry == NULL) continue; 00150 for (j = 0; j < adaption_ambigs_entry->size(); ++j) { 00151 UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j]; 00152 if (reverse_ambigs_for_adaption_[ambig_id] == NULL) { 00153 reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector(); 00154 } 00155 reverse_ambigs_for_adaption_[ambig_id]->push_back(i); 00156 } 00157 } 00158 } 00159 00160 // Print what was read from the input file. 00161 if (debug_level > 1) { 00162 for (int tbl = 0; tbl < 2; ++tbl) { 00163 const UnicharAmbigsVector &print_table = 00164 (tbl == 0) ? replace_ambigs_ : dang_ambigs_; 00165 for (i = 0; i < print_table.size(); ++i) { 00166 AmbigSpec_LIST *lst = print_table[i]; 00167 if (lst == NULL) continue; 00168 if (!lst->empty()) { 00169 tprintf("%s Ambiguities for %s:\n", 00170 (tbl == 0) ? "Replaceable" : "Dangerous", 00171 unicharset->debug_str(i).string()); 00172 } 00173 AmbigSpec_IT lst_it(lst); 00174 for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { 00175 AmbigSpec *ambig_spec = lst_it.data(); 00176 tprintf("wrong_ngram:"); 00177 UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset); 00178 tprintf("correct_fragments:"); 00179 UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset); 00180 } 00181 } 00182 } 00183 if (use_ambigs_for_adaption) { 00184 for (int vec_id = 0; vec_id < 2; ++vec_id) { 00185 const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ? 00186 ambigs_for_adaption_ : reverse_ambigs_for_adaption_; 00187 for (i = 0; i < vec.size(); ++i) { 00188 adaption_ambigs_entry = vec[i]; 00189 if (adaption_ambigs_entry != NULL) { 00190 tprintf("%sAmbigs for adaption for %s:\n", 00191 (vec_id == 0) ? "" : "Reverse ", 00192 unicharset->debug_str(i).string()); 00193 for (j = 0; j < adaption_ambigs_entry->size(); ++j) { 00194 tprintf("%s ", unicharset->debug_str( 00195 (*adaption_ambigs_entry)[j]).string()); 00196 } 00197 tprintf("\n"); 00198 } 00199 } 00200 } 00201 } 00202 } 00203 } 00204 00205 bool UnicharAmbigs::ParseAmbiguityLine( 00206 int line_num, int version, int debug_level, const UNICHARSET &unicharset, 00207 char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds, 00208 int *ReplacementAmbigPartSize, char *ReplacementString, int *type) { 00209 int i; 00210 char *token; 00211 char *next_token; 00212 if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) || 00213 !sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) { 00214 if (debug_level) tprintf(kIllegalMsg, line_num); 00215 return false; 00216 } 00217 if (*TestAmbigPartSize > MAX_AMBIG_SIZE) { 00218 tprintf("Too many unichars in ambiguity on line %d\n"); 00219 return false; 00220 } 00221 for (i = 0; i < *TestAmbigPartSize; ++i) { 00222 if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; 00223 if (!unicharset.contains_unichar(token)) { 00224 if (debug_level) tprintf(kIllegalUnicharMsg, token); 00225 break; 00226 } 00227 TestUnicharIds[i] = unicharset.unichar_to_id(token); 00228 } 00229 TestUnicharIds[i] = INVALID_UNICHAR_ID; 00230 00231 if (i != *TestAmbigPartSize || 00232 !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || 00233 !sscanf(token, "%d", ReplacementAmbigPartSize) || 00234 *ReplacementAmbigPartSize <= 0) { 00235 if (debug_level) tprintf(kIllegalMsg, line_num); 00236 return false; 00237 } 00238 if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) { 00239 tprintf("Too many unichars in ambiguity on line %d\n"); 00240 return false; 00241 } 00242 ReplacementString[0] = '\0'; 00243 for (i = 0; i < *ReplacementAmbigPartSize; ++i) { 00244 if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; 00245 strcat(ReplacementString, token); 00246 if (!unicharset.contains_unichar(token)) { 00247 if (debug_level) tprintf(kIllegalUnicharMsg, token); 00248 break; 00249 } 00250 } 00251 if (i != *ReplacementAmbigPartSize) { 00252 if (debug_level) tprintf(kIllegalMsg, line_num); 00253 return false; 00254 } 00255 if (version > 0) { 00256 // The next field being true indicates that the abiguity should 00257 // always be substituted (e.g. '' should always be changed to "). 00258 // For such "certain" n -> m ambigs tesseract will insert character 00259 // fragments for the n pieces in the unicharset. AmbigsFound() 00260 // will then replace the incorrect ngram with the character 00261 // fragments of the correct character (or ngram if m > 1). 00262 // Note that if m > 1, an ngram will be inserted into the 00263 // modified word, not the individual unigrams. Tesseract 00264 // has limited support for ngram unichar (e.g. dawg permuter). 00265 if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || 00266 !sscanf(token, "%d", type)) { 00267 if (debug_level) tprintf(kIllegalMsg, line_num); 00268 return false; 00269 } 00270 } 00271 return true; 00272 } 00273 00274 void UnicharAmbigs::InsertIntoTable( 00275 UnicharAmbigsVector &table, int TestAmbigPartSize, 00276 UNICHAR_ID *TestUnicharIds, int ReplacementAmbigPartSize, 00277 const char *ReplacementString, int type, 00278 AmbigSpec *ambig_spec, UNICHARSET *unicharset) { 00279 ambig_spec->type = static_cast<AmbigType>(type); 00280 if (TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 && 00281 unicharset->to_lower(TestUnicharIds[0]) == 00282 unicharset->to_lower(unicharset->unichar_to_id(ReplacementString))) { 00283 ambig_spec->type = CASE_AMBIG; 00284 } 00285 00286 ambig_spec->wrong_ngram_size = 00287 UnicharIdArrayUtils::copy(TestUnicharIds, ambig_spec->wrong_ngram); 00288 00289 // Since we need to maintain a constant number of unichar positions in 00290 // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for 00291 // each n->m ambiguity we will have to place n character fragments of the 00292 // correct ngram into the corresponding positions in the vector (e.g. given 00293 // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and 00294 // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed 00295 // from fragments by dawg_permute_and_select(). 00296 00297 // Insert the corresponding correct ngram into the unicharset. 00298 // Unicharset code assumes that the "base" ngram is inserted into 00299 // the unicharset before fragments of this ngram are inserted. 00300 unicharset->unichar_insert(ReplacementString); 00301 ambig_spec->correct_ngram_id = 00302 unicharset->unichar_to_id(ReplacementString); 00303 if (ReplacementAmbigPartSize > 1) { 00304 unicharset->set_isngram(ambig_spec->correct_ngram_id, true); 00305 } 00306 // Add the corresponding fragments of the wrong ngram to unicharset. 00307 int i; 00308 for (i = 0; i < TestAmbigPartSize; ++i) { 00309 UNICHAR_ID unichar_id; 00310 if (TestAmbigPartSize == 1) { 00311 unichar_id = ambig_spec->correct_ngram_id; 00312 } else { 00313 STRING frag_str = CHAR_FRAGMENT::to_string( 00314 ReplacementString, i, TestAmbigPartSize, false); 00315 unicharset->unichar_insert(frag_str.string()); 00316 unichar_id = unicharset->unichar_to_id(frag_str.string()); 00317 } 00318 ambig_spec->correct_fragments[i] = unichar_id; 00319 } 00320 ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID; 00321 00322 // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST. 00323 // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram. 00324 if (table[TestUnicharIds[0]] == NULL) { 00325 table[TestUnicharIds[0]] = new AmbigSpec_LIST(); 00326 } 00327 table[TestUnicharIds[0]]->add_sorted( 00328 AmbigSpec::compare_ambig_specs, false, ambig_spec); 00329 } 00330 00331 } // namespace tesseract