Tesseract  3.02
tesseract-ocr/ccutil/ambigs.cpp
Go to the documentation of this file.
00001 
00002 // File:        ambigs.cc
00003 // Description: Functions for dealing with ambiguities
00004 //              (training and recognition).
00005 // Author:      Daria Antonova
00006 // Created:     Mon Feb 5 11:26:43 PDT 2009
00007 //
00008 // (C) Copyright 2008, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #include "ambigs.h"
00022 #include "helpers.h"
00023 
00024 #ifdef _WIN32
00025 #ifndef __GNUC__
00026 #define strtok_r strtok_s
00027 #else
00028 #include "strtok_r.h"
00029 #endif  /* __GNUC__ */
00030 #endif  /* _WIN32 */
00031 
00032 namespace tesseract {
00033 
00034 AmbigSpec::AmbigSpec() {
00035   wrong_ngram[0] = INVALID_UNICHAR_ID;
00036   correct_fragments[0] = INVALID_UNICHAR_ID;
00037   correct_ngram_id = INVALID_UNICHAR_ID;
00038   type = NOT_AMBIG;
00039   wrong_ngram_size = 0;
00040 }
00041 
00042 ELISTIZE(AmbigSpec);
00043 
00044 void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
00045                                       inT64 end_offset,
00046                                       int debug_level,
00047                                       bool use_ambigs_for_adaption,
00048                                       UNICHARSET *unicharset) {
00049   int i, j;
00050   UnicharIdVector *adaption_ambigs_entry;
00051   for (i = 0; i < unicharset->size(); ++i) {
00052     replace_ambigs_.push_back(NULL);
00053     dang_ambigs_.push_back(NULL);
00054     one_to_one_definite_ambigs_.push_back(NULL);
00055     if (use_ambigs_for_adaption) {
00056       ambigs_for_adaption_.push_back(NULL);
00057       reverse_ambigs_for_adaption_.push_back(NULL);
00058     }
00059   }
00060   if (debug_level) tprintf("Reading ambiguities\n");
00061 
00062   int TestAmbigPartSize;
00063   int ReplacementAmbigPartSize;
00064   // Maximum line size:
00065   //   10 for sizes of ambigs, tabs, abmig type and newline
00066   //   UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
00067   // The space for buffer is allocated on the heap to avoid
00068   // GCC frame size warning.
00069   const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
00070   const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
00071   char *buffer = new char[kBufferSize];
00072   char ReplacementString[kMaxAmbigStringSize];
00073   UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
00074   int line_num = 0;
00075   int type = NOT_AMBIG;
00076 
00077   // Determine the version of the ambigs file.
00078   int version = 0;
00079   ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
00080               strlen(buffer) > 0);
00081   if (*buffer == 'v') {
00082     version = static_cast<int>(strtol(buffer+1, NULL, 10));
00083     ++line_num;
00084   } else {
00085     rewind(AmbigFile);
00086   }
00087   while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
00088          fgets(buffer, kBufferSize, AmbigFile) != NULL) {
00089     chomp_string(buffer);
00090     if (debug_level > 2) tprintf("read line %s\n", buffer);
00091     ++line_num;
00092     if (!ParseAmbiguityLine(line_num, version, debug_level, *unicharset,
00093                             buffer, &TestAmbigPartSize, TestUnicharIds,
00094                             &ReplacementAmbigPartSize,
00095                             ReplacementString, &type)) continue;
00096     // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
00097     AmbigSpec *ambig_spec = new AmbigSpec();
00098     InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
00099                     TestAmbigPartSize, TestUnicharIds,
00100                     ReplacementAmbigPartSize, ReplacementString, type,
00101                     ambig_spec, unicharset);
00102 
00103     // Update one_to_one_definite_ambigs_.
00104     if (TestAmbigPartSize == 1 &&
00105         ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
00106       if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
00107         one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
00108       }
00109       one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
00110           ambig_spec->correct_ngram_id);
00111     }
00112     // Update ambigs_for_adaption_.
00113     if (use_ambigs_for_adaption) {
00114       for (i = 0; i < TestAmbigPartSize; ++i) {
00115         if (ambigs_for_adaption_[TestUnicharIds[i]] == NULL) {
00116           ambigs_for_adaption_[TestUnicharIds[i]] = new UnicharIdVector();
00117         }
00118         adaption_ambigs_entry = ambigs_for_adaption_[TestUnicharIds[i]];
00119         const char *tmp_ptr = ReplacementString;
00120         const char *tmp_ptr_end = ReplacementString + strlen(ReplacementString);
00121         int step = unicharset->step(tmp_ptr);
00122         while (step > 0) {
00123           UNICHAR_ID id_to_insert = unicharset->unichar_to_id(tmp_ptr, step);
00124           ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
00125           // Add the new unichar id to adaption_ambigs_entry (only if the
00126           // vector does not already contain it) keeping it in sorted order.
00127           for (j = 0; j < adaption_ambigs_entry->size() &&
00128                (*adaption_ambigs_entry)[j] > id_to_insert; ++j);
00129           if (j < adaption_ambigs_entry->size()) {
00130             if ((*adaption_ambigs_entry)[j] != id_to_insert) {
00131               adaption_ambigs_entry->insert(id_to_insert, j);
00132             }
00133           } else {
00134             adaption_ambigs_entry->push_back(id_to_insert);
00135           }
00136           // Update tmp_ptr and step.
00137           tmp_ptr += step;
00138           step = tmp_ptr < tmp_ptr_end ? unicharset->step(tmp_ptr) : 0;
00139         }
00140       }
00141     }
00142   }
00143   delete[] buffer;
00144 
00145   // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.
00146   if (use_ambigs_for_adaption) {
00147     for (i = 0; i < ambigs_for_adaption_.size(); ++i) {
00148       adaption_ambigs_entry = ambigs_for_adaption_[i];
00149       if (adaption_ambigs_entry == NULL) continue;
00150       for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
00151         UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];
00152         if (reverse_ambigs_for_adaption_[ambig_id] == NULL) {
00153           reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();
00154         }
00155         reverse_ambigs_for_adaption_[ambig_id]->push_back(i);
00156       }
00157     }
00158   }
00159 
00160   // Print what was read from the input file.
00161   if (debug_level > 1) {
00162     for (int tbl = 0; tbl < 2; ++tbl) {
00163       const UnicharAmbigsVector &print_table =
00164         (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
00165       for (i = 0; i < print_table.size(); ++i) {
00166         AmbigSpec_LIST *lst = print_table[i];
00167         if (lst == NULL) continue;
00168         if (!lst->empty()) {
00169           tprintf("%s Ambiguities for %s:\n",
00170                   (tbl == 0) ? "Replaceable" : "Dangerous",
00171                   unicharset->debug_str(i).string());
00172         }
00173         AmbigSpec_IT lst_it(lst);
00174         for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
00175           AmbigSpec *ambig_spec = lst_it.data();
00176           tprintf("wrong_ngram:");
00177           UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
00178           tprintf("correct_fragments:");
00179           UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
00180         }
00181       }
00182     }
00183     if (use_ambigs_for_adaption) {
00184       for (int vec_id = 0; vec_id < 2; ++vec_id) {
00185         const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ?
00186           ambigs_for_adaption_ : reverse_ambigs_for_adaption_;
00187         for (i = 0; i < vec.size(); ++i) {
00188           adaption_ambigs_entry = vec[i];
00189           if (adaption_ambigs_entry != NULL) {
00190             tprintf("%sAmbigs for adaption for %s:\n",
00191                     (vec_id == 0) ? "" : "Reverse ",
00192                     unicharset->debug_str(i).string());
00193             for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
00194               tprintf("%s ", unicharset->debug_str(
00195                   (*adaption_ambigs_entry)[j]).string());
00196             }
00197             tprintf("\n");
00198           }
00199         }
00200       }
00201     }
00202   }
00203 }
00204 
00205 bool UnicharAmbigs::ParseAmbiguityLine(
00206     int line_num, int version, int debug_level, const UNICHARSET &unicharset,
00207     char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
00208     int *ReplacementAmbigPartSize, char *ReplacementString, int *type) {
00209   int i;
00210   char *token;
00211   char *next_token;
00212   if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
00213       !sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) {
00214     if (debug_level) tprintf(kIllegalMsg, line_num);
00215     return false;
00216   }
00217   if (*TestAmbigPartSize > MAX_AMBIG_SIZE) {
00218     tprintf("Too many unichars in ambiguity on line %d\n");
00219     return false;
00220   }
00221   for (i = 0; i < *TestAmbigPartSize; ++i) {
00222     if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
00223     if (!unicharset.contains_unichar(token)) {
00224       if (debug_level) tprintf(kIllegalUnicharMsg, token);
00225       break;
00226     }
00227     TestUnicharIds[i] = unicharset.unichar_to_id(token);
00228   }
00229   TestUnicharIds[i] = INVALID_UNICHAR_ID;
00230 
00231   if (i != *TestAmbigPartSize ||
00232       !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
00233       !sscanf(token, "%d", ReplacementAmbigPartSize) ||
00234         *ReplacementAmbigPartSize <= 0) {
00235     if (debug_level) tprintf(kIllegalMsg, line_num);
00236     return false;
00237   }
00238   if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) {
00239     tprintf("Too many unichars in ambiguity on line %d\n");
00240     return false;
00241   }
00242   ReplacementString[0] = '\0';
00243   for (i = 0; i < *ReplacementAmbigPartSize; ++i) {
00244     if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
00245     strcat(ReplacementString, token);
00246     if (!unicharset.contains_unichar(token)) {
00247       if (debug_level) tprintf(kIllegalUnicharMsg, token);
00248       break;
00249     }
00250   }
00251   if (i != *ReplacementAmbigPartSize) {
00252     if (debug_level) tprintf(kIllegalMsg, line_num);
00253     return false;
00254   }
00255   if (version > 0) {
00256     // The next field being true indicates that the abiguity should
00257     // always be substituted (e.g. '' should always be changed to ").
00258     // For such "certain" n -> m ambigs tesseract will insert character
00259     // fragments for the n pieces in the unicharset. AmbigsFound()
00260     // will then replace the incorrect ngram with the character
00261     // fragments of the correct character (or ngram if m > 1).
00262     // Note that if m > 1, an ngram will be inserted into the
00263     // modified word, not the individual unigrams. Tesseract
00264     // has limited support for ngram unichar (e.g. dawg permuter).
00265     if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
00266         !sscanf(token, "%d", type)) {
00267       if (debug_level) tprintf(kIllegalMsg, line_num);
00268       return false;
00269     }
00270   }
00271   return true;
00272 }
00273 
00274 void UnicharAmbigs::InsertIntoTable(
00275     UnicharAmbigsVector &table, int TestAmbigPartSize,
00276     UNICHAR_ID *TestUnicharIds, int ReplacementAmbigPartSize,
00277     const char *ReplacementString, int type,
00278     AmbigSpec *ambig_spec, UNICHARSET *unicharset) {
00279   ambig_spec->type = static_cast<AmbigType>(type);
00280   if (TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 &&
00281       unicharset->to_lower(TestUnicharIds[0]) ==
00282       unicharset->to_lower(unicharset->unichar_to_id(ReplacementString))) {
00283     ambig_spec->type = CASE_AMBIG;
00284   }
00285 
00286   ambig_spec->wrong_ngram_size =
00287     UnicharIdArrayUtils::copy(TestUnicharIds, ambig_spec->wrong_ngram);
00288 
00289   // Since we need to maintain a constant number of unichar positions in
00290   // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
00291   // each n->m ambiguity we will have to place n character fragments of the
00292   // correct ngram into the corresponding positions in the vector (e.g. given
00293   // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and
00294   // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed
00295   // from fragments by dawg_permute_and_select().
00296 
00297   // Insert the corresponding correct ngram into the unicharset.
00298   // Unicharset code assumes that the "base" ngram is inserted into
00299   // the unicharset before fragments of this ngram are inserted.
00300   unicharset->unichar_insert(ReplacementString);
00301   ambig_spec->correct_ngram_id =
00302     unicharset->unichar_to_id(ReplacementString);
00303   if (ReplacementAmbigPartSize > 1) {
00304     unicharset->set_isngram(ambig_spec->correct_ngram_id, true);
00305   }
00306   // Add the corresponding fragments of the wrong ngram to unicharset.
00307   int i;
00308   for (i = 0; i < TestAmbigPartSize; ++i) {
00309     UNICHAR_ID unichar_id;
00310     if (TestAmbigPartSize == 1) {
00311       unichar_id = ambig_spec->correct_ngram_id;
00312     } else {
00313       STRING frag_str = CHAR_FRAGMENT::to_string(
00314           ReplacementString, i, TestAmbigPartSize, false);
00315       unicharset->unichar_insert(frag_str.string());
00316       unichar_id = unicharset->unichar_to_id(frag_str.string());
00317     }
00318     ambig_spec->correct_fragments[i] = unichar_id;
00319   }
00320   ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID;
00321 
00322   // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.
00323   // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.
00324   if (table[TestUnicharIds[0]] == NULL) {
00325     table[TestUnicharIds[0]] = new AmbigSpec_LIST();
00326   }
00327   table[TestUnicharIds[0]]->add_sorted(
00328       AmbigSpec::compare_ambig_specs, false, ambig_spec);
00329 }
00330 
00331 }  // namespace tesseract