Tesseract  3.02
tesseract-ocr/ccstruct/ratngs.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File: ratngs.cpp  (Formerly ratings.c)
00003  * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
00004  * Author: Ray Smith
00005  * Created: Thu Apr 23 13:23:29 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "mfcpch.h"
00021 #include "ratngs.h"
00022 
00023 #include "callcpp.h"
00024 #include "genericvector.h"
00025 #include "unicharset.h"
00026 
00027 ELISTIZE (BLOB_CHOICE) CLISTIZE (BLOB_CHOICE_LIST) CLISTIZE (WERD_CHOICE);
00028 
00029 const float WERD_CHOICE::kBadRating = 100000.0;
00030 
00031 static const char kPermuterTypeNoPerm[] = "None";
00032 static const char kPermuterTypePuncPerm[] = "Punctuation";
00033 static const char kPermuterTypeTopPerm[] = "Top Choice";
00034 static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
00035 static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
00036 static const char kPermuterTypeNgramPerm[] = "Ngram";
00037 static const char kPermuterTypeNumberPerm[] = "Number";
00038 static const char kPermuterTypeUserPatPerm[] = "User Pattern";
00039 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
00040 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
00041 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
00042 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
00043 static const char kPermuterTypeCompoundPerm[] = "Compound";
00044 
00045 static const char * const kPermuterTypeNames[] = {
00046     kPermuterTypeNoPerm,        // 0
00047     kPermuterTypePuncPerm,      // 1
00048     kPermuterTypeTopPerm,       // 2
00049     kPermuterTypeLowerPerm,     // 3
00050     kPermuterTypeUpperPerm,     // 4
00051     kPermuterTypeNgramPerm,     // 5
00052     kPermuterTypeNumberPerm,    // 6
00053     kPermuterTypeUserPatPerm,   // 7
00054     kPermuterTypeSysDawgPerm,   // 8
00055     kPermuterTypeDocDawgPerm,   // 9
00056     kPermuterTypeUserDawgPerm,  // 10
00057     kPermuterTypeFreqDawgPerm,  // 11
00058     kPermuterTypeCompoundPerm   // 12
00059 };
00060 
00066 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
00067                          float src_rating,         // rating
00068                          float src_cert,           // certainty
00069                          inT16 src_fontinfo_id,     // font
00070                          inT16 src_fontinfo_id2,    // 2nd choice font
00071                          int src_script_id,        // script
00072                          inT16 min_xheight,        // min xheight allowed
00073                          inT16 max_xheight,        // max xheight by this char
00074                          bool adapted              // adapted match or not
00075                         ) {
00076   unichar_id_ = src_unichar_id;
00077   rating_ = src_rating;
00078   certainty_ = src_cert;
00079   fontinfo_id_ = src_fontinfo_id;
00080   fontinfo_id2_ = src_fontinfo_id2;
00081   script_id_ = src_script_id;
00082   language_model_state_ = NULL;
00083   min_xheight_ = min_xheight;
00084   max_xheight_ = max_xheight;
00085   adapted_ = adapted;
00086 }
00087 
00093 BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
00094   unichar_id_ = other.unichar_id();
00095   rating_ = other.rating();
00096   certainty_ = other.certainty();
00097   fontinfo_id_ = other.fontinfo_id();
00098   fontinfo_id2_ = other.fontinfo_id2();
00099   script_id_ = other.script_id();
00100   language_model_state_ = NULL;
00101   min_xheight_ = other.min_xheight_;
00102   max_xheight_ = other.max_xheight_;
00103   adapted_ = other.adapted_;
00104 }
00105 
00112 WERD_CHOICE::WERD_CHOICE(const char *src_string,
00113                          const UNICHARSET &unicharset)
00114     : unicharset_(&unicharset){
00115   STRING src_lengths;
00116   const char *ptr = src_string;
00117   const char *end = src_string + strlen(src_string);
00118   int step = unicharset.step(ptr);
00119   for (; ptr < end && step > 0;
00120        step = unicharset.step(ptr), src_lengths += step, ptr += step);
00121   if (step != 0 && ptr == end) {
00122     this->init(src_string, src_lengths.string(),
00123                0.0, 0.0, NO_PERM);
00124   } else {  // there must have been an invalid unichar in the string
00125     this->init(8);
00126     this->make_bad();
00127   }
00128 }
00129 
00140 void WERD_CHOICE::init(const char *src_string,
00141                        const char *src_lengths,
00142                        float src_rating,
00143                        float src_certainty,
00144                        uinT8 src_permuter) {
00145   int src_string_len = strlen(src_string);
00146   if (src_string_len == 0) {
00147     this->init(8);
00148   } else {
00149     this->init(src_lengths ? strlen(src_lengths): src_string_len);
00150     length_ = reserved_;
00151     int offset = 0;
00152     for (int i = 0; i < length_; ++i) {
00153       int unichar_length = src_lengths ? src_lengths[i] : 1;
00154       unichar_ids_[i] =
00155           unicharset_->unichar_to_id(src_string+offset, unichar_length);
00156       fragment_lengths_[i] = 1;
00157       offset += unichar_length;
00158     }
00159   }
00160   rating_ = src_rating;
00161   certainty_ = src_certainty;
00162   permuter_ = src_permuter;
00163 }
00164 
00168 WERD_CHOICE::~WERD_CHOICE() {
00169   delete[] unichar_ids_;
00170   delete[] fragment_lengths_;
00171   delete_blob_choices();
00172 }
00173 
00174 const char *WERD_CHOICE::permuter_name() const {
00175   return kPermuterTypeNames[permuter_];
00176 }
00177 
00184 void WERD_CHOICE::set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices) {
00185   if (blob_choices_ != blob_choices) {
00186     delete_blob_choices();
00187     blob_choices_ = blob_choices;
00188   }
00189 }
00190 
00191 
00197 bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
00198   for (int i = 0; i < length_; ++i) {
00199     if (unichar_ids_[i] == unichar_id) {
00200       return true;
00201     }
00202   }
00203   return false;
00204 }
00205 
00213 void WERD_CHOICE::remove_unichar_ids(int start, int num) {
00214   ASSERT_HOST(start >= 0 && start + num <= length_);
00215   for (int i = start; i+num < length_; ++i) {
00216     unichar_ids_[i] = unichar_ids_[i+num];
00217     fragment_lengths_[i] = fragment_lengths_[i+num];
00218   }
00219   length_ -= num;
00220 }
00221 
00227 void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
00228   for (int i = 0; i < length_/2; ++i) {
00229     UNICHAR_ID tmp_id = unichar_ids_[i];
00230     unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
00231     unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
00232   }
00233   if (length_ % 2 != 0) {
00234     unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
00235   }
00236 }
00237 
00245 void WERD_CHOICE::punct_stripped(int *start, int *end) const {
00246   *start = 0;
00247   *end = length() - 1;
00248   while (*start < length() &&
00249          unicharset()->get_ispunctuation(unichar_id(*start))) {
00250     (*start)++;
00251   }
00252   while (*end > -1 &&
00253          unicharset()->get_ispunctuation(unichar_id(*end))) {
00254     (*end)--;
00255   }
00256   (*end)++;
00257 }
00258 
00259 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
00260   ASSERT_HOST(start >= 0 && start <= length_);
00261   ASSERT_HOST(end >= 0 && end <= length_);
00262   if (end < start) { end = start; }
00263   WERD_CHOICE retval(unicharset_, end - start);
00264   for (int i = start; i < end; i++) {
00265     retval.append_unichar_id_space_allocated(
00266         unichar_ids_[i], fragment_lengths_[i], 0.0f, 0.0f);
00267   }
00268   return retval;
00269 }
00270 
00276 bool WERD_CHOICE::has_rtl_unichar_id() const {
00277   int i;
00278   for (i = 0; i < length_; ++i) {
00279     UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
00280     if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
00281         dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {
00282       return true;
00283     }
00284   }
00285   return false;
00286 }
00287 
00294 void WERD_CHOICE::string_and_lengths(STRING *word_str,
00295                                      STRING *word_lengths_str) const {
00296   *word_str = "";
00297   if (word_lengths_str != NULL) *word_lengths_str = "";
00298   for (int i = 0; i < length_; ++i) {
00299     const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
00300     *word_str += ch;
00301     if (word_lengths_str != NULL) {
00302       *word_lengths_str += strlen(ch);
00303     }
00304   }
00305 }
00306 
00313 void WERD_CHOICE::append_unichar_id(
00314     UNICHAR_ID unichar_id, char fragment_length,
00315     float rating, float certainty) {
00316   if (length_ == reserved_) {
00317     this->double_the_size();
00318   }
00319   this->append_unichar_id_space_allocated(unichar_id, fragment_length,
00320                                           rating, certainty);
00321 }
00322 
00330 WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
00331   // TODO(daria): find out why the choice was cleared this way if any
00332   // of the pieces are empty. Add the description of this behavior
00333   // to the comments.
00334   // if (word_string.length () == 0 || second.word_string.length () == 0) {
00335   //   word_string = NULL;          //make it empty
00336   //   word_lengths = NULL;
00337   //   delete_blob_choices();
00338   // } else {
00339   ASSERT_HOST(unicharset_ == second.unicharset_);
00340   while (reserved_ < length_ + second.length()) {
00341     this->double_the_size();
00342   }
00343   const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
00344   const char *other_fragment_lengths = second.fragment_lengths();
00345   for (int i = 0; i < second.length(); ++i) {
00346     unichar_ids_[length_ + i] = other_unichar_ids[i];
00347     fragment_lengths_[length_ + i] = other_fragment_lengths[i];
00348   }
00349   length_ += second.length();
00350   rating_ += second.rating();  // add ratings
00351   if (second.certainty() < certainty_) // take min
00352     certainty_ = second.certainty();
00353   if (permuter_ == NO_PERM) {
00354     permuter_ = second.permuter();
00355   } else if (second.permuter() != NO_PERM &&
00356              second.permuter() != permuter_) {
00357     permuter_ = COMPOUND_PERM;
00358   }
00359 
00360   // Append a deep copy of second blob_choices if it exists.
00361   if (second.blob_choices_ != NULL) {
00362     if (this->blob_choices_ == NULL)
00363       this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST;
00364 
00365     BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
00366     BLOB_CHOICE_LIST_C_IT second_blob_choices_it;
00367 
00368     this_blob_choices_it.set_to_list(this->blob_choices_);
00369     this_blob_choices_it.move_to_last();
00370 
00371     second_blob_choices_it.set_to_list(second.blob_choices_);
00372 
00373     for (second_blob_choices_it.mark_cycle_pt();
00374          !second_blob_choices_it.cycled_list();
00375          second_blob_choices_it.forward()) {
00376 
00377       BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
00378       blob_choices_copy->deep_copy(second_blob_choices_it.data(),
00379                                    &BLOB_CHOICE::deep_copy);
00380 
00381       this_blob_choices_it.add_after_then_move(blob_choices_copy);
00382     }
00383   }
00384   return *this;
00385 }
00386 
00387 
00394 WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {
00395   while (reserved_ < source.length()) {
00396     this->double_the_size();
00397   }
00398 
00399   unicharset_ = source.unicharset_;
00400   const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
00401   const char *other_fragment_lengths = source.fragment_lengths();
00402   for (int i = 0; i < source.length(); ++i) {
00403     unichar_ids_[i] = other_unichar_ids[i];
00404     fragment_lengths_[i] = other_fragment_lengths[i];
00405   }
00406   length_ = source.length();
00407   rating_ = source.rating();
00408   certainty_ = source.certainty();
00409   permuter_ = source.permuter();
00410   fragment_mark_ = source.fragment_mark();
00411 
00412   // Delete existing blob_choices
00413   this->delete_blob_choices();
00414 
00415   // Deep copy blob_choices of source
00416   if (source.blob_choices_ != NULL) {
00417     BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
00418     BLOB_CHOICE_LIST_C_IT source_blob_choices_it;
00419 
00420     this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST();
00421 
00422     this_blob_choices_it.set_to_list(this->blob_choices_);
00423     source_blob_choices_it.set_to_list(source.blob_choices_);
00424 
00425     for (source_blob_choices_it.mark_cycle_pt();
00426          !source_blob_choices_it.cycled_list();
00427          source_blob_choices_it.forward()) {
00428 
00429       BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
00430       blob_choices_copy->deep_copy(source_blob_choices_it.data(),
00431                                    &BLOB_CHOICE::deep_copy);
00432 
00433       this_blob_choices_it.add_after_then_move(blob_choices_copy);
00434     }
00435   }
00436   return *this;
00437 }
00438 
00439 /**********************************************************************
00440  * WERD_CHOICE::delete_blob_choices
00441  *
00442  * Clear the blob_choices list, delete it and set it to NULL.
00443  **********************************************************************/
00444 void WERD_CHOICE::delete_blob_choices() {
00445   if (blob_choices_ != NULL) {
00446     blob_choices_->deep_clear();
00447     delete blob_choices_;
00448     blob_choices_ = NULL;
00449   }
00450 }
00451 
00457 const void WERD_CHOICE::print(const char *msg) const {
00458   tprintf("%s WERD_CHOICE:\n", msg);
00459   tprintf("length_ %d reserved_ %d permuter_ %d\n",
00460          length_, reserved_, permuter_);
00461   tprintf("rating_ %.4f certainty_ %.4f", rating_, certainty_);
00462   if (fragment_mark_) {
00463     tprintf(" fragment_mark_ true");
00464   }
00465   tprintf("\n");
00466   if (unichar_string_.length() > 0) {
00467     tprintf("unichar_string_ %s unichar_lengths_ %s\n",
00468             unichar_string_.string(), unichar_lengths_.string());
00469   }
00470   tprintf("unichar_ids: ");
00471   int i;
00472   for (i = 0; i < length_; ++i) {
00473     tprintf("%d ", unichar_ids_[i]);
00474   }
00475   tprintf("\nfragment_lengths_: ");
00476   for (i = 0; i < length_; ++i) {
00477     tprintf("%d ", fragment_lengths_[i]);
00478   }
00479   tprintf("\n");
00480   fflush(stdout);
00481 }
00482 
00483 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
00484                                        const WERD_CHOICE &word2) {
00485   const UNICHARSET *uchset = word1.unicharset();
00486   if (word2.unicharset() != uchset) return false;
00487   int w1start, w1end;
00488   word1.punct_stripped(&w1start, &w1end);
00489   int w2start, w2end;
00490   word2.punct_stripped(&w2start, &w2end);
00491   if (w1end - w1start != w2end - w2start) return false;
00492   for (int i = 0; i < w1end - w1start; i++) {
00493     if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
00494         uchset->to_lower(word2.unichar_id(w2start + i))) {
00495         return false;
00496     }
00497   }
00498   return true;
00499 }
00500 
00511 void print_ratings_list(const char *msg,
00512                         BLOB_CHOICE_LIST *ratings,
00513                         const UNICHARSET &current_unicharset) {
00514   if (ratings->length() == 0) {
00515     tprintf("%s:<none>\n", msg);
00516     return;
00517   }
00518   if (*msg != '\0') {
00519     tprintf("%s\n", msg);
00520   }
00521   BLOB_CHOICE_IT c_it;
00522   c_it.set_to_list(ratings);
00523   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00524     c_it.data()->print(&current_unicharset);
00525     if (!c_it.at_last()) tprintf("\n");
00526   }
00527   tprintf("\n");
00528   fflush(stdout);
00529 }
00530 
00536 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings) {
00537   if (ratings->length() == 0) {
00538     tprintf("%s:<none>\n", msg);
00539     return;
00540   }
00541   if (*msg != '\0') {
00542     tprintf("%s\n", msg);
00543   }
00544   BLOB_CHOICE_IT c_it;
00545   c_it.set_to_list(ratings);
00546   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00547     c_it.data()->print(NULL);
00548     if (!c_it.at_last()) tprintf("\n");
00549   }
00550   tprintf("\n");
00551   fflush(stdout);
00552 }
00553 
00564 void print_ratings_info(FILE *fp,
00565                         BLOB_CHOICE_LIST *ratings,
00566                         const UNICHARSET &current_unicharset) {
00567   inT32 index;                    // to list
00568   const char* first_char = NULL;  // character
00569   FLOAT32 first_rat;              // rating
00570   FLOAT32 first_cert;             // certainty
00571   const char* sec_char = NULL;    // character
00572   FLOAT32 sec_rat = 0.0f;         // rating
00573   FLOAT32 sec_cert = 0.0f;        // certainty
00574   BLOB_CHOICE_IT c_it = ratings;  // iterator
00575 
00576   index = ratings->length();
00577   if (index > 0) {
00578     first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id());
00579     first_rat = c_it.data()->rating();
00580     first_cert = -c_it.data()->certainty();
00581     if (index > 1) {
00582       sec_char = current_unicharset.id_to_unichar(
00583           c_it.data_relative(1)->unichar_id());
00584       sec_rat = c_it.data_relative(1)->rating();
00585       sec_cert = -c_it.data_relative(1)->certainty();
00586     } else {
00587       sec_char = NULL;
00588       sec_rat = -1;
00589       sec_cert = -1;
00590     }
00591   } else {
00592     first_char = NULL;
00593     first_rat = -1;
00594     first_cert = -1;
00595   }
00596   if (first_char != NULL && (*first_char == '\0' || *first_char == ' '))
00597     first_char = NULL;
00598   if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' '))
00599     sec_char = NULL;
00600   tprintf(" " INT32FORMAT " %s %g %g %s %g %g\n",
00601           ratings->length(),
00602           first_char != NULL ? first_char : "~",
00603           first_rat, first_cert, sec_char != NULL ? sec_char : "~",
00604           sec_rat, sec_cert);
00605 }
00606 
00610 void print_char_choices_list(const char *msg,
00611                              const BLOB_CHOICE_LIST_VECTOR &char_choices,
00612                              const UNICHARSET &current_unicharset,
00613                              BOOL8 detailed) {
00614   if (*msg != '\0') tprintf("%s\n", msg);
00615   for (int x = 0; x < char_choices.length(); ++x) {
00616     BLOB_CHOICE_IT c_it;
00617     c_it.set_to_list(char_choices.get(x));
00618     tprintf("\nchar[%d]: %s\n", x,
00619             current_unicharset.debug_str( c_it.data()->unichar_id()).string());
00620     if (detailed)
00621       print_ratings_list("", char_choices.get(x), current_unicharset);
00622   }
00623 }
00624 
00628 void print_word_alternates_list(
00629     WERD_CHOICE *word,
00630     GenericVector<WERD_CHOICE *> *alternates) {
00631   if (!word || !alternates) return;
00632 
00633   STRING alternates_str;
00634   for (int i = 0; i < alternates->size(); i++) {
00635     if (i > 0) alternates_str += "\", \"";
00636     alternates_str += alternates->get(i)->unichar_string();
00637   }
00638   tprintf("Alternates for \"%s\": {\"%s\"}\n",
00639           word->unichar_string().string(), alternates_str.string());
00640 }