Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: ratngs.cpp (Formerly ratings.c) 00003 * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 13:23:29 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "mfcpch.h" 00021 #include "ratngs.h" 00022 00023 #include "callcpp.h" 00024 #include "genericvector.h" 00025 #include "unicharset.h" 00026 00027 ELISTIZE (BLOB_CHOICE) CLISTIZE (BLOB_CHOICE_LIST) CLISTIZE (WERD_CHOICE); 00028 00029 const float WERD_CHOICE::kBadRating = 100000.0; 00030 00031 static const char kPermuterTypeNoPerm[] = "None"; 00032 static const char kPermuterTypePuncPerm[] = "Punctuation"; 00033 static const char kPermuterTypeTopPerm[] = "Top Choice"; 00034 static const char kPermuterTypeLowerPerm[] = "Top Lower Case"; 00035 static const char kPermuterTypeUpperPerm[] = "Top Upper Case"; 00036 static const char kPermuterTypeNgramPerm[] = "Ngram"; 00037 static const char kPermuterTypeNumberPerm[] = "Number"; 00038 static const char kPermuterTypeUserPatPerm[] = "User Pattern"; 00039 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary"; 00040 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary"; 00041 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary"; 00042 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary"; 00043 static const char kPermuterTypeCompoundPerm[] = "Compound"; 00044 00045 static const char * const kPermuterTypeNames[] = { 00046 kPermuterTypeNoPerm, // 0 00047 kPermuterTypePuncPerm, // 1 00048 kPermuterTypeTopPerm, // 2 00049 kPermuterTypeLowerPerm, // 3 00050 kPermuterTypeUpperPerm, // 4 00051 kPermuterTypeNgramPerm, // 5 00052 kPermuterTypeNumberPerm, // 6 00053 kPermuterTypeUserPatPerm, // 7 00054 kPermuterTypeSysDawgPerm, // 8 00055 kPermuterTypeDocDawgPerm, // 9 00056 kPermuterTypeUserDawgPerm, // 10 00057 kPermuterTypeFreqDawgPerm, // 11 00058 kPermuterTypeCompoundPerm // 12 00059 }; 00060 00066 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id 00067 float src_rating, // rating 00068 float src_cert, // certainty 00069 inT16 src_fontinfo_id, // font 00070 inT16 src_fontinfo_id2, // 2nd choice font 00071 int src_script_id, // script 00072 inT16 min_xheight, // min xheight allowed 00073 inT16 max_xheight, // max xheight by this char 00074 bool adapted // adapted match or not 00075 ) { 00076 unichar_id_ = src_unichar_id; 00077 rating_ = src_rating; 00078 certainty_ = src_cert; 00079 fontinfo_id_ = src_fontinfo_id; 00080 fontinfo_id2_ = src_fontinfo_id2; 00081 script_id_ = src_script_id; 00082 language_model_state_ = NULL; 00083 min_xheight_ = min_xheight; 00084 max_xheight_ = max_xheight; 00085 adapted_ = adapted; 00086 } 00087 00093 BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { 00094 unichar_id_ = other.unichar_id(); 00095 rating_ = other.rating(); 00096 certainty_ = other.certainty(); 00097 fontinfo_id_ = other.fontinfo_id(); 00098 fontinfo_id2_ = other.fontinfo_id2(); 00099 script_id_ = other.script_id(); 00100 language_model_state_ = NULL; 00101 min_xheight_ = other.min_xheight_; 00102 max_xheight_ = other.max_xheight_; 00103 adapted_ = other.adapted_; 00104 } 00105 00112 WERD_CHOICE::WERD_CHOICE(const char *src_string, 00113 const UNICHARSET &unicharset) 00114 : unicharset_(&unicharset){ 00115 STRING src_lengths; 00116 const char *ptr = src_string; 00117 const char *end = src_string + strlen(src_string); 00118 int step = unicharset.step(ptr); 00119 for (; ptr < end && step > 0; 00120 step = unicharset.step(ptr), src_lengths += step, ptr += step); 00121 if (step != 0 && ptr == end) { 00122 this->init(src_string, src_lengths.string(), 00123 0.0, 0.0, NO_PERM); 00124 } else { // there must have been an invalid unichar in the string 00125 this->init(8); 00126 this->make_bad(); 00127 } 00128 } 00129 00140 void WERD_CHOICE::init(const char *src_string, 00141 const char *src_lengths, 00142 float src_rating, 00143 float src_certainty, 00144 uinT8 src_permuter) { 00145 int src_string_len = strlen(src_string); 00146 if (src_string_len == 0) { 00147 this->init(8); 00148 } else { 00149 this->init(src_lengths ? strlen(src_lengths): src_string_len); 00150 length_ = reserved_; 00151 int offset = 0; 00152 for (int i = 0; i < length_; ++i) { 00153 int unichar_length = src_lengths ? src_lengths[i] : 1; 00154 unichar_ids_[i] = 00155 unicharset_->unichar_to_id(src_string+offset, unichar_length); 00156 fragment_lengths_[i] = 1; 00157 offset += unichar_length; 00158 } 00159 } 00160 rating_ = src_rating; 00161 certainty_ = src_certainty; 00162 permuter_ = src_permuter; 00163 } 00164 00168 WERD_CHOICE::~WERD_CHOICE() { 00169 delete[] unichar_ids_; 00170 delete[] fragment_lengths_; 00171 delete_blob_choices(); 00172 } 00173 00174 const char *WERD_CHOICE::permuter_name() const { 00175 return kPermuterTypeNames[permuter_]; 00176 } 00177 00184 void WERD_CHOICE::set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices) { 00185 if (blob_choices_ != blob_choices) { 00186 delete_blob_choices(); 00187 blob_choices_ = blob_choices; 00188 } 00189 } 00190 00191 00197 bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const { 00198 for (int i = 0; i < length_; ++i) { 00199 if (unichar_ids_[i] == unichar_id) { 00200 return true; 00201 } 00202 } 00203 return false; 00204 } 00205 00213 void WERD_CHOICE::remove_unichar_ids(int start, int num) { 00214 ASSERT_HOST(start >= 0 && start + num <= length_); 00215 for (int i = start; i+num < length_; ++i) { 00216 unichar_ids_[i] = unichar_ids_[i+num]; 00217 fragment_lengths_[i] = fragment_lengths_[i+num]; 00218 } 00219 length_ -= num; 00220 } 00221 00227 void WERD_CHOICE::reverse_and_mirror_unichar_ids() { 00228 for (int i = 0; i < length_/2; ++i) { 00229 UNICHAR_ID tmp_id = unichar_ids_[i]; 00230 unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]); 00231 unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id); 00232 } 00233 if (length_ % 2 != 0) { 00234 unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]); 00235 } 00236 } 00237 00245 void WERD_CHOICE::punct_stripped(int *start, int *end) const { 00246 *start = 0; 00247 *end = length() - 1; 00248 while (*start < length() && 00249 unicharset()->get_ispunctuation(unichar_id(*start))) { 00250 (*start)++; 00251 } 00252 while (*end > -1 && 00253 unicharset()->get_ispunctuation(unichar_id(*end))) { 00254 (*end)--; 00255 } 00256 (*end)++; 00257 } 00258 00259 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const { 00260 ASSERT_HOST(start >= 0 && start <= length_); 00261 ASSERT_HOST(end >= 0 && end <= length_); 00262 if (end < start) { end = start; } 00263 WERD_CHOICE retval(unicharset_, end - start); 00264 for (int i = start; i < end; i++) { 00265 retval.append_unichar_id_space_allocated( 00266 unichar_ids_[i], fragment_lengths_[i], 0.0f, 0.0f); 00267 } 00268 return retval; 00269 } 00270 00276 bool WERD_CHOICE::has_rtl_unichar_id() const { 00277 int i; 00278 for (i = 0; i < length_; ++i) { 00279 UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]); 00280 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || 00281 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) { 00282 return true; 00283 } 00284 } 00285 return false; 00286 } 00287 00294 void WERD_CHOICE::string_and_lengths(STRING *word_str, 00295 STRING *word_lengths_str) const { 00296 *word_str = ""; 00297 if (word_lengths_str != NULL) *word_lengths_str = ""; 00298 for (int i = 0; i < length_; ++i) { 00299 const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]); 00300 *word_str += ch; 00301 if (word_lengths_str != NULL) { 00302 *word_lengths_str += strlen(ch); 00303 } 00304 } 00305 } 00306 00313 void WERD_CHOICE::append_unichar_id( 00314 UNICHAR_ID unichar_id, char fragment_length, 00315 float rating, float certainty) { 00316 if (length_ == reserved_) { 00317 this->double_the_size(); 00318 } 00319 this->append_unichar_id_space_allocated(unichar_id, fragment_length, 00320 rating, certainty); 00321 } 00322 00330 WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) { 00331 // TODO(daria): find out why the choice was cleared this way if any 00332 // of the pieces are empty. Add the description of this behavior 00333 // to the comments. 00334 // if (word_string.length () == 0 || second.word_string.length () == 0) { 00335 // word_string = NULL; //make it empty 00336 // word_lengths = NULL; 00337 // delete_blob_choices(); 00338 // } else { 00339 ASSERT_HOST(unicharset_ == second.unicharset_); 00340 while (reserved_ < length_ + second.length()) { 00341 this->double_the_size(); 00342 } 00343 const UNICHAR_ID *other_unichar_ids = second.unichar_ids(); 00344 const char *other_fragment_lengths = second.fragment_lengths(); 00345 for (int i = 0; i < second.length(); ++i) { 00346 unichar_ids_[length_ + i] = other_unichar_ids[i]; 00347 fragment_lengths_[length_ + i] = other_fragment_lengths[i]; 00348 } 00349 length_ += second.length(); 00350 rating_ += second.rating(); // add ratings 00351 if (second.certainty() < certainty_) // take min 00352 certainty_ = second.certainty(); 00353 if (permuter_ == NO_PERM) { 00354 permuter_ = second.permuter(); 00355 } else if (second.permuter() != NO_PERM && 00356 second.permuter() != permuter_) { 00357 permuter_ = COMPOUND_PERM; 00358 } 00359 00360 // Append a deep copy of second blob_choices if it exists. 00361 if (second.blob_choices_ != NULL) { 00362 if (this->blob_choices_ == NULL) 00363 this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST; 00364 00365 BLOB_CHOICE_LIST_C_IT this_blob_choices_it; 00366 BLOB_CHOICE_LIST_C_IT second_blob_choices_it; 00367 00368 this_blob_choices_it.set_to_list(this->blob_choices_); 00369 this_blob_choices_it.move_to_last(); 00370 00371 second_blob_choices_it.set_to_list(second.blob_choices_); 00372 00373 for (second_blob_choices_it.mark_cycle_pt(); 00374 !second_blob_choices_it.cycled_list(); 00375 second_blob_choices_it.forward()) { 00376 00377 BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST(); 00378 blob_choices_copy->deep_copy(second_blob_choices_it.data(), 00379 &BLOB_CHOICE::deep_copy); 00380 00381 this_blob_choices_it.add_after_then_move(blob_choices_copy); 00382 } 00383 } 00384 return *this; 00385 } 00386 00387 00394 WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) { 00395 while (reserved_ < source.length()) { 00396 this->double_the_size(); 00397 } 00398 00399 unicharset_ = source.unicharset_; 00400 const UNICHAR_ID *other_unichar_ids = source.unichar_ids(); 00401 const char *other_fragment_lengths = source.fragment_lengths(); 00402 for (int i = 0; i < source.length(); ++i) { 00403 unichar_ids_[i] = other_unichar_ids[i]; 00404 fragment_lengths_[i] = other_fragment_lengths[i]; 00405 } 00406 length_ = source.length(); 00407 rating_ = source.rating(); 00408 certainty_ = source.certainty(); 00409 permuter_ = source.permuter(); 00410 fragment_mark_ = source.fragment_mark(); 00411 00412 // Delete existing blob_choices 00413 this->delete_blob_choices(); 00414 00415 // Deep copy blob_choices of source 00416 if (source.blob_choices_ != NULL) { 00417 BLOB_CHOICE_LIST_C_IT this_blob_choices_it; 00418 BLOB_CHOICE_LIST_C_IT source_blob_choices_it; 00419 00420 this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST(); 00421 00422 this_blob_choices_it.set_to_list(this->blob_choices_); 00423 source_blob_choices_it.set_to_list(source.blob_choices_); 00424 00425 for (source_blob_choices_it.mark_cycle_pt(); 00426 !source_blob_choices_it.cycled_list(); 00427 source_blob_choices_it.forward()) { 00428 00429 BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST(); 00430 blob_choices_copy->deep_copy(source_blob_choices_it.data(), 00431 &BLOB_CHOICE::deep_copy); 00432 00433 this_blob_choices_it.add_after_then_move(blob_choices_copy); 00434 } 00435 } 00436 return *this; 00437 } 00438 00439 /********************************************************************** 00440 * WERD_CHOICE::delete_blob_choices 00441 * 00442 * Clear the blob_choices list, delete it and set it to NULL. 00443 **********************************************************************/ 00444 void WERD_CHOICE::delete_blob_choices() { 00445 if (blob_choices_ != NULL) { 00446 blob_choices_->deep_clear(); 00447 delete blob_choices_; 00448 blob_choices_ = NULL; 00449 } 00450 } 00451 00457 const void WERD_CHOICE::print(const char *msg) const { 00458 tprintf("%s WERD_CHOICE:\n", msg); 00459 tprintf("length_ %d reserved_ %d permuter_ %d\n", 00460 length_, reserved_, permuter_); 00461 tprintf("rating_ %.4f certainty_ %.4f", rating_, certainty_); 00462 if (fragment_mark_) { 00463 tprintf(" fragment_mark_ true"); 00464 } 00465 tprintf("\n"); 00466 if (unichar_string_.length() > 0) { 00467 tprintf("unichar_string_ %s unichar_lengths_ %s\n", 00468 unichar_string_.string(), unichar_lengths_.string()); 00469 } 00470 tprintf("unichar_ids: "); 00471 int i; 00472 for (i = 0; i < length_; ++i) { 00473 tprintf("%d ", unichar_ids_[i]); 00474 } 00475 tprintf("\nfragment_lengths_: "); 00476 for (i = 0; i < length_; ++i) { 00477 tprintf("%d ", fragment_lengths_[i]); 00478 } 00479 tprintf("\n"); 00480 fflush(stdout); 00481 } 00482 00483 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, 00484 const WERD_CHOICE &word2) { 00485 const UNICHARSET *uchset = word1.unicharset(); 00486 if (word2.unicharset() != uchset) return false; 00487 int w1start, w1end; 00488 word1.punct_stripped(&w1start, &w1end); 00489 int w2start, w2end; 00490 word2.punct_stripped(&w2start, &w2end); 00491 if (w1end - w1start != w2end - w2start) return false; 00492 for (int i = 0; i < w1end - w1start; i++) { 00493 if (uchset->to_lower(word1.unichar_id(w1start + i)) != 00494 uchset->to_lower(word2.unichar_id(w2start + i))) { 00495 return false; 00496 } 00497 } 00498 return true; 00499 } 00500 00511 void print_ratings_list(const char *msg, 00512 BLOB_CHOICE_LIST *ratings, 00513 const UNICHARSET ¤t_unicharset) { 00514 if (ratings->length() == 0) { 00515 tprintf("%s:<none>\n", msg); 00516 return; 00517 } 00518 if (*msg != '\0') { 00519 tprintf("%s\n", msg); 00520 } 00521 BLOB_CHOICE_IT c_it; 00522 c_it.set_to_list(ratings); 00523 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00524 c_it.data()->print(¤t_unicharset); 00525 if (!c_it.at_last()) tprintf("\n"); 00526 } 00527 tprintf("\n"); 00528 fflush(stdout); 00529 } 00530 00536 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings) { 00537 if (ratings->length() == 0) { 00538 tprintf("%s:<none>\n", msg); 00539 return; 00540 } 00541 if (*msg != '\0') { 00542 tprintf("%s\n", msg); 00543 } 00544 BLOB_CHOICE_IT c_it; 00545 c_it.set_to_list(ratings); 00546 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00547 c_it.data()->print(NULL); 00548 if (!c_it.at_last()) tprintf("\n"); 00549 } 00550 tprintf("\n"); 00551 fflush(stdout); 00552 } 00553 00564 void print_ratings_info(FILE *fp, 00565 BLOB_CHOICE_LIST *ratings, 00566 const UNICHARSET ¤t_unicharset) { 00567 inT32 index; // to list 00568 const char* first_char = NULL; // character 00569 FLOAT32 first_rat; // rating 00570 FLOAT32 first_cert; // certainty 00571 const char* sec_char = NULL; // character 00572 FLOAT32 sec_rat = 0.0f; // rating 00573 FLOAT32 sec_cert = 0.0f; // certainty 00574 BLOB_CHOICE_IT c_it = ratings; // iterator 00575 00576 index = ratings->length(); 00577 if (index > 0) { 00578 first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id()); 00579 first_rat = c_it.data()->rating(); 00580 first_cert = -c_it.data()->certainty(); 00581 if (index > 1) { 00582 sec_char = current_unicharset.id_to_unichar( 00583 c_it.data_relative(1)->unichar_id()); 00584 sec_rat = c_it.data_relative(1)->rating(); 00585 sec_cert = -c_it.data_relative(1)->certainty(); 00586 } else { 00587 sec_char = NULL; 00588 sec_rat = -1; 00589 sec_cert = -1; 00590 } 00591 } else { 00592 first_char = NULL; 00593 first_rat = -1; 00594 first_cert = -1; 00595 } 00596 if (first_char != NULL && (*first_char == '\0' || *first_char == ' ')) 00597 first_char = NULL; 00598 if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' ')) 00599 sec_char = NULL; 00600 tprintf(" " INT32FORMAT " %s %g %g %s %g %g\n", 00601 ratings->length(), 00602 first_char != NULL ? first_char : "~", 00603 first_rat, first_cert, sec_char != NULL ? sec_char : "~", 00604 sec_rat, sec_cert); 00605 } 00606 00610 void print_char_choices_list(const char *msg, 00611 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00612 const UNICHARSET ¤t_unicharset, 00613 BOOL8 detailed) { 00614 if (*msg != '\0') tprintf("%s\n", msg); 00615 for (int x = 0; x < char_choices.length(); ++x) { 00616 BLOB_CHOICE_IT c_it; 00617 c_it.set_to_list(char_choices.get(x)); 00618 tprintf("\nchar[%d]: %s\n", x, 00619 current_unicharset.debug_str( c_it.data()->unichar_id()).string()); 00620 if (detailed) 00621 print_ratings_list("", char_choices.get(x), current_unicharset); 00622 } 00623 } 00624 00628 void print_word_alternates_list( 00629 WERD_CHOICE *word, 00630 GenericVector<WERD_CHOICE *> *alternates) { 00631 if (!word || !alternates) return; 00632 00633 STRING alternates_str; 00634 for (int i = 0; i < alternates->size(); i++) { 00635 if (i > 0) alternates_str += "\", \""; 00636 alternates_str += alternates->get(i)->unichar_string(); 00637 } 00638 tprintf("Alternates for \"%s\": {\"%s\"}\n", 00639 word->unichar_string().string(), alternates_str.string()); 00640 }