Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: werd.cpp (Formerly word.c) 00003 * Description: Code for the WERD class. 00004 * Author: Ray Smith 00005 * Created: Tue Oct 08 14:32:12 BST 1991 00006 * 00007 * (C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "mfcpch.h" 00021 #include "blckerr.h" 00022 #include "helpers.h" 00023 #include "linlsq.h" 00024 #include "werd.h" 00025 00026 #define FIRST_COLOUR ScrollView::RED //< first rainbow colour 00027 #define LAST_COLOUR ScrollView::AQUAMARINE //< last rainbow colour 00028 #define CHILD_COLOUR ScrollView::BROWN //< colour of children 00029 00030 const ERRCODE CANT_SCALE_EDGESTEPS = 00031 "Attempted to scale an edgestep format word"; 00032 00033 ELIST2IZE(WERD) 00034 00035 00044 WERD::WERD(C_BLOB_LIST *blob_list, uinT8 blank_count, const char *text) 00045 : blanks(blank_count), 00046 flags(0), 00047 script_id_(0), 00048 correct(text) { 00049 C_BLOB_IT start_it = blob_list; 00050 C_BLOB_IT end_it = blob_list; 00051 C_BLOB_IT rej_cblob_it = &rej_cblobs; 00052 C_OUTLINE_IT c_outline_it; 00053 inT16 inverted_vote = 0; 00054 inT16 non_inverted_vote = 0; 00055 00056 // Move blob_list's elements into cblobs. 00057 while (!end_it.at_last()) 00058 end_it.forward(); 00059 cblobs.assign_to_sublist(&start_it, &end_it); 00060 00061 /* 00062 Set white on black flag for the WERD, moving any duff blobs onto the 00063 rej_cblobs list. 00064 First, walk the cblobs checking the inverse flag for each outline of each 00065 cblob. If a cblob has inconsistent flag settings for its different 00066 outlines, move the blob to the reject list. Otherwise, increment the 00067 appropriate w-on-b or b-on-w vote for the word. 00068 00069 Now set the inversion flag for the WERD by maximum vote. 00070 00071 Walk the blobs again, moving any blob whose inversion flag does not agree 00072 with the concencus onto the reject list. 00073 */ 00074 start_it.set_to_list(&cblobs); 00075 if (start_it.empty()) 00076 return; 00077 for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { 00078 BOOL8 reject_blob = FALSE; 00079 BOOL8 blob_inverted; 00080 00081 c_outline_it.set_to_list(start_it.data()->out_list()); 00082 blob_inverted = c_outline_it.data()->flag(COUT_INVERSE); 00083 for (c_outline_it.mark_cycle_pt(); 00084 !c_outline_it.cycled_list() && !reject_blob; 00085 c_outline_it.forward()) { 00086 reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted; 00087 } 00088 if (reject_blob) { 00089 rej_cblob_it.add_after_then_move(start_it.extract()); 00090 } else { 00091 if (blob_inverted) 00092 inverted_vote++; 00093 else 00094 non_inverted_vote++; 00095 } 00096 } 00097 00098 flags.set_bit(W_INVERSE, (inverted_vote > non_inverted_vote)); 00099 00100 start_it.set_to_list(&cblobs); 00101 if (start_it.empty()) 00102 return; 00103 for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { 00104 c_outline_it.set_to_list(start_it.data()->out_list()); 00105 if (c_outline_it.data()->flag(COUT_INVERSE) != flags.bit(W_INVERSE)) 00106 rej_cblob_it.add_after_then_move(start_it.extract()); 00107 } 00108 } 00109 00110 00118 WERD::WERD(C_BLOB_LIST * blob_list, //< In word order 00119 WERD * clone) //< Source of flags 00120 : flags(clone->flags), 00121 script_id_(clone->script_id_), 00122 correct(clone->correct) { 00123 C_BLOB_IT start_it = blob_list; // iterator 00124 C_BLOB_IT end_it = blob_list; // another 00125 00126 while (!end_it.at_last ()) 00127 end_it.forward (); //move to last 00128 ((C_BLOB_LIST *) (&cblobs))->assign_to_sublist (&start_it, &end_it); 00129 //move to our list 00130 blanks = clone->blanks; 00131 // fprintf(stderr,"Wrong constructor!!!!\n"); 00132 } 00133 00134 // Construct a WERD from a single_blob and clone the flags from this. 00135 // W_BOL and W_EOL flags are set according to the given values. 00136 WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) { 00137 C_BLOB_LIST temp_blobs; 00138 C_BLOB_IT temp_it(&temp_blobs); 00139 temp_it.add_after_then_move(blob); 00140 WERD* blob_word = new WERD(&temp_blobs, this); 00141 blob_word->set_flag(W_BOL, bol); 00142 blob_word->set_flag(W_EOL, eol); 00143 return blob_word; 00144 } 00145 00159 TBOX WERD::bounding_box() { 00160 TBOX box; // box being built 00161 C_BLOB_IT rej_cblob_it = &rej_cblobs; // rejected blobs 00162 00163 for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list(); 00164 rej_cblob_it.forward()) { 00165 box += rej_cblob_it.data()->bounding_box(); 00166 } 00167 00168 C_BLOB_IT it = &cblobs; // blobs of WERD 00169 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00170 box += it.data()->bounding_box(); 00171 } 00172 return box; 00173 } 00174 00175 00183 void WERD::move(const ICOORD vec) { 00184 C_BLOB_IT cblob_it(&cblobs); // cblob iterator 00185 00186 for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) 00187 cblob_it.data()->move(vec); 00188 } 00189 00196 void WERD::join_on(WERD* other) { 00197 C_BLOB_IT blob_it(&cblobs); 00198 C_BLOB_IT src_it(&other->cblobs); 00199 C_BLOB_IT rej_cblob_it(&rej_cblobs); 00200 C_BLOB_IT src_rej_it(&other->rej_cblobs); 00201 00202 while (!src_it.empty()) { 00203 blob_it.add_to_end(src_it.extract()); 00204 src_it.forward(); 00205 } 00206 while (!src_rej_it.empty()) { 00207 rej_cblob_it.add_to_end(src_rej_it.extract()); 00208 src_rej_it.forward(); 00209 } 00210 } 00211 00212 00219 void WERD::copy_on(WERD* other) { 00220 bool reversed = other->bounding_box().left() < bounding_box().left(); 00221 C_BLOB_IT c_blob_it(&cblobs); 00222 C_BLOB_LIST c_blobs; 00223 00224 c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy); 00225 if (reversed) { 00226 c_blob_it.add_list_before(&c_blobs); 00227 } else { 00228 c_blob_it.move_to_last(); 00229 c_blob_it.add_list_after(&c_blobs); 00230 } 00231 if (!other->rej_cblobs.empty()) { 00232 C_BLOB_IT rej_c_blob_it(&rej_cblobs); 00233 C_BLOB_LIST new_rej_c_blobs; 00234 00235 new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy); 00236 if (reversed) { 00237 rej_c_blob_it.add_list_before(&new_rej_c_blobs); 00238 } else { 00239 rej_c_blob_it.move_to_last(); 00240 rej_c_blob_it.add_list_after(&new_rej_c_blobs); 00241 } 00242 } 00243 } 00244 00251 void WERD::print() { 00252 tprintf("Blanks= %d\n", blanks); 00253 bounding_box().print(); 00254 tprintf("Flags = %d = 0%o\n", flags.val, flags.val); 00255 tprintf(" W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE "); 00256 tprintf(" W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE "); 00257 tprintf(" W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE "); 00258 tprintf(" W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE "); 00259 tprintf(" W_NORMALIZED = %s\n", 00260 flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE "); 00261 tprintf(" W_SCRIPT_HAS_XHEIGHT = %s\n", 00262 flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE "); 00263 tprintf(" W_SCRIPT_IS_LATIN = %s\n", 00264 flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE "); 00265 tprintf(" W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE "); 00266 tprintf(" W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE "); 00267 tprintf(" W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE "); 00268 tprintf(" W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE "); 00269 tprintf("Correct= %s\n", correct.string()); 00270 tprintf("Rejected cblob count = %d\n", rej_cblobs.length()); 00271 tprintf("Script = %d\n", script_id_); 00272 } 00273 00274 00281 #ifndef GRAPHICS_DISABLED 00282 void WERD::plot(ScrollView *window, ScrollView::Color colour) { 00283 C_BLOB_IT it = &cblobs; 00284 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00285 it.data()->plot(window, colour, colour); 00286 } 00287 plot_rej_blobs(window); 00288 } 00289 #endif // GRAPHICS_DISABLED 00290 00291 // Get the next color in the (looping) rainbow. 00292 ScrollView::Color WERD::NextColor(ScrollView::Color colour) { 00293 ScrollView::Color next = static_cast<ScrollView::Color>(colour + 1); 00294 if (next >= LAST_COLOUR || next < FIRST_COLOUR) 00295 next = FIRST_COLOUR; 00296 return next; 00297 } 00298 00299 #ifndef GRAPHICS_DISABLED 00300 00306 void WERD::plot(ScrollView* window) { 00307 ScrollView::Color colour = FIRST_COLOUR; 00308 C_BLOB_IT it = &cblobs; 00309 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00310 it.data()->plot(window, colour, CHILD_COLOUR); 00311 colour = NextColor(colour); 00312 } 00313 plot_rej_blobs(window); 00314 } 00315 00316 00324 void WERD::plot_rej_blobs(ScrollView *window) { 00325 C_BLOB_IT it = &rej_cblobs; 00326 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00327 it.data()->plot(window, ScrollView::GREY, ScrollView::GREY); 00328 } 00329 } 00330 #endif // GRAPHICS_DISABLED 00331 00332 00339 WERD *WERD::shallow_copy() { 00340 WERD *new_word = new WERD; 00341 00342 new_word->blanks = blanks; 00343 new_word->flags = flags; 00344 new_word->dummy = dummy; 00345 new_word->correct = correct; 00346 return new_word; 00347 } 00348 00349 00356 WERD & WERD::operator= (const WERD & source) { 00357 this->ELIST2_LINK::operator= (source); 00358 blanks = source.blanks; 00359 flags = source.flags; 00360 script_id_ = source.script_id_; 00361 dummy = source.dummy; 00362 correct = source.correct; 00363 if (!cblobs.empty()) 00364 cblobs.clear(); 00365 cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy); 00366 00367 if (!rej_cblobs.empty()) 00368 rej_cblobs.clear(); 00369 rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy); 00370 return *this; 00371 } 00372 00373 00381 int word_comparator(const void *word1p, const void *word2p) { 00382 WERD *word1 = *(WERD **)word1p; 00383 WERD *word2 = *(WERD **)word2p; 00384 return word1->bounding_box().left() - word2->bounding_box().left(); 00385 } 00386 00399 WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs, 00400 C_BLOB_LIST* orphan_blobs) { 00401 C_BLOB_LIST current_blob_list; 00402 C_BLOB_IT werd_blobs_it(¤t_blob_list); 00403 // Add the word's c_blobs. 00404 werd_blobs_it.add_list_after(cblob_list()); 00405 00406 // New blob list. These contain the blobs which will form the new word. 00407 C_BLOB_LIST new_werd_blobs; 00408 C_BLOB_IT new_blobs_it(&new_werd_blobs); 00409 00410 // not_found_blobs contains the list of current word's blobs for which a 00411 // corresponding blob wasn't found in the input all_blobs list. 00412 C_BLOB_LIST not_found_blobs; 00413 C_BLOB_IT not_found_it(¬_found_blobs); 00414 not_found_it.move_to_last(); 00415 00416 werd_blobs_it.move_to_first(); 00417 for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); 00418 werd_blobs_it.forward()) { 00419 C_BLOB* werd_blob = werd_blobs_it.extract(); 00420 TBOX werd_blob_box = werd_blob->bounding_box(); 00421 bool found = false; 00422 // Now find the corresponding blob for this blob in the all_blobs 00423 // list. For now, follow the inefficient method of pairwise 00424 // comparisons. Ideally, one can pre-bucket the blobs by row. 00425 C_BLOB_IT all_blobs_it(all_blobs); 00426 for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); 00427 all_blobs_it.forward()) { 00428 C_BLOB* a_blob = all_blobs_it.data(); 00429 // Compute the overlap of the two blobs. If major, a_blob should 00430 // be added to the new blobs list. 00431 TBOX a_blob_box = a_blob->bounding_box(); 00432 if (a_blob_box.null_box()) { 00433 tprintf("Bounding box couldn't be ascertained\n"); 00434 } 00435 if (werd_blob_box.contains(a_blob_box) || 00436 werd_blob_box.major_overlap(a_blob_box)) { 00437 // Old blobs are from minimal splits, therefore are expected to be 00438 // bigger. The new small blobs should cover a significant portion. 00439 // This is it. 00440 all_blobs_it.extract(); 00441 new_blobs_it.add_after_then_move(a_blob); 00442 found = true; 00443 } 00444 } 00445 if (!found) { 00446 not_found_it.add_after_then_move(werd_blob); 00447 } else { 00448 delete werd_blob; 00449 } 00450 } 00451 // Iterate over all not found blobs. Some of them may be due to 00452 // under-segmentation (which is OK, since the corresponding blob is already 00453 // in the list in that case. 00454 not_found_it.move_to_first(); 00455 for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); 00456 not_found_it.forward()) { 00457 C_BLOB* not_found = not_found_it.data(); 00458 TBOX not_found_box = not_found->bounding_box(); 00459 C_BLOB_IT existing_blobs_it(new_blobs_it); 00460 for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list(); 00461 existing_blobs_it.forward()) { 00462 C_BLOB* a_blob = existing_blobs_it.data(); 00463 TBOX a_blob_box = a_blob->bounding_box(); 00464 if ((not_found_box.major_overlap(a_blob_box) || 00465 a_blob_box.major_overlap(not_found_box)) && 00466 not_found_box.y_overlap(a_blob_box)) { 00467 // Already taken care of. 00468 delete not_found_it.extract(); 00469 break; 00470 } 00471 } 00472 } 00473 if (orphan_blobs) { 00474 C_BLOB_IT orphan_blobs_it(orphan_blobs); 00475 orphan_blobs_it.move_to_last(); 00476 orphan_blobs_it.add_list_after(¬_found_blobs); 00477 } 00478 00479 // New blobs are ready. Create a new werd object with these. 00480 WERD* new_werd = NULL; 00481 if (!new_werd_blobs.empty()) { 00482 new_werd = new WERD(&new_werd_blobs, this); 00483 } else { 00484 // Add the blobs back to this word so that it can be reused. 00485 C_BLOB_IT this_list_it(cblob_list()); 00486 this_list_it.add_list_after(¬_found_blobs); 00487 } 00488 return new_werd; 00489 }