Tesseract  3.02
tesseract-ocr/ccstruct/werd.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        werd.cpp  (Formerly word.c)
00003  * Description: Code for the WERD class.
00004  * Author:      Ray Smith
00005  * Created:     Tue Oct 08 14:32:12 BST 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "mfcpch.h"
00021 #include "blckerr.h"
00022 #include "helpers.h"
00023 #include "linlsq.h"
00024 #include "werd.h"
00025 
00026 #define FIRST_COLOUR    ScrollView::RED         //< first rainbow colour
00027 #define LAST_COLOUR     ScrollView::AQUAMARINE  //< last rainbow colour
00028 #define CHILD_COLOUR    ScrollView::BROWN       //< colour of children
00029 
00030 const ERRCODE CANT_SCALE_EDGESTEPS =
00031     "Attempted to scale an edgestep format word";
00032 
00033 ELIST2IZE(WERD)
00034 
00035 
00044 WERD::WERD(C_BLOB_LIST *blob_list, uinT8 blank_count, const char *text)
00045   : blanks(blank_count),
00046     flags(0),
00047     script_id_(0),
00048     correct(text) {
00049   C_BLOB_IT start_it = blob_list;
00050   C_BLOB_IT end_it = blob_list;
00051   C_BLOB_IT rej_cblob_it = &rej_cblobs;
00052   C_OUTLINE_IT c_outline_it;
00053   inT16 inverted_vote = 0;
00054   inT16 non_inverted_vote = 0;
00055 
00056   // Move blob_list's elements into cblobs.
00057   while (!end_it.at_last())
00058     end_it.forward();
00059   cblobs.assign_to_sublist(&start_it, &end_it);
00060 
00061   /*
00062     Set white on black flag for the WERD, moving any duff blobs onto the
00063     rej_cblobs list.
00064     First, walk the cblobs checking the inverse flag for each outline of each
00065     cblob. If a cblob has inconsistent flag settings for its different
00066     outlines, move the blob to the reject list. Otherwise, increment the
00067     appropriate w-on-b or b-on-w vote for the word.
00068 
00069     Now set the inversion flag for the WERD by maximum vote.
00070 
00071     Walk the blobs again, moving any blob whose inversion flag does not agree
00072     with the concencus onto the reject list.
00073   */
00074   start_it.set_to_list(&cblobs);
00075   if (start_it.empty())
00076     return;
00077   for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
00078     BOOL8 reject_blob = FALSE;
00079     BOOL8 blob_inverted;
00080 
00081     c_outline_it.set_to_list(start_it.data()->out_list());
00082     blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);
00083     for (c_outline_it.mark_cycle_pt();
00084          !c_outline_it.cycled_list() && !reject_blob;
00085          c_outline_it.forward()) {
00086       reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;
00087     }
00088     if (reject_blob) {
00089       rej_cblob_it.add_after_then_move(start_it.extract());
00090     } else {
00091       if (blob_inverted)
00092         inverted_vote++;
00093       else
00094         non_inverted_vote++;
00095     }
00096   }
00097 
00098   flags.set_bit(W_INVERSE, (inverted_vote > non_inverted_vote));
00099 
00100   start_it.set_to_list(&cblobs);
00101   if (start_it.empty())
00102     return;
00103   for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
00104     c_outline_it.set_to_list(start_it.data()->out_list());
00105     if (c_outline_it.data()->flag(COUT_INVERSE) != flags.bit(W_INVERSE))
00106       rej_cblob_it.add_after_then_move(start_it.extract());
00107   }
00108 }
00109 
00110 
00118 WERD::WERD(C_BLOB_LIST * blob_list,         //< In word order
00119            WERD * clone)                    //< Source of flags
00120   : flags(clone->flags),
00121     script_id_(clone->script_id_),
00122     correct(clone->correct) {
00123   C_BLOB_IT start_it = blob_list;  // iterator
00124   C_BLOB_IT end_it = blob_list;    // another
00125 
00126   while (!end_it.at_last ())
00127     end_it.forward ();           //move to last
00128   ((C_BLOB_LIST *) (&cblobs))->assign_to_sublist (&start_it, &end_it);
00129   //move to our list
00130   blanks = clone->blanks;
00131   //      fprintf(stderr,"Wrong constructor!!!!\n");
00132 }
00133 
00134 // Construct a WERD from a single_blob and clone the flags from this.
00135 // W_BOL and W_EOL flags are set according to the given values.
00136 WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) {
00137   C_BLOB_LIST temp_blobs;
00138   C_BLOB_IT temp_it(&temp_blobs);
00139   temp_it.add_after_then_move(blob);
00140   WERD* blob_word = new WERD(&temp_blobs, this);
00141   blob_word->set_flag(W_BOL, bol);
00142   blob_word->set_flag(W_EOL, eol);
00143   return blob_word;
00144 }
00145 
00159 TBOX WERD::bounding_box() {
00160   TBOX box;                       // box being built
00161   C_BLOB_IT rej_cblob_it = &rej_cblobs;  // rejected blobs
00162 
00163   for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list();
00164        rej_cblob_it.forward()) {
00165     box += rej_cblob_it.data()->bounding_box();
00166   }
00167 
00168   C_BLOB_IT it = &cblobs;    // blobs of WERD
00169   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00170     box += it.data()->bounding_box();
00171   }
00172   return box;
00173 }
00174 
00175 
00183 void WERD::move(const ICOORD vec) {
00184   C_BLOB_IT cblob_it(&cblobs);  // cblob iterator
00185 
00186   for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward())
00187     cblob_it.data()->move(vec);
00188 }
00189 
00196 void WERD::join_on(WERD* other) {
00197   C_BLOB_IT blob_it(&cblobs);
00198   C_BLOB_IT src_it(&other->cblobs);
00199   C_BLOB_IT rej_cblob_it(&rej_cblobs);
00200   C_BLOB_IT src_rej_it(&other->rej_cblobs);
00201 
00202   while (!src_it.empty()) {
00203     blob_it.add_to_end(src_it.extract());
00204     src_it.forward();
00205   }
00206   while (!src_rej_it.empty()) {
00207     rej_cblob_it.add_to_end(src_rej_it.extract());
00208     src_rej_it.forward();
00209   }
00210 }
00211 
00212 
00219 void WERD::copy_on(WERD* other) {
00220   bool reversed = other->bounding_box().left() < bounding_box().left();
00221   C_BLOB_IT c_blob_it(&cblobs);
00222   C_BLOB_LIST c_blobs;
00223 
00224   c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);
00225   if (reversed) {
00226     c_blob_it.add_list_before(&c_blobs);
00227   } else {
00228     c_blob_it.move_to_last();
00229     c_blob_it.add_list_after(&c_blobs);
00230   }
00231   if (!other->rej_cblobs.empty()) {
00232     C_BLOB_IT rej_c_blob_it(&rej_cblobs);
00233     C_BLOB_LIST new_rej_c_blobs;
00234 
00235     new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);
00236     if (reversed) {
00237       rej_c_blob_it.add_list_before(&new_rej_c_blobs);
00238     } else {
00239       rej_c_blob_it.move_to_last();
00240       rej_c_blob_it.add_list_after(&new_rej_c_blobs);
00241     }
00242   }
00243 }
00244 
00251 void WERD::print() {
00252   tprintf("Blanks= %d\n", blanks);
00253   bounding_box().print();
00254   tprintf("Flags = %d = 0%o\n", flags.val, flags.val);
00255   tprintf("   W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE ");
00256   tprintf("   W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE ");
00257   tprintf("   W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE ");
00258   tprintf("   W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE ");
00259   tprintf("   W_NORMALIZED = %s\n",
00260           flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE ");
00261   tprintf("   W_SCRIPT_HAS_XHEIGHT = %s\n",
00262           flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE ");
00263   tprintf("   W_SCRIPT_IS_LATIN = %s\n",
00264           flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE ");
00265   tprintf("   W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE ");
00266   tprintf("   W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE ");
00267   tprintf("   W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE ");
00268   tprintf("   W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE ");
00269   tprintf("Correct= %s\n", correct.string());
00270   tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
00271   tprintf("Script = %d\n", script_id_);
00272 }
00273 
00274 
00281 #ifndef GRAPHICS_DISABLED
00282 void WERD::plot(ScrollView *window, ScrollView::Color colour) {
00283   C_BLOB_IT it = &cblobs;
00284   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00285     it.data()->plot(window, colour, colour);
00286   }
00287   plot_rej_blobs(window);
00288 }
00289 #endif  // GRAPHICS_DISABLED
00290 
00291 // Get the next color in the (looping) rainbow.
00292 ScrollView::Color WERD::NextColor(ScrollView::Color colour) {
00293   ScrollView::Color next = static_cast<ScrollView::Color>(colour + 1);
00294   if (next >= LAST_COLOUR || next < FIRST_COLOUR)
00295     next = FIRST_COLOUR;
00296   return next;
00297 }
00298 
00299 #ifndef GRAPHICS_DISABLED
00300 
00306 void WERD::plot(ScrollView* window) {
00307   ScrollView::Color colour = FIRST_COLOUR;
00308   C_BLOB_IT it = &cblobs;
00309   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00310     it.data()->plot(window, colour, CHILD_COLOUR);
00311     colour = NextColor(colour);
00312   }
00313   plot_rej_blobs(window);
00314 }
00315 
00316 
00324 void WERD::plot_rej_blobs(ScrollView *window) {
00325   C_BLOB_IT it = &rej_cblobs;
00326   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00327     it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);
00328   }
00329 }
00330 #endif  // GRAPHICS_DISABLED
00331 
00332 
00339 WERD *WERD::shallow_copy() {
00340   WERD *new_word = new WERD;
00341 
00342   new_word->blanks = blanks;
00343   new_word->flags = flags;
00344   new_word->dummy = dummy;
00345   new_word->correct = correct;
00346   return new_word;
00347 }
00348 
00349 
00356 WERD & WERD::operator= (const WERD & source) {
00357   this->ELIST2_LINK::operator= (source);
00358   blanks = source.blanks;
00359   flags = source.flags;
00360   script_id_ = source.script_id_;
00361   dummy = source.dummy;
00362   correct = source.correct;
00363   if (!cblobs.empty())
00364     cblobs.clear();
00365   cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);
00366 
00367   if (!rej_cblobs.empty())
00368     rej_cblobs.clear();
00369   rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);
00370   return *this;
00371 }
00372 
00373 
00381 int word_comparator(const void *word1p, const void *word2p) {
00382   WERD *word1 = *(WERD **)word1p;
00383   WERD *word2 = *(WERD **)word2p;
00384   return word1->bounding_box().left() - word2->bounding_box().left();
00385 }
00386 
00399 WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
00400                                       C_BLOB_LIST* orphan_blobs) {
00401   C_BLOB_LIST current_blob_list;
00402   C_BLOB_IT werd_blobs_it(&current_blob_list);
00403   // Add the word's c_blobs.
00404   werd_blobs_it.add_list_after(cblob_list());
00405 
00406   // New blob list. These contain the blobs which will form the new word.
00407   C_BLOB_LIST new_werd_blobs;
00408   C_BLOB_IT new_blobs_it(&new_werd_blobs);
00409 
00410   // not_found_blobs contains the list of current word's blobs for which a
00411   // corresponding blob wasn't found in the input all_blobs list.
00412   C_BLOB_LIST not_found_blobs;
00413   C_BLOB_IT not_found_it(&not_found_blobs);
00414   not_found_it.move_to_last();
00415 
00416   werd_blobs_it.move_to_first();
00417   for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list();
00418        werd_blobs_it.forward()) {
00419     C_BLOB* werd_blob = werd_blobs_it.extract();
00420     TBOX werd_blob_box = werd_blob->bounding_box();
00421     bool found = false;
00422     // Now find the corresponding blob for this blob in the all_blobs
00423     // list. For now, follow the inefficient method of pairwise
00424     // comparisons. Ideally, one can pre-bucket the blobs by row.
00425     C_BLOB_IT all_blobs_it(all_blobs);
00426     for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
00427          all_blobs_it.forward()) {
00428       C_BLOB* a_blob = all_blobs_it.data();
00429       // Compute the overlap of the two blobs. If major, a_blob should
00430       // be added to the new blobs list.
00431       TBOX a_blob_box = a_blob->bounding_box();
00432       if (a_blob_box.null_box()) {
00433         tprintf("Bounding box couldn't be ascertained\n");
00434       }
00435       if (werd_blob_box.contains(a_blob_box) ||
00436           werd_blob_box.major_overlap(a_blob_box)) {
00437         // Old blobs are from minimal splits, therefore are expected to be
00438         // bigger. The new small blobs should cover a significant portion.
00439         // This is it.
00440         all_blobs_it.extract();
00441         new_blobs_it.add_after_then_move(a_blob);
00442         found = true;
00443       }
00444     }
00445     if (!found) {
00446       not_found_it.add_after_then_move(werd_blob);
00447     } else {
00448       delete werd_blob;
00449     }
00450   }
00451   // Iterate over all not found blobs. Some of them may be due to
00452   // under-segmentation (which is OK, since the corresponding blob is already
00453   // in the list in that case.
00454   not_found_it.move_to_first();
00455   for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
00456        not_found_it.forward()) {
00457     C_BLOB* not_found = not_found_it.data();
00458     TBOX not_found_box = not_found->bounding_box();
00459     C_BLOB_IT existing_blobs_it(new_blobs_it);
00460     for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
00461          existing_blobs_it.forward()) {
00462       C_BLOB* a_blob = existing_blobs_it.data();
00463       TBOX a_blob_box = a_blob->bounding_box();
00464       if ((not_found_box.major_overlap(a_blob_box) ||
00465            a_blob_box.major_overlap(not_found_box)) &&
00466            not_found_box.y_overlap(a_blob_box)) {
00467         // Already taken care of.
00468         delete not_found_it.extract();
00469         break;
00470       }
00471     }
00472   }
00473   if (orphan_blobs) {
00474     C_BLOB_IT orphan_blobs_it(orphan_blobs);
00475     orphan_blobs_it.move_to_last();
00476     orphan_blobs_it.add_list_after(&not_found_blobs);
00477   }
00478 
00479   // New blobs are ready. Create a new werd object with these.
00480   WERD* new_werd = NULL;
00481   if (!new_werd_blobs.empty()) {
00482     new_werd = new WERD(&new_werd_blobs, this);
00483   } else {
00484     // Add the blobs back to this word so that it can be reused.
00485     C_BLOB_IT this_list_it(cblob_list());
00486     this_list_it.add_list_after(&not_found_blobs);
00487   }
00488   return new_werd;
00489 }