tesseract-doc/pageres_8cpp_source.html

00001 /**********************************************************************
00002  * File:        pageres.cpp  (Formerly page_res.c)
00003  * Description: Results classes used by control.c
00004  * Author:              Phil Cheatle
00005  * Created:     Tue Sep 22 08:42:49 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 #include "mfcpch.h"
00020 #include          <stdlib.h>
00021 #ifdef __UNIX__
00022 #include          <assert.h>
00023 #endif
00024 #include          "pageres.h"
00025 #include          "blobs.h"
00026
00027 const char kBlameCorrect[] = "corr";
00028 const char kBlameClassifier[] = "cl";
00029 const char kBlameChopper[] = "chop";
00030 const char kBlameClassLMTradeoff[] = "cl/LM";
00031 const char kBlamePageLayout[] = "pglt";
00032 const char kBlameSegsearchHeur[] = "ss_heur";
00033 const char kBlameSegsearchPP[] = "ss_pp";
00034 const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
00035 const char kBlameAdaption[] = "adapt";
00036 const char kBlameNoTruthSplit[] = "no_tr_spl";
00037 const char kBlameNoTruth[] = "no_tr";
00038 const char kBlameUnknown[] = "unkn";
00039
00040 const char * const kIncorrectResultReasonNames[] = {
00041     kBlameCorrect,
00042     kBlameClassifier,
00043     kBlameChopper,
00044     kBlameClassLMTradeoff,
00045     kBlamePageLayout,
00046     kBlameSegsearchHeur,
00047     kBlameSegsearchPP,
00048     kBlameClassOldLMTradeoff,
00049     kBlameAdaption,
00050     kBlameNoTruthSplit,
00051     kBlameNoTruth,
00052     kBlameUnknown
00053 };
00054
00055 const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
00056   return kIncorrectResultReasonNames[irr];
00057 }
00058
00059 const char *BlamerBundle::IncorrectReason() const {
00060   return kIncorrectResultReasonNames[incorrect_result_reason];
00061 }
00062
00063 void BlamerBundle::FillDebugString(const STRING &msg,
00064                                    const WERD_CHOICE *choice,
00065                                    STRING *debug) {
00066   (*debug) += "Truth ";
00067   for (int i = 0; i < this->truth_text.length(); ++i) {
00068     (*debug) += this->truth_text[i];
00069   }
00070   if (!this->truth_has_char_boxes) (*debug) += " (no char boxes)";
00071   if (choice != NULL) {
00072     (*debug) += " Choice ";
00073     STRING choice_str;
00074     choice->string_and_lengths(&choice_str, NULL);
00075     (*debug) += choice_str;
00076   }
00077   if (msg.length() > 0) {
00078     (*debug) += "\n";
00079     (*debug) += msg;
00080   }
00081   (*debug) += "\n";
00082 }
00083
00084 ELISTIZE (BLOCK_RES)
00085 CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES)
00086 /*************************************************************************
00087  * PAGE_RES::PAGE_RES
00088  *
00089  * Constructor for page results
00090  *************************************************************************/
00091 PAGE_RES::PAGE_RES(
00092     BLOCK_LIST *the_block_list,
00093     WERD_CHOICE **prev_word_best_choice_ptr) {
00094   Init();
00095   BLOCK_IT block_it(the_block_list);
00096   BLOCK_RES_IT block_res_it(&block_res_list);
00097   for (block_it.mark_cycle_pt();
00098        !block_it.cycled_list(); block_it.forward()) {
00099     block_res_it.add_to_end(new BLOCK_RES(block_it.data()));
00100   }
00101   prev_word_best_choice = prev_word_best_choice_ptr;
00102 }
00103
00104 /*************************************************************************
00105  * BLOCK_RES::BLOCK_RES
00106  *
00107  * Constructor for BLOCK results
00108  *************************************************************************/
00109
00110 BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
00111   ROW_IT row_it (the_block->row_list ());
00112   ROW_RES_IT row_res_it(&row_res_list);
00113
00114   char_count = 0;
00115   rej_count = 0;
00116   font_class = -1;               //not assigned
00117   x_height = -1.0;
00118   font_assigned = FALSE;
00119   bold = FALSE;
00120   italic = FALSE;
00121   row_count = 0;
00122
00123   block = the_block;
00124
00125   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00126     row_res_it.add_to_end(new ROW_RES(row_it.data()));
00127   }
00128 }
00129
00130
00131 /*************************************************************************
00132  * ROW_RES::ROW_RES
00133  *
00134  * Constructor for ROW results
00135  *************************************************************************/
00136
00137 ROW_RES::ROW_RES(ROW *the_row) {
00138   WERD_IT word_it(the_row->word_list());
00139   WERD_RES_IT word_res_it(&word_res_list);
00140   WERD_RES *combo = NULL;        // current combination of fuzzies
00141   WERD_RES *word_res;            // current word
00142   WERD *copy_word;
00143
00144   char_count = 0;
00145   rej_count = 0;
00146   whole_word_rej_count = 0;
00147
00148   row = the_row;
00149   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00150     word_res = new WERD_RES(word_it.data());
00151     word_res->x_height = the_row->x_height();
00152
00153     if (word_res->word->flag(W_FUZZY_NON)) {
00154       ASSERT_HOST(combo != NULL);
00155       word_res->part_of_combo = TRUE;
00156       combo->copy_on(word_res);
00157     }
00158     if (word_it.data_relative(1)->flag(W_FUZZY_NON)) {
00159       if (combo == NULL) {
00160         copy_word = new WERD;
00161                                  //deep copy
00162         *copy_word = *(word_it.data());
00163         combo = new WERD_RES(copy_word);
00164         combo->x_height = the_row->x_height();
00165         combo->combination = TRUE;
00166         word_res_it.add_to_end(combo);
00167       }
00168       word_res->part_of_combo = TRUE;
00169     } else {
00170       combo = NULL;
00171     }
00172     word_res_it.add_to_end(word_res);
00173   }
00174 }
00175
00176
00177 WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
00178   this->ELIST_LINK::operator=(source);
00179   Clear();
00180   if (source.combination) {
00181     word = new WERD;
00182     *word = *(source.word);      // deep copy
00183   } else {
00184     word = source.word;          // pt to same word
00185   }
00186   if (source.bln_boxes != NULL)
00187     bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
00188   if (source.chopped_word != NULL)
00189     chopped_word = new TWERD(*source.chopped_word);
00190   if (source.rebuild_word != NULL)
00191     rebuild_word = new TWERD(*source.rebuild_word);
00192   // TODO(rays) Do we ever need to copy the seam_array?
00193   denorm = source.denorm;
00194   if (source.box_word != NULL)
00195     box_word = new tesseract::BoxWord(*source.box_word);
00196   best_state = source.best_state;
00197   correct_text = source.correct_text;
00198
00199   if (source.best_choice != NULL) {
00200     best_choice = new WERD_CHOICE(*source.best_choice);
00201     raw_choice = new WERD_CHOICE(*source.raw_choice);
00202     best_choice_fontinfo_ids = source.best_choice_fontinfo_ids;
00203   }
00204   else {
00205     best_choice = NULL;
00206     raw_choice = NULL;
00207     if (!best_choice_fontinfo_ids.empty()) {
00208       best_choice_fontinfo_ids.clear();
00209     }
00210   }
00211   for (int i = 0; i < source.alt_choices.length(); ++i) {
00212     const WERD_CHOICE *choice = source.alt_choices[i];
00213     ASSERT_HOST(choice != NULL);
00214     alt_choices.push_back(new WERD_CHOICE(*choice));
00215   }
00216   alt_states = source.alt_states;
00217   if (source.ep_choice != NULL) {
00218     ep_choice = new WERD_CHOICE(*source.ep_choice);
00219   } else {
00220     ep_choice = NULL;
00221   }
00222   reject_map = source.reject_map;
00223   combination = source.combination;
00224   part_of_combo = source.part_of_combo;
00225   CopySimpleFields(source);
00226   if (source.blamer_bundle != NULL) {
00227     blamer_bundle =  new BlamerBundle(*(source.blamer_bundle));
00228   }
00229   return *this;
00230 }
00231
00232 // Copies basic fields that don't involve pointers that might be useful
00233 // to copy when making one WERD_RES from another.
00234 void WERD_RES::CopySimpleFields(const WERD_RES& source) {
00235   tess_failed = source.tess_failed;
00236   tess_accepted = source.tess_accepted;
00237   tess_would_adapt = source.tess_would_adapt;
00238   done = source.done;
00239   unlv_crunch_mode = source.unlv_crunch_mode;
00240   small_caps = source.small_caps;
00241   italic = source.italic;
00242   bold = source.bold;
00243   fontinfo = source.fontinfo;
00244   fontinfo2 = source.fontinfo2;
00245   fontinfo_id_count = source.fontinfo_id_count;
00246   fontinfo_id2_count = source.fontinfo_id2_count;
00247   x_height = source.x_height;
00248   caps_height = source.caps_height;
00249   guessed_x_ht = source.guessed_x_ht;
00250   guessed_caps_ht = source.guessed_caps_ht;
00251   reject_spaces = source.reject_spaces;
00252   uch_set = source.uch_set;
00253   tesseract = source.tesseract;
00254 }
00255
00256 // Initializes a blank (default constructed) WERD_RES from one that has
00257 // already been recognized.
00258 // Use SetupFor*Recognition afterwards to complete the setup and make
00259 // it ready for a retry recognition.
00260 void WERD_RES::InitForRetryRecognition(const WERD_RES& source) {
00261   word = source.word;
00262   CopySimpleFields(source);
00263   if (source.blamer_bundle != NULL) {
00264     blamer_bundle = new BlamerBundle();
00265     blamer_bundle->CopyTruth(*source.blamer_bundle);
00266   }
00267 }
00268
00269 // Sets up the members used in recognition:
00270 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
00271 // Returns false if the word is empty and sets up fake results.
00272 bool WERD_RES::SetupForTessRecognition(const UNICHARSET& unicharset_in,
00273                                    tesseract::Tesseract* tess, Pix* pix,
00274                                    bool numeric_mode,
00275                                    bool use_body_size,
00276                                    ROW *row, BLOCK* block) {
00277   tesseract = tess;
00278   POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
00279   if (word->cblob_list()->empty() || (pb != NULL && !pb->IsText())) {
00280     // Empty words occur when all the blobs have been moved to the rej_blobs
00281     // list, which seems to occur frequently in junk.
00282     SetupFake(unicharset_in);
00283     word->set_flag(W_REP_CHAR, false);
00284     return false;
00285   }
00286   ClearResults();
00287   SetupWordScript(unicharset_in);
00288   chopped_word = TWERD::PolygonalCopy(word);
00289   if (use_body_size && row->body_size() > 0.0f) {
00290     chopped_word->SetupBLNormalize(block, row, row->body_size(),
00291                                    numeric_mode, &denorm);
00292   } else {
00293     chopped_word->SetupBLNormalize(block, row, x_height, numeric_mode, &denorm);
00294   }
00295   // The image will be 8-bit grey if the input was grey or color. Note that in
00296   // a grey image 0 is black and 255 is white. If the input was binary, then
00297   // the pix will be binary and 0 is white, with 1 being black.
00298   // To tell the difference pixGetDepth() will return 8 or 1.
00299   denorm.set_pix(pix);
00300   // The inverse flag will be true iff the word has been determined to be white
00301   // on black, and is independent of whether the pix is 8 bit or 1 bit.
00302   denorm.set_inverse(word->flag(W_INVERSE));
00303   chopped_word->Normalize(denorm);
00304   bln_boxes = tesseract::BoxWord::CopyFromNormalized(NULL, chopped_word);
00305   seam_array = start_seam_list(chopped_word->blobs);
00306   best_choice = new WERD_CHOICE(&unicharset_in);
00307   best_choice->make_bad();
00308   raw_choice = new WERD_CHOICE(&unicharset_in);
00309   raw_choice->make_bad();
00310   SetupBlamerBundle();
00311   return true;
00312 }
00313
00314 // Sets up the members used in recognition:
00315 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
00316 // Returns false if the word is empty and sets up fake results.
00317 bool WERD_RES::SetupForCubeRecognition(const UNICHARSET& unicharset_in,
00318                                        tesseract::Tesseract* tess,
00319                                        const BLOCK* block) {
00320   tesseract = tess;
00321   POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
00322   if (pb != NULL && !pb->IsText()) {
00323     // Ignore words in graphic regions.
00324     SetupFake(unicharset_in);
00325     word->set_flag(W_REP_CHAR, false);
00326     return false;
00327   }
00328   ClearResults();
00329   SetupWordScript(unicharset_in);
00330   TBOX word_box = word->bounding_box();
00331   denorm.SetupNormalization(block, NULL, NULL, NULL, NULL, 0,
00332                             word_box.left(), word_box.bottom(),
00333                             1.0f, 1.0f, 0.0f, 0.0f);
00334   SetupBlamerBundle();
00335   return true;
00336 }
00337
00338 // Sets up the members used in recognition for an empty recognition result:
00339 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
00340 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
00341   ClearResults();
00342   SetupWordScript(unicharset_in);
00343   chopped_word = new TWERD;
00344   rebuild_word = new TWERD;
00345   bln_boxes = new tesseract::BoxWord;
00346   box_word = new tesseract::BoxWord;
00347   int blob_count = word->cblob_list()->length();
00348   best_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
00349                                 TOP_CHOICE_PERM, unicharset_in);
00350   raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
00351                                TOP_CHOICE_PERM, unicharset_in);
00352   if (blob_count > 0) {
00353     BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
00354     // For non-text blocks, just pass any blobs through to the box_word
00355     // and call the word failed with a fake classification.
00356     C_BLOB_IT b_it(word->cblob_list());
00357     int blob_id = 0;
00358     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00359       TBOX box = b_it.data()->bounding_box();
00360       box_word->InsertBox(box_word->length(), box);
00361       fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f,
00362                                                 -1, -1, -1, 0, 0, false);
00363     }
00364     FakeClassifyWord(blob_count, fake_choices);
00365     delete [] fake_choices;
00366   }
00367   tess_failed = true;
00368 }
00369
00370 void WERD_RES::SetupWordScript(const UNICHARSET& uch) {
00371   uch_set = &uch;
00372   int script = uch.default_sid();
00373   word->set_script_id(script);
00374   word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
00375   word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
00376 }
00377
00378 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
00379 void WERD_RES::SetupBlamerBundle() {
00380   if (blamer_bundle != NULL) {
00381     blamer_bundle->norm_box_tolerance = kBlamerBoxTolerance * denorm.x_scale();
00382     TPOINT topleft;
00383     TPOINT botright;
00384     TPOINT norm_topleft;
00385     TPOINT norm_botright;
00386     for (int b = 0; b < blamer_bundle->truth_word.length(); ++b) {
00387       const TBOX &box = blamer_bundle->truth_word.BlobBox(b);
00388       topleft.x = box.left();
00389       topleft.y = box.top();
00390       botright.x = box.right();
00391       botright.y = box.bottom();
00392       denorm.NormTransform(topleft, &norm_topleft);
00393       denorm.NormTransform(botright, &norm_botright);
00394       TBOX norm_box(norm_topleft.x, norm_botright.y,
00395                     norm_botright.x, norm_topleft.y);
00396       blamer_bundle->norm_truth_word.InsertBox(b, norm_box);
00397     }
00398   }
00399 }
00400
00401 // Simple helper moves the ownership of the pointer data from src to dest,
00402 // first deleting anything in dest, and nulling out src afterwards.
00403 template<class T> static void MovePointerData(T** dest, T**src) {
00404   delete *dest;
00405   *dest = *src;
00406   *src = NULL;
00407 }
00408
00409 // Moves the results fields from word to this. This takes ownership of all
00410 // the data, so src can be destructed.
00411 void WERD_RES::ConsumeWordResults(WERD_RES* word) {
00412   denorm = word->denorm;
00413   MovePointerData(&chopped_word, &word->chopped_word);
00414   MovePointerData(&rebuild_word, &word->rebuild_word);
00415   MovePointerData(&box_word, &word->box_word);
00416   if (seam_array != NULL)
00417     free_seam_list(seam_array);
00418   seam_array = word->seam_array;
00419   word->seam_array = NULL;
00420   best_state.move(&word->best_state);
00421   correct_text.move(&word->correct_text);
00422   MovePointerData(&best_choice, &word->best_choice);
00423   MovePointerData(&raw_choice, &word->raw_choice);
00424   alt_choices.delete_data_pointers();
00425   alt_choices.move(&word->alt_choices);
00426   alt_states.move(&word->alt_states);
00427   reject_map = word->reject_map;
00428   if (word->blamer_bundle != NULL) {
00429     assert(blamer_bundle != NULL);
00430     blamer_bundle->CopyResults(*(word->blamer_bundle));
00431   }
00432   CopySimpleFields(*word);
00433 }
00434
00435 // Replace the best choice and rebuild box word.
00436 void WERD_RES::ReplaceBestChoice(
00437     const WERD_CHOICE& choice,
00438     const GenericVector<int>& segmentation_state) {
00439   delete best_choice;
00440   best_choice = new WERD_CHOICE(choice);
00441   best_state = segmentation_state;
00442   RebuildBestState();
00443   SetupBoxWord();
00444   // Make up a fake reject map of the right length to keep the
00445   // rejection pass happy.
00446   reject_map.initialise(segmentation_state.length());
00447   done = tess_accepted = tess_would_adapt = true;
00448   SetScriptPositions();
00449 }
00450
00451 // Builds the rebuild_word from the chopped_word and the best_state.
00452 void WERD_RES::RebuildBestState() {
00453   if (rebuild_word != NULL)
00454     delete rebuild_word;
00455   rebuild_word = new TWERD;
00456   if (seam_array == NULL) {
00457     seam_array = start_seam_list(chopped_word->blobs);
00458   }
00459   TBLOB* prev_blob = NULL;
00460   int start = 0;
00461   for (int i = 0; i < best_state.size(); ++i) {
00462     int length = best_state[i];
00463     join_pieces(chopped_word->blobs, seam_array, start, start + length - 1);
00464     TBLOB* blob = chopped_word->blobs;
00465     for (int i = 0; i < start; ++i)
00466       blob = blob->next;
00467     TBLOB* copy_blob = new TBLOB(*blob);
00468     if (prev_blob == NULL)
00469       rebuild_word->blobs = copy_blob;
00470     else
00471       prev_blob->next = copy_blob;
00472     prev_blob = copy_blob;
00473     break_pieces(blob, seam_array, start, start + length - 1);
00474     start += length;
00475   }
00476 }
00477
00478 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
00479 // Also sets up the output box_word.
00480 void WERD_RES::CloneChoppedToRebuild() {
00481   if (rebuild_word != NULL)
00482     delete rebuild_word;
00483   rebuild_word = new TWERD(*chopped_word);
00484   SetupBoxWord();
00485   int word_len = box_word->length();
00486   best_state.reserve(word_len);
00487   correct_text.reserve(word_len);
00488   for (int i = 0; i < word_len; ++i) {
00489     best_state.push_back(1);
00490     correct_text.push_back(STRING(""));
00491   }
00492 }
00493
00494 // Sets/replaces the box_word with one made from the rebuild_word.
00495 void WERD_RES::SetupBoxWord() {
00496   if (box_word != NULL)
00497     delete box_word;
00498   rebuild_word->ComputeBoundingBoxes();
00499   box_word = tesseract::BoxWord::CopyFromNormalized(&denorm, rebuild_word);
00500   box_word->ClipToOriginalWord(denorm.block(), word);
00501 }
00502
00503 // Sets up the script positions in the output boxword using the best_choice
00504 // to get the unichars, and the unicharset to get the target positions.
00505 void WERD_RES::SetScriptPositions() {
00506   box_word->SetScriptPositions(*uch_set, small_caps, rebuild_word,
00507                                best_choice);
00508 }
00509
00510 void WERD_RES::WithoutFootnoteSpan(int *pstart, int *pend) const {
00511   int end = best_choice->length();
00512   while (end > 0 &&
00513          uch_set->get_isdigit(best_choice->unichar_ids()[end - 1]) &&
00514          box_word->BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {
00515     end--;
00516   }
00517   int start = 0;
00518   while (start < end &&
00519          uch_set->get_isdigit(best_choice->unichar_ids()[start]) &&
00520          box_word->BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {
00521     start++;
00522   }
00523   *pstart = start;
00524   *pend = end;
00525 }
00526
00527 void WERD_RES::WithoutFootnoteSpan(
00528     const WERD_CHOICE &word, const GenericVector<int> &state,
00529     int *pstart, int *pend) const {
00530   int len = word.length();
00531   *pstart = 0;
00532   *pend = len;
00533   if (len < 2) return;
00534   if (!word.unicharset()->get_isdigit(word.unichar_ids()[len - 1]) &&
00535       !word.unicharset()->get_isdigit(word.unichar_ids()[0])) return;
00536
00537   // ok, now that we know the word ends in digits, do the expensive bit of
00538   // figuring out if they're superscript.
00539   WERD_RES copy(*this);
00540   copy.ReplaceBestChoice(word, state);
00541   copy.WithoutFootnoteSpan(pstart, pend);
00542 }
00543
00544 // Classifies the word with some already-calculated BLOB_CHOICEs.
00545 // The choices are an array of blob_count pointers to BLOB_CHOICE,
00546 // providing a single classifier result for each blob.
00547 // The BLOB_CHOICEs are consumed and the word takes ownership.
00548 // The number of blobs in the outword must match blob_count.
00549 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
00550   // Setup the WERD_RES.
00551   ASSERT_HOST(box_word != NULL);
00552   ASSERT_HOST(blob_count == box_word->length());
00553   ASSERT_HOST(best_choice != NULL);
00554   BLOB_CHOICE_LIST_CLIST* word_choices = new BLOB_CHOICE_LIST_CLIST;
00555   BLOB_CHOICE_LIST_C_IT bc_it(word_choices);
00556   for (int c = 0; c < blob_count; ++c) {
00557     best_choice->append_unichar_id(
00558         choices[c]->unichar_id(), 1,
00559         choices[c]->rating(), choices[c]->certainty());
00560     BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST;
00561     BLOB_CHOICE_IT choice_it(choice_list);
00562     choice_it.add_after_then_move(choices[c]);
00563     bc_it.add_after_then_move(choice_list);
00564   }
00565   best_choice->set_blob_choices(word_choices);
00566   delete raw_choice;
00567   raw_choice = new WERD_CHOICE(*best_choice);
00568   reject_map.initialise(blob_count);
00569 }
00570
00571 // Copies the best_choice strings to the correct_text for adaption/training.
00572 void WERD_RES::BestChoiceToCorrectText() {
00573   correct_text.clear();
00574   ASSERT_HOST(best_choice != NULL);
00575   for (int i = 0; i < best_choice->length(); ++i) {
00576     UNICHAR_ID choice_id = best_choice->unichar_id(i);
00577     const char* blob_choice = uch_set->id_to_unichar(choice_id);
00578     correct_text.push_back(STRING(blob_choice));
00579   }
00580 }
00581
00582 // Merges 2 adjacent blobs in the result if the permanent callback
00583 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
00584 // callback box_cb is NULL or returns true, setting the merged blob
00585 // result to the class returned from class_cb.
00586 // Returns true if anything was merged.
00587 bool WERD_RES::ConditionalBlobMerge(
00588     TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
00589     TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb,
00590
00591     BLOB_CHOICE_LIST_CLIST *blob_choices) {
00592   bool modified = false;
00593   for (int i = 0; i + 1 < best_choice->length(); ++i) {
00594     UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
00595                                       best_choice->unichar_id(i+1));
00596     if (new_id != INVALID_UNICHAR_ID &&
00597         (box_cb == NULL || box_cb->Run(box_word->BlobBox(i),
00598                                        box_word->BlobBox(i + 1)))) {
00599       if (reject_map.length() == best_choice->length())
00600         reject_map.remove_pos(i);
00601       best_choice->set_unichar_id(new_id, i);
00602       best_choice->remove_unichar_id(i + 1);
00603       raw_choice->set_unichar_id(new_id, i);
00604       raw_choice->remove_unichar_id(i + 1);
00605       modified = true;
00606       rebuild_word->MergeBlobs(i, i + 2);
00607       box_word->MergeBoxes(i, i + 2);
00608       if (i + 1 < best_state.length()) {
00609         best_state[i] += best_state[i + 1];
00610         best_state.remove(i + 1);
00611       }
00612
00613       BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
00614       for (int j = 0; j < i; ++j)
00615         blob_choices_it.forward();
00616       BLOB_CHOICE_IT it1(blob_choices_it.data());            // first choices
00617       BLOB_CHOICE_LIST* target_choices = blob_choices_it.data_relative(1);
00618       BLOB_CHOICE_IT it2(target_choices);  // second choices
00619       float certainty = it2.data()->certainty();
00620       float rating = it2.data()->rating();
00621       if (it1.data()->certainty() < certainty) {
00622         certainty = it1.data()->certainty();
00623         rating = it1.data()->rating();
00624         target_choices = blob_choices_it.data();
00625         blob_choices_it.forward();
00626       }
00627       delete blob_choices_it.extract();  // get rid of spare
00628       // TODO(rays) Fix the choices so they contain the desired result.
00629       // Do we really need to ? Only needed for fix_quotes, which should be
00630       // going away.
00631     }
00632   }
00633   delete class_cb;
00634   delete box_cb;
00635   return modified;
00636 }
00637
00638 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
00639 // training data.
00640
00641 // Utility function for fix_quotes
00642 // Return true if the next character in the string (given the UTF8 length in
00643 // bytes) is a quote character.
00644 static int is_simple_quote(const char* signed_str, int length) {
00645   const unsigned char* str =
00646       reinterpret_cast<const unsigned char*>(signed_str);
00647   // Standard 1 byte quotes.
00648   return (length == 1 && (*str == '\'' || *str == '`')) ||
00649       // UTF-8 3 bytes curved quotes.
00650       (length == 3 && ((*str == 0xe2 &&
00651                         *(str + 1) == 0x80 &&
00652                         *(str + 2) == 0x98) ||
00653                        (*str == 0xe2 &&
00654                         *(str + 1) == 0x80 &&
00655                         *(str + 2) == 0x99)));
00656 }
00657
00658 // Callback helper for fix_quotes returns a double quote if both
00659 // arguments are quote, otherwise INVALID_UNICHAR_ID.
00660 UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
00661   const char *ch = uch_set->id_to_unichar(id1);
00662   const char *next_ch = uch_set->id_to_unichar(id2);
00663   if (is_simple_quote(ch, strlen(ch)) &&
00664       is_simple_quote(next_ch, strlen(next_ch)))
00665     return uch_set->unichar_to_id("\"");
00666   return INVALID_UNICHAR_ID;
00667 }
00668
00669 // Change pairs of quotes to double quotes.
00670 void WERD_RES::fix_quotes(BLOB_CHOICE_LIST_CLIST* blob_choices) {
00671   if (!uch_set->contains_unichar("\"") ||
00672       !uch_set->get_enabled(uch_set->unichar_to_id("\"")))
00673     return;  // Don't create it if it is disallowed.
00674
00675   ConditionalBlobMerge(
00676       NewPermanentTessCallback(this, &WERD_RES::BothQuotes),
00677       NULL,
00678       blob_choices);
00679 }
00680
00681 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
00682 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
00683 UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
00684   const char *ch = uch_set->id_to_unichar(id1);
00685   const char *next_ch = uch_set->id_to_unichar(id2);
00686   if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
00687       (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
00688     return uch_set->unichar_to_id("-");
00689   return INVALID_UNICHAR_ID;
00690 }
00691
00692 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
00693 // (assuming both on the same textline, are in order and a chopped em dash.)
00694 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
00695   return box1.right() >= box2.left();
00696 }
00697
00698 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
00699 // Typically a long dash which has been segmented.
00700 void WERD_RES::fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices) {
00701   if (!uch_set->contains_unichar("-") ||
00702       !uch_set->get_enabled(uch_set->unichar_to_id("-")))
00703     return;  // Don't create it if it is disallowed.
00704
00705   ConditionalBlobMerge(
00706       NewPermanentTessCallback(this, &WERD_RES::BothHyphens),
00707       NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap),
00708       blob_choices);
00709 }
00710
00711 // Callback helper for merge_tess_fails returns a space if both
00712 // arguments are space, otherwise INVALID_UNICHAR_ID.
00713 UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
00714   if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
00715     return id1;
00716   else
00717     return INVALID_UNICHAR_ID;
00718 }
00719
00720 // Change pairs of tess failures to a single one
00721 void WERD_RES::merge_tess_fails() {
00722   if (ConditionalBlobMerge(
00723       NewPermanentTessCallback(this, &WERD_RES::BothSpaces), NULL,
00724       best_choice->blob_choices())) {
00725     int len = best_choice->length();
00726     ASSERT_HOST(reject_map.length() == len);
00727     ASSERT_HOST(box_word->length() == len);
00728   }
00729 }
00730
00731 // Returns true if the collection of count pieces, starting at start, are all
00732 // natural connected components, ie there are no real chops involved.
00733 bool WERD_RES::PiecesAllNatural(int start, int count) const {
00734   // all seams must have no splits.
00735   for (int index = start; index < start + count - 1; ++index) {
00736     if (index >= 0 && index < array_count(seam_array)) {
00737       SEAM* seam = reinterpret_cast<SEAM *>(array_value(seam_array, index));
00738       if (seam != NULL && seam->split1 != NULL)
00739         return false;
00740     }
00741   }
00742   return true;
00743 }
00744
00745
00746 WERD_RES::~WERD_RES () {
00747   Clear();
00748 }
00749
00750 void WERD_RES::InitNonPointers() {
00751   tess_failed = FALSE;
00752   tess_accepted = FALSE;
00753   tess_would_adapt = FALSE;
00754   done = FALSE;
00755   unlv_crunch_mode = CR_NONE;
00756   small_caps = false;
00757   italic = FALSE;
00758   bold = FALSE;
00759   // The fontinfos and tesseract count as non-pointers as they point to
00760   // data owned elsewhere.
00761   fontinfo = NULL;
00762   fontinfo2 = NULL;
00763   tesseract = NULL;
00764   fontinfo_id_count = 0;
00765   fontinfo_id2_count = 0;
00766   x_height = 0.0;
00767   caps_height = 0.0;
00768   guessed_x_ht = TRUE;
00769   guessed_caps_ht = TRUE;
00770   combination = FALSE;
00771   part_of_combo = FALSE;
00772   reject_spaces = FALSE;
00773 }
00774
00775 void WERD_RES::InitPointers() {
00776   word = NULL;
00777   bln_boxes = NULL;
00778   uch_set = NULL;
00779   chopped_word = NULL;
00780   rebuild_word = NULL;
00781   box_word = NULL;
00782   seam_array = NULL;
00783   best_choice = NULL;
00784   raw_choice = NULL;
00785   ep_choice = NULL;
00786   blamer_bundle = NULL;
00787 }
00788
00789 void WERD_RES::Clear() {
00790   if (word != NULL && combination) {
00791     delete word;
00792   }
00793   word = NULL;
00794   delete blamer_bundle;
00795   blamer_bundle = NULL;
00796   ClearResults();
00797 }
00798
00799 void WERD_RES::ClearResults() {
00800   done = false;
00801   fontinfo = NULL;
00802   fontinfo2 = NULL;
00803   fontinfo_id_count = 0;
00804   fontinfo_id2_count = 0;
00805   if (bln_boxes != NULL) {
00806     delete bln_boxes;
00807     bln_boxes = NULL;
00808   }
00809   if (chopped_word != NULL) {
00810     delete chopped_word;
00811     chopped_word = NULL;
00812   }
00813   if (rebuild_word != NULL) {
00814     delete rebuild_word;
00815     rebuild_word = NULL;
00816   }
00817   if (box_word != NULL) {
00818     delete box_word;
00819     box_word = NULL;
00820   }
00821   best_state.clear();
00822   correct_text.clear();
00823   if (seam_array != NULL) {
00824     free_seam_list(seam_array);
00825     seam_array = NULL;
00826   }
00827   if (best_choice != NULL) {
00828     delete best_choice;
00829     delete raw_choice;
00830     best_choice = NULL;
00831     raw_choice = NULL;
00832   }
00833   if (!alt_choices.empty()) {
00834     alt_choices.delete_data_pointers();
00835     alt_choices.clear();
00836   }
00837   alt_states.clear();
00838   if (ep_choice != NULL) {
00839     delete ep_choice;
00840     ep_choice = NULL;
00841   }
00842   if (blamer_bundle != NULL) blamer_bundle->ClearResults();
00843 }
00844
00845 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
00846   return word_res == other.word_res &&
00847       row_res == other.row_res &&
00848       block_res == other.block_res;
00849 }
00850
00851 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
00852   ASSERT_HOST(page_res == other.page_res);
00853   if (other.block_res == NULL) {
00854     // other points to the end of the page.
00855     if (block_res == NULL)
00856       return 0;
00857     return -1;
00858   }
00859   if (block_res == NULL) {
00860     return 1; // we point to the end of the page.
00861   }
00862   if (block_res == other.block_res) {
00863     if (other.row_res == NULL || row_res == NULL) {
00864       // this should only happen if we hit an image block.
00865       return 0;
00866     }
00867     if (row_res == other.row_res) {
00868       // we point to the same block and row.
00869       ASSERT_HOST(other.word_res != NULL && word_res != NULL);
00870       if (word_res == other.word_res) {
00871         // we point to the same word!
00872         return 0;
00873       }
00874
00875       WERD_RES_IT word_res_it(&row_res->word_res_list);
00876       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00877            word_res_it.forward()) {
00878         if (word_res_it.data() == word_res) {
00879           return -1;
00880         } else if (word_res_it.data() == other.word_res) {
00881           return 1;
00882         }
00883       }
00884       ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
00885     }
00886
00887     // we both point to the same block, but different rows.
00888     ROW_RES_IT row_res_it(&block_res->row_res_list);
00889     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
00890          row_res_it.forward()) {
00891       if (row_res_it.data() == row_res) {
00892         return -1;
00893       } else if (row_res_it.data() == other.row_res) {
00894         return 1;
00895       }
00896     }
00897     ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
00898   }
00899
00900   // We point to different blocks.
00901   BLOCK_RES_IT block_res_it(&page_res->block_res_list);
00902   for (block_res_it.mark_cycle_pt();
00903        !block_res_it.cycled_list(); block_res_it.forward()) {
00904     if (block_res_it.data() == block_res) {
00905       return -1;
00906     } else if (block_res_it.data() == other.block_res) {
00907       return 1;
00908     }
00909   }
00910   // Shouldn't happen...
00911   ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
00912   return 0;
00913 }
00914
00915 // Inserts the new_word and a corresponding WERD_RES before the current
00916 // position. The simple fields of the WERD_RES are copied from clone_res and
00917 // the resulting WERD_RES is returned for further setup with best_choice etc.
00918 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
00919                                              WERD* new_word) {
00920   // Insert new_word into the ROW.
00921   WERD_IT w_it(row()->row->word_list());
00922   for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00923     WERD* word = w_it.data();
00924     if (word == word_res->word)
00925       break;
00926   }
00927   ASSERT_HOST(!w_it.cycled_list());
00928   w_it.add_before_then_move(new_word);
00929   // Make a WERD_RES for the new_word.
00930   WERD_RES* new_res = new WERD_RES(new_word);
00931   new_res->CopySimpleFields(clone_res);
00932   // Insert into the appropriate place in the ROW_RES.
00933   WERD_RES_IT wr_it(&row()->word_res_list);
00934   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
00935     WERD_RES* word = wr_it.data();
00936     if (word == word_res)
00937       break;
00938   }
00939   ASSERT_HOST(!wr_it.cycled_list());
00940   wr_it.add_before_then_move(new_res);
00941   if (wr_it.at_first()) {
00942     // This is the new first word, so reset the member iterator so it
00943     // detects the cycled_list state correctly.
00944     ResetWordIterator();
00945   }
00946   return new_res;
00947 }
00948
00949 // Deletes the current WERD_RES and its underlying WERD.
00950 void PAGE_RES_IT::DeleteCurrentWord() {
00951   // Check that this word is as we expect. part_of_combos are NEVER iterated
00952   // by the normal iterator, so we should never be trying to delete them.
00953   ASSERT_HOST(!word_res->part_of_combo);
00954   if (!word_res->combination) {
00955     // Combinations own their own word, so we won't find the word on the
00956     // row's word_list, but it is legitimate to try to delete them.
00957     // Delete word from the ROW when not a combination.
00958     WERD_IT w_it(row()->row->word_list());
00959     for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00960       if (w_it.data() == word_res->word) {
00961         break;
00962       }
00963     }
00964     ASSERT_HOST(!w_it.cycled_list());
00965     delete w_it.extract();
00966   }
00967   // Remove the WERD_RES for the new_word.
00968   // Remove the WORD_RES from the ROW_RES.
00969   WERD_RES_IT wr_it(&row()->word_res_list);
00970   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
00971     if (wr_it.data() == word_res) {
00972       word_res = NULL;
00973       break;
00974     }
00975   }
00976   ASSERT_HOST(!wr_it.cycled_list());
00977   delete wr_it.extract();
00978   ResetWordIterator();
00979 }
00980
00981 /*************************************************************************
00982  * PAGE_RES_IT::restart_page
00983  *
00984  * Set things up at the start of the page
00985  *************************************************************************/
00986
00987 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
00988   block_res_it.set_to_list(&page_res->block_res_list);
00989   block_res_it.mark_cycle_pt();
00990   prev_block_res = NULL;
00991   prev_row_res = NULL;
00992   prev_word_res = NULL;
00993   block_res = NULL;
00994   row_res = NULL;
00995   word_res = NULL;
00996   next_block_res = NULL;
00997   next_row_res = NULL;
00998   next_word_res = NULL;
00999   internal_forward(true, empty_ok);
01000   return internal_forward(false, empty_ok);
01001 }
01002
01003 // Recovers from operations on the current word, such as in InsertCloneWord
01004 // and DeleteCurrentWord.
01005 // Resets the word_res_it so that it is one past the next_word_res, as
01006 // it should be after internal_forward. If next_row_res != row_res,
01007 // then the next_word_res is in the next row, so there is no need to do
01008 // anything, since operations on the current word will not have disturbed
01009 // the word_res_it.
01010 void PAGE_RES_IT::ResetWordIterator() {
01011   if (row_res == next_row_res) {
01012     // Reset the member iterator so it can move forward and detect the
01013     // cycled_list state correctly.
01014     word_res_it.move_to_first();
01015     word_res_it.mark_cycle_pt();
01016     while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res)
01017       word_res_it.forward();
01018     ASSERT_HOST(!word_res_it.cycled_list());
01019     word_res_it.forward();
01020   }
01021 }
01022
01023 /*************************************************************************
01024  * PAGE_RES_IT::internal_forward
01025  *
01026  * Find the next word on the page. If empty_ok is true, then non-text blocks
01027  * and text blocks with no text are visited as if they contain a single
01028  * imaginary word in a single imaginary row. (word() and row() both return NULL
01029  * in such a block and the return value is NULL.)
01030  * If empty_ok is false, the old behaviour is maintained. Each real word
01031  * is visited and empty and non-text blocks and rows are skipped.
01032  * new_block is used to initialize the iterators for a new block.
01033  * The iterator maintains pointers to block, row and word for the previous,
01034  * current and next words.  These are correct, regardless of block/row
01035  * boundaries. NULL values denote start and end of the page.
01036  *************************************************************************/
01037
01038 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
01039   bool new_row = false;
01040
01041   prev_block_res = block_res;
01042   prev_row_res = row_res;
01043   prev_word_res = word_res;
01044   block_res = next_block_res;
01045   row_res = next_row_res;
01046   word_res = next_word_res;
01047   next_block_res = NULL;
01048   next_row_res = NULL;
01049   next_word_res = NULL;
01050
01051   while (!block_res_it.cycled_list()) {
01052     if (new_block) {
01053       new_block = false;
01054       row_res_it.set_to_list(&block_res_it.data()->row_res_list);
01055       row_res_it.mark_cycle_pt();
01056       if (row_res_it.empty() && empty_ok) {
01057         next_block_res = block_res_it.data();
01058         break;
01059       }
01060       new_row = true;
01061     }
01062     while (!row_res_it.cycled_list()) {
01063       if (new_row) {
01064         new_row = false;
01065         word_res_it.set_to_list(&row_res_it.data()->word_res_list);
01066         word_res_it.mark_cycle_pt();
01067       }
01068       // Skip any part_of_combo words.
01069       while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
01070         word_res_it.forward();
01071       if (!word_res_it.cycled_list()) {
01072         next_block_res = block_res_it.data();
01073         next_row_res = row_res_it.data();
01074         next_word_res = word_res_it.data();
01075         word_res_it.forward();
01076         goto foundword;
01077       }
01078       // end of row reached
01079       row_res_it.forward();
01080       new_row = true;
01081     }
01082     // end of block reached
01083     block_res_it.forward();
01084     new_block = true;
01085   }
01086   foundword:
01087   // Update prev_word_best_choice pointer.
01088   if (page_res != NULL && page_res->prev_word_best_choice != NULL) {
01089     *page_res->prev_word_best_choice =
01090       (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice;
01091   }
01092   return word_res;
01093 }
01094
01095 /*************************************************************************
01096  * PAGE_RES_IT::restart_row()
01097  *
01098  * Move to the beginning (leftmost word) of the current row.
01099  *************************************************************************/
01100 WERD_RES *PAGE_RES_IT::restart_row() {
01101   ROW_RES *row = this->row();
01102   if (!row) return NULL;
01103   for (restart_page(); this->row() != row; forward()) {
01104     // pass
01105   }
01106   return word();
01107 }
01108
01109 /*************************************************************************
01110  * PAGE_RES_IT::forward_paragraph
01111  *
01112  * Move to the beginning of the next paragraph, allowing empty blocks.
01113  *************************************************************************/
01114
01115 WERD_RES *PAGE_RES_IT::forward_paragraph() {
01116   while (block_res == next_block_res &&
01117          (next_row_res != NULL && next_row_res->row != NULL &&
01118           row_res->row->para() == next_row_res->row->para())) {
01119     internal_forward(false, true);
01120   }
01121   return internal_forward(false, true);
01122 }
01123
01124 /*************************************************************************
01125  * PAGE_RES_IT::forward_block
01126  *
01127  * Move to the beginning of the next block, allowing empty blocks.
01128  *************************************************************************/
01129
01130 WERD_RES *PAGE_RES_IT::forward_block() {
01131   while (block_res == next_block_res) {
01132     internal_forward(false, true);
01133   }
01134   return internal_forward(false, true);
01135 }
01136
01137 void PAGE_RES_IT::rej_stat_word() {
01138   inT16 chars_in_word;
01139   inT16 rejects_in_word = 0;
01140
01141   chars_in_word = word_res->reject_map.length ();
01142   page_res->char_count += chars_in_word;
01143   block_res->char_count += chars_in_word;
01144   row_res->char_count += chars_in_word;
01145
01146   rejects_in_word = word_res->reject_map.reject_count ();
01147
01148   page_res->rej_count += rejects_in_word;
01149   block_res->rej_count += rejects_in_word;
01150   row_res->rej_count += rejects_in_word;
01151   if (chars_in_word == rejects_in_word)
01152     row_res->whole_word_rej_count += rejects_in_word;
01153 }