tesseract-doc/applybox_8cpp_source.html

00001 /**********************************************************************
00002  * File:        applybox.cpp  (Formerly applybox.c)
00003  * Description: Re segment rows according to box file data
00004  * Author:      Phil Cheatle
00005  * Created:     Wed Nov 24 09:11:23 GMT 1993
00006  *
00007  * (C) Copyright 1993, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 #include "mfcpch.h"
00020
00021 #ifdef _MSC_VER
00022 #pragma warning(disable:4244)  // Conversion warnings
00023 #endif
00024
00025 #include <ctype.h>
00026 #include <string.h>
00027 #ifdef __UNIX__
00028 #include <assert.h>
00029 #include <errno.h>
00030 #endif
00031 #include "allheaders.h"
00032 #include "boxread.h"
00033 #include "chopper.h"
00034 #include "pageres.h"
00035 #include "unichar.h"
00036 #include "unicharset.h"
00037 #include "tesseractclass.h"
00038 #include "genericvector.h"
00039
00040 // Max number of blobs to classify together in FindSegmentation.
00041 const int kMaxGroupSize = 4;
00042 // Max fraction of median allowed as deviation in xheight before switching
00043 // to median.
00044 const double kMaxXHeightDeviationFraction = 0.125;
00045
00046 /*************************************************************************
00047  * The box file is assumed to contain box definitions, one per line, of the
00048  * following format for blob-level boxes:
00049  *   <UTF8 str> <left> <bottom> <right> <top> <page id>
00050  * and for word/line-level boxes:
00051  *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
00052  * NOTES:
00053  * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
00054  *
00055  * <page id> is 0-based, and the page number is used for multipage input (tiff).
00056  *
00057  * In the blob-level form, each line represents a recognizable unit, which may
00058  * be several UTF-8 bytes, but there is a bounding box around each recognizable
00059  * unit, and no classifier is needed to train in this mode (bootstrapping.)
00060  *
00061  * In the word/line-level form, the line begins with the literal "WordStr", and
00062  * the bounding box bounds either a whole line or a whole word. The recognizable
00063  * units in the word/line are listed after the # at the end of the line and
00064  * are space delimited, ignoring any original spaces on the line.
00065  * Eg.
00066  * word -> #w o r d
00067  * multi word line -> #m u l t i w o r d l i n e
00068  * The recognizable units must be space-delimited in order to allow multiple
00069  * unicodes to be used for a single recognizable unit, eg Hindi.
00070  * In this mode, the classifier must have been pre-trained with the desired
00071  * character set, or it will not be able to find the character segmentations.
00072  *************************************************************************/
00073
00074 namespace tesseract {
00075
00076 static void clear_any_old_text(BLOCK_LIST *block_list) {
00077   BLOCK_IT block_it(block_list);
00078   for (block_it.mark_cycle_pt();
00079        !block_it.cycled_list(); block_it.forward()) {
00080     ROW_IT row_it(block_it.data()->row_list());
00081     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00082       WERD_IT word_it(row_it.data()->word_list());
00083       for (word_it.mark_cycle_pt();
00084            !word_it.cycled_list(); word_it.forward()) {
00085         word_it.data()->set_text("");
00086       }
00087     }
00088   }
00089 }
00090
00091 // Applies the box file based on the image name fname, and resegments
00092 // the words in the block_list (page), with:
00093 // blob-mode: one blob per line in the box file, words as input.
00094 // word/line-mode: one blob per space-delimited unit after the #, and one word
00095 // per line in the box file. (See comment above for box file format.)
00096 // If find_segmentation is true, (word/line mode) then the classifier is used
00097 // to re-segment words/lines to match the space-delimited truth string for
00098 // each box. In this case, the input box may be for a word or even a whole
00099 // text line, and the output words will contain multiple blobs corresponding
00100 // to the space-delimited input string.
00101 // With find_segmentation false, no classifier is needed, but the chopper
00102 // can still be used to correctly segment touching characters with the help
00103 // of the input boxes.
00104 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
00105 // from normal classification, ie. with a word, chopped_word, rebuild_word,
00106 // seam_array, denorm, box_word, and best_state, but NO best_choice or
00107 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
00108 // Instead, the correct_text member of WERD_RES is set, and this may be later
00109 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
00110 // is not required before calling ApplyBoxTraining.
00111 PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
00112                                 bool find_segmentation,
00113                                 BLOCK_LIST *block_list) {
00114   int box_count = 0;
00115   int box_failures = 0;
00116
00117   FILE* box_file = OpenBoxFile(fname);
00118   TBOX box;
00119   GenericVector<TBOX> boxes;
00120   GenericVector<STRING> texts, full_texts;
00121
00122   bool found_box = true;
00123   while (found_box) {
00124     int line_number = 0;           // Line number of the box file.
00125     STRING text, full_text;
00126     found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box);
00127     if (found_box) {
00128       ++box_count;
00129       MakeBoxFileStr(text.string(), box, applybox_page, &full_text);
00130     } else {
00131       full_text = "";
00132     }
00133     boxes.push_back(box);
00134     texts.push_back(text);
00135     full_texts.push_back(full_text);
00136   }
00137
00138   // In word mode, we use the boxes to make a word for each box, but
00139   // in blob mode we use the existing words and maximally chop them first.
00140   PAGE_RES* page_res = find_segmentation ?
00141       NULL : SetupApplyBoxes(boxes, block_list);
00142   clear_any_old_text(block_list);
00143
00144   for (int i = 0; i < boxes.size() - 1; i++) {
00145     bool foundit = false;
00146     if (page_res != NULL) {
00147       if (i == 0) {
00148         foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
00149                                    full_texts[i].string());
00150       } else {
00151         foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
00152                                    boxes[i + 1], full_texts[i].string());
00153       }
00154     } else {
00155       foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
00156                                  texts[i].string());
00157     }
00158     if (!foundit) {
00159       box_failures++;
00160       ReportFailedBox(box_count, boxes[i], texts[i].string(),
00161                       "FAILURE! Couldn't find a matching blob");
00162     }
00163   }
00164
00165   if (page_res == NULL) {
00166     // In word/line mode, we now maximally chop all the words and resegment
00167     // them with the classifier.
00168     page_res = SetupApplyBoxes(boxes, block_list);
00169     ReSegmentByClassification(page_res);
00170   }
00171   if (applybox_debug > 0) {
00172     tprintf("APPLY_BOXES:\n");
00173     tprintf("   Boxes read from boxfile:  %6d\n", box_count);
00174     if (box_failures > 0)
00175       tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
00176   }
00177   TidyUp(page_res);
00178   return page_res;
00179 }
00180
00181 // Helper computes median xheight in the image.
00182 static double MedianXHeight(BLOCK_LIST *block_list) {
00183   BLOCK_IT block_it(block_list);
00184   STATS xheights(0, block_it.data()->bounding_box().height());
00185   for (block_it.mark_cycle_pt();
00186        !block_it.cycled_list(); block_it.forward()) {
00187     ROW_IT row_it(block_it.data()->row_list());
00188     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00189       xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
00190     }
00191   }
00192   return xheights.median();
00193 }
00194
00195 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
00196 // All fuzzy spaces are removed, and all the words are maximally chopped.
00197 PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
00198                                      BLOCK_LIST *block_list) {
00199   double median_xheight = MedianXHeight(block_list);
00200   double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
00201   // Strip all fuzzy space markers to simplify the PAGE_RES.
00202   BLOCK_IT b_it(block_list);
00203   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00204     BLOCK* block = b_it.data();
00205     ROW_IT r_it(block->row_list());
00206     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
00207       ROW* row = r_it.data();
00208       float diff = fabs(row->x_height() - median_xheight);
00209       if (diff > max_deviation) {
00210         if (applybox_debug) {
00211           tprintf("row xheight=%g, but median xheight = %g\n",
00212                   row->x_height(), median_xheight);
00213         }
00214         row->set_x_height(static_cast<float>(median_xheight));
00215       }
00216       WERD_IT w_it(row->word_list());
00217       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00218         WERD* word = w_it.data();
00219         if (word->cblob_list()->empty()) {
00220           delete w_it.extract();
00221         } else {
00222           word->set_flag(W_FUZZY_SP, false);
00223           word->set_flag(W_FUZZY_NON, false);
00224         }
00225       }
00226     }
00227   }
00228   PAGE_RES* page_res = new PAGE_RES(block_list, NULL);
00229   PAGE_RES_IT pr_it(page_res);
00230   WERD_RES* word_res;
00231   while ((word_res = pr_it.word()) != NULL) {
00232     MaximallyChopWord(boxes, pr_it.block()->block,
00233                       pr_it.row()->row, word_res);
00234     pr_it.forward();
00235   }
00236   return page_res;
00237 }
00238
00239 // Helper to make a WERD_CHOICE from the BLOB_CHOICE_LIST_VECTOR using only
00240 // the top choices. Avoids problems with very long words.
00241 static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
00242                            const UNICHARSET& unicharset,
00243                            WERD_CHOICE* word_choice) {
00244   *word_choice = WERD_CHOICE(&unicharset);  // clear the word choice.
00245   word_choice->make_bad();
00246   for (int i = 0; i < char_choices.size(); ++i) {
00247     BLOB_CHOICE_IT it(char_choices[i]);
00248     BLOB_CHOICE* bc = it.data();
00249     word_choice->append_unichar_id(bc->unichar_id(), 1,
00250                                    bc->rating(), bc->certainty());
00251   }
00252 }
00253
00254 // Tests the chopper by exhaustively running chop_one_blob.
00255 // The word_res will contain filled chopped_word, seam_array, denorm,
00256 // box_word and best_state for the maximally chopped word.
00257 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
00258                                   BLOCK* block, ROW* row,
00259                                   WERD_RES* word_res) {
00260   if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
00261                                          this->textord_use_cjk_fp_model,
00262                                          row, block)) {
00263     word_res->CloneChoppedToRebuild();
00264     return;
00265   }
00266   if (chop_debug) {
00267     tprintf("Maximally chopping word at:");
00268     word_res->word->bounding_box().print();
00269   }
00270   blob_match_table.init_match_table();
00271   BLOB_CHOICE_LIST *match_result;
00272   BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
00273   ASSERT_HOST(word_res->chopped_word->blobs != NULL);
00274   float rating = static_cast<float>(MAX_INT8);
00275   for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
00276        blob = blob->next) {
00277     // The rating and certainty are not quite arbitrary. Since
00278     // select_blob_to_chop uses the worst certainty to choose, they all have
00279     // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
00280     // in here, and then divide by e each time they are chopped, which
00281     // should guarantee a set of unequal values for the whole tree of blobs
00282     // produced, however much chopping is required. The chops are thus only
00283     // limited by the ability of the chopper to find suitable chop points,
00284     // and not by the value of the certainties.
00285     match_result = fake_classify_blob(0, rating, -rating);
00286     modify_blob_choice(match_result, 0);
00287     ASSERT_HOST(!match_result->empty());
00288     *char_choices += match_result;
00289     rating -= 0.125f;
00290   }
00291   inT32 blob_number;
00292   int right_chop_index = 0;
00293   if (!assume_fixed_pitch_char_segment) {
00294     // We only chop if the language is not fixed pitch like CJK.
00295     if (prioritize_division) {
00296       while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
00297     } else {
00298       while (chop_one_blob(word_res->chopped_word, char_choices,
00299                            &blob_number, &word_res->seam_array,
00300                            &right_chop_index));
00301     }
00302   }
00303   MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
00304   MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
00305   word_res->CloneChoppedToRebuild();
00306   blob_match_table.end_match_table();
00307   if (char_choices != NULL) {
00308     char_choices->delete_data_pointers();
00309     delete char_choices;
00310   }
00311 }
00312
00313 // Helper to compute the dispute resolution metric.
00314 // Disputed blob resolution. The aim is to give the blob to the most
00315 // appropriate boxfile box. Most of the time it is obvious, but if
00316 // two boxfile boxes overlap significantly it is not. If a small boxfile
00317 // box takes most of the blob, and a large boxfile box does too, then
00318 // we want the small boxfile box to get it, but if the small box
00319 // is much smaller than the blob, we don't want it to get it.
00320 // Details of the disputed blob resolution:
00321 // Given a box with area A, and a blob with area B, with overlap area C,
00322 // then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
00323 // miss metric gets the blob.
00324 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
00325   int overlap_area = box1.intersection(box2).area();
00326   double miss_metric = box1.area()- overlap_area;
00327   miss_metric /= box1.area();
00328   miss_metric *= box2.area() - overlap_area;
00329   miss_metric /= box2.area();
00330   return miss_metric;
00331 }
00332
00333 // Gather consecutive blobs that match the given box into the best_state
00334 // and corresponding correct_text.
00335 // Fights over which box owns which blobs are settled by pre-chopping and
00336 // applying the blobs to box or next_box with the least non-overlap.
00337 // Returns false if the box was in error, which can only be caused by
00338 // failing to find an appropriate blob for a box.
00339 // This means that occasionally, blobs may be incorrectly segmented if the
00340 // chopper fails to find a suitable chop point.
00341 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
00342                                  const TBOX& box, const TBOX& next_box,
00343                                  const char* correct_text) {
00344   if (applybox_debug > 1) {
00345     tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
00346   }
00347   PAGE_RES_IT page_res_it(page_res);
00348   WERD_RES* word_res;
00349   for (word_res = page_res_it.word(); word_res != NULL;
00350        word_res = page_res_it.forward()) {
00351     if (!word_res->box_word->bounding_box().major_overlap(box))
00352       continue;
00353     if (applybox_debug > 1) {
00354       tprintf("Checking word box:");
00355       word_res->box_word->bounding_box().print();
00356     }
00357     int word_len = word_res->box_word->length();
00358     for (int i = 0; i < word_len; ++i) {
00359       TBOX char_box = TBOX();
00360       int blob_count = 0;
00361       for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
00362         TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
00363         if (!blob_box.major_overlap(box))
00364           break;
00365         if (word_res->correct_text[i + blob_count].length() > 0)
00366           break;  // Blob is claimed already.
00367         double current_box_miss_metric = BoxMissMetric(blob_box, box);
00368         double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
00369         if (applybox_debug > 2) {
00370           tprintf("Checking blob:");
00371           blob_box.print();
00372           tprintf("Current miss metric = %g, next = %g\n",
00373                   current_box_miss_metric, next_box_miss_metric);
00374         }
00375         if (current_box_miss_metric > next_box_miss_metric)
00376           break;  // Blob is a better match for next box.
00377         char_box += blob_box;
00378       }
00379       if (blob_count > 0) {
00380         if (applybox_debug > 1) {
00381           tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
00382         }
00383         if (!char_box.almost_equal(box, 3) &&
00384             (box.x_gap(next_box) < -3 ||
00385              (prev_box != NULL && prev_box->x_gap(box) < -3))) {
00386           return false;
00387         }
00388         // We refine just the box_word, best_state and correct_text here.
00389         // The rebuild_word is made in TidyUp.
00390         // blob_count blobs are put together to match the box. Merge the
00391         // box_word boxes, save the blob_count in the state and the text.
00392         word_res->box_word->MergeBoxes(i, i + blob_count);
00393         word_res->best_state[i] = blob_count;
00394         word_res->correct_text[i] = correct_text;
00395         if (applybox_debug > 2) {
00396           tprintf("%d Blobs match: blob box:", blob_count);
00397           word_res->box_word->BlobBox(i).print();
00398           tprintf("Matches box:");
00399           box.print();
00400           tprintf("With next box:");
00401           next_box.print();
00402         }
00403         // Eliminated best_state and correct_text entries for the consumed
00404         // blobs.
00405         for (int j = 1; j < blob_count; ++j) {
00406           word_res->best_state.remove(i + 1);
00407           word_res->correct_text.remove(i + 1);
00408         }
00409         // Assume that no box spans multiple source words, so we are done with
00410         // this box.
00411         if (applybox_debug > 1) {
00412           tprintf("Best state = ");
00413           for (int j = 0; j < word_res->best_state.size(); ++j) {
00414             tprintf("%d ", word_res->best_state[j]);
00415           }
00416           tprintf("\n");
00417           tprintf("Correct text = [[ ");
00418           for (int j = 0; j < word_res->correct_text.size(); ++j) {
00419             tprintf("%s ", word_res->correct_text[j].string());
00420           }
00421           tprintf("]]\n");
00422         }
00423         return true;
00424       }
00425     }
00426   }
00427   if (applybox_debug > 0) {
00428     tprintf("FAIL!\n");
00429   }
00430   return false;  // Failure.
00431 }
00432
00433 // Consume all source blobs that strongly overlap the given box,
00434 // putting them into a new word, with the correct_text label.
00435 // Fights over which box owns which blobs are settled by
00436 // applying the blobs to box or next_box with the least non-overlap.
00437 // Returns false if the box was in error, which can only be caused by
00438 // failing to find an overlapping blob for a box.
00439 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
00440                                  const TBOX& box, const TBOX& next_box,
00441                                  const char* correct_text) {
00442   if (applybox_debug > 1) {
00443     tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
00444   }
00445   WERD* new_word = NULL;
00446   BLOCK_IT b_it(block_list);
00447   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00448     BLOCK* block = b_it.data();
00449     if (!box.major_overlap(block->bounding_box()))
00450       continue;
00451     ROW_IT r_it(block->row_list());
00452     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
00453       ROW* row = r_it.data();
00454       if (!box.major_overlap(row->bounding_box()))
00455         continue;
00456       WERD_IT w_it(row->word_list());
00457       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00458         WERD* word = w_it.data();
00459         if (applybox_debug > 2) {
00460           tprintf("Checking word:");
00461           word->bounding_box().print();
00462         }
00463         if (word->text() != NULL && word->text()[0] != '\0')
00464           continue;  // Ignore words that are already done.
00465         if (!box.major_overlap(word->bounding_box()))
00466           continue;
00467         C_BLOB_IT blob_it(word->cblob_list());
00468         for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
00469              blob_it.forward()) {
00470           C_BLOB* blob = blob_it.data();
00471           TBOX blob_box = blob->bounding_box();
00472           if (!blob_box.major_overlap(box))
00473             continue;
00474           double current_box_miss_metric = BoxMissMetric(blob_box, box);
00475           double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
00476           if (applybox_debug > 2) {
00477             tprintf("Checking blob:");
00478             blob_box.print();
00479             tprintf("Current miss metric = %g, next = %g\n",
00480                     current_box_miss_metric, next_box_miss_metric);
00481           }
00482           if (current_box_miss_metric > next_box_miss_metric)
00483             continue;  // Blob is a better match for next box.
00484           if (applybox_debug > 2) {
00485             tprintf("Blob match: blob:");
00486             blob_box.print();
00487             tprintf("Matches box:");
00488             box.print();
00489             tprintf("With next box:");
00490             next_box.print();
00491           }
00492           if (new_word == NULL) {
00493             // Make a new word with a single blob.
00494             new_word = word->shallow_copy();
00495             new_word->set_text(correct_text);
00496             w_it.add_to_end(new_word);
00497           }
00498           C_BLOB_IT new_blob_it(new_word->cblob_list());
00499           new_blob_it.add_to_end(blob_it.extract());
00500         }
00501       }
00502     }
00503   }
00504   if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
00505   return new_word != NULL;
00506 }
00507
00508 // Resegments the words by running the classifier in an attempt to find the
00509 // correct segmentation that produces the required string.
00510 void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
00511   PAGE_RES_IT pr_it(page_res);
00512   WERD_RES* word_res;
00513   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
00514     WERD* word = word_res->word;
00515     if (word->text() == NULL || word->text()[0] == '\0')
00516       continue;  // Ignore words that have no text.
00517     // Convert the correct text to a vector of UNICHAR_ID
00518     GenericVector<UNICHAR_ID> target_text;
00519     if (!ConvertStringToUnichars(word->text(), &target_text)) {
00520       tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
00521               word->text());
00522       pr_it.DeleteCurrentWord();
00523       continue;
00524     }
00525     if (!FindSegmentation(target_text, word_res)) {
00526       tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
00527               word->text());
00528       pr_it.DeleteCurrentWord();
00529       continue;
00530     }
00531   }
00532 }
00533
00534 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
00535 // Returns false if an invalid UNICHAR_ID is encountered.
00536 bool Tesseract::ConvertStringToUnichars(const char* utf8,
00537                                         GenericVector<UNICHAR_ID>* class_ids) {
00538   for (int step = 0; *utf8 != '\0'; utf8 += step) {
00539     const char* next_space = strchr(utf8, ' ');
00540     if (next_space == NULL)
00541       next_space = utf8 + strlen(utf8);
00542     step = next_space - utf8;
00543     UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
00544     if (class_id == INVALID_UNICHAR_ID) {
00545       return false;
00546     }
00547     while (utf8[step] == ' ')
00548       ++step;
00549     class_ids->push_back(class_id);
00550   }
00551   return true;
00552 }
00553
00554 // Resegments the word to achieve the target_text from the classifier.
00555 // Returns false if the re-segmentation fails.
00556 // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
00557 // applies a full search on the classifier results to find the best classified
00558 // segmentation. As a compromise to obtain better recall, 1-1 ambiguity
00559 // substitutions ARE used.
00560 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
00561                                  WERD_RES* word_res) {
00562   blob_match_table.init_match_table();
00563   // Classify all required combinations of blobs and save results in choices.
00564   int word_length = word_res->box_word->length();
00565   GenericVector<BLOB_CHOICE_LIST*>* choices =
00566       new GenericVector<BLOB_CHOICE_LIST*>[word_length];
00567   for (int i = 0; i < word_length; ++i) {
00568     for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
00569       BLOB_CHOICE_LIST* match_result = classify_piece(
00570           word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
00571           i, i + j - 1, word_res->blamer_bundle);
00572       if (applybox_debug > 2) {
00573         tprintf("%d+%d:", i, j);
00574         print_ratings_list("Segment:", match_result, unicharset);
00575       }
00576       choices[i].push_back(match_result);
00577     }
00578   }
00579   // Search the segmentation graph for the target text. Must be an exact
00580   // match. Using wildcards makes it difficult to find the correct
00581   // segmentation even when it is there.
00582   word_res->best_state.clear();
00583   GenericVector<int> search_segmentation;
00584   float best_rating = 0.0f;
00585   SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
00586                 &search_segmentation, &best_rating, &word_res->best_state);
00587   blob_match_table.end_match_table();
00588   for (int i = 0; i < word_length; ++i)
00589     choices[i].delete_data_pointers();
00590   delete [] choices;
00591   if (word_res->best_state.empty()) {
00592     // Build the original segmentation and if it is the same length as the
00593     // truth, assume it will do.
00594     int blob_count = 1;
00595     for (int s = 0; s < array_count(word_res->seam_array); ++s) {
00596       SEAM* seam =
00597           reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s));
00598       if (seam->split1 == NULL) {
00599         word_res->best_state.push_back(blob_count);
00600         blob_count = 1;
00601       } else {
00602         ++blob_count;
00603       }
00604     }
00605     word_res->best_state.push_back(blob_count);
00606     if (word_res->best_state.size() != target_text.size()) {
00607       word_res->best_state.clear();  // No good. Original segmentation bad size.
00608       return false;
00609     }
00610   }
00611   word_res->correct_text.clear();
00612   for (int i = 0; i < target_text.size(); ++i) {
00613     word_res->correct_text.push_back(
00614         STRING(unicharset.id_to_unichar(target_text[i])));
00615   }
00616   return true;
00617 }
00618
00619 // Recursive helper to find a match to the target_text (from text_index
00620 // position) in the choices (from choices_pos position).
00621 // Choices is an array of GenericVectors, of length choices_length, with each
00622 // element representing a starting position in the word, and the
00623 // GenericVector holding classification results for a sequence of consecutive
00624 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
00625 void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
00626                               int choices_pos, int choices_length,
00627                               const GenericVector<UNICHAR_ID>& target_text,
00628                               int text_index,
00629                               float rating, GenericVector<int>* segmentation,
00630                               float* best_rating,
00631                               GenericVector<int>* best_segmentation) {
00632   const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
00633   for (int length = 1; length <= choices[choices_pos].size(); ++length) {
00634     // Rating of matching choice or worst choice if no match.
00635     float choice_rating = 0.0f;
00636     // Find the corresponding best BLOB_CHOICE.
00637     BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
00638     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
00639          choice_it.forward()) {
00640       BLOB_CHOICE* choice = choice_it.data();
00641       choice_rating = choice->rating();
00642       UNICHAR_ID class_id = choice->unichar_id();
00643       if (class_id == target_text[text_index]) {
00644         break;
00645       }
00646       // Search ambigs table.
00647       if (class_id < table.size() && table[class_id] != NULL) {
00648         AmbigSpec_IT spec_it(table[class_id]);
00649         for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
00650              spec_it.forward()) {
00651           const AmbigSpec *ambig_spec = spec_it.data();
00652           // We'll only do 1-1.
00653           if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
00654               ambig_spec->correct_ngram_id == target_text[text_index])
00655             break;
00656         }
00657         if (!spec_it.cycled_list())
00658           break;  // Found an ambig.
00659       }
00660     }
00661     if (choice_it.cycled_list())
00662       continue;  // No match.
00663     segmentation->push_back(length);
00664     if (choices_pos + length == choices_length &&
00665         text_index + 1 == target_text.size()) {
00666       // This is a complete match. If the rating is good record a new best.
00667       if (applybox_debug > 2) {
00668         tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
00669                 rating + choice_rating, *best_rating, segmentation->size(),
00670                 best_segmentation->size());
00671       }
00672       if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
00673         *best_segmentation = *segmentation;
00674         *best_rating = rating + choice_rating;
00675       }
00676     } else if (choices_pos + length < choices_length &&
00677                text_index + 1 < target_text.size()) {
00678       if (applybox_debug > 3) {
00679         tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
00680                 target_text[text_index],
00681                 unicharset.id_to_unichar(target_text[text_index]),
00682                 choice_it.data()->unichar_id() == target_text[text_index]
00683                      ? "Match" : "Ambig",
00684                 choices_pos, length);
00685       }
00686       SearchForText(choices, choices_pos + length, choices_length, target_text,
00687                     text_index + 1, rating + choice_rating, segmentation,
00688                     best_rating, best_segmentation);
00689       if (applybox_debug > 3) {
00690         tprintf("End recursion for %d=%s\n", target_text[text_index],
00691                 unicharset.id_to_unichar(target_text[text_index]));
00692       }
00693     }
00694     segmentation->truncate(segmentation->size() - 1);
00695   }
00696 }
00697
00698 // Counts up the labelled words and the blobs within.
00699 // Deletes all unused or emptied words, counting the unused ones.
00700 // Resets W_BOL and W_EOL flags correctly.
00701 // Builds the rebuild_word and rebuilds the box_word and the best_choice.
00702 void Tesseract::TidyUp(PAGE_RES* page_res) {
00703   int ok_blob_count = 0;
00704   int bad_blob_count = 0;
00705   int ok_word_count = 0;
00706   int unlabelled_words = 0;
00707   PAGE_RES_IT pr_it(page_res);
00708   WERD_RES* word_res;
00709   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
00710     int ok_in_word = 0;
00711     BLOB_CHOICE_LIST_VECTOR char_choices;
00712     for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
00713       if (word_res->correct_text[i].length() > 0) {
00714         ++ok_in_word;
00715       }
00716       // Since we only need a fake word_res->best_choice, the actual
00717       // unichar_ids do not matter. Which is fortunate, since TidyUp()
00718       // can be called while training Tesseract, at the stage where
00719       // unicharset is not meaningful yet.
00720       char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
00721     }
00722     if (ok_in_word > 0) {
00723       ok_blob_count += ok_in_word;
00724       bad_blob_count += word_res->correct_text.size() - ok_in_word;
00725       MakeWordChoice(char_choices, unicharset, word_res->best_choice);
00726     } else {
00727       ++unlabelled_words;
00728       if (applybox_debug > 0) {
00729         tprintf("APPLY_BOXES: Unlabelled word at :");
00730         word_res->word->bounding_box().print();
00731       }
00732       pr_it.DeleteCurrentWord();
00733     }
00734     char_choices.delete_data_pointers();
00735   }
00736   pr_it.restart_page();
00737   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
00738     // Denormalize back to a BoxWord.
00739     word_res->RebuildBestState();
00740     word_res->SetupBoxWord();
00741     word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
00742     word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
00743   }
00744   if (applybox_debug > 0) {
00745     tprintf("   Found %d good blobs.\n", ok_blob_count);
00746     if (bad_blob_count > 0) {
00747       tprintf("   Leaving %d unlabelled blobs in %d words.\n",
00748               bad_blob_count, ok_word_count);
00749     }
00750     if (unlabelled_words > 0)
00751       tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
00752   }
00753 }
00754
00755 // Logs a bad box by line in the box file and box coords.
00756 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
00757                                 const char *box_ch, const char *err_msg) {
00758   tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
00759           boxfile_lineno, box_ch,
00760           box.left(), box.bottom(), box.right(), box.top(), err_msg);
00761 }
00762
00763 // Creates a fake best_choice entry in each WERD_RES with the correct text.
00764 void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
00765   PAGE_RES_IT pr_it(page_res);
00766   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
00767        word_res = pr_it.forward()) {
00768     WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
00769                                           word_res->correct_text.size());
00770     for (int i = 0; i < word_res->correct_text.size(); ++i) {
00771       // The part before the first space is the real ground truth, and the
00772       // rest is the bounding box location and page number.
00773       GenericVector<STRING> tokens;
00774       word_res->correct_text[i].split(' ', &tokens);
00775       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
00776       choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
00777     }
00778     if (word_res->best_choice != NULL)
00779       delete word_res->best_choice;
00780     word_res->best_choice = choice;
00781   }
00782 }
00783
00784 // Calls LearnWord to extract features for labelled blobs within each word.
00785 // Features are written to the given filename.
00786 void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
00787   PAGE_RES_IT pr_it(page_res);
00788   int word_count = 0;
00789   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
00790        word_res = pr_it.forward()) {
00791     LearnWord(filename.string(), NULL, word_res);
00792     ++word_count;
00793   }
00794   tprintf("Generated training data for %d words\n", word_count);
00795 }
00796
00797
00798 }  // namespace tesseract