tesseract-doc/segsearch_8cpp_source.html

00001
00002 // File:        segsearch.h
00003 // Description: Segmentation search functions.
00004 // Author:      Daria Antonova
00005 // Created:     Mon Jun 23 11:26:43 PDT 2008
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019
00020 #include "wordrec.h"
00021
00022 #include "associate.h"
00023 #include "baseline.h"
00024 #include "language_model.h"
00025 #include "matrix.h"
00026 #include "oldheap.h"
00027 #include "params.h"
00028 #include "ratngs.h"
00029 #include "states.h"
00030
00031 ELISTIZE(SEG_SEARCH_PENDING);
00032
00033 namespace tesseract {
00034
00035 void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record,
00036                         WERD_CHOICE *best_choice,
00037                         BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00038                         WERD_CHOICE *raw_choice,
00039                         STATE *output_best_state,
00040                         BlamerBundle *blamer_bundle) {
00041   int row, col = 0;
00042   if (segsearch_debug_level > 0) {
00043     tprintf("Starting SegSearch on ratings matrix:\n");
00044     chunks_record->ratings->print(getDict().getUnicharset());
00045   }
00046   // Start with a fresh best_choice since rating adjustments
00047   // used by the chopper and the new segmentation search are not compatible.
00048   best_choice->set_rating(WERD_CHOICE::kBadRating);
00049   // TODO(antonova): Due to the fact that we currently do not re-start the
00050   // segmentation search from the best choice the chopper found, sometimes
00051   // the the segmentation search does not find the best path (that chopper
00052   // did discover) and does not have a chance to adapt to it. As soon as we
00053   // transition to using new-style language model penalties in the chopper
00054   // this issue will be resolved. But for how we are forced clear the
00055   // accumulator choices.
00056   //
00057   // Clear best choice accumulator (that is used for adaption), so that
00058   // choices adjusted by chopper do not interfere with the results from the
00059   // segmentation search.
00060   getDict().ClearBestChoiceAccum();
00061
00062   MATRIX *ratings = chunks_record->ratings;
00063   // Priority queue containing pain points generated by the language model
00064   // The priority is set by the language model components, adjustments like
00065   // seam cost and width priority are factored into the priority.
00066   HEAP *pain_points = MakeHeap(segsearch_max_pain_points);
00067
00068   // best_path_by_column records the lowest cost path found so far for each
00069   // column of the chunks_record->ratings matrix over all the rows.
00070   BestPathByColumn *best_path_by_column =
00071     new BestPathByColumn[ratings->dimension()];
00072   for (col = 0; col < ratings->dimension(); ++col) {
00073     best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating;
00074     best_path_by_column[col].best_vse = NULL;
00075   }
00076
00077   // Compute scaling factor that will help us recover blob outline length
00078   // from classifier rating and certainty for the blob.
00079   float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
00080
00081   language_model_->InitForWord(prev_word_best_choice_,
00082                                assume_fixed_pitch_char_segment,
00083                                best_choice->certainty(),
00084                                segsearch_max_char_wh_ratio, rating_cert_scale,
00085                                pain_points, chunks_record, blamer_bundle,
00086                                wordrec_debug_blamer);
00087
00088   MATRIX_COORD *pain_point;
00089   float pain_point_priority;
00090   BestChoiceBundle best_choice_bundle(
00091       output_best_state, best_choice, raw_choice, best_char_choices);
00092
00093   // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs,
00094   // where i is the column of the child. Initially all the classified entries
00095   // in the ratings matrix from column 0 (with parent NULL) are inserted into
00096   // pending[0]. As the language model state is updated, new child/parent
00097   // pairs are inserted into the lists. Next, the entries in pending[1] are
00098   // considered, and so on. It is important that during the update the
00099   // children are considered in the non-decreasing order of their column, since
00100   // this guarantees that all the parents would be up to date before an update
00101   // of a child is done.
00102   SEG_SEARCH_PENDING_LIST *pending =
00103     new SEG_SEARCH_PENDING_LIST[ratings->dimension()];
00104
00105   // Search for the ratings matrix for the initial best path.
00106   for (row = 0; row < ratings->dimension(); ++row) {
00107     if (ratings->get(0, row) != NOT_CLASSIFIED) {
00108       pending[0].add_sorted(
00109           SEG_SEARCH_PENDING::compare, true,
00110           new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag));
00111     }
00112   }
00113   UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record,
00114                        pain_points, &best_choice_bundle, blamer_bundle);
00115
00116   // Keep trying to find a better path by fixing the "pain points".
00117   int num_futile_classifications = 0;
00118   STRING blamer_debug;
00119   while (!SegSearchDone(num_futile_classifications) ||
00120          (blamer_bundle != NULL &&
00121           blamer_bundle->segsearch_is_looking_for_blame)) {
00122     // Get the next valid "pain point".
00123     int pop;
00124     while (true) {
00125       pop = HeapPop(pain_points, &pain_point_priority, &pain_point);
00126       if (pop == EMPTY) break;
00127       if (pain_point->Valid(*ratings) &&
00128         ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) {
00129         break;
00130       } else {
00131         delete pain_point;
00132       }
00133     }
00134     if (pop == EMPTY) {
00135       if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
00136       break;
00137     }
00138     ProcessSegSearchPainPoint(pain_point_priority, *pain_point,
00139                               best_choice_bundle.best_choice, &pending,
00140                               chunks_record, pain_points, blamer_bundle);
00141
00142     UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column,
00143                          chunks_record, pain_points, &best_choice_bundle,
00144                          blamer_bundle);
00145     if (!best_choice_bundle.updated) ++num_futile_classifications;
00146
00147     if (segsearch_debug_level > 0) {
00148       tprintf("num_futile_classifications %d\n", num_futile_classifications);
00149     }
00150
00151     best_choice_bundle.updated = false;  // reset updated
00152     delete pain_point;  // done using this pain point
00153
00154     // See if it's time to terminate SegSearch or time for starting a guided
00155     // search for the true path to find the blame for the incorrect best_choice.
00156     if (SegSearchDone(num_futile_classifications) && blamer_bundle != NULL &&
00157         blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
00158         !blamer_bundle->segsearch_is_looking_for_blame &&
00159         blamer_bundle->truth_has_char_boxes &&
00160         !ChoiceIsCorrect(getDict().getUnicharset(),
00161                          best_choice, blamer_bundle->truth_text)) {
00162       InitBlamerForSegSearch(best_choice_bundle.best_choice, chunks_record,
00163                              pain_points, blamer_bundle, &blamer_debug);
00164     }
00165   }  // end while loop exploring alternative paths
00166   FinishBlamerForSegSearch(best_choice_bundle.best_choice,
00167                            blamer_bundle, &blamer_debug);
00168
00169   if (segsearch_debug_level > 0) {
00170     tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
00171             language_model_->AcceptableChoiceFound());
00172   }
00173
00174   // Clean up.
00175   FreeHeapData(pain_points, MATRIX_COORD::Delete);
00176   delete[] best_path_by_column;
00177   delete[] pending;
00178   for (row = 0; row < ratings->dimension(); ++row) {
00179     for (col = 0; col <= row; ++col) {
00180       BLOB_CHOICE_LIST *rating = ratings->get(col, row);
00181       if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating);
00182     }
00183   }
00184 }
00185
00186 void Wordrec::UpdateSegSearchNodes(
00187     int starting_col,
00188     SEG_SEARCH_PENDING_LIST *pending[],
00189     BestPathByColumn *best_path_by_column[],
00190     CHUNKS_RECORD *chunks_record,
00191     HEAP *pain_points,
00192     BestChoiceBundle *best_choice_bundle,
00193     BlamerBundle *blamer_bundle) {
00194   MATRIX *ratings = chunks_record->ratings;
00195   for (int col = starting_col; col < ratings->dimension(); ++col) {
00196     if (segsearch_debug_level > 0) {
00197       tprintf("\n\nUpdateSegSearchNodes: evaluate children in col=%d\n", col);
00198     }
00199     // Iterate over the pending list for this column.
00200     SEG_SEARCH_PENDING_LIST *pending_list = &((*pending)[col]);
00201     SEG_SEARCH_PENDING_IT pending_it(pending_list);
00202     GenericVector<int> non_empty_rows;
00203     while (!pending_it.empty()) {
00204       // Update language model state of this child+parent pair.
00205       SEG_SEARCH_PENDING *p = pending_it.extract();
00206       if (non_empty_rows.length() == 0 ||
00207           non_empty_rows[non_empty_rows.length()-1] != p->child_row) {
00208         non_empty_rows.push_back(p->child_row);
00209       }
00210       BLOB_CHOICE_LIST *current_node = ratings->get(col, p->child_row);
00211       LanguageModelFlagsType new_changed =
00212         language_model_->UpdateState(p->changed, col, p->child_row,
00213                                      current_node, p->parent, pain_points,
00214                                      best_path_by_column, chunks_record,
00215                                      best_choice_bundle, blamer_bundle);
00216       if (new_changed) {
00217         // Since the language model state of this entry changed, add all the
00218         // pairs with it as a parent and each of its children to pending, so
00219         // that the children are updated as well.
00220         int child_col = p->child_row + 1;
00221         for (int child_row = child_col;
00222              child_row < ratings->dimension(); ++child_row) {
00223           if (ratings->get(child_col, child_row) != NOT_CLASSIFIED) {
00224             SEG_SEARCH_PENDING *new_pending =
00225               new SEG_SEARCH_PENDING(child_row, current_node, 0);
00226             SEG_SEARCH_PENDING *actual_new_pending =
00227               reinterpret_cast<SEG_SEARCH_PENDING *>(
00228                   (*pending)[child_col].add_sorted_and_find(
00229                   SEG_SEARCH_PENDING::compare, true, new_pending));
00230             if (new_pending != actual_new_pending) delete new_pending;
00231             actual_new_pending->changed |= new_changed;
00232             if (segsearch_debug_level > 0) {
00233                   tprintf("Added child(col=%d row=%d) parent(col=%d row=%d)"
00234                           " changed=0x%x to pending\n", child_col,
00235                           actual_new_pending->child_row,
00236                           col, p->child_row, actual_new_pending->changed);
00237             }
00238           }
00239         }
00240       }  // end if new_changed
00241       delete p;  // clean up
00242       pending_it.forward();
00243     } // end while !pending_it.empty()
00244     language_model_->GeneratePainPointsFromColumn(
00245       col, non_empty_rows, best_choice_bundle->best_choice->certainty(),
00246       pain_points, best_path_by_column, chunks_record);
00247   }  // end for col
00248
00249   if (best_choice_bundle->updated) {
00250     language_model_->GeneratePainPointsFromBestChoice(
00251         pain_points, chunks_record, best_choice_bundle);
00252   }
00253
00254   language_model_->CleanUp();
00255 }
00256
00257 void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority,
00258                                         const MATRIX_COORD &pain_point,
00259                                         const WERD_CHOICE *best_choice,
00260                                         SEG_SEARCH_PENDING_LIST *pending[],
00261                                         CHUNKS_RECORD *chunks_record,
00262                                         HEAP *pain_points,
00263                                         BlamerBundle *blamer_bundle) {
00264   if (segsearch_debug_level > 0) {
00265     tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n",
00266             pain_point_priority, pain_point.col, pain_point.row);
00267   }
00268   MATRIX *ratings = chunks_record->ratings;
00269   BLOB_CHOICE_LIST *classified = classify_piece(
00270       chunks_record->chunks, chunks_record->word_res->denorm,
00271       chunks_record->splits,
00272       pain_point.col, pain_point.row, blamer_bundle);
00273   ratings->put(pain_point.col, pain_point.row, classified);
00274
00275   if (segsearch_debug_level > 0) {
00276     print_ratings_list("Updated ratings matrix with a new entry:",
00277                        ratings->get(pain_point.col, pain_point.row),
00278                        getDict().getUnicharset());
00279     ratings->print(getDict().getUnicharset());
00280   }
00281
00282   // Insert initial "pain points" to join the newly classified blob
00283   // with its left and right neighbors.
00284   if (!classified->empty()) {
00285     float worst_piece_cert;
00286     bool fragmented;
00287     if (pain_point.col > 0) {
00288       language_model_->GetWorstPieceCertainty(
00289           pain_point.col-1, pain_point.row, chunks_record->ratings,
00290           &worst_piece_cert, &fragmented);
00291       language_model_->GeneratePainPoint(
00292           pain_point.col-1, pain_point.row, false,
00293           LanguageModel::kInitialPainPointPriorityAdjustment,
00294           worst_piece_cert, fragmented, best_choice->certainty(),
00295           segsearch_max_char_wh_ratio, NULL, NULL,
00296           chunks_record, pain_points);
00297     }
00298     if (pain_point.row+1 < ratings->dimension()) {
00299       language_model_->GetWorstPieceCertainty(
00300           pain_point.col, pain_point.row+1, chunks_record->ratings,
00301           &worst_piece_cert, &fragmented);
00302       language_model_->GeneratePainPoint(
00303           pain_point.col, pain_point.row+1, true,
00304           LanguageModel::kInitialPainPointPriorityAdjustment,
00305           worst_piece_cert, fragmented, best_choice->certainty(),
00306           segsearch_max_char_wh_ratio, NULL, NULL,
00307           chunks_record, pain_points);
00308     }
00309   }
00310
00311   // Record a pending entry with the pain_point and each of its parents.
00312   int parent_row = pain_point.col - 1;
00313   if (parent_row < 0) {  // this node has no parents
00314     (*pending)[pain_point.col].add_sorted(
00315         SEG_SEARCH_PENDING::compare, true,
00316         new SEG_SEARCH_PENDING(pain_point.row, NULL,
00317                                LanguageModel::kAllChangedFlag));
00318   } else {
00319     for (int parent_col = 0; parent_col < pain_point.col; ++parent_col) {
00320       if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) {
00321         (*pending)[pain_point.col].add_sorted(
00322             SEG_SEARCH_PENDING::compare, true,
00323             new SEG_SEARCH_PENDING(pain_point.row,
00324                                    ratings->get(parent_col, parent_row),
00325                                    LanguageModel::kAllChangedFlag));
00326       }
00327     }
00328   }
00329 }
00330
00331 void Wordrec::InitBlamerForSegSearch(const WERD_CHOICE *best_choice,
00332                                      CHUNKS_RECORD *chunks_record,
00333                                      HEAP *pain_points,
00334                                      BlamerBundle *blamer_bundle,
00335                                      STRING *blamer_debug) {
00336   blamer_bundle->segsearch_is_looking_for_blame = true;
00337   if (wordrec_debug_blamer) {
00338     tprintf("segsearch starting to look for blame\n");
00339   }
00340   // Clear pain points heap.
00341   int pop;
00342   float pain_point_priority;
00343   MATRIX_COORD *pain_point;
00344   while ((pop = HeapPop(pain_points, &pain_point_priority,
00345                          &pain_point)) != EMPTY) {
00346     delete pain_point;
00347   }
00348   // Fill pain points for any unclassifed blob corresponding to the
00349   // correct segmentation state.
00350   *blamer_debug += "Correct segmentation:\n";
00351   for (int idx = 0;
00352         idx < blamer_bundle->correct_segmentation_cols.length(); ++idx) {
00353     blamer_debug->add_str_int(
00354         "col=", blamer_bundle->correct_segmentation_cols[idx]);
00355     blamer_debug->add_str_int(
00356         " row=", blamer_bundle->correct_segmentation_rows[idx]);
00357     *blamer_debug += "\n";
00358     if (chunks_record->ratings->get(
00359         blamer_bundle->correct_segmentation_cols[idx],
00360         blamer_bundle->correct_segmentation_rows[idx]) == NOT_CLASSIFIED) {
00361       if (!language_model_->GeneratePainPoint(
00362           blamer_bundle->correct_segmentation_cols[idx],
00363           blamer_bundle->correct_segmentation_rows[idx],
00364           false, -1.0, -1.0, false, -1.0, segsearch_max_char_wh_ratio,
00365           NULL, NULL, chunks_record, pain_points)) {
00366         blamer_bundle->segsearch_is_looking_for_blame = false;
00367         *blamer_debug += "\nFailed to insert pain point\n";
00368         blamer_bundle->SetBlame(IRR_SEGSEARCH_HEUR, *blamer_debug, best_choice,
00369                                 wordrec_debug_blamer);
00370         break;
00371       }
00372     }
00373   }  // end for blamer_bundle->correct_segmentation_cols/rows
00374 }
00375
00376 void Wordrec::FinishBlamerForSegSearch(const WERD_CHOICE *best_choice,
00377                                        BlamerBundle *blamer_bundle,
00378                                        STRING *blamer_debug) {
00379   // If we are still looking for blame (i.e. best_choice is incorrect, but a
00380   // path representing the correct segmentation could be constructed), we can
00381   // blame segmentation search pain point prioritization if the rating of the
00382   // path corresponding to the correct segmentation is better than that of
00383   // best_choice (i.e. language model would have done the correct thing, but
00384   // because of poor pain point prioritization the correct segmentation was
00385   // never explored). Otherwise we blame the tradeoff between the language model
00386   // and the classifier, since even after exploring the path corresponding to
00387   // the correct segmentation incorrect best_choice would have been chosen.
00388   // One special case when we blame the classifier instead is when best choice
00389   // is incorrect, but it is a dictionary word and it classifier's top choice.
00390   if (blamer_bundle != NULL && blamer_bundle->segsearch_is_looking_for_blame) {
00391     blamer_bundle->segsearch_is_looking_for_blame = false;
00392     if (blamer_bundle->best_choice_is_dict_and_top_choice) {
00393       *blamer_debug = "Best choice is: incorrect, top choice, dictionary word";
00394       *blamer_debug += " with permuter ";
00395       *blamer_debug += best_choice->permuter_name();
00396       blamer_bundle->SetBlame(IRR_CLASSIFIER, *blamer_debug, best_choice,
00397                               wordrec_debug_blamer);
00398     } else if (blamer_bundle->best_correctly_segmented_rating <
00399         best_choice->rating()) {
00400       *blamer_debug += "Correct segmentation state was not explored";
00401       blamer_bundle->SetBlame(IRR_SEGSEARCH_PP, *blamer_debug, best_choice,
00402                               wordrec_debug_blamer);
00403     } else {
00404       if (blamer_bundle->best_correctly_segmented_rating >=
00405           WERD_CHOICE::kBadRating) {
00406         *blamer_debug += "Correct segmentation paths were pruned by LM\n";
00407       } else {
00408         char debug_buffer[256];
00409         *blamer_debug += "Best correct segmentation rating ";
00410         sprintf(debug_buffer, "%g",
00411                 blamer_bundle->best_correctly_segmented_rating);
00412         *blamer_debug += debug_buffer;
00413         *blamer_debug += " vs. best choice rating ";
00414         sprintf(debug_buffer, "%g", best_choice->rating());
00415         *blamer_debug += debug_buffer;
00416       }
00417       blamer_bundle->SetBlame(IRR_CLASS_LM_TRADEOFF, *blamer_debug, best_choice,
00418                               wordrec_debug_blamer);
00419     }
00420   }
00421 }
00422
00423 }  // namespace tesseract