Tesseract
3.02
|
00001 00002 // File: segsearch.h 00003 // Description: Segmentation search functions. 00004 // Author: Daria Antonova 00005 // Created: Mon Jun 23 11:26:43 PDT 2008 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include "wordrec.h" 00021 00022 #include "associate.h" 00023 #include "baseline.h" 00024 #include "language_model.h" 00025 #include "matrix.h" 00026 #include "oldheap.h" 00027 #include "params.h" 00028 #include "ratngs.h" 00029 #include "states.h" 00030 00031 ELISTIZE(SEG_SEARCH_PENDING); 00032 00033 namespace tesseract { 00034 00035 void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record, 00036 WERD_CHOICE *best_choice, 00037 BLOB_CHOICE_LIST_VECTOR *best_char_choices, 00038 WERD_CHOICE *raw_choice, 00039 STATE *output_best_state, 00040 BlamerBundle *blamer_bundle) { 00041 int row, col = 0; 00042 if (segsearch_debug_level > 0) { 00043 tprintf("Starting SegSearch on ratings matrix:\n"); 00044 chunks_record->ratings->print(getDict().getUnicharset()); 00045 } 00046 // Start with a fresh best_choice since rating adjustments 00047 // used by the chopper and the new segmentation search are not compatible. 00048 best_choice->set_rating(WERD_CHOICE::kBadRating); 00049 // TODO(antonova): Due to the fact that we currently do not re-start the 00050 // segmentation search from the best choice the chopper found, sometimes 00051 // the the segmentation search does not find the best path (that chopper 00052 // did discover) and does not have a chance to adapt to it. As soon as we 00053 // transition to using new-style language model penalties in the chopper 00054 // this issue will be resolved. But for how we are forced clear the 00055 // accumulator choices. 00056 // 00057 // Clear best choice accumulator (that is used for adaption), so that 00058 // choices adjusted by chopper do not interfere with the results from the 00059 // segmentation search. 00060 getDict().ClearBestChoiceAccum(); 00061 00062 MATRIX *ratings = chunks_record->ratings; 00063 // Priority queue containing pain points generated by the language model 00064 // The priority is set by the language model components, adjustments like 00065 // seam cost and width priority are factored into the priority. 00066 HEAP *pain_points = MakeHeap(segsearch_max_pain_points); 00067 00068 // best_path_by_column records the lowest cost path found so far for each 00069 // column of the chunks_record->ratings matrix over all the rows. 00070 BestPathByColumn *best_path_by_column = 00071 new BestPathByColumn[ratings->dimension()]; 00072 for (col = 0; col < ratings->dimension(); ++col) { 00073 best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating; 00074 best_path_by_column[col].best_vse = NULL; 00075 } 00076 00077 // Compute scaling factor that will help us recover blob outline length 00078 // from classifier rating and certainty for the blob. 00079 float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; 00080 00081 language_model_->InitForWord(prev_word_best_choice_, 00082 assume_fixed_pitch_char_segment, 00083 best_choice->certainty(), 00084 segsearch_max_char_wh_ratio, rating_cert_scale, 00085 pain_points, chunks_record, blamer_bundle, 00086 wordrec_debug_blamer); 00087 00088 MATRIX_COORD *pain_point; 00089 float pain_point_priority; 00090 BestChoiceBundle best_choice_bundle( 00091 output_best_state, best_choice, raw_choice, best_char_choices); 00092 00093 // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs, 00094 // where i is the column of the child. Initially all the classified entries 00095 // in the ratings matrix from column 0 (with parent NULL) are inserted into 00096 // pending[0]. As the language model state is updated, new child/parent 00097 // pairs are inserted into the lists. Next, the entries in pending[1] are 00098 // considered, and so on. It is important that during the update the 00099 // children are considered in the non-decreasing order of their column, since 00100 // this guarantees that all the parents would be up to date before an update 00101 // of a child is done. 00102 SEG_SEARCH_PENDING_LIST *pending = 00103 new SEG_SEARCH_PENDING_LIST[ratings->dimension()]; 00104 00105 // Search for the ratings matrix for the initial best path. 00106 for (row = 0; row < ratings->dimension(); ++row) { 00107 if (ratings->get(0, row) != NOT_CLASSIFIED) { 00108 pending[0].add_sorted( 00109 SEG_SEARCH_PENDING::compare, true, 00110 new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag)); 00111 } 00112 } 00113 UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record, 00114 pain_points, &best_choice_bundle, blamer_bundle); 00115 00116 // Keep trying to find a better path by fixing the "pain points". 00117 int num_futile_classifications = 0; 00118 STRING blamer_debug; 00119 while (!SegSearchDone(num_futile_classifications) || 00120 (blamer_bundle != NULL && 00121 blamer_bundle->segsearch_is_looking_for_blame)) { 00122 // Get the next valid "pain point". 00123 int pop; 00124 while (true) { 00125 pop = HeapPop(pain_points, &pain_point_priority, &pain_point); 00126 if (pop == EMPTY) break; 00127 if (pain_point->Valid(*ratings) && 00128 ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) { 00129 break; 00130 } else { 00131 delete pain_point; 00132 } 00133 } 00134 if (pop == EMPTY) { 00135 if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n"); 00136 break; 00137 } 00138 ProcessSegSearchPainPoint(pain_point_priority, *pain_point, 00139 best_choice_bundle.best_choice, &pending, 00140 chunks_record, pain_points, blamer_bundle); 00141 00142 UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column, 00143 chunks_record, pain_points, &best_choice_bundle, 00144 blamer_bundle); 00145 if (!best_choice_bundle.updated) ++num_futile_classifications; 00146 00147 if (segsearch_debug_level > 0) { 00148 tprintf("num_futile_classifications %d\n", num_futile_classifications); 00149 } 00150 00151 best_choice_bundle.updated = false; // reset updated 00152 delete pain_point; // done using this pain point 00153 00154 // See if it's time to terminate SegSearch or time for starting a guided 00155 // search for the true path to find the blame for the incorrect best_choice. 00156 if (SegSearchDone(num_futile_classifications) && blamer_bundle != NULL && 00157 blamer_bundle->incorrect_result_reason == IRR_CORRECT && 00158 !blamer_bundle->segsearch_is_looking_for_blame && 00159 blamer_bundle->truth_has_char_boxes && 00160 !ChoiceIsCorrect(getDict().getUnicharset(), 00161 best_choice, blamer_bundle->truth_text)) { 00162 InitBlamerForSegSearch(best_choice_bundle.best_choice, chunks_record, 00163 pain_points, blamer_bundle, &blamer_debug); 00164 } 00165 } // end while loop exploring alternative paths 00166 FinishBlamerForSegSearch(best_choice_bundle.best_choice, 00167 blamer_bundle, &blamer_debug); 00168 00169 if (segsearch_debug_level > 0) { 00170 tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n", 00171 language_model_->AcceptableChoiceFound()); 00172 } 00173 00174 // Clean up. 00175 FreeHeapData(pain_points, MATRIX_COORD::Delete); 00176 delete[] best_path_by_column; 00177 delete[] pending; 00178 for (row = 0; row < ratings->dimension(); ++row) { 00179 for (col = 0; col <= row; ++col) { 00180 BLOB_CHOICE_LIST *rating = ratings->get(col, row); 00181 if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating); 00182 } 00183 } 00184 } 00185 00186 void Wordrec::UpdateSegSearchNodes( 00187 int starting_col, 00188 SEG_SEARCH_PENDING_LIST *pending[], 00189 BestPathByColumn *best_path_by_column[], 00190 CHUNKS_RECORD *chunks_record, 00191 HEAP *pain_points, 00192 BestChoiceBundle *best_choice_bundle, 00193 BlamerBundle *blamer_bundle) { 00194 MATRIX *ratings = chunks_record->ratings; 00195 for (int col = starting_col; col < ratings->dimension(); ++col) { 00196 if (segsearch_debug_level > 0) { 00197 tprintf("\n\nUpdateSegSearchNodes: evaluate children in col=%d\n", col); 00198 } 00199 // Iterate over the pending list for this column. 00200 SEG_SEARCH_PENDING_LIST *pending_list = &((*pending)[col]); 00201 SEG_SEARCH_PENDING_IT pending_it(pending_list); 00202 GenericVector<int> non_empty_rows; 00203 while (!pending_it.empty()) { 00204 // Update language model state of this child+parent pair. 00205 SEG_SEARCH_PENDING *p = pending_it.extract(); 00206 if (non_empty_rows.length() == 0 || 00207 non_empty_rows[non_empty_rows.length()-1] != p->child_row) { 00208 non_empty_rows.push_back(p->child_row); 00209 } 00210 BLOB_CHOICE_LIST *current_node = ratings->get(col, p->child_row); 00211 LanguageModelFlagsType new_changed = 00212 language_model_->UpdateState(p->changed, col, p->child_row, 00213 current_node, p->parent, pain_points, 00214 best_path_by_column, chunks_record, 00215 best_choice_bundle, blamer_bundle); 00216 if (new_changed) { 00217 // Since the language model state of this entry changed, add all the 00218 // pairs with it as a parent and each of its children to pending, so 00219 // that the children are updated as well. 00220 int child_col = p->child_row + 1; 00221 for (int child_row = child_col; 00222 child_row < ratings->dimension(); ++child_row) { 00223 if (ratings->get(child_col, child_row) != NOT_CLASSIFIED) { 00224 SEG_SEARCH_PENDING *new_pending = 00225 new SEG_SEARCH_PENDING(child_row, current_node, 0); 00226 SEG_SEARCH_PENDING *actual_new_pending = 00227 reinterpret_cast<SEG_SEARCH_PENDING *>( 00228 (*pending)[child_col].add_sorted_and_find( 00229 SEG_SEARCH_PENDING::compare, true, new_pending)); 00230 if (new_pending != actual_new_pending) delete new_pending; 00231 actual_new_pending->changed |= new_changed; 00232 if (segsearch_debug_level > 0) { 00233 tprintf("Added child(col=%d row=%d) parent(col=%d row=%d)" 00234 " changed=0x%x to pending\n", child_col, 00235 actual_new_pending->child_row, 00236 col, p->child_row, actual_new_pending->changed); 00237 } 00238 } 00239 } 00240 } // end if new_changed 00241 delete p; // clean up 00242 pending_it.forward(); 00243 } // end while !pending_it.empty() 00244 language_model_->GeneratePainPointsFromColumn( 00245 col, non_empty_rows, best_choice_bundle->best_choice->certainty(), 00246 pain_points, best_path_by_column, chunks_record); 00247 } // end for col 00248 00249 if (best_choice_bundle->updated) { 00250 language_model_->GeneratePainPointsFromBestChoice( 00251 pain_points, chunks_record, best_choice_bundle); 00252 } 00253 00254 language_model_->CleanUp(); 00255 } 00256 00257 void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority, 00258 const MATRIX_COORD &pain_point, 00259 const WERD_CHOICE *best_choice, 00260 SEG_SEARCH_PENDING_LIST *pending[], 00261 CHUNKS_RECORD *chunks_record, 00262 HEAP *pain_points, 00263 BlamerBundle *blamer_bundle) { 00264 if (segsearch_debug_level > 0) { 00265 tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n", 00266 pain_point_priority, pain_point.col, pain_point.row); 00267 } 00268 MATRIX *ratings = chunks_record->ratings; 00269 BLOB_CHOICE_LIST *classified = classify_piece( 00270 chunks_record->chunks, chunks_record->word_res->denorm, 00271 chunks_record->splits, 00272 pain_point.col, pain_point.row, blamer_bundle); 00273 ratings->put(pain_point.col, pain_point.row, classified); 00274 00275 if (segsearch_debug_level > 0) { 00276 print_ratings_list("Updated ratings matrix with a new entry:", 00277 ratings->get(pain_point.col, pain_point.row), 00278 getDict().getUnicharset()); 00279 ratings->print(getDict().getUnicharset()); 00280 } 00281 00282 // Insert initial "pain points" to join the newly classified blob 00283 // with its left and right neighbors. 00284 if (!classified->empty()) { 00285 float worst_piece_cert; 00286 bool fragmented; 00287 if (pain_point.col > 0) { 00288 language_model_->GetWorstPieceCertainty( 00289 pain_point.col-1, pain_point.row, chunks_record->ratings, 00290 &worst_piece_cert, &fragmented); 00291 language_model_->GeneratePainPoint( 00292 pain_point.col-1, pain_point.row, false, 00293 LanguageModel::kInitialPainPointPriorityAdjustment, 00294 worst_piece_cert, fragmented, best_choice->certainty(), 00295 segsearch_max_char_wh_ratio, NULL, NULL, 00296 chunks_record, pain_points); 00297 } 00298 if (pain_point.row+1 < ratings->dimension()) { 00299 language_model_->GetWorstPieceCertainty( 00300 pain_point.col, pain_point.row+1, chunks_record->ratings, 00301 &worst_piece_cert, &fragmented); 00302 language_model_->GeneratePainPoint( 00303 pain_point.col, pain_point.row+1, true, 00304 LanguageModel::kInitialPainPointPriorityAdjustment, 00305 worst_piece_cert, fragmented, best_choice->certainty(), 00306 segsearch_max_char_wh_ratio, NULL, NULL, 00307 chunks_record, pain_points); 00308 } 00309 } 00310 00311 // Record a pending entry with the pain_point and each of its parents. 00312 int parent_row = pain_point.col - 1; 00313 if (parent_row < 0) { // this node has no parents 00314 (*pending)[pain_point.col].add_sorted( 00315 SEG_SEARCH_PENDING::compare, true, 00316 new SEG_SEARCH_PENDING(pain_point.row, NULL, 00317 LanguageModel::kAllChangedFlag)); 00318 } else { 00319 for (int parent_col = 0; parent_col < pain_point.col; ++parent_col) { 00320 if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) { 00321 (*pending)[pain_point.col].add_sorted( 00322 SEG_SEARCH_PENDING::compare, true, 00323 new SEG_SEARCH_PENDING(pain_point.row, 00324 ratings->get(parent_col, parent_row), 00325 LanguageModel::kAllChangedFlag)); 00326 } 00327 } 00328 } 00329 } 00330 00331 void Wordrec::InitBlamerForSegSearch(const WERD_CHOICE *best_choice, 00332 CHUNKS_RECORD *chunks_record, 00333 HEAP *pain_points, 00334 BlamerBundle *blamer_bundle, 00335 STRING *blamer_debug) { 00336 blamer_bundle->segsearch_is_looking_for_blame = true; 00337 if (wordrec_debug_blamer) { 00338 tprintf("segsearch starting to look for blame\n"); 00339 } 00340 // Clear pain points heap. 00341 int pop; 00342 float pain_point_priority; 00343 MATRIX_COORD *pain_point; 00344 while ((pop = HeapPop(pain_points, &pain_point_priority, 00345 &pain_point)) != EMPTY) { 00346 delete pain_point; 00347 } 00348 // Fill pain points for any unclassifed blob corresponding to the 00349 // correct segmentation state. 00350 *blamer_debug += "Correct segmentation:\n"; 00351 for (int idx = 0; 00352 idx < blamer_bundle->correct_segmentation_cols.length(); ++idx) { 00353 blamer_debug->add_str_int( 00354 "col=", blamer_bundle->correct_segmentation_cols[idx]); 00355 blamer_debug->add_str_int( 00356 " row=", blamer_bundle->correct_segmentation_rows[idx]); 00357 *blamer_debug += "\n"; 00358 if (chunks_record->ratings->get( 00359 blamer_bundle->correct_segmentation_cols[idx], 00360 blamer_bundle->correct_segmentation_rows[idx]) == NOT_CLASSIFIED) { 00361 if (!language_model_->GeneratePainPoint( 00362 blamer_bundle->correct_segmentation_cols[idx], 00363 blamer_bundle->correct_segmentation_rows[idx], 00364 false, -1.0, -1.0, false, -1.0, segsearch_max_char_wh_ratio, 00365 NULL, NULL, chunks_record, pain_points)) { 00366 blamer_bundle->segsearch_is_looking_for_blame = false; 00367 *blamer_debug += "\nFailed to insert pain point\n"; 00368 blamer_bundle->SetBlame(IRR_SEGSEARCH_HEUR, *blamer_debug, best_choice, 00369 wordrec_debug_blamer); 00370 break; 00371 } 00372 } 00373 } // end for blamer_bundle->correct_segmentation_cols/rows 00374 } 00375 00376 void Wordrec::FinishBlamerForSegSearch(const WERD_CHOICE *best_choice, 00377 BlamerBundle *blamer_bundle, 00378 STRING *blamer_debug) { 00379 // If we are still looking for blame (i.e. best_choice is incorrect, but a 00380 // path representing the correct segmentation could be constructed), we can 00381 // blame segmentation search pain point prioritization if the rating of the 00382 // path corresponding to the correct segmentation is better than that of 00383 // best_choice (i.e. language model would have done the correct thing, but 00384 // because of poor pain point prioritization the correct segmentation was 00385 // never explored). Otherwise we blame the tradeoff between the language model 00386 // and the classifier, since even after exploring the path corresponding to 00387 // the correct segmentation incorrect best_choice would have been chosen. 00388 // One special case when we blame the classifier instead is when best choice 00389 // is incorrect, but it is a dictionary word and it classifier's top choice. 00390 if (blamer_bundle != NULL && blamer_bundle->segsearch_is_looking_for_blame) { 00391 blamer_bundle->segsearch_is_looking_for_blame = false; 00392 if (blamer_bundle->best_choice_is_dict_and_top_choice) { 00393 *blamer_debug = "Best choice is: incorrect, top choice, dictionary word"; 00394 *blamer_debug += " with permuter "; 00395 *blamer_debug += best_choice->permuter_name(); 00396 blamer_bundle->SetBlame(IRR_CLASSIFIER, *blamer_debug, best_choice, 00397 wordrec_debug_blamer); 00398 } else if (blamer_bundle->best_correctly_segmented_rating < 00399 best_choice->rating()) { 00400 *blamer_debug += "Correct segmentation state was not explored"; 00401 blamer_bundle->SetBlame(IRR_SEGSEARCH_PP, *blamer_debug, best_choice, 00402 wordrec_debug_blamer); 00403 } else { 00404 if (blamer_bundle->best_correctly_segmented_rating >= 00405 WERD_CHOICE::kBadRating) { 00406 *blamer_debug += "Correct segmentation paths were pruned by LM\n"; 00407 } else { 00408 char debug_buffer[256]; 00409 *blamer_debug += "Best correct segmentation rating "; 00410 sprintf(debug_buffer, "%g", 00411 blamer_bundle->best_correctly_segmented_rating); 00412 *blamer_debug += debug_buffer; 00413 *blamer_debug += " vs. best choice rating "; 00414 sprintf(debug_buffer, "%g", best_choice->rating()); 00415 *blamer_debug += debug_buffer; 00416 } 00417 blamer_bundle->SetBlame(IRR_CLASS_LM_TRADEOFF, *blamer_debug, best_choice, 00418 wordrec_debug_blamer); 00419 } 00420 } 00421 } 00422 00423 } // namespace tesseract