Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: pieces.c (Formerly pieces.c) 00005 * Description: 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Mon May 20 12:12:35 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 00029 #include "blobs.h" 00030 #include "freelist.h" 00031 #include "helpers.h" 00032 #include "matchtab.h" 00033 #include "matrix.h" 00034 #include "ndminx.h" 00035 #include "plotseg.h" 00036 #include "ratngs.h" 00037 #include "seam.h" 00038 #include "states.h" 00039 #include "wordclass.h" 00040 #include "wordrec.h" 00041 00042 // Include automatically generated configuration file if running autoconf. 00043 #ifdef HAVE_CONFIG_H 00044 #include "config_auto.h" 00045 #endif 00046 00047 /*---------------------------------------------------------------------- 00048 F u n c t i o n s 00049 ----------------------------------------------------------------------*/ 00050 00051 00052 /********************************************************************** 00053 * bounds_of_piece 00054 * 00055 * Find the bounds of the piece that will be created by joining the 00056 * requested collection of pieces together. 00057 **********************************************************************/ 00058 TBOX bounds_of_piece(TBOX *bounds, inT16 start, inT16 end) { 00059 TBOX all_together = bounds[start]; 00060 for (int x = start + 1; x <= end; x++) { 00061 all_together += bounds[x]; 00062 } 00063 return all_together; 00064 } 00065 00066 00067 /********************************************************************** 00068 * classify_piece 00069 * 00070 * Create a larger piece from a collection of smaller ones. Classify 00071 * it and return the results. Take the large piece apart to leave 00072 * the collection of small pieces un modified. 00073 **********************************************************************/ 00074 namespace tesseract { 00075 BLOB_CHOICE_LIST *Wordrec::classify_piece(TBLOB *pieces, 00076 const DENORM& denorm, 00077 SEAMS seams, 00078 inT16 start, 00079 inT16 end, 00080 BlamerBundle *blamer_bundle) { 00081 BLOB_CHOICE_LIST *choices; 00082 TBLOB *blob; 00083 inT16 x; 00084 00085 join_pieces(pieces, seams, start, end); 00086 for (blob = pieces, x = 0; x < start; x++) { 00087 blob = blob->next; 00088 } 00089 choices = classify_blob(blob, denorm, "pieces:", White, blamer_bundle); 00090 00091 break_pieces(blob, seams, start, end); 00092 #ifndef GRAPHICS_DISABLED 00093 if (wordrec_display_segmentations > 2) { 00094 STATE current_state; 00095 SEARCH_STATE chunk_groups; 00096 set_n_ones (¤t_state, array_count(seams)); 00097 chunk_groups = bin_to_chunks(¤t_state, array_count(seams)); 00098 display_segmentation(pieces, chunk_groups); 00099 window_wait(segm_window); 00100 memfree(chunk_groups); 00101 } 00102 #endif 00103 00104 return (choices); 00105 } 00106 00107 template<class BLOB_CHOICE> 00108 int SortByUnicharID(const void *void1, const void *void2) { 00109 const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1); 00110 const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2); 00111 00112 return p1->unichar_id() - p2->unichar_id(); 00113 } 00114 00115 template<class BLOB_CHOICE> 00116 int SortByRating(const void *void1, const void *void2) { 00117 const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1); 00118 const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2); 00119 00120 if (p1->rating() < p2->rating()) 00121 return 1; 00122 return -1; 00123 } 00124 00125 00126 /********************************************************************** 00127 * fill_filtered_fragment_list 00128 * 00129 * Filter the fragment list so that the filtered_choices only contain 00130 * fragments that are in the correct position. choices is the list 00131 * that we are going to filter. fragment_pos is the position in the 00132 * fragment that we are looking for and num_frag_parts is the the 00133 * total number of pieces. The result will be appended to 00134 * filtered_choices. 00135 **********************************************************************/ 00136 void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, 00137 int fragment_pos, 00138 int num_frag_parts, 00139 BLOB_CHOICE_LIST *filtered_choices) { 00140 BLOB_CHOICE_IT filtered_choices_it(filtered_choices); 00141 BLOB_CHOICE_IT choices_it(choices); 00142 00143 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); 00144 choices_it.forward()) { 00145 UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); 00146 const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id); 00147 00148 if (frag != NULL && frag->get_pos() == fragment_pos && 00149 frag->get_total() == num_frag_parts) { 00150 // Recover the unichar_id of the unichar that this fragment is 00151 // a part of 00152 BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data()); 00153 int original_unichar = unicharset.unichar_to_id(frag->get_unichar()); 00154 b->set_unichar_id(original_unichar); 00155 filtered_choices_it.add_to_end(b); 00156 } 00157 } 00158 00159 filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>); 00160 } 00161 00162 00163 /********************************************************************** 00164 * merge_and_put_fragment_lists 00165 * 00166 * Merge the fragment lists in choice_lists and append it to the 00167 * ratings matrix. 00168 **********************************************************************/ 00169 void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column, 00170 inT16 num_frag_parts, 00171 BLOB_CHOICE_LIST *choice_lists, 00172 MATRIX *ratings) { 00173 BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts]; 00174 00175 for (int i = 0; i < num_frag_parts; i++) { 00176 choice_lists_it[i].set_to_list(&choice_lists[i]); 00177 choice_lists_it[i].mark_cycle_pt(); 00178 } 00179 00180 BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column); 00181 if (merged_choice == NULL) 00182 merged_choice = new BLOB_CHOICE_LIST; 00183 00184 bool end_of_list = false; 00185 BLOB_CHOICE_IT merged_choice_it(merged_choice); 00186 while (!end_of_list) { 00187 // Find the maximum unichar_id of the current entry the iterators 00188 // are pointing at 00189 UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id(); 00190 int max_list = 0; 00191 for (int i = 0; i < num_frag_parts; i++) { 00192 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); 00193 if (max_unichar_id < unichar_id) { 00194 max_unichar_id = unichar_id; 00195 max_list = i; 00196 } 00197 } 00198 00199 // Move the each iterators until it gets to an entry that has a 00200 // value greater than or equal to max_unichar_id 00201 for (int i = 0; i < num_frag_parts; i++) { 00202 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); 00203 while (!choice_lists_it[i].cycled_list() && 00204 unichar_id < max_unichar_id) { 00205 choice_lists_it[i].forward(); 00206 unichar_id = choice_lists_it[i].data()->unichar_id(); 00207 } 00208 if (choice_lists_it[i].cycled_list()) { 00209 end_of_list = true; 00210 break; 00211 } 00212 } 00213 00214 if (end_of_list) 00215 break; 00216 00217 // Checks if the fragments are parts of the same character 00218 UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id(); 00219 bool same_unichar = true; 00220 for (int i = 1; i < num_frag_parts; i++) { 00221 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); 00222 if (unichar_id != first_unichar_id) { 00223 same_unichar = false; 00224 break; 00225 } 00226 } 00227 00228 if (same_unichar) { 00229 // Add the merged character to the result 00230 UNICHAR_ID merged_unichar_id = first_unichar_id; 00231 inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id(); 00232 inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2(); 00233 inT16 merged_min_xheight = choice_lists_it[0].data()->min_xheight(); 00234 inT16 merged_max_xheight = choice_lists_it[0].data()->max_xheight(); 00235 int merged_script_id = choice_lists_it[0].data()->script_id(); 00236 bool merged_adapted = choice_lists_it[0].data()->adapted(); 00237 00238 float merged_rating = 0, merged_certainty = 0; 00239 for (int i = 0; i < num_frag_parts; i++) { 00240 float rating = choice_lists_it[i].data()->rating(); 00241 float certainty = choice_lists_it[i].data()->certainty(); 00242 00243 if (i == 0 || certainty < merged_certainty) 00244 merged_certainty = certainty; 00245 merged_rating += rating; 00246 00247 choice_lists_it[i].forward(); 00248 if (choice_lists_it[i].cycled_list()) 00249 end_of_list = true; 00250 IntersectRange(choice_lists_it[i].data()->min_xheight(), 00251 choice_lists_it[i].data()->max_xheight(), 00252 &merged_min_xheight, &merged_max_xheight); 00253 } 00254 00255 merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id, 00256 merged_rating, 00257 merged_certainty, 00258 merged_fontinfo_id, 00259 merged_fontinfo_id2, 00260 merged_script_id, 00261 merged_min_xheight, 00262 merged_max_xheight, 00263 merged_adapted)); 00264 } 00265 } 00266 00267 if (classify_debug_level) 00268 print_ratings_list("Merged Fragments", merged_choice, 00269 unicharset); 00270 00271 if (merged_choice->empty()) 00272 delete merged_choice; 00273 else 00274 ratings->put(row, column, merged_choice); 00275 00276 delete [] choice_lists_it; 00277 } 00278 00279 00280 /********************************************************************** 00281 * get_fragment_lists 00282 * 00283 * Recursively go through the ratings matrix to find lists of fragments 00284 * to be merged in the function merge_and_put_fragment_lists. 00285 * current_frag is the postion of the piece we are looking for. 00286 * current_row is the row in the rating matrix we are currently at. 00287 * start is the row we started initially, so that we can know where 00288 * to append the results to the matrix. num_frag_parts is the total 00289 * number of pieces we are looking for and num_blobs is the size of the 00290 * ratings matrix. 00291 **********************************************************************/ 00292 void Wordrec::get_fragment_lists(inT16 current_frag, inT16 current_row, 00293 inT16 start, inT16 num_frag_parts, 00294 inT16 num_blobs, MATRIX *ratings, 00295 BLOB_CHOICE_LIST *choice_lists) { 00296 if (current_frag == num_frag_parts) { 00297 merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts, 00298 choice_lists, ratings); 00299 return; 00300 } 00301 00302 for (inT16 x = current_row; x < num_blobs; x++) { 00303 BLOB_CHOICE_LIST *choices = ratings->get(current_row, x); 00304 if (choices == NULL) 00305 continue; 00306 00307 fill_filtered_fragment_list(choices, current_frag, num_frag_parts, 00308 &choice_lists[current_frag]); 00309 if (!choice_lists[current_frag].empty()) { 00310 get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts, 00311 num_blobs, ratings, choice_lists); 00312 choice_lists[current_frag].clear(); 00313 } 00314 } 00315 } 00316 00317 00318 /********************************************************************** 00319 * merge_fragments 00320 * 00321 * Try to merge fragments in the ratings matrix and put the result in 00322 * the corresponding row and column 00323 **********************************************************************/ 00324 void Wordrec::merge_fragments(MATRIX *ratings, inT16 num_blobs) { 00325 BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks]; 00326 for (inT16 start = 0; start < num_blobs; start++) { 00327 for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks; 00328 frag_parts++) { 00329 get_fragment_lists(0, start, start, frag_parts, num_blobs, 00330 ratings, choice_lists); 00331 } 00332 } 00333 00334 // Delete fragments from the rating matrix 00335 for (inT16 x = 0; x < num_blobs; x++) { 00336 for (inT16 y = x; y < num_blobs; y++) { 00337 BLOB_CHOICE_LIST *choices = ratings->get(x, y); 00338 if (choices != NULL) { 00339 BLOB_CHOICE_IT choices_it(choices); 00340 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); 00341 choices_it.forward()) { 00342 UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); 00343 const CHAR_FRAGMENT *frag = 00344 unicharset.get_fragment(choice_unichar_id); 00345 if (frag != NULL) 00346 delete choices_it.extract(); 00347 } 00348 } 00349 } 00350 } 00351 } 00352 00353 00354 /********************************************************************** 00355 * get_piece_rating 00356 * 00357 * Check to see if this piece has already been classified. If it has 00358 * return that rating. Otherwise build the piece from the smaller 00359 * pieces, classify it, store the rating for later, and take the piece 00360 * apart again. 00361 **********************************************************************/ 00362 BLOB_CHOICE_LIST *Wordrec::get_piece_rating(MATRIX *ratings, 00363 TBLOB *blobs, 00364 const DENORM& denorm, 00365 SEAMS seams, 00366 inT16 start, 00367 inT16 end, 00368 BlamerBundle *blamer_bundle) { 00369 BLOB_CHOICE_LIST *choices = ratings->get(start, end); 00370 if (choices == NOT_CLASSIFIED) { 00371 choices = classify_piece(blobs, 00372 denorm, 00373 seams, 00374 start, 00375 end, 00376 blamer_bundle); 00377 ratings->put(start, end, choices); 00378 if (wordrec_debug_level > 1) { 00379 tprintf("get_piece_rating(): updated ratings matrix\n"); 00380 ratings->print(getDict().getUnicharset()); 00381 } 00382 } 00383 return (choices); 00384 } 00385 00386 00387 /********************************************************************** 00388 * record_blob_bounds 00389 * 00390 * Set up and initialize an array that holds the bounds of a set of 00391 * blobs. Caller should delete[] the array. 00392 **********************************************************************/ 00393 TBOX *Wordrec::record_blob_bounds(TBLOB *blobs) { 00394 int nblobs = count_blobs(blobs); 00395 TBOX *bboxes = new TBOX[nblobs]; 00396 00397 inT16 x = 0; 00398 for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { 00399 bboxes[x] = blob->bounding_box(); 00400 x++; 00401 } 00402 return bboxes; 00403 } 00404 00405 00406 /********************************************************************** 00407 * record_piece_ratings 00408 * 00409 * Save the choices for all the pieces that have been classified into 00410 * a matrix that can be used to look them up later. A two dimensional 00411 * matrix is created. The indices correspond to the starting and 00412 * ending initial piece number. 00413 **********************************************************************/ 00414 MATRIX *Wordrec::record_piece_ratings(TBLOB *blobs) { 00415 inT16 num_blobs = count_blobs(blobs); 00416 TBOX *bounds = record_blob_bounds(blobs); 00417 MATRIX *ratings = new MATRIX(num_blobs); 00418 00419 for (int x = 0; x < num_blobs; x++) { 00420 for (int y = x; y < num_blobs; y++) { 00421 TBOX piecebox = bounds_of_piece(bounds, x, y); 00422 BLOB_CHOICE_LIST *choices = blob_match_table.get_match_by_box(piecebox); 00423 if (choices != NULL) { 00424 ratings->put(x, y, choices); 00425 } 00426 } 00427 } 00428 00429 if (merge_fragments_in_matrix) 00430 merge_fragments(ratings, num_blobs); 00431 00432 delete []bounds; 00433 return ratings; 00434 } 00435 00436 } // namespace tesseract