Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: edgblob.c (Formerly edgeloop.c) 00003 * Description: Functions to clean up an outline before approximation. 00004 * Author: Ray Smith 00005 * Created: Tue Mar 26 16:56:25 GMT 1991 00006 * 00007 *(C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0(the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "mfcpch.h" 00021 #include "scanedg.h" 00022 #include "drawedg.h" 00023 #include "edgloop.h" 00024 #include "edgblob.h" 00025 00026 // Include automatically generated configuration file if running autoconf. 00027 #ifdef HAVE_CONFIG_H 00028 #include "config_auto.h" 00029 #endif 00030 00031 #define EXTERN 00032 00033 // Control parameters used in outline_complexity(), which rejects an outline 00034 // if any one of the 3 conditions is satisfied: 00035 // - number of children exceeds edges_max_children_per_outline 00036 // - number of nested layers exceeds edges_max_children_layers 00037 // - joint complexity exceeds edges_children_count_limit(as in child_count()) 00038 EXTERN BOOL_VAR(edges_use_new_outline_complexity, FALSE, 00039 "Use the new outline complexity module"); 00040 EXTERN INT_VAR(edges_max_children_per_outline, 10, 00041 "Max number of children inside a character outline"); 00042 EXTERN INT_VAR(edges_max_children_layers, 5, 00043 "Max layers of nested children inside a character outline"); 00044 EXTERN BOOL_VAR(edges_debug, FALSE, 00045 "turn on debugging for this module"); 00046 00047 00048 EXTERN INT_VAR(edges_children_per_grandchild, 10, 00049 "Importance ratio for chucking outlines"); 00050 EXTERN INT_VAR(edges_children_count_limit, 45, 00051 "Max holes allowed in blob"); 00052 EXTERN BOOL_VAR(edges_children_fix, FALSE, 00053 "Remove boxy parents of char-like children"); 00054 EXTERN INT_VAR(edges_min_nonhole, 12, 00055 "Min pixels for potential char in box"); 00056 EXTERN INT_VAR(edges_patharea_ratio, 40, 00057 "Max lensq/area for acceptable child outline"); 00058 EXTERN double_VAR(edges_childarea, 0.5, 00059 "Min area fraction of child outline"); 00060 EXTERN double_VAR(edges_boxarea, 0.875, 00061 "Min area fraction of grandchild for box"); 00062 00069 OL_BUCKETS::OL_BUCKETS( 00070 ICOORD bleft, // corners 00071 ICOORD tright): bl(bleft), tr(tright) { 00072 bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1; 00073 bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1; 00074 // make array 00075 buckets = new C_OUTLINE_LIST[bxdim * bydim]; 00076 index = 0; 00077 } 00078 00079 00087 C_OUTLINE_LIST * 00088 OL_BUCKETS::operator()( // array access 00089 inT16 x, // image coords 00090 inT16 y) { 00091 return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE]; 00092 } 00093 00094 00115 inT32 OL_BUCKETS::outline_complexity( 00116 C_OUTLINE *outline, // parent outline 00117 inT32 max_count, // max output 00118 inT16 depth // recurion depth 00119 ) { 00120 inT16 xmin, xmax; // coord limits 00121 inT16 ymin, ymax; 00122 inT16 xindex, yindex; // current bucket 00123 C_OUTLINE *child; // current child 00124 inT32 child_count; // no of children 00125 inT32 grandchild_count; // no of grandchildren 00126 C_OUTLINE_IT child_it; // search iterator 00127 00128 TBOX olbox = outline->bounding_box(); 00129 xmin =(olbox.left() - bl.x()) / BUCKETSIZE; 00130 xmax =(olbox.right() - bl.x()) / BUCKETSIZE; 00131 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; 00132 ymax =(olbox.top() - bl.y()) / BUCKETSIZE; 00133 child_count = 0; 00134 grandchild_count = 0; 00135 if (++depth > edges_max_children_layers) // nested loops are too deep 00136 return max_count + depth; 00137 00138 for (yindex = ymin; yindex <= ymax; yindex++) { 00139 for (xindex = xmin; xindex <= xmax; xindex++) { 00140 child_it.set_to_list(&buckets[yindex * bxdim + xindex]); 00141 if (child_it.empty()) 00142 continue; 00143 for (child_it.mark_cycle_pt(); !child_it.cycled_list(); 00144 child_it.forward()) { 00145 child = child_it.data(); 00146 if (child == outline || !(*child < *outline)) 00147 continue; 00148 child_count++; 00149 00150 if (child_count > edges_max_children_per_outline) { // too fragmented 00151 if (edges_debug) 00152 tprintf("Discard outline on child_count=%d > " 00153 "max_children_per_outline=%d\n", 00154 child_count, 00155 static_cast<inT32>(edges_max_children_per_outline)); 00156 return max_count + child_count; 00157 } 00158 00159 // Compute the "complexity" of each child recursively 00160 inT32 remaining_count = max_count - child_count - grandchild_count; 00161 if (remaining_count > 0) 00162 grandchild_count += edges_children_per_grandchild * 00163 outline_complexity(child, remaining_count, depth); 00164 if (child_count + grandchild_count > max_count) { // too complex 00165 if (edges_debug) 00166 tprintf("Disgard outline on child_count=%d + grandchild_count=%d " 00167 "> max_count=%d\n", 00168 child_count, grandchild_count, max_count); 00169 return child_count + grandchild_count; 00170 } 00171 } 00172 } 00173 } 00174 return child_count + grandchild_count; 00175 } 00176 00177 00184 inT32 OL_BUCKETS::count_children( // recursive count 00185 C_OUTLINE *outline, // parent outline 00186 inT32 max_count // max output 00187 ) { 00188 BOOL8 parent_box; // could it be boxy 00189 inT16 xmin, xmax; // coord limits 00190 inT16 ymin, ymax; 00191 inT16 xindex, yindex; // current bucket 00192 C_OUTLINE *child; // current child 00193 inT32 child_count; // no of children 00194 inT32 grandchild_count; // no of grandchildren 00195 inT32 parent_area; // potential box 00196 FLOAT32 max_parent_area; // potential box 00197 inT32 child_area; // current child 00198 inT32 child_length; // current child 00199 TBOX olbox; 00200 C_OUTLINE_IT child_it; // search iterator 00201 00202 olbox = outline->bounding_box(); 00203 xmin =(olbox.left() - bl.x()) / BUCKETSIZE; 00204 xmax =(olbox.right() - bl.x()) / BUCKETSIZE; 00205 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; 00206 ymax =(olbox.top() - bl.y()) / BUCKETSIZE; 00207 child_count = 0; 00208 grandchild_count = 0; 00209 parent_area = 0; 00210 max_parent_area = 0; 00211 parent_box = TRUE; 00212 for (yindex = ymin; yindex <= ymax; yindex++) { 00213 for (xindex = xmin; xindex <= xmax; xindex++) { 00214 child_it.set_to_list(&buckets[yindex * bxdim + xindex]); 00215 if (child_it.empty()) 00216 continue; 00217 for (child_it.mark_cycle_pt(); !child_it.cycled_list(); 00218 child_it.forward()) { 00219 child = child_it.data(); 00220 if (child != outline && *child < *outline) { 00221 child_count++; 00222 if (child_count <= max_count) { 00223 int max_grand =(max_count - child_count) / 00224 edges_children_per_grandchild; 00225 if (max_grand > 0) 00226 grandchild_count += count_children(child, max_grand) * 00227 edges_children_per_grandchild; 00228 else 00229 grandchild_count += count_children(child, 1); 00230 } 00231 if (child_count + grandchild_count > max_count) { 00232 if (edges_debug) 00233 tprintf("Discarding parent with child count=%d, gc=%d\n", 00234 child_count,grandchild_count); 00235 return child_count + grandchild_count; 00236 } 00237 if (parent_area == 0) { 00238 parent_area = outline->outer_area(); 00239 if (parent_area < 0) 00240 parent_area = -parent_area; 00241 max_parent_area = outline->bounding_box().area() * edges_boxarea; 00242 if (parent_area < max_parent_area) 00243 parent_box = FALSE; 00244 } 00245 if (parent_box && 00246 (!edges_children_fix || 00247 child->bounding_box().height() > edges_min_nonhole)) { 00248 child_area = child->outer_area(); 00249 if (child_area < 0) 00250 child_area = -child_area; 00251 if (edges_children_fix) { 00252 if (parent_area - child_area < max_parent_area) { 00253 parent_box = FALSE; 00254 continue; 00255 } 00256 if (grandchild_count > 0) { 00257 if (edges_debug) 00258 tprintf("Discarding parent of area %d, child area=%d, max%g " 00259 "with gc=%d\n", 00260 parent_area, child_area, max_parent_area, 00261 grandchild_count); 00262 return max_count + 1; 00263 } 00264 child_length = child->pathlength(); 00265 if (child_length * child_length > 00266 child_area * edges_patharea_ratio) { 00267 if (edges_debug) 00268 tprintf("Discarding parent of area %d, child area=%d, max%g " 00269 "with child length=%d\n", 00270 parent_area, child_area, max_parent_area, 00271 child_length); 00272 return max_count + 1; 00273 } 00274 } 00275 if (child_area < child->bounding_box().area() * edges_childarea) { 00276 if (edges_debug) 00277 tprintf("Discarding parent of area %d, child area=%d, max%g " 00278 "with child rect=%d\n", 00279 parent_area, child_area, max_parent_area, 00280 child->bounding_box().area()); 00281 return max_count + 1; 00282 } 00283 } 00284 } 00285 } 00286 } 00287 } 00288 return child_count + grandchild_count; 00289 } 00290 00291 00292 00293 00300 void OL_BUCKETS::extract_children( // recursive count 00301 C_OUTLINE *outline, // parent outline 00302 C_OUTLINE_IT *it // destination iterator 00303 ) { 00304 inT16 xmin, xmax; // coord limits 00305 inT16 ymin, ymax; 00306 inT16 xindex, yindex; // current bucket 00307 TBOX olbox; 00308 C_OUTLINE_IT child_it; // search iterator 00309 00310 olbox = outline->bounding_box(); 00311 xmin =(olbox.left() - bl.x()) / BUCKETSIZE; 00312 xmax =(olbox.right() - bl.x()) / BUCKETSIZE; 00313 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; 00314 ymax =(olbox.top() - bl.y()) / BUCKETSIZE; 00315 for (yindex = ymin; yindex <= ymax; yindex++) { 00316 for (xindex = xmin; xindex <= xmax; xindex++) { 00317 child_it.set_to_list(&buckets[yindex * bxdim + xindex]); 00318 for (child_it.mark_cycle_pt(); !child_it.cycled_list(); 00319 child_it.forward()) { 00320 if (*child_it.data() < *outline) { 00321 it->add_after_then_move(child_it.extract()); 00322 } 00323 } 00324 } 00325 } 00326 } 00327 00328 00335 void extract_edges(Pix* pix, // thresholded image 00336 BLOCK *block) { // block to scan 00337 C_OUTLINE_LIST outlines; // outlines in block 00338 C_OUTLINE_IT out_it = &outlines; 00339 00340 // TODO(rays) move the pix all the way down to the bottom. 00341 IMAGE image; 00342 image.FromPix(pix); 00343 00344 block_edges(&image, block, &out_it); 00345 ICOORD bleft; // block box 00346 ICOORD tright; 00347 block->bounding_box(bleft, tright); 00348 // make blobs 00349 outlines_to_blobs(block, bleft, tright, &outlines); 00350 } 00351 00352 00359 void outlines_to_blobs( // find blobs 00360 BLOCK *block, // block to scan 00361 ICOORD bleft, 00362 ICOORD tright, 00363 C_OUTLINE_LIST *outlines) { 00364 // make buckets 00365 OL_BUCKETS buckets(bleft, tright); 00366 00367 fill_buckets(outlines, &buckets); 00368 empty_buckets(block, &buckets); 00369 } 00370 00371 00378 void fill_buckets( // find blobs 00379 C_OUTLINE_LIST *outlines, // outlines in block 00380 OL_BUCKETS *buckets // output buckets 00381 ) { 00382 TBOX ol_box; // outline box 00383 C_OUTLINE_IT out_it = outlines; // iterator 00384 C_OUTLINE_IT bucket_it; // iterator in bucket 00385 C_OUTLINE *outline; // current outline 00386 00387 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { 00388 outline = out_it.extract(); // take off list 00389 // get box 00390 ol_box = outline->bounding_box(); 00391 bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom())); 00392 bucket_it.add_to_end(outline); 00393 } 00394 } 00395 00396 00403 void empty_buckets( // find blobs 00404 BLOCK *block, // block to scan 00405 OL_BUCKETS *buckets // output buckets 00406 ) { 00407 BOOL8 good_blob; // healthy blob 00408 C_OUTLINE_LIST outlines; // outlines in block 00409 // iterator 00410 C_OUTLINE_IT out_it = &outlines; 00411 C_OUTLINE_IT bucket_it = buckets->start_scan(); 00412 C_OUTLINE_IT parent_it; // parent outline 00413 C_BLOB *blob; // new blob 00414 C_BLOB_IT good_blobs = block->blob_list(); 00415 C_BLOB_IT junk_blobs = block->reject_blobs(); 00416 00417 while (!bucket_it.empty()) { 00418 out_it.set_to_list(&outlines); 00419 do { 00420 parent_it = bucket_it; // find outermost 00421 do { 00422 bucket_it.forward(); 00423 } while (!bucket_it.at_first() && 00424 !(*parent_it.data() < *bucket_it.data())); 00425 } while (!bucket_it.at_first()); 00426 00427 // move to new list 00428 out_it.add_after_then_move(parent_it.extract()); 00429 good_blob = capture_children(buckets, &junk_blobs, &out_it); 00430 blob = new C_BLOB(&outlines); 00431 if (good_blob) 00432 good_blobs.add_after_then_move(blob); 00433 else 00434 junk_blobs.add_after_then_move(blob); 00435 00436 bucket_it.set_to_list(buckets->scan_next()); 00437 } 00438 } 00439 00440 00449 BOOL8 capture_children( // find children 00450 OL_BUCKETS *buckets, // bucket sort clanss 00451 C_BLOB_IT *reject_it, // dead grandchildren 00452 C_OUTLINE_IT *blob_it // output outlines 00453 ) { 00454 C_OUTLINE *outline; // master outline 00455 inT32 child_count; // no of children 00456 00457 outline = blob_it->data(); 00458 if (edges_use_new_outline_complexity) 00459 child_count = buckets->outline_complexity(outline, 00460 edges_children_count_limit, 00461 0); 00462 else 00463 child_count = buckets->count_children(outline, 00464 edges_children_count_limit); 00465 if (child_count > edges_children_count_limit) 00466 return FALSE; 00467 00468 if (child_count > 0) 00469 buckets->extract_children(outline, blob_it); 00470 return TRUE; 00471 }