Tesseract  3.02
tesseract-ocr/textord/edgblob.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        edgblob.c (Formerly edgeloop.c)
00003  * Description: Functions to clean up an outline before approximation.
00004  * Author:              Ray Smith
00005  * Created:             Tue Mar 26 16:56:25 GMT 1991
00006  *
00007  *(C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0(the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "mfcpch.h"
00021 #include "scanedg.h"
00022 #include "drawedg.h"
00023 #include "edgloop.h"
00024 #include "edgblob.h"
00025 
00026 // Include automatically generated configuration file if running autoconf.
00027 #ifdef HAVE_CONFIG_H
00028 #include "config_auto.h"
00029 #endif
00030 
00031 #define EXTERN
00032 
00033 // Control parameters used in outline_complexity(), which rejects an outline
00034 // if any one of the 3 conditions is satisfied:
00035 //  - number of children exceeds edges_max_children_per_outline
00036 //  - number of nested layers exceeds edges_max_children_layers
00037 //  - joint complexity exceeds edges_children_count_limit(as in child_count())
00038 EXTERN BOOL_VAR(edges_use_new_outline_complexity, FALSE,
00039                 "Use the new outline complexity module");
00040 EXTERN INT_VAR(edges_max_children_per_outline, 10,
00041                "Max number of children inside a character outline");
00042 EXTERN INT_VAR(edges_max_children_layers, 5,
00043                "Max layers of nested children inside a character outline");
00044 EXTERN BOOL_VAR(edges_debug, FALSE,
00045                 "turn on debugging for this module");
00046 
00047 
00048 EXTERN INT_VAR(edges_children_per_grandchild, 10,
00049                "Importance ratio for chucking outlines");
00050 EXTERN INT_VAR(edges_children_count_limit, 45,
00051                "Max holes allowed in blob");
00052 EXTERN BOOL_VAR(edges_children_fix, FALSE,
00053                 "Remove boxy parents of char-like children");
00054 EXTERN INT_VAR(edges_min_nonhole, 12,
00055                "Min pixels for potential char in box");
00056 EXTERN INT_VAR(edges_patharea_ratio, 40,
00057                "Max lensq/area for acceptable child outline");
00058 EXTERN double_VAR(edges_childarea, 0.5,
00059                   "Min area fraction of child outline");
00060 EXTERN double_VAR(edges_boxarea, 0.875,
00061                   "Min area fraction of grandchild for box");
00062 
00069 OL_BUCKETS::OL_BUCKETS(
00070 ICOORD bleft,                    // corners
00071 ICOORD tright):         bl(bleft), tr(tright) {
00072   bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
00073   bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
00074                                  // make array
00075   buckets = new C_OUTLINE_LIST[bxdim * bydim];
00076   index = 0;
00077 }
00078 
00079 
00087 C_OUTLINE_LIST *
00088 OL_BUCKETS::operator()(       // array access
00089 inT16 x,                      // image coords
00090 inT16 y) {
00091   return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
00092 }
00093 
00094 
00115 inT32 OL_BUCKETS::outline_complexity(
00116                                      C_OUTLINE *outline,   // parent outline
00117                                      inT32 max_count,      // max output
00118                                      inT16 depth           // recurion depth
00119                                     ) {
00120   inT16 xmin, xmax;              // coord limits
00121   inT16 ymin, ymax;
00122   inT16 xindex, yindex;          // current bucket
00123   C_OUTLINE *child;              // current child
00124   inT32 child_count;             // no of children
00125   inT32 grandchild_count;        // no of grandchildren
00126   C_OUTLINE_IT child_it;         // search iterator
00127 
00128   TBOX olbox = outline->bounding_box();
00129   xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
00130   xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
00131   ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
00132   ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
00133   child_count = 0;
00134   grandchild_count = 0;
00135   if (++depth > edges_max_children_layers)  // nested loops are too deep
00136     return max_count + depth;
00137 
00138   for (yindex = ymin; yindex <= ymax; yindex++) {
00139     for (xindex = xmin; xindex <= xmax; xindex++) {
00140       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
00141       if (child_it.empty())
00142         continue;
00143       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
00144            child_it.forward()) {
00145         child = child_it.data();
00146         if (child == outline || !(*child < *outline))
00147           continue;
00148         child_count++;
00149 
00150         if (child_count > edges_max_children_per_outline) {   // too fragmented
00151           if (edges_debug)
00152             tprintf("Discard outline on child_count=%d > "
00153                     "max_children_per_outline=%d\n",
00154                     child_count,
00155                     static_cast<inT32>(edges_max_children_per_outline));
00156           return max_count + child_count;
00157         }
00158 
00159         // Compute the "complexity" of each child recursively
00160         inT32 remaining_count = max_count - child_count - grandchild_count;
00161         if (remaining_count > 0)
00162           grandchild_count += edges_children_per_grandchild *
00163                               outline_complexity(child, remaining_count, depth);
00164         if (child_count + grandchild_count > max_count) {  // too complex
00165           if (edges_debug)
00166             tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
00167                     "> max_count=%d\n",
00168                     child_count, grandchild_count, max_count);
00169           return child_count + grandchild_count;
00170         }
00171       }
00172     }
00173   }
00174   return child_count + grandchild_count;
00175 }
00176 
00177 
00184 inT32 OL_BUCKETS::count_children(                     // recursive count
00185                                  C_OUTLINE *outline,  // parent outline
00186                                  inT32 max_count      // max output
00187                                 ) {
00188   BOOL8 parent_box;              // could it be boxy
00189   inT16 xmin, xmax;              // coord limits
00190   inT16 ymin, ymax;
00191   inT16 xindex, yindex;          // current bucket
00192   C_OUTLINE *child;              // current child
00193   inT32 child_count;             // no of children
00194   inT32 grandchild_count;        // no of grandchildren
00195   inT32 parent_area;             // potential box
00196   FLOAT32 max_parent_area;       // potential box
00197   inT32 child_area;              // current child
00198   inT32 child_length;            // current child
00199   TBOX olbox;
00200   C_OUTLINE_IT child_it;         // search iterator
00201 
00202   olbox = outline->bounding_box();
00203   xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
00204   xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
00205   ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
00206   ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
00207   child_count = 0;
00208   grandchild_count = 0;
00209   parent_area = 0;
00210   max_parent_area = 0;
00211   parent_box = TRUE;
00212   for (yindex = ymin; yindex <= ymax; yindex++) {
00213     for (xindex = xmin; xindex <= xmax; xindex++) {
00214       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
00215       if (child_it.empty())
00216         continue;
00217       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
00218            child_it.forward()) {
00219         child = child_it.data();
00220         if (child != outline && *child < *outline) {
00221           child_count++;
00222           if (child_count <= max_count) {
00223             int max_grand =(max_count - child_count) /
00224                             edges_children_per_grandchild;
00225             if (max_grand > 0)
00226               grandchild_count += count_children(child, max_grand) *
00227                                   edges_children_per_grandchild;
00228             else
00229               grandchild_count += count_children(child, 1);
00230           }
00231           if (child_count + grandchild_count > max_count) {
00232             if (edges_debug)
00233               tprintf("Discarding parent with child count=%d, gc=%d\n",
00234                       child_count,grandchild_count);
00235             return child_count + grandchild_count;
00236           }
00237           if (parent_area == 0) {
00238             parent_area = outline->outer_area();
00239             if (parent_area < 0)
00240               parent_area = -parent_area;
00241             max_parent_area = outline->bounding_box().area() * edges_boxarea;
00242             if (parent_area < max_parent_area)
00243               parent_box = FALSE;
00244           }
00245           if (parent_box &&
00246               (!edges_children_fix ||
00247                child->bounding_box().height() > edges_min_nonhole)) {
00248             child_area = child->outer_area();
00249             if (child_area < 0)
00250               child_area = -child_area;
00251             if (edges_children_fix) {
00252               if (parent_area - child_area < max_parent_area) {
00253                 parent_box = FALSE;
00254                 continue;
00255               }
00256               if (grandchild_count > 0) {
00257                 if (edges_debug)
00258                   tprintf("Discarding parent of area %d, child area=%d, max%g "
00259                           "with gc=%d\n",
00260                           parent_area, child_area, max_parent_area,
00261                           grandchild_count);
00262                 return max_count + 1;
00263               }
00264               child_length = child->pathlength();
00265               if (child_length * child_length >
00266                   child_area * edges_patharea_ratio) {
00267                 if (edges_debug)
00268                   tprintf("Discarding parent of area %d, child area=%d, max%g "
00269                           "with child length=%d\n",
00270                           parent_area, child_area, max_parent_area,
00271                           child_length);
00272                 return max_count + 1;
00273               }
00274             }
00275             if (child_area < child->bounding_box().area() * edges_childarea) {
00276               if (edges_debug)
00277                 tprintf("Discarding parent of area %d, child area=%d, max%g "
00278                         "with child rect=%d\n",
00279                         parent_area, child_area, max_parent_area,
00280                         child->bounding_box().area());
00281               return max_count + 1;
00282             }
00283           }
00284         }
00285       }
00286     }
00287   }
00288   return child_count + grandchild_count;
00289 }
00290 
00291 
00292 
00293 
00300 void OL_BUCKETS::extract_children(                     // recursive count
00301                                   C_OUTLINE *outline,  // parent outline
00302                                   C_OUTLINE_IT *it     // destination iterator
00303                                  ) {
00304   inT16 xmin, xmax;              // coord limits
00305   inT16 ymin, ymax;
00306   inT16 xindex, yindex;          // current bucket
00307   TBOX olbox;
00308   C_OUTLINE_IT child_it;         // search iterator
00309 
00310   olbox = outline->bounding_box();
00311   xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
00312   xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
00313   ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
00314   ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
00315   for (yindex = ymin; yindex <= ymax; yindex++) {
00316     for (xindex = xmin; xindex <= xmax; xindex++) {
00317       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
00318       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
00319            child_it.forward()) {
00320         if (*child_it.data() < *outline) {
00321           it->add_after_then_move(child_it.extract());
00322         }
00323       }
00324     }
00325   }
00326 }
00327 
00328 
00335 void extract_edges(Pix* pix,  // thresholded image
00336                    BLOCK *block) {  // block to scan
00337   C_OUTLINE_LIST outlines;       // outlines in block
00338   C_OUTLINE_IT out_it = &outlines;
00339 
00340   // TODO(rays) move the pix all the way down to the bottom.
00341   IMAGE image;
00342   image.FromPix(pix);
00343 
00344   block_edges(&image, block, &out_it);
00345   ICOORD bleft;                  // block box
00346   ICOORD tright;
00347   block->bounding_box(bleft, tright);
00348                                  // make blobs
00349   outlines_to_blobs(block, bleft, tright, &outlines);
00350 }
00351 
00352 
00359 void outlines_to_blobs(               // find blobs
00360                        BLOCK *block,  // block to scan
00361                        ICOORD bleft,
00362                        ICOORD tright,
00363                        C_OUTLINE_LIST *outlines) {
00364                                  // make buckets
00365   OL_BUCKETS buckets(bleft, tright);
00366 
00367   fill_buckets(outlines, &buckets);
00368   empty_buckets(block, &buckets);
00369 }
00370 
00371 
00378 void fill_buckets(                           // find blobs
00379                   C_OUTLINE_LIST *outlines,  // outlines in block
00380                   OL_BUCKETS *buckets        // output buckets
00381                  ) {
00382   TBOX ol_box;                     // outline box
00383   C_OUTLINE_IT out_it = outlines;  // iterator
00384   C_OUTLINE_IT bucket_it;          // iterator in bucket
00385   C_OUTLINE *outline;              // current outline
00386 
00387   for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
00388     outline = out_it.extract();  // take off list
00389                                  // get box
00390     ol_box = outline->bounding_box();
00391     bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom()));
00392     bucket_it.add_to_end(outline);
00393   }
00394 }
00395 
00396 
00403 void empty_buckets(                     // find blobs
00404                    BLOCK *block,        // block to scan
00405                    OL_BUCKETS *buckets  // output buckets
00406                   ) {
00407   BOOL8 good_blob;               // healthy blob
00408   C_OUTLINE_LIST outlines;       // outlines in block
00409                                  // iterator
00410   C_OUTLINE_IT out_it = &outlines;
00411   C_OUTLINE_IT bucket_it = buckets->start_scan();
00412   C_OUTLINE_IT parent_it;        // parent outline
00413   C_BLOB *blob;                  // new blob
00414   C_BLOB_IT good_blobs = block->blob_list();
00415   C_BLOB_IT junk_blobs = block->reject_blobs();
00416 
00417   while (!bucket_it.empty()) {
00418     out_it.set_to_list(&outlines);
00419     do {
00420       parent_it = bucket_it;     // find outermost
00421       do {
00422         bucket_it.forward();
00423       } while (!bucket_it.at_first() &&
00424                !(*parent_it.data() < *bucket_it.data()));
00425     } while (!bucket_it.at_first());
00426 
00427                                  // move to new list
00428     out_it.add_after_then_move(parent_it.extract());
00429     good_blob = capture_children(buckets, &junk_blobs, &out_it);
00430     blob = new C_BLOB(&outlines);
00431     if (good_blob)
00432       good_blobs.add_after_then_move(blob);
00433     else
00434       junk_blobs.add_after_then_move(blob);
00435 
00436     bucket_it.set_to_list(buckets->scan_next());
00437   }
00438 }
00439 
00440 
00449 BOOL8 capture_children(                       // find children
00450                        OL_BUCKETS *buckets,   // bucket sort clanss
00451                        C_BLOB_IT *reject_it,  // dead grandchildren
00452                        C_OUTLINE_IT *blob_it  // output outlines
00453                       ) {
00454   C_OUTLINE *outline;            // master outline
00455   inT32 child_count;             // no of children
00456 
00457   outline = blob_it->data();
00458   if (edges_use_new_outline_complexity)
00459     child_count = buckets->outline_complexity(outline,
00460                                                edges_children_count_limit,
00461                                                0);
00462   else
00463     child_count = buckets->count_children(outline,
00464                                            edges_children_count_limit);
00465   if (child_count > edges_children_count_limit)
00466     return FALSE;
00467 
00468   if (child_count > 0)
00469     buckets->extract_children(outline, blob_it);
00470   return TRUE;
00471 }