Tesseract
3.02
|
#include <edgblob.h>
Public Member Functions | |
~OL_BUCKETS () | |
C_OUTLINE_LIST * | start_scan () |
C_OUTLINE_LIST * | scan_next () |
OL_BUCKETS::OL_BUCKETS | |
Construct an array of buckets for associating outlines into blobs. | |
OL_BUCKETS (ICOORD bleft, ICOORD tright) | |
OL_BUCKETS::operator( | |
Return a pointer to a list of C_OUTLINEs corresponding to the given pixel coordinates. | |
C_OUTLINE_LIST * | operator() (inT16 x, inT16 y) |
OL_BUCKETS::count_children | |
Find number of descendants of this outline. | |
inT32 | count_children (C_OUTLINE *outline, inT32 max_count) |
OL_BUCKETS::outline_complexity | |
This is the new version of count_child. The goal of this function is to determine if an outline and its interiors could be part of a character blob. This is done by computing a "complexity" index for the outline, which is the return value of this function, and checking it against a threshold. The max_count is used for short-circuiting the recursion and forcing a rejection that guarantees to fail the threshold test. The complexity F for outline X with N children X[i] is F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild so each layer of nesting increases complexity exponentially. An outline can be rejected as a text blob candidate if its complexity is too high, has too many children(likely a container), or has too many layers of nested inner loops. This has the side-effect of flattening out boxed or reversed video text regions. | |
inT32 | outline_complexity (C_OUTLINE *outline, inT32 max_count, inT16 depth) |
OL_BUCKETS::extract_children | |
Find number of descendants of this outline. | |
void | extract_children (C_OUTLINE *outline, C_OUTLINE_IT *it) |
Definition at line 69 of file edgblob.cpp.
: bl(bleft), tr(tright) { bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1; bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1; // make array buckets = new C_OUTLINE_LIST[bxdim * bydim]; index = 0; }
OL_BUCKETS::~OL_BUCKETS | ( | ) | [inline] |
Definition at line 184 of file edgblob.cpp.
{ BOOL8 parent_box; // could it be boxy inT16 xmin, xmax; // coord limits inT16 ymin, ymax; inT16 xindex, yindex; // current bucket C_OUTLINE *child; // current child inT32 child_count; // no of children inT32 grandchild_count; // no of grandchildren inT32 parent_area; // potential box FLOAT32 max_parent_area; // potential box inT32 child_area; // current child inT32 child_length; // current child TBOX olbox; C_OUTLINE_IT child_it; // search iterator olbox = outline->bounding_box(); xmin =(olbox.left() - bl.x()) / BUCKETSIZE; xmax =(olbox.right() - bl.x()) / BUCKETSIZE; ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; ymax =(olbox.top() - bl.y()) / BUCKETSIZE; child_count = 0; grandchild_count = 0; parent_area = 0; max_parent_area = 0; parent_box = TRUE; for (yindex = ymin; yindex <= ymax; yindex++) { for (xindex = xmin; xindex <= xmax; xindex++) { child_it.set_to_list(&buckets[yindex * bxdim + xindex]); if (child_it.empty()) continue; for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) { child = child_it.data(); if (child != outline && *child < *outline) { child_count++; if (child_count <= max_count) { int max_grand =(max_count - child_count) / edges_children_per_grandchild; if (max_grand > 0) grandchild_count += count_children(child, max_grand) * edges_children_per_grandchild; else grandchild_count += count_children(child, 1); } if (child_count + grandchild_count > max_count) { if (edges_debug) tprintf("Discarding parent with child count=%d, gc=%d\n", child_count,grandchild_count); return child_count + grandchild_count; } if (parent_area == 0) { parent_area = outline->outer_area(); if (parent_area < 0) parent_area = -parent_area; max_parent_area = outline->bounding_box().area() * edges_boxarea; if (parent_area < max_parent_area) parent_box = FALSE; } if (parent_box && (!edges_children_fix || child->bounding_box().height() > edges_min_nonhole)) { child_area = child->outer_area(); if (child_area < 0) child_area = -child_area; if (edges_children_fix) { if (parent_area - child_area < max_parent_area) { parent_box = FALSE; continue; } if (grandchild_count > 0) { if (edges_debug) tprintf("Discarding parent of area %d, child area=%d, max%g " "with gc=%d\n", parent_area, child_area, max_parent_area, grandchild_count); return max_count + 1; } child_length = child->pathlength(); if (child_length * child_length > child_area * edges_patharea_ratio) { if (edges_debug) tprintf("Discarding parent of area %d, child area=%d, max%g " "with child length=%d\n", parent_area, child_area, max_parent_area, child_length); return max_count + 1; } } if (child_area < child->bounding_box().area() * edges_childarea) { if (edges_debug) tprintf("Discarding parent of area %d, child area=%d, max%g " "with child rect=%d\n", parent_area, child_area, max_parent_area, child->bounding_box().area()); return max_count + 1; } } } } } } return child_count + grandchild_count; }
void OL_BUCKETS::extract_children | ( | C_OUTLINE * | outline, |
C_OUTLINE_IT * | it | ||
) |
Definition at line 300 of file edgblob.cpp.
{ inT16 xmin, xmax; // coord limits inT16 ymin, ymax; inT16 xindex, yindex; // current bucket TBOX olbox; C_OUTLINE_IT child_it; // search iterator olbox = outline->bounding_box(); xmin =(olbox.left() - bl.x()) / BUCKETSIZE; xmax =(olbox.right() - bl.x()) / BUCKETSIZE; ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; ymax =(olbox.top() - bl.y()) / BUCKETSIZE; for (yindex = ymin; yindex <= ymax; yindex++) { for (xindex = xmin; xindex <= xmax; xindex++) { child_it.set_to_list(&buckets[yindex * bxdim + xindex]); for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) { if (*child_it.data() < *outline) { it->add_after_then_move(child_it.extract()); } } } } }
Definition at line 88 of file edgblob.cpp.
{ return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE]; }
Definition at line 115 of file edgblob.cpp.
{ inT16 xmin, xmax; // coord limits inT16 ymin, ymax; inT16 xindex, yindex; // current bucket C_OUTLINE *child; // current child inT32 child_count; // no of children inT32 grandchild_count; // no of grandchildren C_OUTLINE_IT child_it; // search iterator TBOX olbox = outline->bounding_box(); xmin =(olbox.left() - bl.x()) / BUCKETSIZE; xmax =(olbox.right() - bl.x()) / BUCKETSIZE; ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; ymax =(olbox.top() - bl.y()) / BUCKETSIZE; child_count = 0; grandchild_count = 0; if (++depth > edges_max_children_layers) // nested loops are too deep return max_count + depth; for (yindex = ymin; yindex <= ymax; yindex++) { for (xindex = xmin; xindex <= xmax; xindex++) { child_it.set_to_list(&buckets[yindex * bxdim + xindex]); if (child_it.empty()) continue; for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) { child = child_it.data(); if (child == outline || !(*child < *outline)) continue; child_count++; if (child_count > edges_max_children_per_outline) { // too fragmented if (edges_debug) tprintf("Discard outline on child_count=%d > " "max_children_per_outline=%d\n", child_count, static_cast<inT32>(edges_max_children_per_outline)); return max_count + child_count; } // Compute the "complexity" of each child recursively inT32 remaining_count = max_count - child_count - grandchild_count; if (remaining_count > 0) grandchild_count += edges_children_per_grandchild * outline_complexity(child, remaining_count, depth); if (child_count + grandchild_count > max_count) { // too complex if (edges_debug) tprintf("Disgard outline on child_count=%d + grandchild_count=%d " "> max_count=%d\n", child_count, grandchild_count, max_count); return child_count + grandchild_count; } } } } return child_count + grandchild_count; }
C_OUTLINE_LIST* OL_BUCKETS::scan_next | ( | ) | [inline] |
C_OUTLINE_LIST* OL_BUCKETS::start_scan | ( | ) | [inline] |