Tesseract
3.02
|
00001 00002 // File: imagefind.cpp 00003 // Description: Function to find image and drawing regions in an image 00004 // and create a corresponding list of empty blobs. 00005 // Author: Ray Smith 00006 // Created: Thu Mar 20 09:49:01 PDT 2008 00007 // 00008 // (C) Copyright 2008, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifdef _MSC_VER 00022 #pragma warning(disable:4244) // Conversion warnings 00023 #endif 00024 00025 #include "imagefind.h" 00026 #include "colpartitiongrid.h" 00027 #include "linlsq.h" 00028 #include "ndminx.h" 00029 #include "statistc.h" 00030 #include "params.h" 00031 00032 // This entire file is dependent upon leptonica. If you don't have it, 00033 // you don't get this functionality. 00034 #ifdef HAVE_CONFIG_H 00035 #include "config_auto.h" 00036 #endif 00037 #include "allheaders.h" 00038 00039 INT_VAR(textord_tabfind_show_images, false, "Show image blobs"); 00040 00041 namespace tesseract { 00042 00043 // Fraction of width or height of on pixels that can be discarded from a 00044 // roughly rectangular image. 00045 const double kMinRectangularFraction = 0.125; 00046 // Fraction of width or height to consider image completely used. 00047 const double kMaxRectangularFraction = 0.75; 00048 // Fraction of width or height to allow transition from kMinRectangularFraction 00049 // to kMaxRectangularFraction, equivalent to a dy/dx skew. 00050 const double kMaxRectangularGradient = 0.1; // About 6 degrees. 00051 // Minimum image size to be worth looking for images on. 00052 const int kMinImageFindSize = 100; 00053 // Scale factor for the rms color fit error. 00054 const double kRMSFitScaling = 8.0; 00055 // Min color difference to call it two colors. 00056 const int kMinColorDifference = 16; 00057 // Pixel padding for noise blobs and partitions when rendering on the image 00058 // mask to encourage them to join together. Make it too big and images 00059 // will fatten out too much and have to be clipped to text. 00060 const int kNoisePadding = 4; 00061 00062 // Finds image regions within the BINARY source pix (page image) and returns 00063 // the image regions as a mask image. 00064 // The returned pix may be NULL, meaning no images found. 00065 // If not NULL, it must be PixDestroyed by the caller. 00066 Pix* ImageFind::FindImages(Pix* pix) { 00067 // Not worth looking at small images. 00068 if (pixGetWidth(pix) < kMinImageFindSize || 00069 pixGetHeight(pix) < kMinImageFindSize) 00070 return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1); 00071 // Reduce by factor 2. 00072 Pix *pixr = pixReduceRankBinaryCascade(pix, 1, 0, 0, 0); 00073 pixDisplayWrite(pixr, textord_tabfind_show_images); 00074 00075 // Get the halftone mask directly from Leptonica. 00076 l_int32 ht_found = 0; 00077 Pix *pixht2 = pixGenHalftoneMask(pixr, NULL, &ht_found, 00078 textord_tabfind_show_images); 00079 pixDestroy(&pixr); 00080 if (!ht_found && pixht2 != NULL) 00081 pixDestroy(&pixht2); 00082 if (pixht2 == NULL) 00083 return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1); 00084 00085 // Expand back up again. 00086 Pix *pixht = pixExpandReplicate(pixht2, 2); 00087 pixDisplayWrite(pixht, textord_tabfind_show_images); 00088 pixDestroy(&pixht2); 00089 00090 // Fill to capture pixels near the mask edges that were missed 00091 Pix *pixt = pixSeedfillBinary(NULL, pixht, pix, 8); 00092 pixOr(pixht, pixht, pixt); 00093 pixDestroy(&pixt); 00094 00095 // Eliminate lines and bars that may be joined to images. 00096 Pix* pixfinemask = pixReduceRankBinaryCascade(pixht, 1, 1, 3, 3); 00097 pixDilateBrick(pixfinemask, pixfinemask, 5, 5); 00098 pixDisplayWrite(pixfinemask, textord_tabfind_show_images); 00099 Pix* pixreduced = pixReduceRankBinaryCascade(pixht, 1, 1, 1, 1); 00100 Pix* pixreduced2 = pixReduceRankBinaryCascade(pixreduced, 3, 3, 3, 0); 00101 pixDestroy(&pixreduced); 00102 pixDilateBrick(pixreduced2, pixreduced2, 5, 5); 00103 Pix* pixcoarsemask = pixExpandReplicate(pixreduced2, 8); 00104 pixDestroy(&pixreduced2); 00105 pixDisplayWrite(pixcoarsemask, textord_tabfind_show_images); 00106 // Combine the coarse and fine image masks. 00107 pixAnd(pixcoarsemask, pixcoarsemask, pixfinemask); 00108 pixDestroy(&pixfinemask); 00109 // Dilate a bit to make sure we get everything. 00110 pixDilateBrick(pixcoarsemask, pixcoarsemask, 3, 3); 00111 Pix* pixmask = pixExpandReplicate(pixcoarsemask, 16); 00112 pixDestroy(&pixcoarsemask); 00113 if (textord_tabfind_show_images) 00114 pixWrite("junkexpandedcoarsemask.png", pixmask, IFF_PNG); 00115 // And the image mask with the line and bar remover. 00116 pixAnd(pixht, pixht, pixmask); 00117 pixDestroy(&pixmask); 00118 if (textord_tabfind_show_images) 00119 pixWrite("junkfinalimagemask.png", pixht, IFF_PNG); 00120 // Make the result image the same size as the input. 00121 Pix* result = pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1); 00122 pixOr(result, result, pixht); 00123 pixDestroy(&pixht); 00124 return result; 00125 } 00126 00127 // Generates a Boxa, Pixa pair from the input binary (image mask) pix, 00128 // analgous to pixConnComp, except that connected components which are nearly 00129 // rectangular are replaced with solid rectangles. 00130 // The returned boxa, pixa may be NULL, meaning no images found. 00131 // If not NULL, they must be destroyed by the caller. 00132 // Resolution of pix should match the source image (Tesseract::pix_binary_) 00133 // so the output coordinate systems match. 00134 void ImageFind::ConnCompAndRectangularize(Pix* pix, Boxa** boxa, Pixa** pixa) { 00135 *boxa = NULL; 00136 *pixa = NULL; 00137 00138 if (textord_tabfind_show_images) 00139 pixWrite("junkconncompimage.png", pix, IFF_PNG); 00140 // Find the individual image regions in the mask image. 00141 *boxa = pixConnComp(pix, pixa, 8); 00142 // Rectangularize the individual images. If a sharp edge in vertical and/or 00143 // horizontal occupancy can be found, it indicates a probably rectangular 00144 // image with unwanted bits merged on, so clip to the approximate rectangle. 00145 int npixes = pixaGetCount(*pixa); 00146 for (int i = 0; i < npixes; ++i) { 00147 int x_start, x_end, y_start, y_end; 00148 Pix* img_pix = pixaGetPix(*pixa, i, L_CLONE); 00149 pixDisplayWrite(img_pix, textord_tabfind_show_images); 00150 if (pixNearlyRectangular(img_pix, kMinRectangularFraction, 00151 kMaxRectangularFraction, 00152 kMaxRectangularGradient, 00153 &x_start, &y_start, &x_end, &y_end)) { 00154 Pix* simple_pix = pixCreate(x_end - x_start, y_end - y_start, 1); 00155 pixSetAll(simple_pix); 00156 pixDestroy(&img_pix); 00157 // pixaReplacePix takes ownership of the simple_pix. 00158 pixaReplacePix(*pixa, i, simple_pix, NULL); 00159 img_pix = pixaGetPix(*pixa, i, L_CLONE); 00160 // Fix the box to match the new pix. 00161 l_int32 x, y, width, height; 00162 boxaGetBoxGeometry(*boxa, i, &x, &y, &width, &height); 00163 Box* simple_box = boxCreate(x + x_start, y + y_start, 00164 x_end - x_start, y_end - y_start); 00165 boxaReplaceBox(*boxa, i, simple_box); 00166 } 00167 pixDestroy(&img_pix); 00168 } 00169 } 00170 00171 // Scans horizontally on x=[x_start,x_end), starting with y=*y_start, 00172 // stepping y+=y_step, until y=y_end. *ystart is input/output. 00173 // If the number of black pixels in a row, pix_count fits this pattern: 00174 // 0 or more rows with pix_count < min_count then 00175 // <= mid_width rows with min_count <= pix_count <= max_count then 00176 // a row with pix_count > max_count then 00177 // true is returned, and *y_start = the first y with pix_count >= min_count. 00178 static bool HScanForEdge(uinT32* data, int wpl, int x_start, int x_end, 00179 int min_count, int mid_width, int max_count, 00180 int y_end, int y_step, int* y_start) { 00181 int mid_rows = 0; 00182 for (int y = *y_start; y != y_end; y += y_step) { 00183 // Need pixCountPixelsInRow(pix, y, &pix_count, NULL) to count in a subset. 00184 int pix_count = 0; 00185 uinT32* line = data + wpl * y; 00186 for (int x = x_start; x < x_end; ++x) { 00187 if (GET_DATA_BIT(line, x)) 00188 ++pix_count; 00189 } 00190 if (mid_rows == 0 && pix_count < min_count) 00191 continue; // In the min phase. 00192 if (mid_rows == 0) 00193 *y_start = y; // Save the y_start where we came out of the min phase. 00194 if (pix_count > max_count) 00195 return true; // Found the pattern. 00196 ++mid_rows; 00197 if (mid_rows > mid_width) 00198 break; // Middle too big. 00199 } 00200 return false; // Never found max_count. 00201 } 00202 00203 // Scans vertically on y=[y_start,y_end), starting with x=*x_start, 00204 // stepping x+=x_step, until x=x_end. *x_start is input/output. 00205 // If the number of black pixels in a column, pix_count fits this pattern: 00206 // 0 or more cols with pix_count < min_count then 00207 // <= mid_width cols with min_count <= pix_count <= max_count then 00208 // a column with pix_count > max_count then 00209 // true is returned, and *x_start = the first x with pix_count >= min_count. 00210 static bool VScanForEdge(uinT32* data, int wpl, int y_start, int y_end, 00211 int min_count, int mid_width, int max_count, 00212 int x_end, int x_step, int* x_start) { 00213 int mid_cols = 0; 00214 for (int x = *x_start; x != x_end; x += x_step) { 00215 int pix_count = 0; 00216 uinT32* line = data + y_start * wpl; 00217 for (int y = y_start; y < y_end; ++y, line += wpl) { 00218 if (GET_DATA_BIT(line, x)) 00219 ++pix_count; 00220 } 00221 if (mid_cols == 0 && pix_count < min_count) 00222 continue; // In the min phase. 00223 if (mid_cols == 0) 00224 *x_start = x; // Save the place where we came out of the min phase. 00225 if (pix_count > max_count) 00226 return true; // found the pattern. 00227 ++mid_cols; 00228 if (mid_cols > mid_width) 00229 break; // Middle too big. 00230 } 00231 return false; // Never found max_count. 00232 } 00233 00234 // Returns true if there is a rectangle in the source pix, such that all 00235 // pixel rows and column slices outside of it have less than 00236 // min_fraction of the pixels black, and within max_skew_gradient fraction 00237 // of the pixels on the inside, there are at least max_fraction of the 00238 // pixels black. In other words, the inside of the rectangle looks roughly 00239 // rectangular, and the outside of it looks like extra bits. 00240 // On return, the rectangle is defined by x_start, y_start, x_end and y_end. 00241 // Note: the algorithm is iterative, allowing it to slice off pixels from 00242 // one edge, allowing it to then slice off more pixels from another edge. 00243 bool ImageFind::pixNearlyRectangular(Pix* pix, 00244 double min_fraction, double max_fraction, 00245 double max_skew_gradient, 00246 int* x_start, int* y_start, 00247 int* x_end, int* y_end) { 00248 ASSERT_HOST(pix != NULL); 00249 *x_start = 0; 00250 *x_end = pixGetWidth(pix); 00251 *y_start = 0; 00252 *y_end = pixGetHeight(pix); 00253 00254 uinT32* data = pixGetData(pix); 00255 int wpl = pixGetWpl(pix); 00256 bool any_cut = false; 00257 bool left_done = false; 00258 bool right_done = false; 00259 bool top_done = false; 00260 bool bottom_done = false; 00261 do { 00262 any_cut = false; 00263 // Find the top/bottom edges. 00264 int width = *x_end - *x_start; 00265 int min_count = static_cast<int>(width * min_fraction); 00266 int max_count = static_cast<int>(width * max_fraction); 00267 int edge_width = static_cast<int>(width * max_skew_gradient); 00268 if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width, 00269 max_count, *y_end, 1, y_start) && !top_done) { 00270 top_done = true; 00271 any_cut = true; 00272 } 00273 --(*y_end); 00274 if (HScanForEdge(data, wpl, *x_start, *x_end, min_count, edge_width, 00275 max_count, *y_start, -1, y_end) && !bottom_done) { 00276 bottom_done = true; 00277 any_cut = true; 00278 } 00279 ++(*y_end); 00280 00281 // Find the left/right edges. 00282 int height = *y_end - *y_start; 00283 min_count = static_cast<int>(height * min_fraction); 00284 max_count = static_cast<int>(height * max_fraction); 00285 edge_width = static_cast<int>(height * max_skew_gradient); 00286 if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width, 00287 max_count, *x_end, 1, x_start) && !left_done) { 00288 left_done = true; 00289 any_cut = true; 00290 } 00291 --(*x_end); 00292 if (VScanForEdge(data, wpl, *y_start, *y_end, min_count, edge_width, 00293 max_count, *x_start, -1, x_end) && !right_done) { 00294 right_done = true; 00295 any_cut = true; 00296 } 00297 ++(*x_end); 00298 } while (any_cut); 00299 00300 // All edges must satisfy the condition of sharp gradient in pixel density 00301 // in order for the full rectangle to be present. 00302 return left_done && right_done && top_done && bottom_done; 00303 } 00304 00305 // Given an input pix, and a bounding rectangle, the sides of the rectangle 00306 // are shrunk inwards until they bound any black pixels found within the 00307 // original rectangle. Returns false if the rectangle contains no black 00308 // pixels at all. 00309 bool ImageFind::BoundsWithinRect(Pix* pix, int* x_start, int* y_start, 00310 int* x_end, int* y_end) { 00311 Box* input_box = boxCreate(*x_start, *y_start, *x_end - *x_start, 00312 *y_end - *y_start); 00313 Box* output_box = NULL; 00314 pixClipBoxToForeground(pix, input_box, NULL, &output_box); 00315 bool result = output_box != NULL; 00316 if (result) { 00317 l_int32 x, y, width, height; 00318 boxGetGeometry(output_box, &x, &y, &width, &height); 00319 *x_start = x; 00320 *y_start = y; 00321 *x_end = x + width; 00322 *y_end = y + height; 00323 boxDestroy(&output_box); 00324 } 00325 boxDestroy(&input_box); 00326 return result; 00327 } 00328 00329 // Given a point in 3-D (RGB) space, returns the squared Euclidean distance 00330 // of the point from the given line, defined by a pair of points in the 3-D 00331 // (RGB) space, line1 and line2. 00332 double ImageFind::ColorDistanceFromLine(const uinT8* line1, 00333 const uinT8* line2, 00334 const uinT8* point) { 00335 int line_vector[kRGBRMSColors]; 00336 int point_vector[kRGBRMSColors]; 00337 for (int i = 0; i < kRGBRMSColors; ++i) { 00338 line_vector[i] = static_cast<int>(line2[i]) - static_cast<int>(line1[i]); 00339 point_vector[i] = static_cast<int>(point[i]) - static_cast<int>(line1[i]); 00340 } 00341 line_vector[L_ALPHA_CHANNEL] = 0; 00342 // Now the cross product in 3d. 00343 int cross[kRGBRMSColors]; 00344 cross[COLOR_RED] = line_vector[COLOR_GREEN] * point_vector[COLOR_BLUE] 00345 - line_vector[COLOR_BLUE] * point_vector[COLOR_GREEN]; 00346 cross[COLOR_GREEN] = line_vector[COLOR_BLUE] * point_vector[COLOR_RED] 00347 - line_vector[COLOR_RED] * point_vector[COLOR_BLUE]; 00348 cross[COLOR_BLUE] = line_vector[COLOR_RED] * point_vector[COLOR_GREEN] 00349 - line_vector[COLOR_GREEN] * point_vector[COLOR_RED]; 00350 cross[L_ALPHA_CHANNEL] = 0; 00351 // Now the sums of the squares. 00352 double cross_sq = 0.0; 00353 double line_sq = 0.0; 00354 for (int j = 0; j < kRGBRMSColors; ++j) { 00355 cross_sq += static_cast<double>(cross[j]) * cross[j]; 00356 line_sq += static_cast<double>(line_vector[j]) * line_vector[j]; 00357 } 00358 if (line_sq == 0.0) { 00359 return 0.0; 00360 } 00361 return cross_sq / line_sq; // This is the squared distance. 00362 } 00363 00364 00365 // Returns the leptonica combined code for the given RGB triplet. 00366 uinT32 ImageFind::ComposeRGB(uinT32 r, uinT32 g, uinT32 b) { 00367 l_uint32 result; 00368 composeRGBPixel(r, g, b, &result); 00369 return result; 00370 } 00371 00372 // Returns the input value clipped to a uinT8. 00373 uinT8 ImageFind::ClipToByte(double pixel) { 00374 if (pixel < 0.0) 00375 return 0; 00376 else if (pixel >= 255.0) 00377 return 255; 00378 return static_cast<uinT8>(pixel); 00379 } 00380 00381 // Computes the light and dark extremes of color in the given rectangle of 00382 // the given pix, which is factor smaller than the coordinate system in rect. 00383 // The light and dark points are taken to be the upper and lower 8th-ile of 00384 // the most deviant of R, G and B. The value of the other 2 channels are 00385 // computed by linear fit against the most deviant. 00386 // The colors of the two points are returned in color1 and color2, with the 00387 // alpha channel set to a scaled mean rms of the fits. 00388 // If color_map1 is not null then it and color_map2 get rect pasted in them 00389 // with the two calculated colors, and rms map gets a pasted rect of the rms. 00390 // color_map1, color_map2 and rms_map are assumed to be the same scale as pix. 00391 void ImageFind::ComputeRectangleColors(const TBOX& rect, Pix* pix, int factor, 00392 Pix* color_map1, Pix* color_map2, 00393 Pix* rms_map, 00394 uinT8* color1, uinT8* color2) { 00395 ASSERT_HOST(pix != NULL && pixGetDepth(pix) == 32); 00396 // Pad the rectangle outwards by 2 (scaled) pixels if possible to get more 00397 // background. 00398 int width = pixGetWidth(pix); 00399 int height = pixGetHeight(pix); 00400 int left_pad = MAX(rect.left() - 2 * factor, 0) / factor; 00401 int top_pad = (rect.top() + 2 * factor + (factor - 1)) / factor; 00402 top_pad = MIN(height, top_pad); 00403 int right_pad = (rect.right() + 2 * factor + (factor - 1)) / factor; 00404 right_pad = MIN(width, right_pad); 00405 int bottom_pad = MAX(rect.bottom() - 2 * factor, 0) / factor; 00406 int width_pad = right_pad - left_pad; 00407 int height_pad = top_pad - bottom_pad; 00408 if (width_pad < 1 || height_pad < 1 || width_pad + height_pad < 4) 00409 return; 00410 // Now crop the pix to the rectangle. 00411 Box* scaled_box = boxCreate(left_pad, height - top_pad, 00412 width_pad, height_pad); 00413 Pix* scaled = pixClipRectangle(pix, scaled_box, NULL); 00414 00415 // Compute stats over the whole image. 00416 STATS red_stats(0, 256); 00417 STATS green_stats(0, 256); 00418 STATS blue_stats(0, 256); 00419 uinT32* data = pixGetData(scaled); 00420 ASSERT_HOST(pixGetWpl(scaled) == width_pad); 00421 for (int y = 0; y < height_pad; ++y) { 00422 for (int x = 0; x < width_pad; ++x, ++data) { 00423 int r = GET_DATA_BYTE(data, COLOR_RED); 00424 int g = GET_DATA_BYTE(data, COLOR_GREEN); 00425 int b = GET_DATA_BYTE(data, COLOR_BLUE); 00426 red_stats.add(r, 1); 00427 green_stats.add(g, 1); 00428 blue_stats.add(b, 1); 00429 } 00430 } 00431 // Find the RGB component with the greatest 8th-ile-range. 00432 // 8th-iles are used instead of quartiles to get closer to the true 00433 // foreground color, which is going to be faint at best because of the 00434 // pre-scaling of the input image. 00435 int best_l8 = static_cast<int>(red_stats.ile(0.125f)); 00436 int best_u8 = static_cast<int>(ceil(red_stats.ile(0.875f))); 00437 int best_i8r = best_u8 - best_l8; 00438 int x_color = COLOR_RED; 00439 int y1_color = COLOR_GREEN; 00440 int y2_color = COLOR_BLUE; 00441 int l8 = static_cast<int>(green_stats.ile(0.125f)); 00442 int u8 = static_cast<int>(ceil(green_stats.ile(0.875f))); 00443 if (u8 - l8 > best_i8r) { 00444 best_i8r = u8 - l8; 00445 best_l8 = l8; 00446 best_u8 = u8; 00447 x_color = COLOR_GREEN; 00448 y1_color = COLOR_RED; 00449 } 00450 l8 = static_cast<int>(blue_stats.ile(0.125f)); 00451 u8 = static_cast<int>(ceil(blue_stats.ile(0.875f))); 00452 if (u8 - l8 > best_i8r) { 00453 best_i8r = u8 - l8; 00454 best_l8 = l8; 00455 best_u8 = u8; 00456 x_color = COLOR_BLUE; 00457 y1_color = COLOR_GREEN; 00458 y2_color = COLOR_RED; 00459 } 00460 if (best_i8r >= kMinColorDifference) { 00461 LLSQ line1; 00462 LLSQ line2; 00463 uinT32* data = pixGetData(scaled); 00464 for (int im_y = 0; im_y < height_pad; ++im_y) { 00465 for (int im_x = 0; im_x < width_pad; ++im_x, ++data) { 00466 int x = GET_DATA_BYTE(data, x_color); 00467 int y1 = GET_DATA_BYTE(data, y1_color); 00468 int y2 = GET_DATA_BYTE(data, y2_color); 00469 line1.add(x, y1); 00470 line2.add(x, y2); 00471 } 00472 } 00473 double m1 = line1.m(); 00474 double c1 = line1.c(m1); 00475 double m2 = line2.m(); 00476 double c2 = line2.c(m2); 00477 double rms = line1.rms(m1, c1) + line2.rms(m2, c2); 00478 rms *= kRMSFitScaling; 00479 // Save the results. 00480 color1[x_color] = ClipToByte(best_l8); 00481 color1[y1_color] = ClipToByte(m1 * best_l8 + c1 + 0.5); 00482 color1[y2_color] = ClipToByte(m2 * best_l8 + c2 + 0.5); 00483 color1[L_ALPHA_CHANNEL] = ClipToByte(rms); 00484 color2[x_color] = ClipToByte(best_u8); 00485 color2[y1_color] = ClipToByte(m1 * best_u8 + c1 + 0.5); 00486 color2[y2_color] = ClipToByte(m2 * best_u8 + c2 + 0.5); 00487 color2[L_ALPHA_CHANNEL] = ClipToByte(rms); 00488 } else { 00489 // There is only one color. 00490 color1[COLOR_RED] = ClipToByte(red_stats.median()); 00491 color1[COLOR_GREEN] = ClipToByte(green_stats.median()); 00492 color1[COLOR_BLUE] = ClipToByte(blue_stats.median()); 00493 color1[L_ALPHA_CHANNEL] = 0; 00494 memcpy(color2, color1, 4); 00495 } 00496 if (color_map1 != NULL) { 00497 pixSetInRectArbitrary(color_map1, scaled_box, 00498 ComposeRGB(color1[COLOR_RED], 00499 color1[COLOR_GREEN], 00500 color1[COLOR_BLUE])); 00501 pixSetInRectArbitrary(color_map2, scaled_box, 00502 ComposeRGB(color2[COLOR_RED], 00503 color2[COLOR_GREEN], 00504 color2[COLOR_BLUE])); 00505 pixSetInRectArbitrary(rms_map, scaled_box, color1[L_ALPHA_CHANNEL]); 00506 } 00507 pixDestroy(&scaled); 00508 boxDestroy(&scaled_box); 00509 } 00510 00511 // ================ CUTTING POLYGONAL IMAGES FROM A RECTANGLE ================ 00512 // The following functions are responsible for cutting a polygonal image from 00513 // a rectangle: CountPixelsInRotatedBox, AttemptToShrinkBox, CutChunkFromParts 00514 // with DivideImageIntoParts as the master. 00515 // Problem statement: 00516 // We start with a single connected component from the image mask: we get 00517 // a Pix of the component, and its location on the page (im_box). 00518 // The objective of cutting a polygonal image from its rectangle is to avoid 00519 // interfering text, but not text that completely overlaps the image. 00520 // ------------------------------ ------------------------------ 00521 // | Single input partition | | 1 Cut up output partitions | 00522 // | | ------------------------------ 00523 // Av|oid | Avoid | | 00524 // | | |________________________| 00525 // Int|erfering | Interfering | | 00526 // | | _____|__________________| 00527 // T|ext | Text | | 00528 // | Text-on-image | | Text-on-image | 00529 // ------------------------------ -------------------------- 00530 // DivideImageIntoParts does this by building a ColPartition_LIST (not in the 00531 // grid) with each ColPartition representing one of the rectangles needed, 00532 // starting with a single rectangle for the whole image component, and cutting 00533 // bits out of it with CutChunkFromParts as needed to avoid text. The output 00534 // ColPartitions are supposed to be ordered from top to bottom. 00535 00536 // The problem is complicated by the fact that we have rotated the coordinate 00537 // system to make text lines horizontal, so if we need to look at the component 00538 // image, we have to rotate the coordinates. Throughout the functions in this 00539 // section im_box is the rectangle representing the image component in the 00540 // rotated page coordinates (where we are building our output ColPartitions), 00541 // rotation is the rotation that we used to get there, and rerotation is the 00542 // rotation required to get back to original page image coordinates. 00543 // To get to coordinates in the component image, pix, we rotate the im_box, 00544 // the point we want to locate, and subtract the rotated point from the top-left 00545 // of the rotated im_box. 00546 // im_box is therefore essential to calculating coordinates within the pix. 00547 00548 // Returns true if there are no black pixels in between the boxes. 00549 // The im_box must represent the bounding box of the pix in tesseract 00550 // coordinates, which may be negative, due to rotations to make the textlines 00551 // horizontal. The boxes are rotated by rotation, which should undo such 00552 // rotations, before mapping them onto the pix. 00553 bool ImageFind::BlankImageInBetween(const TBOX& box1, const TBOX& box2, 00554 const TBOX& im_box, const FCOORD& rotation, 00555 Pix* pix) { 00556 TBOX search_box(box1); 00557 search_box += box2; 00558 if (box1.x_gap(box2) >= box1.y_gap(box2)) { 00559 if (box1.x_gap(box2) <= 0) 00560 return true; 00561 search_box.set_left(MIN(box1.right(), box2.right())); 00562 search_box.set_right(MAX(box1.left(), box2.left())); 00563 } else { 00564 if (box1.y_gap(box2) <= 0) 00565 return true; 00566 search_box.set_top(MAX(box1.bottom(), box2.bottom())); 00567 search_box.set_bottom(MIN(box1.top(), box2.top())); 00568 } 00569 return CountPixelsInRotatedBox(search_box, im_box, rotation, pix) == 0; 00570 } 00571 00572 // Returns the number of pixels in box in the pix. 00573 // rotation, pix and im_box are defined in the large comment above. 00574 int ImageFind::CountPixelsInRotatedBox(TBOX box, const TBOX& im_box, 00575 const FCOORD& rotation, Pix* pix) { 00576 // Intersect it with the image box. 00577 box &= im_box; // This is in-place box intersection. 00578 if (box.null_box()) 00579 return 0; 00580 box.rotate(rotation); 00581 TBOX rotated_im_box(im_box); 00582 rotated_im_box.rotate(rotation); 00583 Pix* rect_pix = pixCreate(box.width(), box.height(), 1); 00584 pixRasterop(rect_pix, 0, 0, box.width(), box.height(), 00585 PIX_SRC, pix, box.left() - rotated_im_box.left(), 00586 rotated_im_box.top() - box.top()); 00587 l_int32 result; 00588 pixCountPixels(rect_pix, &result, NULL); 00589 pixDestroy(&rect_pix); 00590 return result; 00591 } 00592 00593 // The box given by slice contains some black pixels, but not necessarily 00594 // over the whole box. Shrink the x bounds of slice, but not the y bounds 00595 // until there is at least one black pixel in the outermost columns. 00596 // rotation, rerotation, pix and im_box are defined in the large comment above. 00597 static void AttemptToShrinkBox(const FCOORD& rotation, const FCOORD& rerotation, 00598 const TBOX& im_box, Pix* pix, TBOX* slice) { 00599 TBOX rotated_box(*slice); 00600 rotated_box.rotate(rerotation); 00601 TBOX rotated_im_box(im_box); 00602 rotated_im_box.rotate(rerotation); 00603 int left = rotated_box.left() - rotated_im_box.left(); 00604 int right = rotated_box.right() - rotated_im_box.left(); 00605 int top = rotated_im_box.top() - rotated_box.top(); 00606 int bottom = rotated_im_box.top() - rotated_box.bottom(); 00607 ImageFind::BoundsWithinRect(pix, &left, &top, &right, &bottom); 00608 top = rotated_im_box.top() - top; 00609 bottom = rotated_im_box.top() - bottom; 00610 left += rotated_im_box.left(); 00611 right += rotated_im_box.left(); 00612 rotated_box.set_to_given_coords(left, bottom, right, top); 00613 rotated_box.rotate(rotation); 00614 slice->set_left(rotated_box.left()); 00615 slice->set_right(rotated_box.right()); 00616 } 00617 00618 // The meat of cutting a polygonal image around text. 00619 // This function covers the general case of cutting a box out of a box 00620 // as shown: 00621 // Input Output 00622 // ------------------------------ ------------------------------ 00623 // | Single input partition | | 1 Cut up output partitions | 00624 // | | ------------------------------ 00625 // | ---------- | --------- ---------- 00626 // | | box | | | 2 | box | 3 | 00627 // | | | | | | is cut | | 00628 // | ---------- | --------- out ---------- 00629 // | | ------------------------------ 00630 // | | | 4 | 00631 // ------------------------------ ------------------------------ 00632 // In the context that this function is used, at most 3 of the above output 00633 // boxes will be created, as the overlapping box is never contained by the 00634 // input. 00635 // The above cutting operation is executed for each element of part_list that 00636 // is overlapped by the input box. Each modified ColPartition is replaced 00637 // in place in the list by the output of the cutting operation in the order 00638 // shown above, so iff no holes are ever created, the output will be in 00639 // top-to-bottom order, but in extreme cases, hole creation is possible. 00640 // In such cases, the output order may cause strange block polygons. 00641 // rotation, rerotation, pix and im_box are defined in the large comment above. 00642 static void CutChunkFromParts(const TBOX& box, const TBOX& im_box, 00643 const FCOORD& rotation, const FCOORD& rerotation, 00644 Pix* pix, ColPartition_LIST* part_list) { 00645 ASSERT_HOST(!part_list->empty()); 00646 ColPartition_IT part_it(part_list); 00647 do { 00648 ColPartition* part = part_it.data(); 00649 TBOX part_box = part->bounding_box(); 00650 if (part_box.overlap(box)) { 00651 // This part must be cut and replaced with the remains. There are 00652 // upto 4 pieces to be made. Start with the first one and use 00653 // add_before_stay_put. For each piece if it has no black pixels 00654 // left, just don't make the box. 00655 // Above box. 00656 if (box.top() < part_box.top()) { 00657 TBOX slice(part_box); 00658 slice.set_bottom(box.top()); 00659 if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, 00660 pix) > 0) { 00661 AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice); 00662 part_it.add_before_stay_put( 00663 ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, 00664 BTFT_NONTEXT)); 00665 } 00666 } 00667 // Left of box. 00668 if (box.left() > part_box.left()) { 00669 TBOX slice(part_box); 00670 slice.set_right(box.left()); 00671 if (box.top() < part_box.top()) 00672 slice.set_top(box.top()); 00673 if (box.bottom() > part_box.bottom()) 00674 slice.set_bottom(box.bottom()); 00675 if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, 00676 pix) > 0) { 00677 AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice); 00678 part_it.add_before_stay_put( 00679 ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, 00680 BTFT_NONTEXT)); 00681 } 00682 } 00683 // Right of box. 00684 if (box.right() < part_box.right()) { 00685 TBOX slice(part_box); 00686 slice.set_left(box.right()); 00687 if (box.top() < part_box.top()) 00688 slice.set_top(box.top()); 00689 if (box.bottom() > part_box.bottom()) 00690 slice.set_bottom(box.bottom()); 00691 if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, 00692 pix) > 0) { 00693 AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice); 00694 part_it.add_before_stay_put( 00695 ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, 00696 BTFT_NONTEXT)); 00697 } 00698 } 00699 // Below box. 00700 if (box.bottom() > part_box.bottom()) { 00701 TBOX slice(part_box); 00702 slice.set_top(box.bottom()); 00703 if (ImageFind::CountPixelsInRotatedBox(slice, im_box, rerotation, 00704 pix) > 0) { 00705 AttemptToShrinkBox(rotation, rerotation, im_box, pix, &slice); 00706 part_it.add_before_stay_put( 00707 ColPartition::FakePartition(slice, PT_UNKNOWN, BRT_POLYIMAGE, 00708 BTFT_NONTEXT)); 00709 } 00710 } 00711 part->DeleteBoxes(); 00712 delete part_it.extract(); 00713 } 00714 part_it.forward(); 00715 } while (!part_it.at_first()); 00716 } 00717 00718 // Starts with the bounding box of the image component and cuts it up 00719 // so that it doesn't intersect text where possible. 00720 // Strong fully contained horizontal text is marked as text on image, 00721 // and does not cause a division of the image. 00722 // For more detail see the large comment above on cutting polygonal images 00723 // from a rectangle. 00724 // rotation, rerotation, pix and im_box are defined in the large comment above. 00725 static void DivideImageIntoParts(const TBOX& im_box, const FCOORD& rotation, 00726 const FCOORD& rerotation, Pix* pix, 00727 ColPartitionGridSearch* rectsearch, 00728 ColPartition_LIST* part_list) { 00729 // Add the full im_box partition to the list to begin with. 00730 ColPartition* pix_part = ColPartition::FakePartition(im_box, PT_UNKNOWN, 00731 BRT_RECTIMAGE, 00732 BTFT_NONTEXT); 00733 ColPartition_IT part_it(part_list); 00734 part_it.add_after_then_move(pix_part); 00735 00736 rectsearch->StartRectSearch(im_box); 00737 ColPartition* part; 00738 while ((part = rectsearch->NextRectSearch()) != NULL) { 00739 TBOX part_box = part->bounding_box(); 00740 if (part_box.contains(im_box) && part->flow() >= BTFT_CHAIN) { 00741 // This image is completely covered by an existing text partition. 00742 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) { 00743 ColPartition* pix_part = part_it.extract(); 00744 pix_part->DeleteBoxes(); 00745 delete pix_part; 00746 } 00747 } else if (part->flow() == BTFT_STRONG_CHAIN) { 00748 // Text intersects the box. 00749 TBOX overlap_box = part_box.intersection(im_box); 00750 // Intersect it with the image box. 00751 int black_area = ImageFind::CountPixelsInRotatedBox(overlap_box, im_box, 00752 rerotation, pix); 00753 if (black_area * 2 < part_box.area() || !im_box.contains(part_box)) { 00754 // Eat a piece out of the image. 00755 // Pad it so that pieces eaten out look decent. 00756 int padding = part->blob_type() == BRT_VERT_TEXT 00757 ? part_box.width() : part_box.height(); 00758 part_box.set_top(part_box.top() + padding / 2); 00759 part_box.set_bottom(part_box.bottom() - padding / 2); 00760 CutChunkFromParts(part_box, im_box, rotation, rerotation, 00761 pix, part_list); 00762 } else { 00763 // Strong overlap with the black area, so call it text on image. 00764 part->set_flow(BTFT_TEXT_ON_IMAGE); 00765 } 00766 } 00767 if (part_list->empty()) { 00768 break; 00769 } 00770 } 00771 } 00772 00773 // Search for the rightmost text that overlaps vertically and is to the left 00774 // of the given box, but within the given left limit. 00775 static int ExpandImageLeft(const TBOX& box, int left_limit, 00776 ColPartitionGrid* part_grid) { 00777 ColPartitionGridSearch search(part_grid); 00778 ColPartition* part; 00779 // Search right to left for any text that overlaps. 00780 search.StartSideSearch(box.left(), box.bottom(), box.top()); 00781 while ((part = search.NextSideSearch(true)) != NULL) { 00782 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { 00783 const TBOX& part_box(part->bounding_box()); 00784 if (part_box.y_gap(box) < 0) { 00785 if (part_box.right() > left_limit && part_box.right() < box.left()) 00786 left_limit = part_box.right(); 00787 break; 00788 } 00789 } 00790 } 00791 if (part != NULL) { 00792 // Search for the nearest text up to the one we already found. 00793 TBOX search_box(left_limit, box.bottom(), box.left(), box.top()); 00794 search.StartRectSearch(search_box); 00795 while ((part = search.NextRectSearch()) != NULL) { 00796 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { 00797 const TBOX& part_box(part->bounding_box()); 00798 if (part_box.y_gap(box) < 0) { 00799 if (part_box.right() > left_limit && part_box.right() < box.left()) { 00800 left_limit = part_box.right(); 00801 } 00802 } 00803 } 00804 } 00805 } 00806 return left_limit; 00807 } 00808 00809 // Search for the leftmost text that overlaps vertically and is to the right 00810 // of the given box, but within the given right limit. 00811 static int ExpandImageRight(const TBOX& box, int right_limit, 00812 ColPartitionGrid* part_grid) { 00813 ColPartitionGridSearch search(part_grid); 00814 ColPartition* part; 00815 // Search left to right for any text that overlaps. 00816 search.StartSideSearch(box.right(), box.bottom(), box.top()); 00817 while ((part = search.NextSideSearch(false)) != NULL) { 00818 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { 00819 const TBOX& part_box(part->bounding_box()); 00820 if (part_box.y_gap(box) < 0) { 00821 if (part_box.left() < right_limit && part_box.left() > box.right()) 00822 right_limit = part_box.left(); 00823 break; 00824 } 00825 } 00826 } 00827 if (part != NULL) { 00828 // Search for the nearest text up to the one we already found. 00829 TBOX search_box(box.left(), box.bottom(), right_limit, box.top()); 00830 search.StartRectSearch(search_box); 00831 while ((part = search.NextRectSearch()) != NULL) { 00832 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { 00833 const TBOX& part_box(part->bounding_box()); 00834 if (part_box.y_gap(box) < 0) { 00835 if (part_box.left() < right_limit && part_box.left() > box.right()) 00836 right_limit = part_box.left(); 00837 } 00838 } 00839 } 00840 } 00841 return right_limit; 00842 } 00843 00844 // Search for the topmost text that overlaps horizontally and is below 00845 // the given box, but within the given bottom limit. 00846 static int ExpandImageBottom(const TBOX& box, int bottom_limit, 00847 ColPartitionGrid* part_grid) { 00848 ColPartitionGridSearch search(part_grid); 00849 ColPartition* part; 00850 // Search right to left for any text that overlaps. 00851 search.StartVerticalSearch(box.left(), box.right(), box.bottom()); 00852 while ((part = search.NextVerticalSearch(true)) != NULL) { 00853 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { 00854 const TBOX& part_box(part->bounding_box()); 00855 if (part_box.x_gap(box) < 0) { 00856 if (part_box.top() > bottom_limit && part_box.top() < box.bottom()) 00857 bottom_limit = part_box.top(); 00858 break; 00859 } 00860 } 00861 } 00862 if (part != NULL) { 00863 // Search for the nearest text up to the one we already found. 00864 TBOX search_box(box.left(), bottom_limit, box.right(), box.bottom()); 00865 search.StartRectSearch(search_box); 00866 while ((part = search.NextRectSearch()) != NULL) { 00867 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { 00868 const TBOX& part_box(part->bounding_box()); 00869 if (part_box.x_gap(box) < 0) { 00870 if (part_box.top() > bottom_limit && part_box.top() < box.bottom()) 00871 bottom_limit = part_box.top(); 00872 } 00873 } 00874 } 00875 } 00876 return bottom_limit; 00877 } 00878 00879 // Search for the bottommost text that overlaps horizontally and is above 00880 // the given box, but within the given top limit. 00881 static int ExpandImageTop(const TBOX& box, int top_limit, 00882 ColPartitionGrid* part_grid) { 00883 ColPartitionGridSearch search(part_grid); 00884 ColPartition* part; 00885 // Search right to left for any text that overlaps. 00886 search.StartVerticalSearch(box.left(), box.right(), box.top()); 00887 while ((part = search.NextVerticalSearch(false)) != NULL) { 00888 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { 00889 const TBOX& part_box(part->bounding_box()); 00890 if (part_box.x_gap(box) < 0) { 00891 if (part_box.bottom() < top_limit && part_box.bottom() > box.top()) 00892 top_limit = part_box.bottom(); 00893 break; 00894 } 00895 } 00896 } 00897 if (part != NULL) { 00898 // Search for the nearest text up to the one we already found. 00899 TBOX search_box(box.left(), box.top(), box.right(), top_limit); 00900 search.StartRectSearch(search_box); 00901 while ((part = search.NextRectSearch()) != NULL) { 00902 if (part->flow() == BTFT_STRONG_CHAIN || part->flow() == BTFT_CHAIN) { 00903 const TBOX& part_box(part->bounding_box()); 00904 if (part_box.x_gap(box) < 0) { 00905 if (part_box.bottom() < top_limit && part_box.bottom() > box.top()) 00906 top_limit = part_box.bottom(); 00907 } 00908 } 00909 } 00910 } 00911 return top_limit; 00912 } 00913 00914 // Expands the image box in the given direction until it hits text, 00915 // limiting the expansion to the given limit box, returning the result 00916 // in the expanded box, and 00917 // returning the increase in area resulting from the expansion. 00918 static int ExpandImageDir(BlobNeighbourDir dir, const TBOX& im_box, 00919 const TBOX& limit_box, 00920 ColPartitionGrid* part_grid, TBOX* expanded_box) { 00921 *expanded_box = im_box; 00922 switch (dir) { 00923 case BND_LEFT: 00924 expanded_box->set_left(ExpandImageLeft(im_box, limit_box.left(), 00925 part_grid)); 00926 break; 00927 case BND_RIGHT: 00928 expanded_box->set_right(ExpandImageRight(im_box, limit_box.right(), 00929 part_grid)); 00930 break; 00931 case BND_ABOVE: 00932 expanded_box->set_top(ExpandImageTop(im_box, limit_box.top(), part_grid)); 00933 break; 00934 case BND_BELOW: 00935 expanded_box->set_bottom(ExpandImageBottom(im_box, limit_box.bottom(), 00936 part_grid)); 00937 break; 00938 default: 00939 return 0; 00940 } 00941 return expanded_box->area() - im_box.area(); 00942 } 00943 00944 // Expands the image partition into any non-text until it touches text. 00945 // The expansion proceeds in the order of increasing increase in area 00946 // as a heuristic to find the best rectangle by expanding in the most 00947 // constrained direction first. 00948 static void MaximalImageBoundingBox(ColPartitionGrid* part_grid, TBOX* im_box) { 00949 bool dunnit[BND_COUNT]; 00950 memset(dunnit, 0, sizeof(dunnit)); 00951 TBOX limit_box(part_grid->bleft().x(), part_grid->bleft().y(), 00952 part_grid->tright().x(), part_grid->tright().y()); 00953 TBOX text_box(*im_box); 00954 for (int iteration = 0; iteration < BND_COUNT; ++iteration) { 00955 // Find the direction with least area increase. 00956 int best_delta = -1; 00957 BlobNeighbourDir best_dir = BND_LEFT; 00958 TBOX expanded_boxes[BND_COUNT]; 00959 for (int dir = 0; dir < BND_COUNT; ++dir) { 00960 BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir); 00961 if (!dunnit[bnd]) { 00962 TBOX expanded_box; 00963 int area_delta = ExpandImageDir(bnd, text_box, limit_box, part_grid, 00964 &expanded_boxes[bnd]); 00965 if (best_delta < 0 || area_delta < best_delta) { 00966 best_delta = area_delta; 00967 best_dir = bnd; 00968 } 00969 } 00970 } 00971 // Run the best and remember the direction. 00972 dunnit[best_dir] = true; 00973 text_box = expanded_boxes[best_dir]; 00974 } 00975 *im_box = text_box; 00976 } 00977 00978 // Helper deletes the given partition but first marks up all the blobs as 00979 // noise, so they get deleted later, and disowns them. 00980 // If the initial type of the partition is image, then it actually deletes 00981 // the blobs, as the partition owns them in that case. 00982 static void DeletePartition(ColPartition* part) { 00983 BlobRegionType type = part->blob_type(); 00984 if (type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) { 00985 // The partition owns the boxes of these types, so just delete them. 00986 part->DeleteBoxes(); // From a previous iteration. 00987 } else { 00988 // Once marked, the blobs will be swept up by TidyBlobs. 00989 part->set_flow(BTFT_NONTEXT); 00990 part->set_blob_type(BRT_NOISE); 00991 part->SetBlobTypes(); 00992 part->DisownBoxes(); // Created before FindImagePartitions. 00993 } 00994 delete part; 00995 } 00996 00997 // The meat of joining fragmented images and consuming ColPartitions of 00998 // uncertain type. 00999 // *part_ptr is an input/output BRT_RECTIMAGE ColPartition that is to be 01000 // expanded to consume overlapping and nearby ColPartitions of uncertain type 01001 // and other BRT_RECTIMAGE partitions, but NOT to be expanded beyond 01002 // max_image_box. *part_ptr is NOT in the part_grid. 01003 // rectsearch is already constructed on the part_grid, and is used for 01004 // searching for overlapping and nearby ColPartitions. 01005 // ExpandImageIntoParts is called iteratively until it returns false. Each 01006 // time it absorbs the nearest non-contained candidate, and everything that 01007 // is fully contained within part_ptr's bounding box. 01008 // TODO(rays) what if it just eats everything inside max_image_box in one go? 01009 static bool ExpandImageIntoParts(const TBOX& max_image_box, 01010 ColPartitionGridSearch* rectsearch, 01011 ColPartitionGrid* part_grid, 01012 ColPartition** part_ptr) { 01013 ColPartition* image_part = *part_ptr; 01014 TBOX im_part_box = image_part->bounding_box(); 01015 if (textord_tabfind_show_images > 1) { 01016 tprintf("Searching for merge with image part:"); 01017 im_part_box.print(); 01018 tprintf("Text box="); 01019 max_image_box.print(); 01020 } 01021 rectsearch->StartRectSearch(max_image_box); 01022 ColPartition* part; 01023 ColPartition* best_part = NULL; 01024 int best_dist = 0; 01025 while ((part = rectsearch->NextRectSearch()) != NULL) { 01026 if (textord_tabfind_show_images > 1) { 01027 tprintf("Considering merge with part:"); 01028 part->Print(); 01029 if (im_part_box.contains(part->bounding_box())) 01030 tprintf("Fully contained\n"); 01031 else if (!max_image_box.contains(part->bounding_box())) 01032 tprintf("Not within text box\n"); 01033 else if (part->flow() == BTFT_STRONG_CHAIN) 01034 tprintf("Too strong text\n"); 01035 else 01036 tprintf("Real candidate\n"); 01037 } 01038 if (part->flow() == BTFT_STRONG_CHAIN || 01039 part->flow() == BTFT_TEXT_ON_IMAGE || 01040 part->blob_type() == BRT_POLYIMAGE) 01041 continue; 01042 TBOX box = part->bounding_box(); 01043 if (max_image_box.contains(box) && part->blob_type() != BRT_NOISE) { 01044 if (im_part_box.contains(box)) { 01045 // Eat it completely. 01046 rectsearch->RemoveBBox(); 01047 DeletePartition(part); 01048 continue; 01049 } 01050 int x_dist = MAX(0, box.x_gap(im_part_box)); 01051 int y_dist = MAX(0, box.y_gap(im_part_box)); 01052 int dist = x_dist * x_dist + y_dist * y_dist; 01053 if (dist > box.area() || dist > im_part_box.area()) 01054 continue; // Not close enough. 01055 if (best_part == NULL || dist < best_dist) { 01056 // We keep the nearest qualifier, which is not necessarily the nearest. 01057 best_part = part; 01058 best_dist = dist; 01059 } 01060 } 01061 } 01062 if (best_part != NULL) { 01063 // It needs expanding. We can do it without touching text. 01064 TBOX box = best_part->bounding_box(); 01065 if (textord_tabfind_show_images > 1) { 01066 tprintf("Merging image part:"); 01067 im_part_box.print(); 01068 tprintf("with part:"); 01069 box.print(); 01070 } 01071 im_part_box += box; 01072 *part_ptr = ColPartition::FakePartition(im_part_box, PT_UNKNOWN, 01073 BRT_RECTIMAGE, 01074 BTFT_NONTEXT); 01075 DeletePartition(image_part); 01076 part_grid->RemoveBBox(best_part); 01077 DeletePartition(best_part); 01078 rectsearch->RepositionIterator(); 01079 return true; 01080 } 01081 return false; 01082 } 01083 01084 // Helper function to compute the overlap area between the box and the 01085 // given list of partitions. 01086 static int IntersectArea(const TBOX& box, ColPartition_LIST* part_list) { 01087 int intersect_area = 0; 01088 ColPartition_IT part_it(part_list); 01089 // Iterate the parts and subtract intersecting area. 01090 for (part_it.mark_cycle_pt(); !part_it.cycled_list(); 01091 part_it.forward()) { 01092 ColPartition* image_part = part_it.data(); 01093 TBOX intersect = box.intersection(image_part->bounding_box()); 01094 intersect_area += intersect.area(); 01095 } 01096 return intersect_area; 01097 } 01098 01099 // part_list is a set of ColPartitions representing a polygonal image, and 01100 // im_box is the union of the bounding boxes of all the parts in part_list. 01101 // Tests whether part is to be consumed by the polygonal image. 01102 // Returns true if part is weak text and more than half of its area is 01103 // intersected by parts from the part_list, and it is contained within im_box. 01104 static bool TestWeakIntersectedPart(const TBOX& im_box, 01105 ColPartition_LIST* part_list, 01106 ColPartition* part) { 01107 if (part->flow() < BTFT_STRONG_CHAIN) { 01108 // A weak partition intersects the box. 01109 TBOX part_box = part->bounding_box(); 01110 if (im_box.contains(part_box)) { 01111 int area = part_box.area(); 01112 int intersect_area = IntersectArea(part_box, part_list); 01113 if (area < 2 * intersect_area) { 01114 return true; 01115 } 01116 } 01117 } 01118 return false; 01119 } 01120 01121 // A rectangular or polygonal image has been completed, in part_list, bounding 01122 // box in im_box. We want to eliminate weak text or other uncertain partitions 01123 // (basically anything that is not BRT_STRONG_CHAIN or better) from both the 01124 // part_grid and the big_parts list that are contained within im_box and 01125 // overlapped enough by the possibly polygonal image. 01126 static void EliminateWeakParts(const TBOX& im_box, 01127 ColPartitionGrid* part_grid, 01128 ColPartition_LIST* big_parts, 01129 ColPartition_LIST* part_list) { 01130 ColPartitionGridSearch rectsearch(part_grid); 01131 ColPartition* part; 01132 rectsearch.StartRectSearch(im_box); 01133 while ((part = rectsearch.NextRectSearch()) != NULL) { 01134 if (TestWeakIntersectedPart(im_box, part_list, part)) { 01135 BlobRegionType type = part->blob_type(); 01136 if (type == BRT_POLYIMAGE || type == BRT_RECTIMAGE) { 01137 rectsearch.RemoveBBox(); 01138 DeletePartition(part); 01139 } else { 01140 // The part is mostly covered, so mark it. Non-image partitions are 01141 // kept hanging around to mark the image for pass2 01142 part->set_flow(BTFT_NONTEXT); 01143 part->set_blob_type(BRT_NOISE); 01144 part->SetBlobTypes(); 01145 } 01146 } 01147 } 01148 ColPartition_IT big_it(big_parts); 01149 for (big_it.mark_cycle_pt(); !big_it.cycled_list(); big_it.forward()) { 01150 part = big_it.data(); 01151 if (TestWeakIntersectedPart(im_box, part_list, part)) { 01152 // Once marked, the blobs will be swept up by TidyBlobs. 01153 DeletePartition(big_it.extract()); 01154 } 01155 } 01156 } 01157 01158 // Helper scans for good text partitions overlapping the given box. 01159 // If there are no good text partitions overlapping an expanded box, then 01160 // the box is expanded, otherwise, the original box is returned. 01161 // If good text overlaps the box, true is returned. 01162 static bool ScanForOverlappingText(ColPartitionGrid* part_grid, TBOX* box) { 01163 ColPartitionGridSearch rectsearch(part_grid); 01164 TBOX padded_box(*box); 01165 padded_box.pad(kNoisePadding, kNoisePadding); 01166 rectsearch.StartRectSearch(padded_box); 01167 ColPartition* part; 01168 bool any_text_in_padded_rect = false; 01169 while ((part = rectsearch.NextRectSearch()) != NULL) { 01170 if (part->flow() == BTFT_CHAIN || 01171 part->flow() == BTFT_STRONG_CHAIN) { 01172 // Text intersects the box. 01173 any_text_in_padded_rect = true; 01174 TBOX part_box = part->bounding_box(); 01175 if (box->overlap(part_box)) { 01176 return true; 01177 } 01178 } 01179 } 01180 if (!any_text_in_padded_rect) 01181 *box = padded_box; 01182 return false; 01183 } 01184 01185 // Renders the boxes of image parts from the supplied list onto the image_pix, 01186 // except where they interfere with existing strong text in the part_grid, 01187 // and then deletes them. 01188 // Box coordinates are rotated by rerotate to match the image. 01189 static void MarkAndDeleteImageParts(const FCOORD& rerotate, 01190 ColPartitionGrid* part_grid, 01191 ColPartition_LIST* image_parts, 01192 Pix* image_pix) { 01193 if (image_pix == NULL) 01194 return; 01195 int imageheight = pixGetHeight(image_pix); 01196 ColPartition_IT part_it(image_parts); 01197 for (; !part_it.empty(); part_it.forward()) { 01198 ColPartition* part = part_it.extract(); 01199 TBOX part_box = part->bounding_box(); 01200 BlobRegionType type = part->blob_type(); 01201 if (!ScanForOverlappingText(part_grid, &part_box) || 01202 type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) { 01203 // Mark the box on the image. 01204 // All coords need to be rotated to match the image. 01205 part_box.rotate(rerotate); 01206 int left = part_box.left(); 01207 int top = part_box.top(); 01208 pixRasterop(image_pix, left, imageheight - top, 01209 part_box.width(), part_box.height(), PIX_SET, NULL, 0, 0); 01210 } 01211 DeletePartition(part); 01212 } 01213 } 01214 01215 // Locates all the image partitions in the part_grid, that were found by a 01216 // previous call to FindImagePartitions, marks them in the image_mask, 01217 // removes them from the grid, and deletes them. This makes it possble to 01218 // call FindImagePartitions again to produce less broken-up and less 01219 // overlapping image partitions. 01220 // rerotation specifies how to rotate the partition coords to match 01221 // the image_mask, since this function is used after orientation correction. 01222 void ImageFind::TransferImagePartsToImageMask(const FCOORD& rerotation, 01223 ColPartitionGrid* part_grid, 01224 Pix* image_mask) { 01225 // Extract the noise parts from the grid and put them on a temporary list. 01226 ColPartition_LIST parts_list; 01227 ColPartition_IT part_it(&parts_list); 01228 ColPartitionGridSearch gsearch(part_grid); 01229 gsearch.StartFullSearch(); 01230 ColPartition* part; 01231 while ((part = gsearch.NextFullSearch()) != NULL) { 01232 BlobRegionType type = part->blob_type(); 01233 if (type == BRT_NOISE || type == BRT_RECTIMAGE || type == BRT_POLYIMAGE) { 01234 part_it.add_after_then_move(part); 01235 gsearch.RemoveBBox(); 01236 } 01237 } 01238 // Render listed noise partitions to the image mask. 01239 MarkAndDeleteImageParts(rerotation, part_grid, &parts_list, image_mask); 01240 } 01241 01242 // Removes and deletes all image partitions that are too small to be worth 01243 // keeping. We have to do this as a separate phase after creating the image 01244 // partitions as the small images are needed to join the larger ones together. 01245 static void DeleteSmallImages(ColPartitionGrid* part_grid) { 01246 if (part_grid != NULL) return; 01247 ColPartitionGridSearch gsearch(part_grid); 01248 gsearch.StartFullSearch(); 01249 ColPartition* part; 01250 while ((part = gsearch.NextFullSearch()) != NULL) { 01251 // Only delete rectangular images, since if it became a poly image, it 01252 // is more evidence that it is somehow important. 01253 if (part->blob_type() == BRT_RECTIMAGE) { 01254 const TBOX& part_box = part->bounding_box(); 01255 if (part_box.width() < kMinImageFindSize || 01256 part_box.height() < kMinImageFindSize) { 01257 // It is too small to keep. Just make it disappear. 01258 gsearch.RemoveBBox(); 01259 DeletePartition(part); 01260 } 01261 } 01262 } 01263 } 01264 01265 // Runs a CC analysis on the image_pix mask image, and creates 01266 // image partitions from them, cutting out strong text, and merging with 01267 // nearby image regions such that they don't interfere with text. 01268 // Rotation and rerotation specify how to rotate image coords to match 01269 // the blob and partition coords and back again. 01270 // The input/output part_grid owns all the created partitions, and 01271 // the partitions own all the fake blobs that belong in the partitions. 01272 // Since the other blobs in the other partitions will be owned by the block, 01273 // ColPartitionGrid::ReTypeBlobs must be called afterwards to fix this 01274 // situation and collect the image blobs. 01275 void ImageFind::FindImagePartitions(Pix* image_pix, 01276 const FCOORD& rotation, 01277 const FCOORD& rerotation, 01278 TO_BLOCK* block, 01279 TabFind* tab_grid, 01280 ColPartitionGrid* part_grid, 01281 ColPartition_LIST* big_parts) { 01282 int imageheight = pixGetHeight(image_pix); 01283 Boxa* boxa; 01284 Pixa* pixa; 01285 ConnCompAndRectangularize(image_pix, &boxa, &pixa); 01286 // Iterate the connected components in the image regions mask. 01287 int nboxes = boxaGetCount(boxa); 01288 for (int i = 0; i < nboxes; ++i) { 01289 l_int32 x, y, width, height; 01290 boxaGetBoxGeometry(boxa, i, &x, &y, &width, &height); 01291 Pix* pix = pixaGetPix(pixa, i, L_CLONE); 01292 TBOX im_box(x, imageheight -y - height, x + width, imageheight - y); 01293 im_box.rotate(rotation); // Now matches all partitions and blobs. 01294 ColPartitionGridSearch rectsearch(part_grid); 01295 rectsearch.SetUniqueMode(true); 01296 ColPartition_LIST part_list; 01297 DivideImageIntoParts(im_box, rotation, rerotation, pix, 01298 &rectsearch, &part_list); 01299 if (textord_tabfind_show_images) { 01300 pixWrite("junkimagecomponent.png", pix, IFF_PNG); 01301 tprintf("Component has %d parts\n", part_list.length()); 01302 } 01303 pixDestroy(&pix); 01304 if (!part_list.empty()) { 01305 ColPartition_IT part_it(&part_list); 01306 if (part_list.singleton()) { 01307 // We didn't have to chop it into a polygon to fit around text, so 01308 // try expanding it to merge fragmented image parts, as long as it 01309 // doesn't touch strong text. 01310 ColPartition* part = part_it.extract(); 01311 TBOX text_box(im_box); 01312 MaximalImageBoundingBox(part_grid, &text_box); 01313 while (ExpandImageIntoParts(text_box, &rectsearch, part_grid, &part)); 01314 part_it.set_to_list(&part_list); 01315 part_it.add_after_then_move(part); 01316 im_box = part->bounding_box(); 01317 } 01318 EliminateWeakParts(im_box, part_grid, big_parts, &part_list); 01319 // Iterate the part_list and put the parts into the grid. 01320 for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) { 01321 ColPartition* image_part = part_it.extract(); 01322 im_box = image_part->bounding_box(); 01323 part_grid->InsertBBox(true, true, image_part); 01324 if (!part_it.at_last()) { 01325 ColPartition* neighbour = part_it.data_relative(1); 01326 image_part->AddPartner(false, neighbour); 01327 neighbour->AddPartner(true, image_part); 01328 } 01329 } 01330 } 01331 } 01332 boxaDestroy(&boxa); 01333 pixaDestroy(&pixa); 01334 DeleteSmallImages(part_grid); 01335 if (textord_tabfind_show_images) { 01336 ScrollView* images_win_ = part_grid->MakeWindow(1000, 400, "With Images"); 01337 part_grid->DisplayBoxes(images_win_); 01338 } 01339 } 01340 01341 01342 } // namespace tesseract. 01343