Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: devanagari_processing.cpp 00003 * Description: Methods to process images containing devanagari symbols, 00004 * prior to classification. 00005 * Author: Shobhit Saxena 00006 * Created: Mon Nov 17 20:26:01 IST 2008 00007 * 00008 * (C) Copyright 2008, Google Inc. 00009 ** Licensed under the Apache License, Version 2.0 (the "License"); 00010 ** you may not use this file except in compliance with the License. 00011 ** You may obtain a copy of the License at 00012 ** http://www.apache.org/licenses/LICENSE-2.0 00013 ** Unless required by applicable law or agreed to in writing, software 00014 ** distributed under the License is distributed on an "AS IS" BASIS, 00015 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 ** See the License for the specific language governing permissions and 00017 ** limitations under the License. 00018 * 00019 **********************************************************************/ 00020 00021 #include "devanagari_processing.h" 00022 #include "allheaders.h" 00023 #include "tordmain.h" 00024 #include "img.h" 00025 #include "statistc.h" 00026 00027 // Flags controlling the debugging information for shiro-rekha splitting 00028 // strategies. 00029 INT_VAR(devanagari_split_debuglevel, 0, 00030 "Debug level for split shiro-rekha process."); 00031 00032 BOOL_VAR(devanagari_split_debugimage, 0, 00033 "Whether to create a debug image for split shiro-rekha process."); 00034 00035 namespace tesseract { 00036 00037 ShiroRekhaSplitter::ShiroRekhaSplitter() { 00038 orig_pix_ = NULL; 00039 segmentation_block_list_ = NULL; 00040 splitted_image_ = NULL; 00041 global_xheight_ = kUnspecifiedXheight; 00042 perform_close_ = false; 00043 debug_image_ = NULL; 00044 pageseg_split_strategy_ = NO_SPLIT; 00045 ocr_split_strategy_ = NO_SPLIT; 00046 } 00047 00048 ShiroRekhaSplitter::~ShiroRekhaSplitter() { 00049 Clear(); 00050 } 00051 00052 void ShiroRekhaSplitter::Clear() { 00053 pixDestroy(&orig_pix_); 00054 pixDestroy(&splitted_image_); 00055 pageseg_split_strategy_ = NO_SPLIT; 00056 ocr_split_strategy_ = NO_SPLIT; 00057 pixDestroy(&debug_image_); 00058 segmentation_block_list_ = NULL; 00059 global_xheight_ = kUnspecifiedXheight; 00060 perform_close_ = false; 00061 } 00062 00063 // This method dumps a debug image to the specified location. 00064 void ShiroRekhaSplitter::DumpDebugImage(const char* filename) const { 00065 pixWrite(filename, debug_image_, IFF_PNG); 00066 } 00067 00068 // On setting the input image, a clone of it is owned by this class. 00069 void ShiroRekhaSplitter::set_orig_pix(Pix* pix) { 00070 if (orig_pix_) { 00071 pixDestroy(&orig_pix_); 00072 } 00073 orig_pix_ = pixClone(pix); 00074 } 00075 00076 // Top-level method to perform splitting based on current settings. 00077 // Returns true if a split was actually performed. 00078 // split_for_pageseg should be true if the splitting is being done prior to 00079 // page segmentation. This mode uses the flag 00080 // pageseg_devanagari_split_strategy to determine the splitting strategy. 00081 bool ShiroRekhaSplitter::Split(bool split_for_pageseg) { 00082 SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ : 00083 ocr_split_strategy_; 00084 if (split_strategy == NO_SPLIT) { 00085 return false; // Nothing to do. 00086 } 00087 ASSERT_HOST(split_strategy == MINIMAL_SPLIT || 00088 split_strategy == MAXIMAL_SPLIT); 00089 ASSERT_HOST(orig_pix_); 00090 if (devanagari_split_debuglevel > 0) { 00091 tprintf("Splitting shiro-rekha ...\n"); 00092 tprintf("Split strategy = %s\n", 00093 split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal"); 00094 tprintf("Initial pageseg available = %s\n", 00095 segmentation_block_list_ ? "yes" : "no"); 00096 } 00097 // Create a copy of original image to store the splitting output. 00098 pixDestroy(&splitted_image_); 00099 splitted_image_ = pixCopy(NULL, orig_pix_); 00100 00101 // Initialize debug image if required. 00102 if (devanagari_split_debugimage) { 00103 pixDestroy(&debug_image_); 00104 debug_image_ = pixConvertTo32(orig_pix_); 00105 } 00106 00107 // Determine all connected components in the input image. A close operation 00108 // may be required prior to this, depending on the current settings. 00109 Pix* pix_for_ccs = pixClone(orig_pix_); 00110 if (perform_close_ && global_xheight_ != kUnspecifiedXheight && 00111 !segmentation_block_list_) { 00112 if (devanagari_split_debuglevel > 0) { 00113 tprintf("Performing a global close operation..\n"); 00114 } 00115 // A global measure is available for xheight, but no local information 00116 // exists. 00117 pixDestroy(&pix_for_ccs); 00118 pix_for_ccs = pixCopy(NULL, orig_pix_); 00119 PerformClose(pix_for_ccs, global_xheight_); 00120 } 00121 Pixa* ccs; 00122 Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8); 00123 boxaDestroy(&tmp_boxa); 00124 pixDestroy(&pix_for_ccs); 00125 00126 // Iterate over all connected components. Get their bounding boxes and clip 00127 // out the image regions corresponding to these boxes from the original image. 00128 // Conditionally run splitting on each of them. 00129 Boxa* regions_to_clear = boxaCreate(0); 00130 for (int i = 0; i < pixaGetCount(ccs); ++i) { 00131 Box* box = ccs->boxa->box[i]; 00132 Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL); 00133 ASSERT_HOST(word_pix); 00134 int xheight = GetXheightForCC(box); 00135 if (xheight == kUnspecifiedXheight && segmentation_block_list_ && 00136 devanagari_split_debugimage) { 00137 pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0); 00138 } 00139 // If some xheight measure is available, attempt to pre-eliminate small 00140 // blobs from the shiro-rekha process. This is primarily to save the CCs 00141 // corresponding to punctuation marks/small dots etc which are part of 00142 // larger graphemes. 00143 if (xheight == kUnspecifiedXheight || 00144 (box->w > xheight / 3 && box->h > xheight / 2)) { 00145 SplitWordShiroRekha(split_strategy, word_pix, xheight, 00146 box->x, box->y, regions_to_clear); 00147 } else if (devanagari_split_debuglevel > 0) { 00148 tprintf("CC dropped from splitting: %d,%d (%d, %d)\n", 00149 box->x, box->y, box->w, box->h); 00150 } 00151 pixDestroy(&word_pix); 00152 } 00153 // Actually clear the boxes now. 00154 for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) { 00155 Box* box = boxaGetBox(regions_to_clear, i, L_CLONE); 00156 pixClearInRect(splitted_image_, box); 00157 boxDestroy(&box); 00158 } 00159 boxaDestroy(®ions_to_clear); 00160 pixaDestroy(&ccs); 00161 if (devanagari_split_debugimage) { 00162 DumpDebugImage(split_for_pageseg ? "pageseg_split_debug.png" : 00163 "ocr_split_debug.png"); 00164 } 00165 return true; 00166 } 00167 00168 // Method to perform a close operation on the input image. The xheight 00169 // estimate decides the size of sel used. 00170 void ShiroRekhaSplitter::PerformClose(Pix* pix, int xheight_estimate) { 00171 pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3); 00172 } 00173 00174 // This method resolves the cc bbox to a particular row and returns the row's 00175 // xheight. 00176 int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) { 00177 if (!segmentation_block_list_) { 00178 return global_xheight_; 00179 } 00180 // Compute the box coordinates in Tesseract's coordinate system. 00181 TBOX bbox(cc_bbox->x, 00182 pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1, 00183 cc_bbox->x + cc_bbox->w, 00184 pixGetHeight(orig_pix_) - cc_bbox->y - 1); 00185 // Iterate over all blocks. 00186 BLOCK_IT block_it(segmentation_block_list_); 00187 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { 00188 BLOCK* block = block_it.data(); 00189 // Iterate over all rows in the block. 00190 ROW_IT row_it(block->row_list()); 00191 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00192 ROW* row = row_it.data(); 00193 if (!row->bounding_box().major_overlap(bbox)) { 00194 continue; 00195 } 00196 // Row could be skewed, warped, etc. Use the position of the box to 00197 // determine the baseline position of the row for that x-coordinate. 00198 // Create a square TBOX whose baseline's mid-point lies at this point 00199 // and side is row's xheight. Take the overlap of this box with the input 00200 // box and check if it is a 'major overlap'. If so, this box lies in this 00201 // row. In that case, return the xheight for this row. 00202 float box_middle = 0.5 * (bbox.left() + bbox.right()); 00203 int baseline = static_cast<int>(row->base_line(box_middle) + 0.5); 00204 TBOX test_box(box_middle - row->x_height() / 2, 00205 baseline, 00206 box_middle + row->x_height() / 2, 00207 static_cast<int>(baseline + row->x_height())); 00208 // Compute overlap. If it is is a major overlap, this is the right row. 00209 if (bbox.major_overlap(test_box)) { 00210 return row->x_height(); 00211 } 00212 } 00213 } 00214 // No row found for this bbox. 00215 return kUnspecifiedXheight; 00216 } 00217 00218 // Returns a list of regions (boxes) which should be cleared in the original 00219 // image so as to perform shiro-rekha splitting. Pix is assumed to carry one 00220 // (or less) word only. Xheight measure could be the global estimate, the row 00221 // estimate, or unspecified. If unspecified, over splitting may occur, since a 00222 // conservative estimate of stroke width along with an associated multiplier 00223 // is used in its place. It is advisable to have a specified xheight when 00224 // splitting for classification/training. 00225 // A vertical projection histogram of all the on-pixels in the input pix is 00226 // computed. The maxima of this histogram is regarded as an approximate location 00227 // of the shiro-rekha. By descending on the maxima's peak on both sides, 00228 // stroke width of shiro-rekha is estimated. 00229 // A horizontal projection histogram is computed for a sub-image of the input 00230 // image, which extends from just below the shiro-rekha down to a certain 00231 // leeway. The leeway depends on the input xheight, if provided, else a 00232 // conservative multiplier on approximate stroke width is used (which may lead 00233 // to over-splitting). 00234 void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy, 00235 Pix* pix, 00236 int xheight, 00237 int word_left, 00238 int word_top, 00239 Boxa* regions_to_clear) { 00240 if (split_strategy == NO_SPLIT) { 00241 return; 00242 } 00243 int width = pixGetWidth(pix); 00244 int height = pixGetHeight(pix); 00245 // Statistically determine the yextents of the shiro-rekha. 00246 int shirorekha_top, shirorekha_bottom, shirorekha_ylevel; 00247 GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom, 00248 &shirorekha_ylevel); 00249 // Since the shiro rekha is also a stroke, its width is equal to the stroke 00250 // width. 00251 int stroke_width = shirorekha_bottom - shirorekha_top + 1; 00252 00253 // Some safeguards to protect CCs we do not want to be split. 00254 // These are particularly useful when the word wasn't eliminated earlier 00255 // because xheight information was unavailable. 00256 if (shirorekha_ylevel > height / 2) { 00257 // Shirorekha shouldn't be in the bottom half of the word. 00258 if (devanagari_split_debuglevel > 0) { 00259 tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n", 00260 word_left, word_top); 00261 } 00262 return; 00263 } 00264 if (stroke_width > height / 3) { 00265 // Even the boldest of fonts shouldn't do this. 00266 if (devanagari_split_debuglevel > 0) { 00267 tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n", 00268 word_left, word_top); 00269 } 00270 return; 00271 } 00272 00273 // Clear the ascender and descender regions of the word. 00274 // Obtain a vertical projection histogram for the resulting image. 00275 Box* box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3, 00276 width, 5 * stroke_width / 3); 00277 Pix* word_in_xheight = pixCopy(NULL, pix); 00278 pixClearInRect(word_in_xheight, box_to_clear); 00279 // Also clear any pixels which are below shirorekha_bottom + some leeway. 00280 // The leeway is set to xheight if the information is available, else it is a 00281 // multiplier applied to the stroke width. 00282 int leeway_to_keep = stroke_width * 3; 00283 if (xheight != kUnspecifiedXheight) { 00284 // This is because the xheight-region typically includes the shiro-rekha 00285 // inside it, i.e., the top of the xheight range corresponds to the top of 00286 // shiro-rekha. 00287 leeway_to_keep = xheight - stroke_width; 00288 } 00289 box_to_clear->y = shirorekha_bottom + leeway_to_keep; 00290 box_to_clear->h = height - box_to_clear->y; 00291 pixClearInRect(word_in_xheight, box_to_clear); 00292 boxDestroy(&box_to_clear); 00293 00294 PixelHistogram vert_hist; 00295 vert_hist.ConstructVerticalCountHist(word_in_xheight); 00296 pixDestroy(&word_in_xheight); 00297 00298 // If the number of black pixel in any column of the image is less than a 00299 // fraction of the stroke width, treat it as noise / a stray mark. Perform 00300 // these changes inside the vert_hist data itself, as that is used later on as 00301 // a bit vector for the final split decision at every column. 00302 for (int i = 0; i < width; ++i) { 00303 if (vert_hist.hist()[i] <= stroke_width / 4) 00304 vert_hist.hist()[i] = 0; 00305 else 00306 vert_hist.hist()[i] = 1; 00307 } 00308 // In order to split the line at any point, we make sure that the width of the 00309 // gap is atleast half the stroke width. 00310 int i = 0; 00311 int cur_component_width = 0; 00312 while (i < width) { 00313 if (!vert_hist.hist()[i]) { 00314 int j = 0; 00315 while (i + j < width && !vert_hist.hist()[i+j]) 00316 ++j; 00317 if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) { 00318 // Perform a shiro-rekha split. The intervening region lies from i to 00319 // i+j-1. 00320 // A minimal single-pixel split makes the estimation of intra- and 00321 // inter-word spacing easier during page layout analysis, 00322 // whereas a maximal split may be needed for OCR, depending on 00323 // how the engine was trained. 00324 bool minimal_split = (split_strategy == MINIMAL_SPLIT); 00325 int split_width = minimal_split ? 1 : j; 00326 int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i; 00327 if (!minimal_split || (i != 0 && i + j != width)) { 00328 Box* box_to_clear = 00329 boxCreate(word_left + split_left, 00330 word_top + shirorekha_top - stroke_width / 3, 00331 split_width, 00332 5 * stroke_width / 3); 00333 if (box_to_clear) { 00334 boxaAddBox(regions_to_clear, box_to_clear, L_CLONE); 00335 // Mark this in the debug image if needed. 00336 if (devanagari_split_debugimage) { 00337 pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128); 00338 } 00339 boxDestroy(&box_to_clear); 00340 cur_component_width = 0; 00341 } 00342 } 00343 } 00344 i += j; 00345 } else { 00346 ++i; 00347 ++cur_component_width; 00348 } 00349 } 00350 } 00351 00352 // Refreshes the words in the segmentation block list by using blobs in the 00353 // input block list. 00354 // The segmentation block list must be set. 00355 void ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs( 00356 C_BLOB_LIST* new_blobs) { 00357 // The segmentation block list must have been specified. 00358 ASSERT_HOST(segmentation_block_list_); 00359 if (devanagari_split_debuglevel > 0) { 00360 tprintf("Before refreshing blobs:\n"); 00361 PrintSegmentationStats(segmentation_block_list_); 00362 tprintf("New Blobs found: %d\n", new_blobs->length()); 00363 } 00364 00365 C_BLOB_LIST not_found_blobs; 00366 RefreshWordBlobsFromNewBlobs(segmentation_block_list_, 00367 new_blobs, 00368 ((devanagari_split_debugimage && debug_image_) ? 00369 ¬_found_blobs : NULL)); 00370 00371 if (devanagari_split_debuglevel > 0) { 00372 tprintf("After refreshing blobs:\n"); 00373 PrintSegmentationStats(segmentation_block_list_); 00374 } 00375 if (devanagari_split_debugimage && debug_image_) { 00376 // Plot out the original blobs for which no match was found in the new 00377 // all_blobs list. 00378 C_BLOB_IT not_found_it(¬_found_blobs); 00379 for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); 00380 not_found_it.forward()) { 00381 C_BLOB* not_found = not_found_it.data(); 00382 TBOX not_found_box = not_found->bounding_box(); 00383 Box* box_to_plot = GetBoxForTBOX(not_found_box); 00384 pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255); 00385 boxDestroy(&box_to_plot); 00386 } 00387 00388 // Plot out the blobs unused from all blobs. 00389 C_BLOB_IT all_blobs_it(new_blobs); 00390 for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); 00391 all_blobs_it.forward()) { 00392 C_BLOB* a_blob = all_blobs_it.data(); 00393 Box* box_to_plot = GetBoxForTBOX(a_blob->bounding_box()); 00394 pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0); 00395 boxDestroy(&box_to_plot); 00396 } 00397 } 00398 } 00399 00400 // Returns a new box object for the corresponding TBOX, based on the original 00401 // image's coordinate system. 00402 Box* ShiroRekhaSplitter::GetBoxForTBOX(const TBOX& tbox) const { 00403 return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1, 00404 tbox.width(), tbox.height()); 00405 } 00406 00407 // This method returns the computed mode-height of blobs in the pix. 00408 // It also prunes very small blobs from calculation. 00409 int ShiroRekhaSplitter::GetModeHeight(Pix* pix) { 00410 Boxa* boxa = pixConnComp(pix, NULL, 8); 00411 STATS heights(0, pixGetHeight(pix)); 00412 heights.clear(); 00413 for (int i = 0; i < boxaGetCount(boxa); ++i) { 00414 Box* box = boxaGetBox(boxa, i, L_CLONE); 00415 if (box->h >= 3 || box->w >= 3) { 00416 heights.add(box->h, 1); 00417 } 00418 boxDestroy(&box); 00419 } 00420 boxaDestroy(&boxa); 00421 return heights.mode(); 00422 } 00423 00424 // This method returns y-extents of the shiro-rekha computed from the input 00425 // word image. 00426 void ShiroRekhaSplitter::GetShiroRekhaYExtents(Pix* word_pix, 00427 int* shirorekha_top, 00428 int* shirorekha_bottom, 00429 int* shirorekha_ylevel) { 00430 // Compute a histogram from projecting the word on a vertical line. 00431 PixelHistogram hist_horiz; 00432 hist_horiz.ConstructHorizontalCountHist(word_pix); 00433 // Get the ylevel where the top-line exists. This is basically the global 00434 // maxima in the horizontal histogram. 00435 int topline_onpixel_count = 0; 00436 int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count); 00437 00438 // Get the upper and lower extents of the shiro rekha. 00439 int thresh = (topline_onpixel_count * 70) / 100; 00440 int ulimit = topline_ylevel; 00441 int llimit = topline_ylevel; 00442 while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh) 00443 --ulimit; 00444 while (llimit < word_pix->h && hist_horiz.hist()[llimit] >= thresh) 00445 ++llimit; 00446 00447 if (shirorekha_top) *shirorekha_top = ulimit; 00448 if (shirorekha_bottom) *shirorekha_bottom = llimit; 00449 if (shirorekha_ylevel) *shirorekha_ylevel = topline_ylevel; 00450 } 00451 00452 // This method returns the global-maxima for the histogram. The frequency of 00453 // the global maxima is returned in count, if specified. 00454 int PixelHistogram::GetHistogramMaximum(int* count) const { 00455 int best_value = 0; 00456 for (int i = 0; i < length_; ++i) { 00457 if (hist_[i] > hist_[best_value]) { 00458 best_value = i; 00459 } 00460 } 00461 if (count) { 00462 *count = hist_[best_value]; 00463 } 00464 return best_value; 00465 } 00466 00467 // Methods to construct histograms from images. 00468 void PixelHistogram::ConstructVerticalCountHist(Pix* pix) { 00469 Clear(); 00470 int width = pixGetWidth(pix); 00471 int height = pixGetHeight(pix); 00472 hist_ = new int[width]; 00473 length_ = width; 00474 int wpl = pixGetWpl(pix); 00475 l_uint32 *data = pixGetData(pix); 00476 for (int i = 0; i < width; ++i) 00477 hist_[i] = 0; 00478 for (int i = 0; i < height; ++i) { 00479 l_uint32 *line = data + i * wpl; 00480 for (int j = 0; j < width; ++j) 00481 if (GET_DATA_BIT(line, j)) 00482 ++(hist_[j]); 00483 } 00484 } 00485 00486 void PixelHistogram::ConstructHorizontalCountHist(Pix* pix) { 00487 Clear(); 00488 Numa* counts = pixCountPixelsByRow(pix, NULL); 00489 length_ = numaGetCount(counts); 00490 hist_ = new int[length_]; 00491 for (int i = 0; i < length_; ++i) { 00492 l_int32 val = 0; 00493 numaGetIValue(counts, i, &val); 00494 hist_[i] = val; 00495 } 00496 numaDestroy(&counts); 00497 } 00498 00499 } // namespace tesseract.