Tesseract  3.02
tesseract-ocr/textord/devanagari_processing.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        devanagari_processing.cpp
00003  * Description: Methods to process images containing devanagari symbols,
00004  *              prior to classification.
00005  * Author:      Shobhit Saxena
00006  * Created:     Mon Nov 17 20:26:01 IST 2008
00007  *
00008  * (C) Copyright 2008, Google Inc.
00009  ** Licensed under the Apache License, Version 2.0 (the "License");
00010  ** you may not use this file except in compliance with the License.
00011  ** You may obtain a copy of the License at
00012  ** http://www.apache.org/licenses/LICENSE-2.0
00013  ** Unless required by applicable law or agreed to in writing, software
00014  ** distributed under the License is distributed on an "AS IS" BASIS,
00015  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  ** See the License for the specific language governing permissions and
00017  ** limitations under the License.
00018  *
00019  **********************************************************************/
00020 
00021 #include "devanagari_processing.h"
00022 #include "allheaders.h"
00023 #include "tordmain.h"
00024 #include "img.h"
00025 #include "statistc.h"
00026 
00027 // Flags controlling the debugging information for shiro-rekha splitting
00028 // strategies.
00029 INT_VAR(devanagari_split_debuglevel, 0,
00030         "Debug level for split shiro-rekha process.");
00031 
00032 BOOL_VAR(devanagari_split_debugimage, 0,
00033          "Whether to create a debug image for split shiro-rekha process.");
00034 
00035 namespace tesseract {
00036 
00037 ShiroRekhaSplitter::ShiroRekhaSplitter() {
00038   orig_pix_ = NULL;
00039   segmentation_block_list_ = NULL;
00040   splitted_image_ = NULL;
00041   global_xheight_ = kUnspecifiedXheight;
00042   perform_close_ = false;
00043   debug_image_ = NULL;
00044   pageseg_split_strategy_ = NO_SPLIT;
00045   ocr_split_strategy_ = NO_SPLIT;
00046 }
00047 
00048 ShiroRekhaSplitter::~ShiroRekhaSplitter() {
00049   Clear();
00050 }
00051 
00052 void ShiroRekhaSplitter::Clear() {
00053   pixDestroy(&orig_pix_);
00054   pixDestroy(&splitted_image_);
00055   pageseg_split_strategy_ = NO_SPLIT;
00056   ocr_split_strategy_ = NO_SPLIT;
00057   pixDestroy(&debug_image_);
00058   segmentation_block_list_ = NULL;
00059   global_xheight_ = kUnspecifiedXheight;
00060   perform_close_ = false;
00061 }
00062 
00063 // This method dumps a debug image to the specified location.
00064 void ShiroRekhaSplitter::DumpDebugImage(const char* filename) const {
00065   pixWrite(filename, debug_image_, IFF_PNG);
00066 }
00067 
00068 // On setting the input image, a clone of it is owned by this class.
00069 void ShiroRekhaSplitter::set_orig_pix(Pix* pix) {
00070   if (orig_pix_) {
00071     pixDestroy(&orig_pix_);
00072   }
00073   orig_pix_ = pixClone(pix);
00074 }
00075 
00076 // Top-level method to perform splitting based on current settings.
00077 // Returns true if a split was actually performed.
00078 // split_for_pageseg should be true if the splitting is being done prior to
00079 // page segmentation. This mode uses the flag
00080 // pageseg_devanagari_split_strategy to determine the splitting strategy.
00081 bool ShiroRekhaSplitter::Split(bool split_for_pageseg) {
00082   SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ :
00083       ocr_split_strategy_;
00084   if (split_strategy == NO_SPLIT) {
00085     return false;  // Nothing to do.
00086   }
00087   ASSERT_HOST(split_strategy == MINIMAL_SPLIT ||
00088               split_strategy == MAXIMAL_SPLIT);
00089   ASSERT_HOST(orig_pix_);
00090   if (devanagari_split_debuglevel > 0) {
00091     tprintf("Splitting shiro-rekha ...\n");
00092     tprintf("Split strategy = %s\n",
00093             split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
00094     tprintf("Initial pageseg available = %s\n",
00095             segmentation_block_list_ ? "yes" : "no");
00096   }
00097   // Create a copy of original image to store the splitting output.
00098   pixDestroy(&splitted_image_);
00099   splitted_image_ = pixCopy(NULL, orig_pix_);
00100 
00101   // Initialize debug image if required.
00102   if (devanagari_split_debugimage) {
00103     pixDestroy(&debug_image_);
00104     debug_image_ = pixConvertTo32(orig_pix_);
00105   }
00106 
00107   // Determine all connected components in the input image. A close operation
00108   // may be required prior to this, depending on the current settings.
00109   Pix* pix_for_ccs = pixClone(orig_pix_);
00110   if (perform_close_ && global_xheight_ != kUnspecifiedXheight &&
00111       !segmentation_block_list_) {
00112     if (devanagari_split_debuglevel > 0) {
00113       tprintf("Performing a global close operation..\n");
00114     }
00115     // A global measure is available for xheight, but no local information
00116     // exists.
00117     pixDestroy(&pix_for_ccs);
00118     pix_for_ccs = pixCopy(NULL, orig_pix_);
00119     PerformClose(pix_for_ccs, global_xheight_);
00120   }
00121   Pixa* ccs;
00122   Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
00123   boxaDestroy(&tmp_boxa);
00124   pixDestroy(&pix_for_ccs);
00125 
00126   // Iterate over all connected components. Get their bounding boxes and clip
00127   // out the image regions corresponding to these boxes from the original image.
00128   // Conditionally run splitting on each of them.
00129   Boxa* regions_to_clear = boxaCreate(0);
00130   for (int i = 0; i < pixaGetCount(ccs); ++i) {
00131     Box* box = ccs->boxa->box[i];
00132     Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL);
00133     ASSERT_HOST(word_pix);
00134     int xheight = GetXheightForCC(box);
00135     if (xheight == kUnspecifiedXheight && segmentation_block_list_ &&
00136         devanagari_split_debugimage) {
00137       pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
00138     }
00139     // If some xheight measure is available, attempt to pre-eliminate small
00140     // blobs from the shiro-rekha process. This is primarily to save the CCs
00141     // corresponding to punctuation marks/small dots etc which are part of
00142     // larger graphemes.
00143     if (xheight == kUnspecifiedXheight ||
00144         (box->w > xheight / 3 && box->h > xheight / 2)) {
00145       SplitWordShiroRekha(split_strategy, word_pix, xheight,
00146                           box->x, box->y, regions_to_clear);
00147     } else if (devanagari_split_debuglevel > 0) {
00148       tprintf("CC dropped from splitting: %d,%d (%d, %d)\n",
00149               box->x, box->y, box->w, box->h);
00150     }
00151     pixDestroy(&word_pix);
00152   }
00153   // Actually clear the boxes now.
00154   for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
00155     Box* box = boxaGetBox(regions_to_clear, i, L_CLONE);
00156     pixClearInRect(splitted_image_, box);
00157     boxDestroy(&box);
00158   }
00159   boxaDestroy(&regions_to_clear);
00160   pixaDestroy(&ccs);
00161   if (devanagari_split_debugimage) {
00162     DumpDebugImage(split_for_pageseg ? "pageseg_split_debug.png" :
00163                    "ocr_split_debug.png");
00164   }
00165   return true;
00166 }
00167 
00168 // Method to perform a close operation on the input image. The xheight
00169 // estimate decides the size of sel used.
00170 void ShiroRekhaSplitter::PerformClose(Pix* pix, int xheight_estimate) {
00171   pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);
00172 }
00173 
00174 // This method resolves the cc bbox to a particular row and returns the row's
00175 // xheight.
00176 int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) {
00177   if (!segmentation_block_list_) {
00178     return global_xheight_;
00179   }
00180   // Compute the box coordinates in Tesseract's coordinate system.
00181   TBOX bbox(cc_bbox->x,
00182             pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1,
00183             cc_bbox->x + cc_bbox->w,
00184             pixGetHeight(orig_pix_) - cc_bbox->y - 1);
00185   // Iterate over all blocks.
00186   BLOCK_IT block_it(segmentation_block_list_);
00187   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00188     BLOCK* block = block_it.data();
00189     // Iterate over all rows in the block.
00190     ROW_IT row_it(block->row_list());
00191     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00192       ROW* row = row_it.data();
00193       if (!row->bounding_box().major_overlap(bbox)) {
00194         continue;
00195       }
00196       // Row could be skewed, warped, etc. Use the position of the box to
00197       // determine the baseline position of the row for that x-coordinate.
00198       // Create a square TBOX whose baseline's mid-point lies at this point
00199       // and side is row's xheight. Take the overlap of this box with the input
00200       // box and check if it is a 'major overlap'. If so, this box lies in this
00201       // row. In that case, return the xheight for this row.
00202       float box_middle = 0.5 * (bbox.left() + bbox.right());
00203       int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
00204       TBOX test_box(box_middle - row->x_height() / 2,
00205                     baseline,
00206                     box_middle + row->x_height() / 2,
00207                     static_cast<int>(baseline + row->x_height()));
00208       // Compute overlap. If it is is a major overlap, this is the right row.
00209       if (bbox.major_overlap(test_box)) {
00210         return row->x_height();
00211       }
00212     }
00213   }
00214   // No row found for this bbox.
00215   return kUnspecifiedXheight;
00216 }
00217 
00218 // Returns a list of regions (boxes) which should be cleared in the original
00219 // image so as to perform shiro-rekha splitting. Pix is assumed to carry one
00220 // (or less) word only. Xheight measure could be the global estimate, the row
00221 // estimate, or unspecified. If unspecified, over splitting may occur, since a
00222 // conservative estimate of stroke width along with an associated multiplier
00223 // is used in its place. It is advisable to have a specified xheight when
00224 // splitting for classification/training.
00225 // A vertical projection histogram of all the on-pixels in the input pix is
00226 // computed. The maxima of this histogram is regarded as an approximate location
00227 // of the shiro-rekha. By descending on the maxima's peak on both sides,
00228 // stroke width of shiro-rekha is estimated.
00229 // A horizontal projection histogram is computed for a sub-image of the input
00230 // image, which extends from just below the shiro-rekha down to a certain
00231 // leeway. The leeway depends on the input xheight, if provided, else a
00232 // conservative multiplier on approximate stroke width is used (which may lead
00233 // to over-splitting).
00234 void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy,
00235                                              Pix* pix,
00236                                              int xheight,
00237                                              int word_left,
00238                                              int word_top,
00239                                              Boxa* regions_to_clear) {
00240   if (split_strategy == NO_SPLIT) {
00241     return;
00242   }
00243   int width = pixGetWidth(pix);
00244   int height = pixGetHeight(pix);
00245   // Statistically determine the yextents of the shiro-rekha.
00246   int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;
00247   GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom,
00248                         &shirorekha_ylevel);
00249   // Since the shiro rekha is also a stroke, its width is equal to the stroke
00250   // width.
00251   int stroke_width = shirorekha_bottom - shirorekha_top + 1;
00252 
00253   // Some safeguards to protect CCs we do not want to be split.
00254   // These are particularly useful when the word wasn't eliminated earlier
00255   // because xheight information was unavailable.
00256   if (shirorekha_ylevel > height / 2) {
00257     // Shirorekha shouldn't be in the bottom half of the word.
00258     if (devanagari_split_debuglevel > 0) {
00259       tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n",
00260               word_left, word_top);
00261     }
00262     return;
00263   }
00264   if (stroke_width > height / 3) {
00265     // Even the boldest of fonts shouldn't do this.
00266     if (devanagari_split_debuglevel > 0) {
00267       tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n",
00268               word_left, word_top);
00269     }
00270     return;
00271   }
00272 
00273   // Clear the ascender and descender regions of the word.
00274   // Obtain a vertical projection histogram for the resulting image.
00275   Box* box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3,
00276                                 width, 5 * stroke_width / 3);
00277   Pix* word_in_xheight = pixCopy(NULL, pix);
00278   pixClearInRect(word_in_xheight, box_to_clear);
00279   // Also clear any pixels which are below shirorekha_bottom + some leeway.
00280   // The leeway is set to xheight if the information is available, else it is a
00281   // multiplier applied to the stroke width.
00282   int leeway_to_keep = stroke_width * 3;
00283   if (xheight != kUnspecifiedXheight) {
00284     // This is because the xheight-region typically includes the shiro-rekha
00285     // inside it, i.e., the top of the xheight range corresponds to the top of
00286     // shiro-rekha.
00287     leeway_to_keep = xheight - stroke_width;
00288   }
00289   box_to_clear->y = shirorekha_bottom + leeway_to_keep;
00290   box_to_clear->h = height - box_to_clear->y;
00291   pixClearInRect(word_in_xheight, box_to_clear);
00292   boxDestroy(&box_to_clear);
00293 
00294   PixelHistogram vert_hist;
00295   vert_hist.ConstructVerticalCountHist(word_in_xheight);
00296   pixDestroy(&word_in_xheight);
00297 
00298   // If the number of black pixel in any column of the image is less than a
00299   // fraction of the stroke width, treat it as noise / a stray mark. Perform
00300   // these changes inside the vert_hist data itself, as that is used later on as
00301   // a bit vector for the final split decision at every column.
00302   for (int i = 0; i < width; ++i) {
00303     if (vert_hist.hist()[i] <= stroke_width / 4)
00304       vert_hist.hist()[i] = 0;
00305     else
00306       vert_hist.hist()[i] = 1;
00307   }
00308   // In order to split the line at any point, we make sure that the width of the
00309   // gap is atleast half the stroke width.
00310   int i = 0;
00311   int cur_component_width = 0;
00312   while (i < width) {
00313     if (!vert_hist.hist()[i]) {
00314       int j = 0;
00315       while (i + j < width && !vert_hist.hist()[i+j])
00316         ++j;
00317       if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {
00318         // Perform a shiro-rekha split. The intervening region lies from i to
00319         // i+j-1.
00320         // A minimal single-pixel split makes the estimation of intra- and
00321         // inter-word spacing easier during page layout analysis,
00322         // whereas a maximal split may be needed for OCR, depending on
00323         // how the engine was trained.
00324         bool minimal_split = (split_strategy == MINIMAL_SPLIT);
00325         int split_width = minimal_split ? 1 : j;
00326         int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;
00327         if (!minimal_split || (i != 0 && i + j != width)) {
00328           Box* box_to_clear =
00329               boxCreate(word_left + split_left,
00330                         word_top + shirorekha_top - stroke_width / 3,
00331                         split_width,
00332                         5 * stroke_width / 3);
00333           if (box_to_clear) {
00334             boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);
00335             // Mark this in the debug image if needed.
00336             if (devanagari_split_debugimage) {
00337               pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);
00338             }
00339             boxDestroy(&box_to_clear);
00340             cur_component_width = 0;
00341           }
00342         }
00343       }
00344       i += j;
00345     } else {
00346       ++i;
00347       ++cur_component_width;
00348     }
00349   }
00350 }
00351 
00352 // Refreshes the words in the segmentation block list by using blobs in the
00353 // input block list.
00354 // The segmentation block list must be set.
00355 void ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs(
00356     C_BLOB_LIST* new_blobs) {
00357   // The segmentation block list must have been specified.
00358   ASSERT_HOST(segmentation_block_list_);
00359   if (devanagari_split_debuglevel > 0) {
00360     tprintf("Before refreshing blobs:\n");
00361     PrintSegmentationStats(segmentation_block_list_);
00362     tprintf("New Blobs found: %d\n", new_blobs->length());
00363   }
00364 
00365   C_BLOB_LIST not_found_blobs;
00366   RefreshWordBlobsFromNewBlobs(segmentation_block_list_,
00367                                new_blobs,
00368                                ((devanagari_split_debugimage && debug_image_) ?
00369                                 &not_found_blobs : NULL));
00370 
00371   if (devanagari_split_debuglevel > 0) {
00372     tprintf("After refreshing blobs:\n");
00373     PrintSegmentationStats(segmentation_block_list_);
00374   }
00375   if (devanagari_split_debugimage && debug_image_) {
00376     // Plot out the original blobs for which no match was found in the new
00377     // all_blobs list.
00378     C_BLOB_IT not_found_it(&not_found_blobs);
00379     for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
00380          not_found_it.forward()) {
00381       C_BLOB* not_found = not_found_it.data();
00382       TBOX not_found_box = not_found->bounding_box();
00383       Box* box_to_plot = GetBoxForTBOX(not_found_box);
00384       pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);
00385       boxDestroy(&box_to_plot);
00386     }
00387 
00388     // Plot out the blobs unused from all blobs.
00389     C_BLOB_IT all_blobs_it(new_blobs);
00390     for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
00391          all_blobs_it.forward()) {
00392       C_BLOB* a_blob = all_blobs_it.data();
00393       Box* box_to_plot = GetBoxForTBOX(a_blob->bounding_box());
00394       pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);
00395       boxDestroy(&box_to_plot);
00396     }
00397   }
00398 }
00399 
00400 // Returns a new box object for the corresponding TBOX, based on the original
00401 // image's coordinate system.
00402 Box* ShiroRekhaSplitter::GetBoxForTBOX(const TBOX& tbox) const {
00403   return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1,
00404                    tbox.width(), tbox.height());
00405 }
00406 
00407 // This method returns the computed mode-height of blobs in the pix.
00408 // It also prunes very small blobs from calculation.
00409 int ShiroRekhaSplitter::GetModeHeight(Pix* pix) {
00410   Boxa* boxa = pixConnComp(pix, NULL, 8);
00411   STATS heights(0, pixGetHeight(pix));
00412   heights.clear();
00413   for (int i = 0; i < boxaGetCount(boxa); ++i) {
00414     Box* box = boxaGetBox(boxa, i, L_CLONE);
00415     if (box->h >= 3 || box->w >= 3) {
00416       heights.add(box->h, 1);
00417     }
00418     boxDestroy(&box);
00419   }
00420   boxaDestroy(&boxa);
00421   return heights.mode();
00422 }
00423 
00424 // This method returns y-extents of the shiro-rekha computed from the input
00425 // word image.
00426 void ShiroRekhaSplitter::GetShiroRekhaYExtents(Pix* word_pix,
00427                                                int* shirorekha_top,
00428                                                int* shirorekha_bottom,
00429                                                int* shirorekha_ylevel) {
00430   // Compute a histogram from projecting the word on a vertical line.
00431   PixelHistogram hist_horiz;
00432   hist_horiz.ConstructHorizontalCountHist(word_pix);
00433   // Get the ylevel where the top-line exists. This is basically the global
00434   // maxima in the horizontal histogram.
00435   int topline_onpixel_count = 0;
00436   int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);
00437 
00438   // Get the upper and lower extents of the shiro rekha.
00439   int thresh = (topline_onpixel_count * 70) / 100;
00440   int ulimit = topline_ylevel;
00441   int llimit = topline_ylevel;
00442   while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh)
00443     --ulimit;
00444   while (llimit < word_pix->h && hist_horiz.hist()[llimit] >= thresh)
00445     ++llimit;
00446 
00447   if (shirorekha_top) *shirorekha_top = ulimit;
00448   if (shirorekha_bottom) *shirorekha_bottom = llimit;
00449   if (shirorekha_ylevel) *shirorekha_ylevel = topline_ylevel;
00450 }
00451 
00452 // This method returns the global-maxima for the histogram. The frequency of
00453 // the global maxima is returned in count, if specified.
00454 int PixelHistogram::GetHistogramMaximum(int* count) const {
00455   int best_value = 0;
00456   for (int i = 0; i < length_; ++i) {
00457     if (hist_[i] > hist_[best_value]) {
00458       best_value = i;
00459     }
00460   }
00461   if (count) {
00462     *count = hist_[best_value];
00463   }
00464   return best_value;
00465 }
00466 
00467 // Methods to construct histograms from images.
00468 void PixelHistogram::ConstructVerticalCountHist(Pix* pix) {
00469   Clear();
00470   int width = pixGetWidth(pix);
00471   int height = pixGetHeight(pix);
00472   hist_ = new int[width];
00473   length_ = width;
00474   int wpl = pixGetWpl(pix);
00475   l_uint32 *data = pixGetData(pix);
00476   for (int i = 0; i < width; ++i)
00477     hist_[i] = 0;
00478   for (int i = 0; i < height; ++i) {
00479     l_uint32 *line = data + i * wpl;
00480     for (int j = 0; j < width; ++j)
00481       if (GET_DATA_BIT(line, j))
00482         ++(hist_[j]);
00483   }
00484 }
00485 
00486 void PixelHistogram::ConstructHorizontalCountHist(Pix* pix) {
00487   Clear();
00488   Numa* counts = pixCountPixelsByRow(pix, NULL);
00489   length_ = numaGetCount(counts);
00490   hist_ = new int[length_];
00491   for (int i = 0; i < length_; ++i) {
00492     l_int32 val = 0;
00493     numaGetIValue(counts, i, &val);
00494     hist_[i] = val;
00495   }
00496   numaDestroy(&counts);
00497 }
00498 
00499 }  // namespace tesseract.