Tesseract  3.02
tesseract-ocr/ccmain/pagesegmain.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pagesegmain.cpp
00003  * Description: Top-level page segmenter for Tesseract.
00004  * Author:      Ray Smith
00005  * Created:     Thu Sep 25 17:12:01 PDT 2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _WIN32
00021 #ifndef __GNUC__
00022 #include <windows.h>
00023 #endif  /* __GNUC__ */
00024 #else
00025 #include <unistd.h>
00026 #endif
00027 #ifdef _MSC_VER
00028 #pragma warning(disable:4244)  // Conversion warnings
00029 #endif
00030 
00031 // Include automatically generated configuration file if running autoconf.
00032 #ifdef HAVE_CONFIG_H
00033 #include "config_auto.h"
00034 #endif
00035 
00036 #include "allheaders.h"
00037 #include "blobbox.h"
00038 #include "blread.h"
00039 #include "colfind.h"
00040 #include "equationdetect.h"
00041 #include "imagefind.h"
00042 #include "img.h"
00043 #include "linefind.h"
00044 #include "makerow.h"
00045 #include "osdetect.h"
00046 #include "tabvector.h"
00047 #include "tesseractclass.h"
00048 #include "tessvars.h"
00049 #include "textord.h"
00050 #include "tordmain.h"
00051 #include "wordseg.h"
00052 
00053 namespace tesseract {
00054 
00056 const int kMinCredibleResolution = 70;
00058 const int kDefaultResolution = 300;
00059 // Max erosions to perform in removing an enclosing circle.
00060 const int kMaxCircleErosions = 8;
00061 
00062 // Helper to remove an enclosing circle from an image.
00063 // If there isn't one, then the image will most likely get badly mangled.
00064 // The returned pix must be pixDestroyed after use. NULL may be returned
00065 // if the image doesn't meet the trivial conditions that it uses to determine
00066 // success.
00067 static Pix* RemoveEnclosingCircle(Pix* pixs) {
00068   Pix* pixsi = pixInvert(NULL, pixs);
00069   Pix* pixc = pixCreateTemplate(pixs);
00070   pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
00071   pixSeedfillBinary(pixc, pixc, pixsi, 4);
00072   pixInvert(pixc, pixc);
00073   pixDestroy(&pixsi);
00074   Pix* pixt = pixAnd(NULL, pixs, pixc);
00075   l_int32 max_count;
00076   pixCountConnComp(pixt, 8, &max_count);
00077   // The count has to go up before we start looking for the minimum.
00078   l_int32 min_count = MAX_INT32;
00079   Pix* pixout = NULL;
00080   for (int i = 1; i < kMaxCircleErosions; i++) {
00081     pixDestroy(&pixt);
00082     pixErodeBrick(pixc, pixc, 3, 3);
00083     pixt = pixAnd(NULL, pixs, pixc);
00084     l_int32 count;
00085     pixCountConnComp(pixt, 8, &count);
00086     if (i == 1 || count > max_count) {
00087       max_count = count;
00088       min_count = count;
00089     } else if (i > 1 && count < min_count) {
00090       min_count = count;
00091       pixDestroy(&pixout);
00092       pixout = pixCopy(NULL, pixt);  // Save the best.
00093     } else if (count >= min_count) {
00094       break;  // We have passed by the best.
00095     }
00096   }
00097   pixDestroy(&pixt);
00098   pixDestroy(&pixc);
00099   return pixout;
00100 }
00101 
00107 int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
00108                            Tesseract* osd_tess, OSResults* osr) {
00109   ASSERT_HOST(pix_binary_ != NULL);
00110   int width = pixGetWidth(pix_binary_);
00111   int height = pixGetHeight(pix_binary_);
00112   // Get page segmentation mode.
00113   PageSegMode pageseg_mode = static_cast<PageSegMode>(
00114       static_cast<int>(tessedit_pageseg_mode));
00115   // If a UNLV zone file can be found, use that instead of segmentation.
00116   if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
00117       input_file != NULL && input_file->length() > 0) {
00118     STRING name = *input_file;
00119     const char* lastdot = strrchr(name.string(), '.');
00120     if (lastdot != NULL)
00121       name[lastdot - name.string()] = '\0';
00122     read_unlv_file(name, width, height, blocks);
00123   }
00124   if (blocks->empty()) {
00125     // No UNLV file present. Work according to the PageSegMode.
00126     // First make a single block covering the whole image.
00127     BLOCK_IT block_it(blocks);
00128     BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
00129     block->set_right_to_left(right_to_left());
00130     block_it.add_to_end(block);
00131   } else {
00132     // UNLV file present. Use PSM_SINGLE_BLOCK.
00133     pageseg_mode = PSM_SINGLE_BLOCK;
00134   }
00135   bool single_column = !PSM_COL_FIND_ENABLED(pageseg_mode);
00136   bool osd_enabled = PSM_OSD_ENABLED(pageseg_mode);
00137   bool osd_only = pageseg_mode == PSM_OSD_ONLY;
00138 
00139   int auto_page_seg_ret_val = 0;
00140   TO_BLOCK_LIST to_blocks;
00141   if (osd_enabled || PSM_BLOCK_FIND_ENABLED(pageseg_mode)) {
00142     auto_page_seg_ret_val =
00143         AutoPageSeg(single_column, osd_enabled, osd_only,
00144                     blocks, &to_blocks, osd_tess, osr);
00145     if (osd_only)
00146       return auto_page_seg_ret_val;
00147     // To create blobs from the image region bounds uncomment this line:
00148     //  to_blocks.clear();  // Uncomment to go back to the old mode.
00149   } else {
00150     deskew_ = FCOORD(1.0f, 0.0f);
00151     reskew_ = FCOORD(1.0f, 0.0f);
00152     if (pageseg_mode == PSM_CIRCLE_WORD) {
00153       Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
00154       if (pixcleaned != NULL) {
00155         pixDestroy(&pix_binary_);
00156         pix_binary_ = pixcleaned;
00157       }
00158     }
00159   }
00160 
00161   if (auto_page_seg_ret_val < 0) {
00162     return -1;
00163   }
00164 
00165   if (blocks->empty()) {
00166     if (textord_debug_tabfind)
00167       tprintf("Empty page\n");
00168     return 0;  // AutoPageSeg found an empty page.
00169   }
00170 
00171   textord_.TextordPage(pageseg_mode, width, height, pix_binary_,
00172                        blocks, &to_blocks);
00173   return auto_page_seg_ret_val;
00174 }
00175 
00176 // Helper writes a grey image to a file for use by scrollviewer.
00177 // Normally for speed we don't display the image in the layout debug windows.
00178 // If textord_debug_images is true, we draw the image as a background to some
00179 // of the debug windows. printable determines whether these
00180 // images are optimized for printing instead of screen display.
00181 static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
00182   Pix* grey_pix = pixCreate(pixGetWidth(pix_binary),
00183                             pixGetHeight(pix_binary), 8);
00184   // Printable images are light grey on white, but for screen display
00185   // they are black on dark grey so the other colors show up well.
00186   if (printable) {
00187     pixSetAll(grey_pix);
00188     pixSetMasked(grey_pix, pix_binary, 192);
00189   } else {
00190     pixSetAllArbitrary(grey_pix, 64);
00191     pixSetMasked(grey_pix, pix_binary, 0);
00192   }
00193   AlignedBlob::IncrementDebugPix();
00194   pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG);
00195   pixDestroy(&grey_pix);
00196 }
00197 
00198 
00218 int Tesseract::AutoPageSeg(bool single_column, bool osd, bool only_osd,
00219                            BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
00220                            Tesseract* osd_tess, OSResults* osr) {
00221   if (textord_debug_images) {
00222     WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
00223   }
00224   Pix* photomask_pix = NULL;
00225   Pix* musicmask_pix = NULL;
00226   // The blocks made by the ColumnFinder. Moved to blocks before return.
00227   BLOCK_LIST found_blocks;
00228   TO_BLOCK_LIST temp_blocks;
00229 
00230   ColumnFinder* finder = SetupPageSegAndDetectOrientation(
00231       single_column, osd, only_osd, blocks, osd_tess, osr,
00232       &temp_blocks, &photomask_pix, &musicmask_pix);
00233   if (finder != NULL) {
00234     TO_BLOCK_IT to_block_it(&temp_blocks);
00235     TO_BLOCK* to_block = to_block_it.data();
00236     if (musicmask_pix != NULL) {
00237       // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
00238       // blocks separately. For now combine with photomask_pix.
00239       pixOr(photomask_pix, photomask_pix, musicmask_pix);
00240     }
00241     if (equ_detect_) {
00242       finder->SetEquationDetect(equ_detect_);
00243     }
00244     if (finder->FindBlocks(single_column, scaled_color_, scaled_factor_,
00245                            to_block, photomask_pix,
00246                            &found_blocks, to_blocks) < 0) {
00247       pixDestroy(&photomask_pix);
00248       pixDestroy(&musicmask_pix);
00249       return -1;
00250     }
00251     finder->GetDeskewVectors(&deskew_, &reskew_);
00252     delete finder;
00253   }
00254   pixDestroy(&photomask_pix);
00255   pixDestroy(&musicmask_pix);
00256   blocks->clear();
00257   BLOCK_IT block_it(blocks);
00258   // Move the found blocks to the input/output blocks.
00259   block_it.add_list_after(&found_blocks);
00260 
00261   if (textord_debug_images) {
00262     // The debug image is no longer needed so delete it.
00263     unlink(AlignedBlob::textord_debug_pix().string());
00264   }
00265   return 0;
00266 }
00267 
00281 ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
00282     bool single_column, bool osd, bool only_osd,
00283     BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr,
00284     TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) {
00285   int vertical_x = 0;
00286   int vertical_y = 1;
00287   TabVector_LIST v_lines;
00288   TabVector_LIST h_lines;
00289   ICOORD bleft(0, 0);
00290 
00291   ASSERT_HOST(pix_binary_ != NULL);
00292   if (tessedit_dump_pageseg_images) {
00293     pixWrite("tessinput.png", pix_binary_, IFF_PNG);
00294   }
00295   // Leptonica is used to find the rule/separator lines in the input.
00296   LineFinder::FindAndRemoveLines(source_resolution_,
00297                                  textord_tabfind_show_vlines, pix_binary_,
00298                                  &vertical_x, &vertical_y, music_mask_pix,
00299                                  &v_lines, &h_lines);
00300   if (tessedit_dump_pageseg_images)
00301     pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
00302   // Leptonica is used to find a mask of the photo regions in the input.
00303   *photo_mask_pix = ImageFind::FindImages(pix_binary_);
00304   if (tessedit_dump_pageseg_images)
00305     pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
00306   if (single_column)
00307     v_lines.clear();
00308 
00309   // The rest of the algorithm uses the usual connected components.
00310   textord_.find_components(pix_binary_, blocks, to_blocks);
00311 
00312   TO_BLOCK_IT to_block_it(to_blocks);
00313   // There must be exactly one input block.
00314   // TODO(rays) handle new textline finding with a UNLV zone file.
00315   ASSERT_HOST(to_blocks->singleton());
00316   TO_BLOCK* to_block = to_block_it.data();
00317   TBOX blkbox = to_block->block->bounding_box();
00318   ColumnFinder* finder = NULL;
00319 
00320   if (to_block->line_size >= 2) {
00321     finder = new ColumnFinder(static_cast<int>(to_block->line_size),
00322                               blkbox.botleft(), blkbox.topright(),
00323                               source_resolution_,
00324                               &v_lines, &h_lines, vertical_x, vertical_y);
00325 
00326     finder->SetupAndFilterNoise(*photo_mask_pix, to_block);
00327 
00328     if (equ_detect_) {
00329       equ_detect_->LabelSpecialText(to_block);
00330     }
00331 
00332     BLOBNBOX_CLIST osd_blobs;
00333     // osd_orientation is the number of 90 degree rotations to make the
00334     // characters upright. (See osdetect.h for precise definition.)
00335     // We want the text lines horizontal, (vertical text indicates vertical
00336     // textlines) which may conflict (eg vertically written CJK).
00337     int osd_orientation = 0;
00338     bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs);
00339     if (osd && osd_tess != NULL && osr != NULL) {
00340       os_detect_blobs(&osd_blobs, osr, osd_tess);
00341       if (only_osd) {
00342         delete finder;
00343         return NULL;
00344       }
00345       osd_orientation = osr->best_result.orientation_id;
00346       double osd_score = osr->orientations[osd_orientation];
00347       double osd_margin = min_orientation_margin * 2;
00348       for (int i = 0; i < 4; ++i) {
00349         if (i != osd_orientation &&
00350             osd_score - osr->orientations[i] < osd_margin) {
00351           osd_margin = osd_score - osr->orientations[i];
00352         }
00353       }
00354       if (osd_margin < min_orientation_margin) {
00355         // The margin is weak.
00356         int best_script_id = osr->best_result.script_id;
00357         bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) ||
00358             (best_script_id == osd_tess->unicharset.hiragana_sid()) ||
00359             (best_script_id == osd_tess->unicharset.katakana_sid());
00360 
00361         if (!cjk && !vertical_text && osd_orientation == 2) {
00362           // upside down latin text is improbable with such a weak margin.
00363           tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
00364                   "Don't rotate.\n", osd_margin);
00365           osd_orientation = 0;
00366         } else {
00367           tprintf("OSD: Weak margin (%.2f) for %d blob text block, "
00368                   "but using orientation anyway: %d\n",
00369                   osd_blobs.length(), osd_margin, osd_orientation);
00370         }
00371       }
00372     }
00373     osd_blobs.shallow_clear();
00374     finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
00375   }
00376 
00377   return finder;
00378 }
00379 
00380 }  // namespace tesseract.