Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: pagesegmain.cpp 00003 * Description: Top-level page segmenter for Tesseract. 00004 * Author: Ray Smith 00005 * Created: Thu Sep 25 17:12:01 PDT 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _WIN32 00021 #ifndef __GNUC__ 00022 #include <windows.h> 00023 #endif /* __GNUC__ */ 00024 #else 00025 #include <unistd.h> 00026 #endif 00027 #ifdef _MSC_VER 00028 #pragma warning(disable:4244) // Conversion warnings 00029 #endif 00030 00031 // Include automatically generated configuration file if running autoconf. 00032 #ifdef HAVE_CONFIG_H 00033 #include "config_auto.h" 00034 #endif 00035 00036 #include "allheaders.h" 00037 #include "blobbox.h" 00038 #include "blread.h" 00039 #include "colfind.h" 00040 #include "equationdetect.h" 00041 #include "imagefind.h" 00042 #include "img.h" 00043 #include "linefind.h" 00044 #include "makerow.h" 00045 #include "osdetect.h" 00046 #include "tabvector.h" 00047 #include "tesseractclass.h" 00048 #include "tessvars.h" 00049 #include "textord.h" 00050 #include "tordmain.h" 00051 #include "wordseg.h" 00052 00053 namespace tesseract { 00054 00056 const int kMinCredibleResolution = 70; 00058 const int kDefaultResolution = 300; 00059 // Max erosions to perform in removing an enclosing circle. 00060 const int kMaxCircleErosions = 8; 00061 00062 // Helper to remove an enclosing circle from an image. 00063 // If there isn't one, then the image will most likely get badly mangled. 00064 // The returned pix must be pixDestroyed after use. NULL may be returned 00065 // if the image doesn't meet the trivial conditions that it uses to determine 00066 // success. 00067 static Pix* RemoveEnclosingCircle(Pix* pixs) { 00068 Pix* pixsi = pixInvert(NULL, pixs); 00069 Pix* pixc = pixCreateTemplate(pixs); 00070 pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET); 00071 pixSeedfillBinary(pixc, pixc, pixsi, 4); 00072 pixInvert(pixc, pixc); 00073 pixDestroy(&pixsi); 00074 Pix* pixt = pixAnd(NULL, pixs, pixc); 00075 l_int32 max_count; 00076 pixCountConnComp(pixt, 8, &max_count); 00077 // The count has to go up before we start looking for the minimum. 00078 l_int32 min_count = MAX_INT32; 00079 Pix* pixout = NULL; 00080 for (int i = 1; i < kMaxCircleErosions; i++) { 00081 pixDestroy(&pixt); 00082 pixErodeBrick(pixc, pixc, 3, 3); 00083 pixt = pixAnd(NULL, pixs, pixc); 00084 l_int32 count; 00085 pixCountConnComp(pixt, 8, &count); 00086 if (i == 1 || count > max_count) { 00087 max_count = count; 00088 min_count = count; 00089 } else if (i > 1 && count < min_count) { 00090 min_count = count; 00091 pixDestroy(&pixout); 00092 pixout = pixCopy(NULL, pixt); // Save the best. 00093 } else if (count >= min_count) { 00094 break; // We have passed by the best. 00095 } 00096 } 00097 pixDestroy(&pixt); 00098 pixDestroy(&pixc); 00099 return pixout; 00100 } 00101 00107 int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, 00108 Tesseract* osd_tess, OSResults* osr) { 00109 ASSERT_HOST(pix_binary_ != NULL); 00110 int width = pixGetWidth(pix_binary_); 00111 int height = pixGetHeight(pix_binary_); 00112 // Get page segmentation mode. 00113 PageSegMode pageseg_mode = static_cast<PageSegMode>( 00114 static_cast<int>(tessedit_pageseg_mode)); 00115 // If a UNLV zone file can be found, use that instead of segmentation. 00116 if (!PSM_COL_FIND_ENABLED(pageseg_mode) && 00117 input_file != NULL && input_file->length() > 0) { 00118 STRING name = *input_file; 00119 const char* lastdot = strrchr(name.string(), '.'); 00120 if (lastdot != NULL) 00121 name[lastdot - name.string()] = '\0'; 00122 read_unlv_file(name, width, height, blocks); 00123 } 00124 if (blocks->empty()) { 00125 // No UNLV file present. Work according to the PageSegMode. 00126 // First make a single block covering the whole image. 00127 BLOCK_IT block_it(blocks); 00128 BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height); 00129 block->set_right_to_left(right_to_left()); 00130 block_it.add_to_end(block); 00131 } else { 00132 // UNLV file present. Use PSM_SINGLE_BLOCK. 00133 pageseg_mode = PSM_SINGLE_BLOCK; 00134 } 00135 bool single_column = !PSM_COL_FIND_ENABLED(pageseg_mode); 00136 bool osd_enabled = PSM_OSD_ENABLED(pageseg_mode); 00137 bool osd_only = pageseg_mode == PSM_OSD_ONLY; 00138 00139 int auto_page_seg_ret_val = 0; 00140 TO_BLOCK_LIST to_blocks; 00141 if (osd_enabled || PSM_BLOCK_FIND_ENABLED(pageseg_mode)) { 00142 auto_page_seg_ret_val = 00143 AutoPageSeg(single_column, osd_enabled, osd_only, 00144 blocks, &to_blocks, osd_tess, osr); 00145 if (osd_only) 00146 return auto_page_seg_ret_val; 00147 // To create blobs from the image region bounds uncomment this line: 00148 // to_blocks.clear(); // Uncomment to go back to the old mode. 00149 } else { 00150 deskew_ = FCOORD(1.0f, 0.0f); 00151 reskew_ = FCOORD(1.0f, 0.0f); 00152 if (pageseg_mode == PSM_CIRCLE_WORD) { 00153 Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_); 00154 if (pixcleaned != NULL) { 00155 pixDestroy(&pix_binary_); 00156 pix_binary_ = pixcleaned; 00157 } 00158 } 00159 } 00160 00161 if (auto_page_seg_ret_val < 0) { 00162 return -1; 00163 } 00164 00165 if (blocks->empty()) { 00166 if (textord_debug_tabfind) 00167 tprintf("Empty page\n"); 00168 return 0; // AutoPageSeg found an empty page. 00169 } 00170 00171 textord_.TextordPage(pageseg_mode, width, height, pix_binary_, 00172 blocks, &to_blocks); 00173 return auto_page_seg_ret_val; 00174 } 00175 00176 // Helper writes a grey image to a file for use by scrollviewer. 00177 // Normally for speed we don't display the image in the layout debug windows. 00178 // If textord_debug_images is true, we draw the image as a background to some 00179 // of the debug windows. printable determines whether these 00180 // images are optimized for printing instead of screen display. 00181 static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { 00182 Pix* grey_pix = pixCreate(pixGetWidth(pix_binary), 00183 pixGetHeight(pix_binary), 8); 00184 // Printable images are light grey on white, but for screen display 00185 // they are black on dark grey so the other colors show up well. 00186 if (printable) { 00187 pixSetAll(grey_pix); 00188 pixSetMasked(grey_pix, pix_binary, 192); 00189 } else { 00190 pixSetAllArbitrary(grey_pix, 64); 00191 pixSetMasked(grey_pix, pix_binary, 0); 00192 } 00193 AlignedBlob::IncrementDebugPix(); 00194 pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG); 00195 pixDestroy(&grey_pix); 00196 } 00197 00198 00218 int Tesseract::AutoPageSeg(bool single_column, bool osd, bool only_osd, 00219 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, 00220 Tesseract* osd_tess, OSResults* osr) { 00221 if (textord_debug_images) { 00222 WriteDebugBackgroundImage(textord_debug_printable, pix_binary_); 00223 } 00224 Pix* photomask_pix = NULL; 00225 Pix* musicmask_pix = NULL; 00226 // The blocks made by the ColumnFinder. Moved to blocks before return. 00227 BLOCK_LIST found_blocks; 00228 TO_BLOCK_LIST temp_blocks; 00229 00230 ColumnFinder* finder = SetupPageSegAndDetectOrientation( 00231 single_column, osd, only_osd, blocks, osd_tess, osr, 00232 &temp_blocks, &photomask_pix, &musicmask_pix); 00233 if (finder != NULL) { 00234 TO_BLOCK_IT to_block_it(&temp_blocks); 00235 TO_BLOCK* to_block = to_block_it.data(); 00236 if (musicmask_pix != NULL) { 00237 // TODO(rays) pass the musicmask_pix into FindBlocks and mark music 00238 // blocks separately. For now combine with photomask_pix. 00239 pixOr(photomask_pix, photomask_pix, musicmask_pix); 00240 } 00241 if (equ_detect_) { 00242 finder->SetEquationDetect(equ_detect_); 00243 } 00244 if (finder->FindBlocks(single_column, scaled_color_, scaled_factor_, 00245 to_block, photomask_pix, 00246 &found_blocks, to_blocks) < 0) { 00247 pixDestroy(&photomask_pix); 00248 pixDestroy(&musicmask_pix); 00249 return -1; 00250 } 00251 finder->GetDeskewVectors(&deskew_, &reskew_); 00252 delete finder; 00253 } 00254 pixDestroy(&photomask_pix); 00255 pixDestroy(&musicmask_pix); 00256 blocks->clear(); 00257 BLOCK_IT block_it(blocks); 00258 // Move the found blocks to the input/output blocks. 00259 block_it.add_list_after(&found_blocks); 00260 00261 if (textord_debug_images) { 00262 // The debug image is no longer needed so delete it. 00263 unlink(AlignedBlob::textord_debug_pix().string()); 00264 } 00265 return 0; 00266 } 00267 00281 ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( 00282 bool single_column, bool osd, bool only_osd, 00283 BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr, 00284 TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) { 00285 int vertical_x = 0; 00286 int vertical_y = 1; 00287 TabVector_LIST v_lines; 00288 TabVector_LIST h_lines; 00289 ICOORD bleft(0, 0); 00290 00291 ASSERT_HOST(pix_binary_ != NULL); 00292 if (tessedit_dump_pageseg_images) { 00293 pixWrite("tessinput.png", pix_binary_, IFF_PNG); 00294 } 00295 // Leptonica is used to find the rule/separator lines in the input. 00296 LineFinder::FindAndRemoveLines(source_resolution_, 00297 textord_tabfind_show_vlines, pix_binary_, 00298 &vertical_x, &vertical_y, music_mask_pix, 00299 &v_lines, &h_lines); 00300 if (tessedit_dump_pageseg_images) 00301 pixWrite("tessnolines.png", pix_binary_, IFF_PNG); 00302 // Leptonica is used to find a mask of the photo regions in the input. 00303 *photo_mask_pix = ImageFind::FindImages(pix_binary_); 00304 if (tessedit_dump_pageseg_images) 00305 pixWrite("tessnoimages.png", pix_binary_, IFF_PNG); 00306 if (single_column) 00307 v_lines.clear(); 00308 00309 // The rest of the algorithm uses the usual connected components. 00310 textord_.find_components(pix_binary_, blocks, to_blocks); 00311 00312 TO_BLOCK_IT to_block_it(to_blocks); 00313 // There must be exactly one input block. 00314 // TODO(rays) handle new textline finding with a UNLV zone file. 00315 ASSERT_HOST(to_blocks->singleton()); 00316 TO_BLOCK* to_block = to_block_it.data(); 00317 TBOX blkbox = to_block->block->bounding_box(); 00318 ColumnFinder* finder = NULL; 00319 00320 if (to_block->line_size >= 2) { 00321 finder = new ColumnFinder(static_cast<int>(to_block->line_size), 00322 blkbox.botleft(), blkbox.topright(), 00323 source_resolution_, 00324 &v_lines, &h_lines, vertical_x, vertical_y); 00325 00326 finder->SetupAndFilterNoise(*photo_mask_pix, to_block); 00327 00328 if (equ_detect_) { 00329 equ_detect_->LabelSpecialText(to_block); 00330 } 00331 00332 BLOBNBOX_CLIST osd_blobs; 00333 // osd_orientation is the number of 90 degree rotations to make the 00334 // characters upright. (See osdetect.h for precise definition.) 00335 // We want the text lines horizontal, (vertical text indicates vertical 00336 // textlines) which may conflict (eg vertically written CJK). 00337 int osd_orientation = 0; 00338 bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs); 00339 if (osd && osd_tess != NULL && osr != NULL) { 00340 os_detect_blobs(&osd_blobs, osr, osd_tess); 00341 if (only_osd) { 00342 delete finder; 00343 return NULL; 00344 } 00345 osd_orientation = osr->best_result.orientation_id; 00346 double osd_score = osr->orientations[osd_orientation]; 00347 double osd_margin = min_orientation_margin * 2; 00348 for (int i = 0; i < 4; ++i) { 00349 if (i != osd_orientation && 00350 osd_score - osr->orientations[i] < osd_margin) { 00351 osd_margin = osd_score - osr->orientations[i]; 00352 } 00353 } 00354 if (osd_margin < min_orientation_margin) { 00355 // The margin is weak. 00356 int best_script_id = osr->best_result.script_id; 00357 bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) || 00358 (best_script_id == osd_tess->unicharset.hiragana_sid()) || 00359 (best_script_id == osd_tess->unicharset.katakana_sid()); 00360 00361 if (!cjk && !vertical_text && osd_orientation == 2) { 00362 // upside down latin text is improbable with such a weak margin. 00363 tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: " 00364 "Don't rotate.\n", osd_margin); 00365 osd_orientation = 0; 00366 } else { 00367 tprintf("OSD: Weak margin (%.2f) for %d blob text block, " 00368 "but using orientation anyway: %d\n", 00369 osd_blobs.length(), osd_margin, osd_orientation); 00370 } 00371 } 00372 } 00373 osd_blobs.shallow_clear(); 00374 finder->CorrectOrientation(to_block, vertical_text, osd_orientation); 00375 } 00376 00377 return finder; 00378 } 00379 00380 } // namespace tesseract.