Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: tordmain.cpp (Formerly textordp.c) 00003 * Description: C++ top level textord code. 00004 * Author: Ray Smith 00005 * Created: Tue Jul 28 17:12:33 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 #include "mfcpch.h" 00020 #ifdef __UNIX__ 00021 #include <assert.h> 00022 #endif 00023 #include "stderr.h" 00024 #include "globaloc.h" 00025 #include "blread.h" 00026 #include "blobbox.h" 00027 #include "ccstruct.h" 00028 #include "edgblob.h" 00029 #include "drawtord.h" 00030 #include "makerow.h" 00031 #include "wordseg.h" 00032 #include "imgs.h" 00033 #include "textord.h" 00034 #include "tordmain.h" 00035 #include "secname.h" 00036 00037 // Include automatically generated configuration file if running autoconf. 00038 #ifdef HAVE_CONFIG_H 00039 #include "config_auto.h" 00040 #endif 00041 00042 #include "allheaders.h" 00043 00044 const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block"; 00045 00046 #undef EXTERN 00047 #define EXTERN 00048 00049 #define MAX_NEAREST_DIST 600 //for block skew stats 00050 00051 /********************************************************************** 00052 * SetBlobStrokeWidth 00053 * 00054 * Set the horizontal and vertical stroke widths in the blob. 00055 **********************************************************************/ 00056 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) { 00057 // Cut the blob rectangle into a Pix. 00058 int pix_height = pixGetHeight(pix); 00059 const TBOX& box = blob->bounding_box(); 00060 int width = box.width(); 00061 int height = box.height(); 00062 Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(), 00063 width, height); 00064 Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL); 00065 boxDestroy(&blob_pix_box); 00066 Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG); 00067 pixDestroy(&pix_blob); 00068 // Compute the stroke widths. 00069 uinT32* data = pixGetData(dist_pix); 00070 int wpl = pixGetWpl(dist_pix); 00071 // Horizontal width of stroke. 00072 STATS h_stats(0, width + 1); 00073 for (int y = 0; y < height; ++y) { 00074 uinT32* pixels = data + y*wpl; 00075 int prev_pixel = 0; 00076 int pixel = GET_DATA_BYTE(pixels, 0); 00077 for (int x = 1; x < width; ++x) { 00078 int next_pixel = GET_DATA_BYTE(pixels, x); 00079 // We are looking for a pixel that is equal to its vertical neighbours, 00080 // yet greater than its left neighbour. 00081 if (prev_pixel < pixel && 00082 (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && 00083 (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) { 00084 if (pixel > next_pixel) { 00085 // Single local max, so an odd width. 00086 h_stats.add(pixel * 2 - 1, 1); 00087 } else if (pixel == next_pixel && x + 1 < width && 00088 pixel > GET_DATA_BYTE(pixels, x + 1)) { 00089 // Double local max, so an even width. 00090 h_stats.add(pixel * 2, 1); 00091 } 00092 } 00093 prev_pixel = pixel; 00094 pixel = next_pixel; 00095 } 00096 } 00097 // Vertical width of stroke. 00098 STATS v_stats(0, height + 1); 00099 for (int x = 0; x < width; ++x) { 00100 int prev_pixel = 0; 00101 int pixel = GET_DATA_BYTE(data, x); 00102 for (int y = 1; y < height; ++y) { 00103 uinT32* pixels = data + y*wpl; 00104 int next_pixel = GET_DATA_BYTE(pixels, x); 00105 // We are looking for a pixel that is equal to its horizontal neighbours, 00106 // yet greater than its upper neighbour. 00107 if (prev_pixel < pixel && 00108 (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && 00109 (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) { 00110 if (pixel > next_pixel) { 00111 // Single local max, so an odd width. 00112 v_stats.add(pixel * 2 - 1, 1); 00113 } else if (pixel == next_pixel && y + 1 < height && 00114 pixel > GET_DATA_BYTE(pixels + wpl, x)) { 00115 // Double local max, so an even width. 00116 v_stats.add(pixel * 2, 1); 00117 } 00118 } 00119 prev_pixel = pixel; 00120 pixel = next_pixel; 00121 } 00122 } 00123 pixDestroy(&dist_pix); 00124 // Store the horizontal and vertical width in the blob, keeping both 00125 // widths if there is enough information, otherwse only the one with 00126 // the most samples. 00127 // If there are insufficent samples, store zero, rather than using 00128 // 2*area/perimeter, as the numbers that gives do not match the numbers 00129 // from the distance method. 00130 if (h_stats.get_total() >= (width + height) / 4) { 00131 blob->set_horz_stroke_width(h_stats.ile(0.5f)); 00132 if (v_stats.get_total() >= (width + height) / 4) 00133 blob->set_vert_stroke_width(v_stats.ile(0.5f)); 00134 else 00135 blob->set_vert_stroke_width(0.0f); 00136 } else { 00137 if (v_stats.get_total() >= (width + height) / 4 || 00138 v_stats.get_total() > h_stats.get_total()) { 00139 blob->set_horz_stroke_width(0.0f); 00140 blob->set_vert_stroke_width(v_stats.ile(0.5f)); 00141 } else { 00142 blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) 00143 : 0.0f); 00144 blob->set_vert_stroke_width(0.0f); 00145 } 00146 } 00147 } 00148 00149 00150 /********************************************************************** 00151 * assign_blobs_to_blocks2 00152 * 00153 * Make a list of TO_BLOCKs for portrait and landscape orientation. 00154 **********************************************************************/ 00155 00156 void assign_blobs_to_blocks2(Pix* pix, 00157 BLOCK_LIST *blocks, // blocks to process 00158 TO_BLOCK_LIST *port_blocks) { // output list 00159 BLOCK *block; // current block 00160 BLOBNBOX *newblob; // created blob 00161 C_BLOB *blob; // current blob 00162 BLOCK_IT block_it = blocks; 00163 C_BLOB_IT blob_it; // iterator 00164 BLOBNBOX_IT port_box_it; // iterator 00165 // destination iterator 00166 TO_BLOCK_IT port_block_it = port_blocks; 00167 TO_BLOCK *port_block; // created block 00168 00169 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { 00170 block = block_it.data(); 00171 port_block = new TO_BLOCK(block); 00172 00173 // Convert the good outlines to block->blob_list 00174 port_box_it.set_to_list(&port_block->blobs); 00175 blob_it.set_to_list(block->blob_list()); 00176 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00177 blob = blob_it.extract(); 00178 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. 00179 SetBlobStrokeWidth(pix, newblob); 00180 port_box_it.add_after_then_move(newblob); 00181 } 00182 00183 // Put the rejected outlines in block->noise_blobs, which allows them to 00184 // be reconsidered and sorted back into rows and recover outlines mistakenly 00185 // rejected. 00186 port_box_it.set_to_list(&port_block->noise_blobs); 00187 blob_it.set_to_list(block->reject_blobs()); 00188 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00189 blob = blob_it.extract(); 00190 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. 00191 SetBlobStrokeWidth(pix, newblob); 00192 port_box_it.add_after_then_move(newblob); 00193 } 00194 00195 port_block_it.add_after_then_move(port_block); 00196 } 00197 } 00198 00199 namespace tesseract { 00200 /********************************************************************** 00201 * find_components 00202 * 00203 * Find the C_OUTLINEs of the connected components in each block, put them 00204 * in C_BLOBs, and filter them by size, putting the different size 00205 * grades on different lists in the matching TO_BLOCK in to_blocks. 00206 **********************************************************************/ 00207 00208 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks, 00209 TO_BLOCK_LIST *to_blocks) { 00210 int width = pixGetWidth(pix); 00211 int height = pixGetHeight(pix); 00212 if (width > MAX_INT16 || height > MAX_INT16) { 00213 tprintf("Input image too large! (%d, %d)\n", width, height); 00214 return; // Can't handle it. 00215 } 00216 00217 set_global_loc_code(LOC_EDGE_PROG); 00218 00219 BLOCK_IT block_it(blocks); // iterator 00220 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00221 block_it.forward()) { 00222 BLOCK* block = block_it.data(); 00223 if (block->poly_block() == NULL || block->poly_block()->IsText()) { 00224 extract_edges(pix, block); 00225 } 00226 } 00227 00228 assign_blobs_to_blocks2(pix, blocks, to_blocks); 00229 ICOORD page_tr(width, height); 00230 filter_blobs(page_tr, to_blocks, !textord_test_landscape); 00231 } 00232 00233 /********************************************************************** 00234 * filter_blobs 00235 * 00236 * Sort the blobs into sizes in all the blocks for later work. 00237 **********************************************************************/ 00238 00239 void Textord::filter_blobs(ICOORD page_tr, // top right 00240 TO_BLOCK_LIST *blocks, // output list 00241 BOOL8 testing_on) { // for plotting 00242 TO_BLOCK_IT block_it = blocks; // destination iterator 00243 TO_BLOCK *block; // created block 00244 00245 #ifndef GRAPHICS_DISABLED 00246 if (to_win != NULL) 00247 to_win->Clear(); 00248 #endif // GRAPHICS_DISABLED 00249 00250 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00251 block_it.forward()) { 00252 block = block_it.data(); 00253 block->line_size = filter_noise_blobs(&block->blobs, 00254 &block->noise_blobs, 00255 &block->small_blobs, 00256 &block->large_blobs); 00257 block->line_spacing = block->line_size * 00258 (tesseract::CCStruct::kDescenderFraction + 00259 tesseract::CCStruct::kXHeightFraction + 00260 2 * tesseract::CCStruct::kAscenderFraction) / 00261 tesseract::CCStruct::kXHeightFraction; 00262 block->line_size *= textord_min_linesize; 00263 block->max_blob_size = block->line_size * textord_excess_blobsize; 00264 00265 #ifndef GRAPHICS_DISABLED 00266 if (textord_show_blobs && testing_on) { 00267 if (to_win == NULL) 00268 create_to_win(page_tr); 00269 block->plot_graded_blobs(to_win); 00270 } 00271 if (textord_show_boxes && testing_on) { 00272 if (to_win == NULL) 00273 create_to_win(page_tr); 00274 plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE); 00275 plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE); 00276 plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE); 00277 plot_box_list(to_win, &block->blobs, ScrollView::WHITE); 00278 } 00279 #endif // GRAPHICS_DISABLED 00280 } 00281 } 00282 00283 /********************************************************************** 00284 * filter_noise_blobs 00285 * 00286 * Move small blobs to a separate list. 00287 **********************************************************************/ 00288 00289 float Textord::filter_noise_blobs( 00290 BLOBNBOX_LIST *src_list, // original list 00291 BLOBNBOX_LIST *noise_list, // noise list 00292 BLOBNBOX_LIST *small_list, // small blobs 00293 BLOBNBOX_LIST *large_list) { // large blobs 00294 inT16 height; //height of blob 00295 inT16 width; //of blob 00296 BLOBNBOX *blob; //current blob 00297 float initial_x; //first guess 00298 BLOBNBOX_IT src_it = src_list; //iterators 00299 BLOBNBOX_IT noise_it = noise_list; 00300 BLOBNBOX_IT small_it = small_list; 00301 BLOBNBOX_IT large_it = large_list; 00302 STATS size_stats (0, MAX_NEAREST_DIST); 00303 //blob heights 00304 float min_y; //size limits 00305 float max_y; 00306 float max_x; 00307 float max_height; //of good blobs 00308 00309 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00310 blob = src_it.data (); 00311 if (blob->bounding_box ().height () < textord_max_noise_size) 00312 noise_it.add_after_then_move (src_it.extract ()); 00313 else if (blob->enclosed_area () >= blob->bounding_box ().height () 00314 * blob->bounding_box ().width () * textord_noise_area_ratio) 00315 small_it.add_after_then_move (src_it.extract ()); 00316 } 00317 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00318 size_stats.add (src_it.data ()->bounding_box ().height (), 1); 00319 } 00320 initial_x = size_stats.ile (textord_initialx_ile); 00321 max_y = ceil(initial_x * 00322 (tesseract::CCStruct::kDescenderFraction + 00323 tesseract::CCStruct::kXHeightFraction + 00324 2 * tesseract::CCStruct::kAscenderFraction) / 00325 tesseract::CCStruct::kXHeightFraction); 00326 min_y = floor (initial_x / 2); 00327 max_x = ceil (initial_x * textord_width_limit); 00328 small_it.move_to_first (); 00329 for (small_it.mark_cycle_pt (); !small_it.cycled_list (); 00330 small_it.forward ()) { 00331 height = small_it.data()->bounding_box().height(); 00332 if (height > max_y) 00333 large_it.add_after_then_move(small_it.extract ()); 00334 else if (height >= min_y) 00335 src_it.add_after_then_move(small_it.extract ()); 00336 } 00337 size_stats.clear (); 00338 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00339 height = src_it.data ()->bounding_box ().height (); 00340 width = src_it.data ()->bounding_box ().width (); 00341 if (height < min_y) 00342 small_it.add_after_then_move (src_it.extract ()); 00343 else if (height > max_y || width > max_x) 00344 large_it.add_after_then_move (src_it.extract ()); 00345 else 00346 size_stats.add (height, 1); 00347 } 00348 max_height = size_stats.ile (textord_initialasc_ile); 00349 // printf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,", 00350 // max_y,min_y,initial_x,max_height); 00351 max_height *= tesseract::CCStruct::kXHeightCapRatio; 00352 if (max_height > initial_x) 00353 initial_x = max_height; 00354 // printf(" ret=%g\n",initial_x); 00355 return initial_x; 00356 } 00357 00358 /********************************************************************** 00359 * cleanup_blocks 00360 * 00361 * Delete empty blocks, rows from the page. 00362 **********************************************************************/ 00363 00364 void Textord::cleanup_blocks( //remove empties 00365 BLOCK_LIST *blocks //list 00366 ) { 00367 BLOCK_IT block_it = blocks; //iterator 00368 ROW_IT row_it; //row iterator 00369 00370 int num_rows = 0; 00371 int num_rows_all = 0; 00372 int num_blocks = 0; 00373 int num_blocks_all = 0; 00374 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00375 block_it.forward ()) { 00376 num_rows = 0; 00377 num_rows_all = 0; 00378 row_it.set_to_list (block_it.data ()->row_list ()); 00379 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00380 ++num_rows_all; 00381 clean_small_noise_from_words(row_it.data()); 00382 if ((textord_noise_rejrows 00383 && !row_it.data ()->word_list ()->empty () 00384 && clean_noise_from_row (row_it.data ())) 00385 || row_it.data ()->word_list ()->empty ()) 00386 delete row_it.extract ();//lose empty row 00387 else { 00388 if (textord_noise_rejwords) 00389 clean_noise_from_words (row_it.data ()); 00390 if (textord_blshift_maxshift >= 0) 00391 tweak_row_baseline(row_it.data(), 00392 textord_blshift_maxshift, 00393 textord_blshift_xfraction); 00394 ++num_rows; 00395 } 00396 } 00397 if (block_it.data()->row_list()->empty() && 00398 (block_it.data()->poly_block() == NULL || 00399 block_it.data()->poly_block()->IsText())) { 00400 delete block_it.extract(); // Lose empty text blocks but not other types. 00401 } else { 00402 ++num_blocks; 00403 } 00404 ++num_blocks_all; 00405 if (textord_noise_debug) 00406 tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all); 00407 } 00408 if (textord_noise_debug) 00409 tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all); 00410 } 00411 00412 00413 /********************************************************************** 00414 * clean_noise_from_row 00415 * 00416 * Move blobs of words from rows of garbage into the reject blobs list. 00417 **********************************************************************/ 00418 00419 BOOL8 Textord::clean_noise_from_row( //remove empties 00420 ROW *row //row to clean 00421 ) { 00422 BOOL8 testing_on; 00423 TBOX blob_box; //bounding box 00424 C_BLOB *blob; //current blob 00425 C_OUTLINE *outline; //current outline 00426 WERD *word; //current word 00427 inT32 blob_size; //biggest size 00428 inT32 trans_count = 0; //no of transitions 00429 inT32 trans_threshold; //noise tolerance 00430 inT32 dot_count; //small objects 00431 inT32 norm_count; //normal objects 00432 inT32 super_norm_count; //real char-like 00433 //words of row 00434 WERD_IT word_it = row->word_list (); 00435 C_BLOB_IT blob_it; //blob iterator 00436 C_OUTLINE_IT out_it; //outline iterator 00437 00438 if (textord_test_y > row->base_line (textord_test_x) 00439 && textord_show_blobs 00440 && textord_test_y < row->base_line (textord_test_x) + row->x_height ()) 00441 testing_on = TRUE; 00442 else 00443 testing_on = FALSE; 00444 dot_count = 0; 00445 norm_count = 0; 00446 super_norm_count = 0; 00447 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00448 word = word_it.data (); //current word 00449 //blobs in word 00450 blob_it.set_to_list (word->cblob_list ()); 00451 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00452 blob_it.forward ()) { 00453 blob = blob_it.data (); 00454 if (!word->flag (W_DONT_CHOP)) { 00455 //get outlines 00456 out_it.set_to_list (blob->out_list ()); 00457 for (out_it.mark_cycle_pt (); !out_it.cycled_list (); 00458 out_it.forward ()) { 00459 outline = out_it.data (); 00460 blob_box = outline->bounding_box (); 00461 blob_size = 00462 blob_box.width () > 00463 blob_box.height ()? blob_box.width () : blob_box. 00464 height(); 00465 if (blob_size < textord_noise_sizelimit * row->x_height ()) 00466 dot_count++; //count smal outlines 00467 if (!outline->child ()->empty () 00468 && blob_box.height () < 00469 (1 + textord_noise_syfract) * row->x_height () 00470 && blob_box.height () > 00471 (1 - textord_noise_syfract) * row->x_height () 00472 && blob_box.width () < 00473 (1 + textord_noise_sxfract) * row->x_height () 00474 && blob_box.width () > 00475 (1 - textord_noise_sxfract) * row->x_height ()) 00476 super_norm_count++; //count smal outlines 00477 } 00478 } 00479 else 00480 super_norm_count++; 00481 blob_box = blob->bounding_box (); 00482 blob_size = 00483 blob_box.width () > 00484 blob_box.height ()? blob_box.width () : blob_box.height (); 00485 if (blob_size >= textord_noise_sizelimit * row->x_height () 00486 && blob_size < row->x_height () * 2) { 00487 trans_threshold = blob_size / textord_noise_sizefraction; 00488 trans_count = blob->count_transitions (trans_threshold); 00489 if (trans_count < textord_noise_translimit) 00490 norm_count++; 00491 } 00492 else if (blob_box.height () > row->x_height () * 2 00493 && (!word_it.at_first () || !blob_it.at_first ())) 00494 dot_count += 2; 00495 #ifndef SECURE_NAMES 00496 if (testing_on) { 00497 tprintf 00498 ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", 00499 blob_box.left (), blob_box.bottom (), blob_box.right (), 00500 blob_box.top (), blob->out_list ()->length (), trans_count, 00501 blob_box.bottom () - row->base_line (blob_box.left ())); 00502 } 00503 #endif 00504 } 00505 } 00506 #ifndef SECURE_NAMES 00507 if (textord_noise_debug) { 00508 tprintf ("Row ending at (%d,%g):", 00509 blob_box.right (), row->base_line (blob_box.right ())); 00510 tprintf (" R=%g, dc=%d, nc=%d, %s\n", 00511 norm_count > 0 ? (float) dot_count / norm_count : 9999, 00512 dot_count, norm_count, 00513 dot_count > norm_count * textord_noise_normratio 00514 && dot_count > 2 ? "REJECTED" : "ACCEPTED"); 00515 } 00516 #endif 00517 return super_norm_count < textord_noise_sncount 00518 && dot_count > norm_count * textord_noise_rowratio && dot_count > 2; 00519 } 00520 00521 /********************************************************************** 00522 * clean_noise_from_words 00523 * 00524 * Move blobs of words from rows of garbage into the reject blobs list. 00525 **********************************************************************/ 00526 00527 void Textord::clean_noise_from_words( //remove empties 00528 ROW *row //row to clean 00529 ) { 00530 TBOX blob_box; //bounding box 00531 inT8 *word_dud; //was it chucked 00532 C_BLOB *blob; //current blob 00533 C_OUTLINE *outline; //current outline 00534 WERD *word; //current word 00535 inT32 blob_size; //biggest size 00536 inT32 trans_count; //no of transitions 00537 inT32 trans_threshold; //noise tolerance 00538 inT32 dot_count; //small objects 00539 inT32 norm_count; //normal objects 00540 inT32 dud_words; //number discarded 00541 inT32 ok_words; //number remaining 00542 inT32 word_index; //current word 00543 //words of row 00544 WERD_IT word_it = row->word_list (); 00545 C_BLOB_IT blob_it; //blob iterator 00546 C_OUTLINE_IT out_it; //outline iterator 00547 00548 ok_words = word_it.length (); 00549 if (ok_words == 0 || textord_no_rejects) 00550 return; 00551 word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8)); 00552 dud_words = 0; 00553 ok_words = 0; 00554 word_index = 0; 00555 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00556 word = word_it.data (); //current word 00557 dot_count = 0; 00558 norm_count = 0; 00559 //blobs in word 00560 blob_it.set_to_list (word->cblob_list ()); 00561 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00562 blob_it.forward ()) { 00563 blob = blob_it.data (); 00564 if (!word->flag (W_DONT_CHOP)) { 00565 //get outlines 00566 out_it.set_to_list (blob->out_list ()); 00567 for (out_it.mark_cycle_pt (); !out_it.cycled_list (); 00568 out_it.forward ()) { 00569 outline = out_it.data (); 00570 blob_box = outline->bounding_box (); 00571 blob_size = 00572 blob_box.width () > 00573 blob_box.height ()? blob_box.width () : blob_box. 00574 height(); 00575 if (blob_size < textord_noise_sizelimit * row->x_height ()) 00576 dot_count++; //count smal outlines 00577 if (!outline->child ()->empty () 00578 && blob_box.height () < 00579 (1 + textord_noise_syfract) * row->x_height () 00580 && blob_box.height () > 00581 (1 - textord_noise_syfract) * row->x_height () 00582 && blob_box.width () < 00583 (1 + textord_noise_sxfract) * row->x_height () 00584 && blob_box.width () > 00585 (1 - textord_noise_sxfract) * row->x_height ()) 00586 norm_count++; //count smal outlines 00587 } 00588 } 00589 else 00590 norm_count++; 00591 blob_box = blob->bounding_box (); 00592 blob_size = 00593 blob_box.width () > 00594 blob_box.height ()? blob_box.width () : blob_box.height (); 00595 if (blob_size >= textord_noise_sizelimit * row->x_height () 00596 && blob_size < row->x_height () * 2) { 00597 trans_threshold = blob_size / textord_noise_sizefraction; 00598 trans_count = blob->count_transitions (trans_threshold); 00599 if (trans_count < textord_noise_translimit) 00600 norm_count++; 00601 } 00602 else if (blob_box.height () > row->x_height () * 2 00603 && (!word_it.at_first () || !blob_it.at_first ())) 00604 dot_count += 2; 00605 } 00606 if (dot_count > 2) { 00607 if (dot_count > norm_count * textord_noise_normratio * 2) 00608 word_dud[word_index] = 2; 00609 else if (dot_count > norm_count * textord_noise_normratio) 00610 word_dud[word_index] = 1; 00611 else 00612 word_dud[word_index] = 0; 00613 } 00614 else 00615 word_dud[word_index] = 0; 00616 if (word_dud[word_index] == 2) 00617 dud_words++; 00618 else 00619 ok_words++; 00620 word_index++; 00621 } 00622 00623 word_index = 0; 00624 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00625 if (word_dud[word_index] == 2 00626 || (word_dud[word_index] == 1 && dud_words > ok_words)) { 00627 word = word_it.data (); //current word 00628 //rejected blobs 00629 blob_it.set_to_list (word->rej_cblob_list ()); 00630 //move from blobs 00631 blob_it.add_list_after (word->cblob_list ()); 00632 } 00633 word_index++; 00634 } 00635 free_mem(word_dud); 00636 } 00637 00638 // Remove outlines that are a tiny fraction in either width or height 00639 // of the word height. 00640 void Textord::clean_small_noise_from_words(ROW *row) { 00641 WERD_IT word_it(row->word_list()); 00642 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00643 WERD* word = word_it.data(); 00644 int min_size = static_cast<int>( 00645 textord_noise_hfract * word->bounding_box().height() + 0.5); 00646 C_BLOB_IT blob_it(word->cblob_list()); 00647 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00648 C_BLOB* blob = blob_it.data(); 00649 C_OUTLINE_IT out_it(blob->out_list()); 00650 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { 00651 C_OUTLINE* outline = out_it.data(); 00652 outline->RemoveSmallRecursive(min_size, &out_it); 00653 } 00654 if (blob->out_list()->empty()) { 00655 delete blob_it.extract(); 00656 } 00657 } 00658 if (word->cblob_list()->empty()) { 00659 if (!word_it.at_last()) { 00660 // The next word is no longer a fuzzy non space if it was before, 00661 // since the word before is about to be deleted. 00662 WERD* next_word = word_it.data_relative(1); 00663 if (next_word->flag(W_FUZZY_NON)) { 00664 next_word->set_flag(W_FUZZY_NON, false); 00665 } 00666 } 00667 delete word_it.extract(); 00668 } 00669 } 00670 } 00671 } // tesseract 00672 00673 /********************************************************************** 00674 * tweak_row_baseline 00675 * 00676 * Shift baseline to fit the blobs more accurately where they are 00677 * close enough. 00678 **********************************************************************/ 00679 00680 void tweak_row_baseline(ROW *row, 00681 double blshift_maxshift, 00682 double blshift_xfraction) { 00683 TBOX blob_box; //bounding box 00684 C_BLOB *blob; //current blob 00685 WERD *word; //current word 00686 inT32 blob_count; //no of blobs 00687 inT32 src_index; //source segment 00688 inT32 dest_index; //destination segment 00689 inT32 *xstarts; //spline segments 00690 double *coeffs; //spline coeffs 00691 float ydiff; //baseline error 00692 float x_centre; //centre of blob 00693 //words of row 00694 WERD_IT word_it = row->word_list (); 00695 C_BLOB_IT blob_it; //blob iterator 00696 00697 blob_count = 0; 00698 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00699 word = word_it.data (); //current word 00700 //get total blobs 00701 blob_count += word->cblob_list ()->length (); 00702 } 00703 if (blob_count == 0) 00704 return; 00705 xstarts = 00706 (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) * 00707 sizeof (inT32)); 00708 coeffs = 00709 (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 * 00710 sizeof (double)); 00711 00712 src_index = 0; 00713 dest_index = 0; 00714 xstarts[0] = row->baseline.xcoords[0]; 00715 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00716 word = word_it.data (); //current word 00717 //blobs in word 00718 blob_it.set_to_list (word->cblob_list ()); 00719 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00720 blob_it.forward ()) { 00721 blob = blob_it.data (); 00722 blob_box = blob->bounding_box (); 00723 x_centre = (blob_box.left () + blob_box.right ()) / 2.0; 00724 ydiff = blob_box.bottom () - row->base_line (x_centre); 00725 if (ydiff < 0) 00726 ydiff = -ydiff / row->x_height (); 00727 else 00728 ydiff = ydiff / row->x_height (); 00729 if (ydiff < blshift_maxshift 00730 && blob_box.height () / row->x_height () > blshift_xfraction) { 00731 if (xstarts[dest_index] >= x_centre) 00732 xstarts[dest_index] = blob_box.left (); 00733 coeffs[dest_index * 3] = 0; 00734 coeffs[dest_index * 3 + 1] = 0; 00735 coeffs[dest_index * 3 + 2] = blob_box.bottom (); 00736 //shift it 00737 dest_index++; 00738 xstarts[dest_index] = blob_box.right () + 1; 00739 } 00740 else { 00741 if (xstarts[dest_index] <= x_centre) { 00742 while (row->baseline.xcoords[src_index + 1] <= x_centre 00743 && src_index < row->baseline.segments - 1) { 00744 if (row->baseline.xcoords[src_index + 1] > 00745 xstarts[dest_index]) { 00746 coeffs[dest_index * 3] = 00747 row->baseline.quadratics[src_index].a; 00748 coeffs[dest_index * 3 + 1] = 00749 row->baseline.quadratics[src_index].b; 00750 coeffs[dest_index * 3 + 2] = 00751 row->baseline.quadratics[src_index].c; 00752 dest_index++; 00753 xstarts[dest_index] = 00754 row->baseline.xcoords[src_index + 1]; 00755 } 00756 src_index++; 00757 } 00758 coeffs[dest_index * 3] = 00759 row->baseline.quadratics[src_index].a; 00760 coeffs[dest_index * 3 + 1] = 00761 row->baseline.quadratics[src_index].b; 00762 coeffs[dest_index * 3 + 2] = 00763 row->baseline.quadratics[src_index].c; 00764 dest_index++; 00765 xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; 00766 } 00767 } 00768 } 00769 } 00770 while (src_index < row->baseline.segments 00771 && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) 00772 src_index++; 00773 while (src_index < row->baseline.segments) { 00774 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; 00775 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; 00776 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; 00777 dest_index++; 00778 src_index++; 00779 xstarts[dest_index] = row->baseline.xcoords[src_index]; 00780 } 00781 //turn to spline 00782 row->baseline = QSPLINE (dest_index, xstarts, coeffs); 00783 free_mem(xstarts); 00784 free_mem(coeffs); 00785 } 00786 00787 /********************************************************************** 00788 * blob_y_order 00789 * 00790 * Sort function to sort blobs in y from page top. 00791 **********************************************************************/ 00792 00793 inT32 blob_y_order( //sort function 00794 void *item1, //items to compare 00795 void *item2) { 00796 //converted ptr 00797 BLOBNBOX *blob1 = *(BLOBNBOX **) item1; 00798 //converted ptr 00799 BLOBNBOX *blob2 = *(BLOBNBOX **) item2; 00800 00801 if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ()) 00802 return -1; 00803 else if (blob1->bounding_box ().bottom () < 00804 blob2->bounding_box ().bottom ()) 00805 return 1; 00806 else { 00807 if (blob1->bounding_box ().left () < blob2->bounding_box ().left ()) 00808 return -1; 00809 else if (blob1->bounding_box ().left () > 00810 blob2->bounding_box ().left ()) 00811 return 1; 00812 else 00813 return 0; 00814 } 00815 }