Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: topitch.cpp (Formerly to_pitch.c) 00003 * Description: Code to determine fixed pitchness and the pitch if fixed. 00004 * Author: Ray Smith 00005 * Created: Tue Aug 24 16:57:29 BST 1993 00006 * 00007 * (C) Copyright 1993, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "mfcpch.h" 00021 #ifdef __UNIX__ 00022 #include <assert.h> 00023 #endif 00024 #include "stderr.h" 00025 #include "blobbox.h" 00026 #include "statistc.h" 00027 #include "drawtord.h" 00028 #include "makerow.h" 00029 #include "pitsync1.h" 00030 #include "pithsync.h" 00031 #include "tovars.h" 00032 #include "wordseg.h" 00033 #include "topitch.h" 00034 #include "secname.h" 00035 #include "helpers.h" 00036 00037 // Include automatically generated configuration file if running autoconf. 00038 #ifdef HAVE_CONFIG_H 00039 #include "config_auto.h" 00040 #endif 00041 00042 #define EXTERN 00043 00044 EXTERN BOOL_VAR (textord_all_prop, FALSE, "All doc is proportial text"); 00045 EXTERN BOOL_VAR (textord_debug_pitch_test, FALSE, 00046 "Debug on fixed pitch test"); 00047 EXTERN BOOL_VAR (textord_disable_pitch_test, FALSE, 00048 "Turn off dp fixed pitch algorithm"); 00049 EXTERN BOOL_VAR (textord_fast_pitch_test, FALSE, 00050 "Do even faster pitch algorithm"); 00051 EXTERN BOOL_VAR (textord_debug_pitch_metric, FALSE, 00052 "Write full metric stuff"); 00053 EXTERN BOOL_VAR (textord_show_row_cuts, FALSE, "Draw row-level cuts"); 00054 EXTERN BOOL_VAR (textord_show_page_cuts, FALSE, "Draw page-level cuts"); 00055 EXTERN BOOL_VAR (textord_pitch_cheat, FALSE, 00056 "Use correct answer for fixed/prop"); 00057 EXTERN BOOL_VAR (textord_blockndoc_fixed, FALSE, 00058 "Attempt whole doc/block fixed pitch"); 00059 EXTERN double_VAR (textord_projection_scale, 0.200, "Ding rate for mid-cuts"); 00060 EXTERN double_VAR (textord_balance_factor, 1.0, 00061 "Ding rate for unbalanced char cells"); 00062 00063 #define FIXED_WIDTH_MULTIPLE 5 00064 #define BLOCK_STATS_CLUSTERS 10 00065 #define MAX_ALLOWED_PITCH 100 //max pixel pitch. 00066 00067 /********************************************************************** 00068 * compute_fixed_pitch 00069 * 00070 * Decide whether each row is fixed pitch individually. 00071 * Correlate definite and uncertain results to obtain an individual 00072 * result for each row in the TO_ROW class. 00073 **********************************************************************/ 00074 00075 void compute_fixed_pitch(ICOORD page_tr, // top right 00076 TO_BLOCK_LIST *port_blocks, // input list 00077 float gradient, // page skew 00078 FCOORD rotation, // for drawing 00079 BOOL8 testing_on) { // correct orientation 00080 TO_BLOCK_IT block_it; //iterator 00081 TO_BLOCK *block; //current block; 00082 TO_ROW_IT row_it; //row iterator 00083 TO_ROW *row; //current row 00084 int block_index; //block number 00085 int row_index; //row number 00086 00087 #ifndef GRAPHICS_DISABLED 00088 if (textord_show_initial_words && testing_on) { 00089 if (to_win == NULL) 00090 create_to_win(page_tr); 00091 } 00092 #endif 00093 00094 block_it.set_to_list (port_blocks); 00095 block_index = 1; 00096 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00097 block_it.forward ()) { 00098 block = block_it.data (); 00099 compute_block_pitch(block, rotation, block_index, testing_on); 00100 block_index++; 00101 } 00102 00103 if (!try_doc_fixed (page_tr, port_blocks, gradient)) { 00104 block_index = 1; 00105 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00106 block_it.forward ()) { 00107 block = block_it.data (); 00108 if (!try_block_fixed (block, block_index)) 00109 try_rows_fixed(block, block_index, testing_on); 00110 block_index++; 00111 } 00112 } 00113 00114 block_index = 1; 00115 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00116 block_it.forward()) { 00117 block = block_it.data (); 00118 POLY_BLOCK* pb = block->block->poly_block(); 00119 if (pb != NULL && !pb->IsText()) continue; // Non-text doesn't exist! 00120 row_it.set_to_list (block->get_rows ()); 00121 row_index = 1; 00122 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00123 row = row_it.data (); 00124 fix_row_pitch(row, block, port_blocks, row_index, block_index); 00125 row_index++; 00126 } 00127 block_index++; 00128 } 00129 #ifndef GRAPHICS_DISABLED 00130 if (textord_show_initial_words && testing_on) { 00131 ScrollView::Update(); 00132 } 00133 #endif 00134 } 00135 00136 00137 /********************************************************************** 00138 * fix_row_pitch 00139 * 00140 * Get a pitch_decision for this row by voting among similar rows in the 00141 * block, then similar rows over all the page, or any other rows at all. 00142 **********************************************************************/ 00143 00144 void fix_row_pitch(TO_ROW *bad_row, // row to fix 00145 TO_BLOCK *bad_block, // block of bad_row 00146 TO_BLOCK_LIST *blocks, // blocks to scan 00147 inT32 row_target, // number of row 00148 inT32 block_target) { // number of block 00149 inT16 mid_cuts; 00150 int block_votes; //votes in block 00151 int like_votes; //votes over page 00152 int other_votes; //votes of unlike blocks 00153 int block_index; //number of block 00154 int row_index; //number of row 00155 int maxwidth; //max pitch 00156 TO_BLOCK_IT block_it = blocks; //block iterator 00157 TO_ROW_IT row_it; 00158 TO_BLOCK *block; //current block 00159 TO_ROW *row; //current row 00160 float sp_sd; //space deviation 00161 STATS block_stats; //pitches in block 00162 STATS like_stats; //pitches in page 00163 00164 block_votes = like_votes = other_votes = 0; 00165 maxwidth = (inT32) ceil (bad_row->xheight * textord_words_maxspace); 00166 if (bad_row->pitch_decision != PITCH_DEF_FIXED 00167 && bad_row->pitch_decision != PITCH_DEF_PROP) { 00168 block_stats.set_range (0, maxwidth); 00169 like_stats.set_range (0, maxwidth); 00170 block_index = 1; 00171 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00172 block_it.forward()) { 00173 block = block_it.data(); 00174 POLY_BLOCK* pb = block->block->poly_block(); 00175 if (pb != NULL && !pb->IsText()) continue; // Non text doesn't exist! 00176 row_index = 1; 00177 row_it.set_to_list (block->get_rows ()); 00178 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); 00179 row_it.forward ()) { 00180 row = row_it.data (); 00181 if ((bad_row->all_caps 00182 && row->xheight + row->ascrise 00183 < 00184 (bad_row->xheight + bad_row->ascrise) * (1 + 00185 textord_pitch_rowsimilarity) 00186 && row->xheight + row->ascrise > 00187 (bad_row->xheight + bad_row->ascrise) * (1 - 00188 textord_pitch_rowsimilarity)) 00189 || (!bad_row->all_caps 00190 && row->xheight < 00191 bad_row->xheight * (1 + textord_pitch_rowsimilarity) 00192 && row->xheight > 00193 bad_row->xheight * (1 - textord_pitch_rowsimilarity))) { 00194 if (block_index == block_target) { 00195 if (row->pitch_decision == PITCH_DEF_FIXED) { 00196 block_votes += textord_words_veto_power; 00197 block_stats.add ((inT32) row->fixed_pitch, 00198 textord_words_veto_power); 00199 } 00200 else if (row->pitch_decision == PITCH_MAYBE_FIXED 00201 || row->pitch_decision == PITCH_CORR_FIXED) { 00202 block_votes++; 00203 block_stats.add ((inT32) row->fixed_pitch, 1); 00204 } 00205 else if (row->pitch_decision == PITCH_DEF_PROP) 00206 block_votes -= textord_words_veto_power; 00207 else if (row->pitch_decision == PITCH_MAYBE_PROP 00208 || row->pitch_decision == PITCH_CORR_PROP) 00209 block_votes--; 00210 } 00211 else { 00212 if (row->pitch_decision == PITCH_DEF_FIXED) { 00213 like_votes += textord_words_veto_power; 00214 like_stats.add ((inT32) row->fixed_pitch, 00215 textord_words_veto_power); 00216 } 00217 else if (row->pitch_decision == PITCH_MAYBE_FIXED 00218 || row->pitch_decision == PITCH_CORR_FIXED) { 00219 like_votes++; 00220 like_stats.add ((inT32) row->fixed_pitch, 1); 00221 } 00222 else if (row->pitch_decision == PITCH_DEF_PROP) 00223 like_votes -= textord_words_veto_power; 00224 else if (row->pitch_decision == PITCH_MAYBE_PROP 00225 || row->pitch_decision == PITCH_CORR_PROP) 00226 like_votes--; 00227 } 00228 } 00229 else { 00230 if (row->pitch_decision == PITCH_DEF_FIXED) 00231 other_votes += textord_words_veto_power; 00232 else if (row->pitch_decision == PITCH_MAYBE_FIXED 00233 || row->pitch_decision == PITCH_CORR_FIXED) 00234 other_votes++; 00235 else if (row->pitch_decision == PITCH_DEF_PROP) 00236 other_votes -= textord_words_veto_power; 00237 else if (row->pitch_decision == PITCH_MAYBE_PROP 00238 || row->pitch_decision == PITCH_CORR_PROP) 00239 other_votes--; 00240 } 00241 row_index++; 00242 } 00243 block_index++; 00244 } 00245 if (block_votes > textord_words_veto_power) { 00246 bad_row->fixed_pitch = block_stats.ile (0.5); 00247 bad_row->pitch_decision = PITCH_CORR_FIXED; 00248 } 00249 else if (block_votes <= textord_words_veto_power && like_votes > 0) { 00250 bad_row->fixed_pitch = like_stats.ile (0.5); 00251 bad_row->pitch_decision = PITCH_CORR_FIXED; 00252 } 00253 else { 00254 bad_row->pitch_decision = PITCH_CORR_PROP; 00255 #ifndef SECURE_NAMES 00256 if (block_votes == 0 && like_votes == 0 && other_votes > 0 00257 && (textord_debug_pitch_test || textord_debug_pitch_metric)) 00258 tprintf 00259 ("Warning:row %d of block %d set prop with no like rows against trend\n", 00260 row_target, block_target); 00261 #endif 00262 } 00263 } 00264 if (textord_debug_pitch_metric) { 00265 tprintf(":b_votes=%d:l_votes=%d:o_votes=%d", 00266 block_votes, like_votes, other_votes); 00267 tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise); 00268 } 00269 if (bad_row->pitch_decision == PITCH_CORR_FIXED) { 00270 if (bad_row->fixed_pitch < textord_min_xheight) { 00271 if (block_votes > 0) 00272 bad_row->fixed_pitch = block_stats.ile (0.5); 00273 else if (block_votes == 0 && like_votes > 0) 00274 bad_row->fixed_pitch = like_stats.ile (0.5); 00275 else { 00276 tprintf 00277 ("Warning:guessing pitch as xheight on row %d, block %d\n", 00278 row_target, block_target); 00279 bad_row->fixed_pitch = bad_row->xheight; 00280 } 00281 } 00282 if (bad_row->fixed_pitch < textord_min_xheight) 00283 bad_row->fixed_pitch = (float) textord_min_xheight; 00284 bad_row->kern_size = bad_row->fixed_pitch / 4; 00285 bad_row->min_space = (inT32) (bad_row->fixed_pitch * 0.6); 00286 bad_row->max_nonspace = (inT32) (bad_row->fixed_pitch * 0.4); 00287 bad_row->space_threshold = 00288 (bad_row->min_space + bad_row->max_nonspace) / 2; 00289 bad_row->space_size = bad_row->fixed_pitch; 00290 if (bad_row->char_cells.empty ()) 00291 tune_row_pitch (bad_row, &bad_row->projection, 00292 bad_row->projection_left, bad_row->projection_right, 00293 (bad_row->fixed_pitch + 00294 bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch, 00295 sp_sd, mid_cuts, &bad_row->char_cells, FALSE); 00296 } 00297 else if (bad_row->pitch_decision == PITCH_CORR_PROP 00298 || bad_row->pitch_decision == PITCH_DEF_PROP) { 00299 bad_row->fixed_pitch = 0.0f; 00300 bad_row->char_cells.clear (); 00301 } 00302 } 00303 00304 00305 /********************************************************************** 00306 * compute_block_pitch 00307 * 00308 * Decide whether each block is fixed pitch individually. 00309 **********************************************************************/ 00310 00311 void compute_block_pitch(TO_BLOCK *block, // input list 00312 FCOORD rotation, // for drawing 00313 inT32 block_index, // block number 00314 BOOL8 testing_on) { // correct orientation 00315 TBOX block_box; //bounding box 00316 00317 block_box = block->block->bounding_box (); 00318 if (testing_on && textord_debug_pitch_test) { 00319 tprintf ("Block %d at (%d,%d)->(%d,%d)\n", 00320 block_index, 00321 block_box.left (), block_box.bottom (), 00322 block_box.right (), block_box.top ()); 00323 } 00324 block->min_space = (inT32) floor (block->xheight 00325 * textord_words_default_minspace); 00326 block->max_nonspace = (inT32) ceil (block->xheight 00327 * textord_words_default_nonspace); 00328 block->fixed_pitch = 0.0f; 00329 block->space_size = (float) block->min_space; 00330 block->kern_size = (float) block->max_nonspace; 00331 block->pr_nonsp = block->xheight * words_default_prop_nonspace; 00332 block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop; 00333 if (!block->get_rows ()->empty ()) { 00334 ASSERT_HOST (block->xheight > 0); 00335 find_repeated_chars(block, textord_show_initial_words && testing_on); 00336 #ifndef GRAPHICS_DISABLED 00337 if (textord_show_initial_words && testing_on) 00338 //overlap_picture_ops(TRUE); 00339 ScrollView::Update(); 00340 #endif 00341 compute_rows_pitch(block, 00342 block_index, 00343 textord_debug_pitch_test &&testing_on); 00344 } 00345 } 00346 00347 00348 /********************************************************************** 00349 * compute_rows_pitch 00350 * 00351 * Decide whether each row is fixed pitch individually. 00352 **********************************************************************/ 00353 00354 BOOL8 compute_rows_pitch( //find line stats 00355 TO_BLOCK *block, //block to do 00356 inT32 block_index, //block number 00357 BOOL8 testing_on //correct orientation 00358 ) { 00359 inT32 maxwidth; //of spaces 00360 TO_ROW *row; //current row 00361 inT32 row_index; //row number. 00362 float lower, upper; //cluster thresholds 00363 TO_ROW_IT row_it = block->get_rows (); 00364 00365 row_index = 1; 00366 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00367 row = row_it.data (); 00368 ASSERT_HOST (row->xheight > 0); 00369 row->compute_vertical_projection (); 00370 maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace); 00371 if (row_pitch_stats (row, maxwidth, testing_on) 00372 && find_row_pitch (row, maxwidth, 00373 textord_dotmatrix_gap + 1, block, block_index, 00374 row_index, testing_on)) { 00375 if (row->fixed_pitch == 0) { 00376 lower = row->pr_nonsp; 00377 upper = row->pr_space; 00378 row->space_size = upper; 00379 row->kern_size = lower; 00380 } 00381 } 00382 else { 00383 row->fixed_pitch = 0.0f; //insufficient data 00384 row->pitch_decision = PITCH_DUNNO; 00385 } 00386 row_index++; 00387 } 00388 return FALSE; 00389 } 00390 00391 00392 /********************************************************************** 00393 * try_doc_fixed 00394 * 00395 * Attempt to call the entire document fixed pitch. 00396 **********************************************************************/ 00397 00398 BOOL8 try_doc_fixed( //determine pitch 00399 ICOORD page_tr, //top right 00400 TO_BLOCK_LIST *port_blocks, //input list 00401 float gradient //page skew 00402 ) { 00403 inT16 master_x; //uniform shifts 00404 inT16 pitch; //median pitch. 00405 int x; //profile coord 00406 int prop_blocks; //correct counts 00407 int fixed_blocks; 00408 int total_row_count; //total in page 00409 //iterator 00410 TO_BLOCK_IT block_it = port_blocks; 00411 TO_BLOCK *block; //current block; 00412 TO_ROW_IT row_it; //row iterator 00413 TO_ROW *row; //current row 00414 inT16 projection_left; //edges 00415 inT16 projection_right; 00416 inT16 row_left; //edges of row 00417 inT16 row_right; 00418 ICOORDELT_LIST *master_cells; //cells for page 00419 float master_y; //uniform shifts 00420 float shift_factor; //page skew correction 00421 float row_shift; //shift for row 00422 float final_pitch; //output pitch 00423 float row_y; //baseline 00424 STATS projection; //entire page 00425 STATS pitches (0, MAX_ALLOWED_PITCH); 00426 //for median 00427 float sp_sd; //space sd 00428 inT16 mid_cuts; //no of cheap cuts 00429 float pitch_sd; //sync rating 00430 00431 if (block_it.empty () 00432 // || block_it.data()==block_it.data_relative(1) 00433 || !textord_blockndoc_fixed) 00434 return FALSE; 00435 shift_factor = gradient / (gradient * gradient + 1); 00436 row_it.set_to_list (block_it.data ()->get_rows ()); 00437 master_x = row_it.data ()->projection_left; 00438 master_y = row_it.data ()->baseline.y (master_x); 00439 projection_left = MAX_INT16; 00440 projection_right = -MAX_INT16; 00441 prop_blocks = 0; 00442 fixed_blocks = 0; 00443 total_row_count = 0; 00444 00445 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00446 block_it.forward ()) { 00447 block = block_it.data (); 00448 row_it.set_to_list (block->get_rows ()); 00449 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00450 row = row_it.data (); 00451 total_row_count++; 00452 if (row->fixed_pitch > 0) 00453 pitches.add ((inT32) (row->fixed_pitch), 1); 00454 //find median 00455 row_y = row->baseline.y (master_x); 00456 row_left = 00457 (inT16) (row->projection_left - 00458 shift_factor * (master_y - row_y)); 00459 row_right = 00460 (inT16) (row->projection_right - 00461 shift_factor * (master_y - row_y)); 00462 if (row_left < projection_left) 00463 projection_left = row_left; 00464 if (row_right > projection_right) 00465 projection_right = row_right; 00466 } 00467 } 00468 if (pitches.get_total () == 0) 00469 return FALSE; 00470 projection.set_range (projection_left, projection_right); 00471 00472 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00473 block_it.forward ()) { 00474 block = block_it.data (); 00475 row_it.set_to_list (block->get_rows ()); 00476 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00477 row = row_it.data (); 00478 row_y = row->baseline.y (master_x); 00479 row_left = 00480 (inT16) (row->projection_left - 00481 shift_factor * (master_y - row_y)); 00482 for (x = row->projection_left; x < row->projection_right; 00483 x++, row_left++) { 00484 projection.add (row_left, row->projection.pile_count (x)); 00485 } 00486 } 00487 } 00488 00489 row_it.set_to_list (block_it.data ()->get_rows ()); 00490 row = row_it.data (); 00491 #ifndef GRAPHICS_DISABLED 00492 if (textord_show_page_cuts && to_win != NULL) 00493 projection.plot (to_win, projection_left, 00494 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); 00495 #endif 00496 final_pitch = pitches.ile (0.5); 00497 pitch = (inT16) final_pitch; 00498 pitch_sd = 00499 tune_row_pitch (row, &projection, projection_left, projection_right, 00500 pitch * 0.75, final_pitch, sp_sd, mid_cuts, 00501 &row->char_cells, FALSE); 00502 00503 if (textord_debug_pitch_metric) 00504 tprintf 00505 ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n", 00506 prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, 00507 pitch_sd / total_row_count, pitch_sd / pitch, 00508 pitch_sd / total_row_count / pitch); 00509 00510 #ifndef GRAPHICS_DISABLED 00511 if (textord_show_page_cuts && to_win != NULL) { 00512 master_cells = &row->char_cells; 00513 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00514 block_it.forward ()) { 00515 block = block_it.data (); 00516 row_it.set_to_list (block->get_rows ()); 00517 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); 00518 row_it.forward ()) { 00519 row = row_it.data (); 00520 row_y = row->baseline.y (master_x); 00521 row_shift = shift_factor * (master_y - row_y); 00522 plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells); 00523 } 00524 } 00525 } 00526 #endif 00527 row->char_cells.clear (); 00528 return FALSE; 00529 } 00530 00531 00532 /********************************************************************** 00533 * try_block_fixed 00534 * 00535 * Try to call the entire block fixed. 00536 **********************************************************************/ 00537 00538 BOOL8 try_block_fixed( //find line stats 00539 TO_BLOCK *block, //block to do 00540 inT32 block_index //block number 00541 ) { 00542 return FALSE; 00543 } 00544 00545 00546 /********************************************************************** 00547 * try_rows_fixed 00548 * 00549 * Decide whether each row is fixed pitch individually. 00550 **********************************************************************/ 00551 00552 BOOL8 try_rows_fixed( //find line stats 00553 TO_BLOCK *block, //block to do 00554 inT32 block_index, //block number 00555 BOOL8 testing_on //correct orientation 00556 ) { 00557 inT32 maxwidth; //of spaces 00558 TO_ROW *row; //current row 00559 inT32 row_index; //row number. 00560 inT32 def_fixed = 0; //counters 00561 inT32 def_prop = 0; 00562 inT32 maybe_fixed = 0; 00563 inT32 maybe_prop = 0; 00564 inT32 dunno = 0; 00565 inT32 corr_fixed = 0; 00566 inT32 corr_prop = 0; 00567 float lower, upper; //cluster thresholds 00568 TO_ROW_IT row_it = block->get_rows (); 00569 00570 row_index = 1; 00571 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00572 row = row_it.data (); 00573 ASSERT_HOST (row->xheight > 0); 00574 maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace); 00575 if (row->fixed_pitch > 0 && 00576 fixed_pitch_row(row, block->block, block_index)) { 00577 if (row->fixed_pitch == 0) { 00578 lower = row->pr_nonsp; 00579 upper = row->pr_space; 00580 row->space_size = upper; 00581 row->kern_size = lower; 00582 } 00583 } 00584 row_index++; 00585 } 00586 count_block_votes(block, 00587 def_fixed, 00588 def_prop, 00589 maybe_fixed, 00590 maybe_prop, 00591 corr_fixed, 00592 corr_prop, 00593 dunno); 00594 if (testing_on 00595 && (textord_debug_pitch_test 00596 || textord_blocksall_prop || textord_blocksall_fixed)) { 00597 tprintf ("Initially:"); 00598 print_block_counts(block, block_index); 00599 } 00600 if (def_fixed > def_prop * textord_words_veto_power) 00601 block->pitch_decision = PITCH_DEF_FIXED; 00602 else if (def_prop > def_fixed * textord_words_veto_power) 00603 block->pitch_decision = PITCH_DEF_PROP; 00604 else if (def_fixed > 0 || def_prop > 0) 00605 block->pitch_decision = PITCH_DUNNO; 00606 else if (maybe_fixed > maybe_prop * textord_words_veto_power) 00607 block->pitch_decision = PITCH_MAYBE_FIXED; 00608 else if (maybe_prop > maybe_fixed * textord_words_veto_power) 00609 block->pitch_decision = PITCH_MAYBE_PROP; 00610 else 00611 block->pitch_decision = PITCH_DUNNO; 00612 return FALSE; 00613 } 00614 00615 00616 /********************************************************************** 00617 * print_block_counts 00618 * 00619 * Count up how many rows have what decision and print the results. 00620 **********************************************************************/ 00621 00622 void print_block_counts( //find line stats 00623 TO_BLOCK *block, //block to do 00624 inT32 block_index //block number 00625 ) { 00626 inT32 def_fixed = 0; //counters 00627 inT32 def_prop = 0; 00628 inT32 maybe_fixed = 0; 00629 inT32 maybe_prop = 0; 00630 inT32 dunno = 0; 00631 inT32 corr_fixed = 0; 00632 inT32 corr_prop = 0; 00633 00634 count_block_votes(block, 00635 def_fixed, 00636 def_prop, 00637 maybe_fixed, 00638 maybe_prop, 00639 corr_fixed, 00640 corr_prop, 00641 dunno); 00642 tprintf ("Block %d has (%d,%d,%d)", 00643 block_index, def_fixed, maybe_fixed, corr_fixed); 00644 if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) 00645 tprintf (" (Wrongly)"); 00646 tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop); 00647 if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) 00648 tprintf (" (Wrongly)"); 00649 tprintf (" prop, %d dunno\n", dunno); 00650 } 00651 00652 00653 /********************************************************************** 00654 * count_block_votes 00655 * 00656 * Count the number of rows in the block with each kind of pitch_decision. 00657 **********************************************************************/ 00658 00659 void count_block_votes( //find line stats 00660 TO_BLOCK *block, //block to do 00661 inT32 &def_fixed, //add to counts 00662 inT32 &def_prop, 00663 inT32 &maybe_fixed, 00664 inT32 &maybe_prop, 00665 inT32 &corr_fixed, 00666 inT32 &corr_prop, 00667 inT32 &dunno) { 00668 TO_ROW *row; //current row 00669 TO_ROW_IT row_it = block->get_rows (); 00670 00671 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00672 row = row_it.data (); 00673 switch (row->pitch_decision) { 00674 case PITCH_DUNNO: 00675 dunno++; 00676 break; 00677 case PITCH_DEF_PROP: 00678 def_prop++; 00679 break; 00680 case PITCH_MAYBE_PROP: 00681 maybe_prop++; 00682 break; 00683 case PITCH_DEF_FIXED: 00684 def_fixed++; 00685 break; 00686 case PITCH_MAYBE_FIXED: 00687 maybe_fixed++; 00688 break; 00689 case PITCH_CORR_PROP: 00690 corr_prop++; 00691 break; 00692 case PITCH_CORR_FIXED: 00693 corr_fixed++; 00694 break; 00695 } 00696 } 00697 } 00698 00699 00700 /********************************************************************** 00701 * row_pitch_stats 00702 * 00703 * Decide whether each row is fixed pitch individually. 00704 **********************************************************************/ 00705 00706 BOOL8 row_pitch_stats( //find line stats 00707 TO_ROW *row, //current row 00708 inT32 maxwidth, //of spaces 00709 BOOL8 testing_on //correct orientation 00710 ) { 00711 BLOBNBOX *blob; //current blob 00712 int gap_index; //current gap 00713 inT32 prev_x; //end of prev blob 00714 inT32 cluster_count; //no of clusters 00715 inT32 prev_count; //of clusters 00716 inT32 smooth_factor; //for smoothing stats 00717 TBOX blob_box; //bounding box 00718 float lower, upper; //cluster thresholds 00719 //gap sizes 00720 float gaps[BLOCK_STATS_CLUSTERS]; 00721 //blobs 00722 BLOBNBOX_IT blob_it = row->blob_list (); 00723 STATS gap_stats (0, maxwidth); 00724 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; 00725 //clusters 00726 00727 smooth_factor = 00728 (inT32) (row->xheight * textord_wordstats_smooth_factor + 1.5); 00729 if (!blob_it.empty ()) { 00730 prev_x = blob_it.data ()->bounding_box ().right (); 00731 blob_it.forward (); 00732 while (!blob_it.at_first ()) { 00733 blob = blob_it.data (); 00734 if (!blob->joined_to_prev ()) { 00735 blob_box = blob->bounding_box (); 00736 if (blob_box.left () - prev_x < maxwidth) 00737 gap_stats.add (blob_box.left () - prev_x, 1); 00738 prev_x = blob_box.right (); 00739 } 00740 blob_it.forward (); 00741 } 00742 } 00743 if (gap_stats.get_total () == 0) { 00744 return FALSE; 00745 } 00746 cluster_count = 0; 00747 lower = row->xheight * words_initial_lower; 00748 upper = row->xheight * words_initial_upper; 00749 gap_stats.smooth (smooth_factor); 00750 do { 00751 prev_count = cluster_count; 00752 cluster_count = gap_stats.cluster (lower, upper, 00753 textord_spacesize_ratioprop, 00754 BLOCK_STATS_CLUSTERS, cluster_stats); 00755 } 00756 while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); 00757 if (cluster_count < 1) { 00758 return FALSE; 00759 } 00760 for (gap_index = 0; gap_index < cluster_count; gap_index++) 00761 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); 00762 //get medians 00763 if (testing_on) { 00764 tprintf ("cluster_count=%d:", cluster_count); 00765 for (gap_index = 0; gap_index < cluster_count; gap_index++) 00766 tprintf (" %g(%d)", gaps[gap_index], 00767 cluster_stats[gap_index + 1].get_total ()); 00768 tprintf ("\n"); 00769 } 00770 qsort (gaps, cluster_count, sizeof (float), sort_floats); 00771 00772 //Try to find proportional non-space and space for row. 00773 lower = row->xheight * words_default_prop_nonspace; 00774 upper = row->xheight * textord_words_min_minspace; 00775 for (gap_index = 0; gap_index < cluster_count 00776 && gaps[gap_index] < lower; gap_index++); 00777 if (gap_index == 0) { 00778 if (testing_on) 00779 tprintf ("No clusters below nonspace threshold!!\n"); 00780 if (cluster_count > 1) { 00781 row->pr_nonsp = gaps[0]; 00782 row->pr_space = gaps[1]; 00783 } 00784 else { 00785 row->pr_nonsp = lower; 00786 row->pr_space = gaps[0]; 00787 } 00788 } 00789 else { 00790 row->pr_nonsp = gaps[gap_index - 1]; 00791 while (gap_index < cluster_count && gaps[gap_index] < upper) 00792 gap_index++; 00793 if (gap_index == cluster_count) { 00794 if (testing_on) 00795 tprintf ("No clusters above nonspace threshold!!\n"); 00796 row->pr_space = lower * textord_spacesize_ratioprop; 00797 } 00798 else 00799 row->pr_space = gaps[gap_index]; 00800 } 00801 00802 //Now try to find the fixed pitch space and non-space. 00803 upper = row->xheight * words_default_fixed_space; 00804 for (gap_index = 0; gap_index < cluster_count 00805 && gaps[gap_index] < upper; gap_index++); 00806 if (gap_index == 0) { 00807 if (testing_on) 00808 tprintf ("No clusters below space threshold!!\n"); 00809 row->fp_nonsp = upper; 00810 row->fp_space = gaps[0]; 00811 } 00812 else { 00813 row->fp_nonsp = gaps[gap_index - 1]; 00814 if (gap_index == cluster_count) { 00815 if (testing_on) 00816 tprintf ("No clusters above space threshold!!\n"); 00817 row->fp_space = row->xheight; 00818 } 00819 else 00820 row->fp_space = gaps[gap_index]; 00821 } 00822 if (testing_on) { 00823 tprintf 00824 ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n", 00825 row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space); 00826 } 00827 return TRUE; //computed some stats 00828 } 00829 00830 00831 /********************************************************************** 00832 * find_row_pitch 00833 * 00834 * Check to see if this row could be fixed pitch using the given spacings. 00835 * Blobs with gaps smaller than the lower threshold are assumed to be one. 00836 * The larger threshold is the word gap threshold. 00837 **********************************************************************/ 00838 00839 BOOL8 find_row_pitch( //find lines 00840 TO_ROW *row, //row to do 00841 inT32 maxwidth, //max permitted space 00842 inT32 dm_gap, //ignorable gaps 00843 TO_BLOCK *block, //block of row 00844 inT32 block_index, //block_number 00845 inT32 row_index, //number of row 00846 BOOL8 testing_on //correct orientation 00847 ) { 00848 BOOL8 used_dm_model; //looks lik dot matrix 00849 float min_space; //estimate threshold 00850 float non_space; //gap size 00851 float gap_iqr; //interquartile range 00852 float pitch_iqr; 00853 float dm_gap_iqr; //interquartile range 00854 float dm_pitch_iqr; 00855 float dm_pitch; //pitch with dm on 00856 float pitch; //revised estimate 00857 float initial_pitch; //guess at pitch 00858 STATS gap_stats (0, maxwidth); 00859 //centre-centre 00860 STATS pitch_stats (0, maxwidth); 00861 00862 row->fixed_pitch = 0.0f; 00863 initial_pitch = row->fp_space; 00864 if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) 00865 initial_pitch = row->xheight;//keep pitch decent 00866 non_space = row->fp_nonsp; 00867 if (non_space > initial_pitch) 00868 non_space = initial_pitch; 00869 min_space = (initial_pitch + non_space) / 2; 00870 00871 if (!count_pitch_stats (row, &gap_stats, &pitch_stats, 00872 initial_pitch, min_space, TRUE, FALSE, dm_gap)) { 00873 dm_gap_iqr = 0.0001; 00874 dm_pitch_iqr = maxwidth * 2.0f; 00875 dm_pitch = initial_pitch; 00876 } 00877 else { 00878 dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00879 dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00880 dm_pitch = pitch_stats.ile (0.5); 00881 } 00882 gap_stats.clear (); 00883 pitch_stats.clear (); 00884 if (!count_pitch_stats (row, &gap_stats, &pitch_stats, 00885 initial_pitch, min_space, TRUE, FALSE, 0)) { 00886 gap_iqr = 0.0001; 00887 pitch_iqr = maxwidth * 3.0f; 00888 } 00889 else { 00890 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00891 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00892 if (testing_on) 00893 tprintf 00894 ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n", 00895 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5)); 00896 initial_pitch = pitch_stats.ile (0.5); 00897 if (min_space > initial_pitch 00898 && count_pitch_stats (row, &gap_stats, &pitch_stats, 00899 initial_pitch, initial_pitch, TRUE, FALSE, 0)) { 00900 min_space = initial_pitch; 00901 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00902 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00903 if (testing_on) 00904 tprintf 00905 ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n", 00906 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5)); 00907 initial_pitch = pitch_stats.ile (0.5); 00908 } 00909 } 00910 if (textord_debug_pitch_metric) 00911 tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", 00912 block_index, row_index, 'X', 00913 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr, 00914 pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D' : 00915 (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M')); 00916 if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) { 00917 row->pitch_decision = PITCH_DUNNO; 00918 if (textord_debug_pitch_metric) 00919 tprintf ("\n"); 00920 return FALSE; //insufficient data 00921 } 00922 if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) { 00923 if (testing_on) 00924 tprintf 00925 ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n", 00926 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); 00927 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00928 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00929 pitch = pitch_stats.ile (0.5); 00930 used_dm_model = FALSE; 00931 } 00932 else { 00933 if (testing_on) 00934 tprintf 00935 ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n", 00936 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); 00937 gap_iqr = dm_gap_iqr; 00938 pitch_iqr = dm_pitch_iqr; 00939 pitch = dm_pitch; 00940 used_dm_model = TRUE; 00941 } 00942 if (textord_debug_pitch_metric) { 00943 tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", 00944 pitch_iqr, gap_iqr, pitch); 00945 tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", 00946 pitch_iqr / gap_iqr, pitch_iqr / block->xheight, 00947 pitch_iqr < gap_iqr * textord_fpiqr_ratio 00948 && pitch_iqr < block->xheight * textord_max_pitch_iqr 00949 && pitch < block->xheight * textord_words_default_maxspace 00950 ? 'F' : 'P'); 00951 } 00952 if (pitch_iqr < gap_iqr * textord_fpiqr_ratio 00953 && pitch_iqr < block->xheight * textord_max_pitch_iqr 00954 && pitch < block->xheight * textord_words_default_maxspace) 00955 row->pitch_decision = PITCH_MAYBE_FIXED; 00956 else 00957 row->pitch_decision = PITCH_MAYBE_PROP; 00958 row->fixed_pitch = pitch; 00959 row->kern_size = gap_stats.ile (0.5); 00960 row->min_space = (inT32) (row->fixed_pitch + non_space) / 2; 00961 if (row->min_space > row->fixed_pitch) 00962 row->min_space = (inT32) row->fixed_pitch; 00963 row->max_nonspace = row->min_space; 00964 row->space_size = row->fixed_pitch; 00965 row->space_threshold = (row->max_nonspace + row->min_space) / 2; 00966 row->used_dm_model = used_dm_model; 00967 return TRUE; 00968 } 00969 00970 00971 /********************************************************************** 00972 * fixed_pitch_row 00973 * 00974 * Check to see if this row could be fixed pitch using the given spacings. 00975 * Blobs with gaps smaller than the lower threshold are assumed to be one. 00976 * The larger threshold is the word gap threshold. 00977 **********************************************************************/ 00978 00979 BOOL8 fixed_pitch_row(TO_ROW *row, // row to do 00980 BLOCK* block, 00981 inT32 block_index // block_number 00982 ) { 00983 const char *res_string; //pitch result 00984 inT16 mid_cuts; //no of cheap cuts 00985 float non_space; //gap size 00986 float pitch_sd; //error on pitch 00987 float sp_sd; //space sd 00988 00989 non_space = row->fp_nonsp; 00990 if (non_space > row->fixed_pitch) 00991 non_space = row->fixed_pitch; 00992 POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; 00993 if (textord_all_prop || (pb != NULL && !pb->IsText())) { 00994 // Set the decision to definitely proportional. 00995 pitch_sd = textord_words_def_prop * row->fixed_pitch; 00996 row->pitch_decision = PITCH_DEF_PROP; 00997 } else { 00998 pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left, 00999 row->projection_right, 01000 (row->fixed_pitch + non_space * 3) / 4, 01001 row->fixed_pitch, sp_sd, mid_cuts, 01002 &row->char_cells, 01003 block_index == textord_debug_block); 01004 if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch 01005 && ((pitsync_linear_version & 3) < 3 01006 || ((pitsync_linear_version & 3) >= 3 && (row->used_dm_model 01007 || sp_sd > 20 01008 || (pitch_sd == 0 && sp_sd > 10))))) { 01009 if (pitch_sd < textord_words_def_fixed * row->fixed_pitch 01010 && !row->all_caps 01011 && ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) 01012 row->pitch_decision = PITCH_DEF_FIXED; 01013 else 01014 row->pitch_decision = PITCH_MAYBE_FIXED; 01015 } 01016 else if ((pitsync_linear_version & 3) < 3 01017 || sp_sd > 20 01018 || mid_cuts > 0 01019 || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) { 01020 if (pitch_sd < textord_words_def_prop * row->fixed_pitch) 01021 row->pitch_decision = PITCH_MAYBE_PROP; 01022 else 01023 row->pitch_decision = PITCH_DEF_PROP; 01024 } 01025 else 01026 row->pitch_decision = PITCH_DUNNO; 01027 } 01028 01029 if (textord_debug_pitch_metric) { 01030 res_string = "??"; 01031 switch (row->pitch_decision) { 01032 case PITCH_DEF_PROP: 01033 res_string = "DP"; 01034 break; 01035 case PITCH_MAYBE_PROP: 01036 res_string = "MP"; 01037 break; 01038 case PITCH_DEF_FIXED: 01039 res_string = "DF"; 01040 break; 01041 case PITCH_MAYBE_FIXED: 01042 res_string = "MF"; 01043 default: 01044 res_string = "??"; 01045 } 01046 tprintf (":sd/p=%g:occ=%g:init_res=%s\n", 01047 pitch_sd / row->fixed_pitch, sp_sd, res_string); 01048 } 01049 return TRUE; 01050 } 01051 01052 01053 /********************************************************************** 01054 * count_pitch_stats 01055 * 01056 * Count up the gap and pitch stats on the block to see if it is fixed pitch. 01057 * Blobs with gaps smaller than the lower threshold are assumed to be one. 01058 * The larger threshold is the word gap threshold. 01059 * The return value indicates whether there were any decent values to use. 01060 **********************************************************************/ 01061 01062 BOOL8 count_pitch_stats( //find lines 01063 TO_ROW *row, //row to do 01064 STATS *gap_stats, //blob gaps 01065 STATS *pitch_stats, //centre-centre stats 01066 float initial_pitch, //guess at pitch 01067 float min_space, //estimate space size 01068 BOOL8 ignore_outsize, //discard big objects 01069 BOOL8 split_outsize, //split big objects 01070 inT32 dm_gap //ignorable gaps 01071 ) { 01072 BOOL8 prev_valid; //not word broken 01073 BLOBNBOX *blob; //current blob 01074 //blobs 01075 BLOBNBOX_IT blob_it = row->blob_list (); 01076 inT32 prev_right; //end of prev blob 01077 inT32 prev_centre; //centre of previous blob 01078 inT32 x_centre; //centre of this blob 01079 inT32 blob_width; //width of blob 01080 inT32 width_units; //no of widths in blob 01081 float width; //blob width 01082 TBOX blob_box; //bounding box 01083 TBOX joined_box; //of super blob 01084 01085 gap_stats->clear (); 01086 pitch_stats->clear (); 01087 if (blob_it.empty ()) 01088 return FALSE; 01089 prev_valid = FALSE; 01090 prev_centre = 0; 01091 prev_right = 0; //stop complier warning 01092 joined_box = blob_it.data ()->bounding_box (); 01093 do { 01094 blob_it.forward (); 01095 blob = blob_it.data (); 01096 if (!blob->joined_to_prev ()) { 01097 blob_box = blob->bounding_box (); 01098 if ((blob_box.left () - joined_box.right () < dm_gap 01099 && !blob_it.at_first ()) 01100 || blob->cblob() == NULL) 01101 joined_box += blob_box; //merge blobs 01102 else { 01103 blob_width = joined_box.width (); 01104 if (split_outsize) { 01105 width_units = 01106 (inT32) floor ((float) blob_width / initial_pitch + 0.5); 01107 if (width_units < 1) 01108 width_units = 1; 01109 width_units--; 01110 } 01111 else if (ignore_outsize) { 01112 width = (float) blob_width / initial_pitch; 01113 width_units = width < 1 + words_default_fixed_limit 01114 && width > 1 - words_default_fixed_limit ? 0 : -1; 01115 } 01116 else 01117 width_units = 0; //everything in 01118 x_centre = (inT32) (joined_box.left () 01119 + (blob_width - 01120 width_units * initial_pitch) / 2); 01121 if (prev_valid && width_units >= 0) { 01122 // if (width_units>0) 01123 // { 01124 // tprintf("wu=%d, width=%d, xc=%d, adding %d\n", 01125 // width_units,blob_width,x_centre,x_centre-prev_centre); 01126 // } 01127 gap_stats->add (joined_box.left () - prev_right, 1); 01128 pitch_stats->add (x_centre - prev_centre, 1); 01129 } 01130 prev_centre = (inT32) (x_centre + width_units * initial_pitch); 01131 prev_right = joined_box.right (); 01132 prev_valid = blob_box.left () - joined_box.right () < min_space; 01133 prev_valid = prev_valid && width_units >= 0; 01134 joined_box = blob_box; 01135 } 01136 } 01137 } 01138 while (!blob_it.at_first ()); 01139 return gap_stats->get_total () >= 3; 01140 } 01141 01142 01143 /********************************************************************** 01144 * tune_row_pitch 01145 * 01146 * Use a dp algorithm to fit the character cells and return the sd of 01147 * the cell size over the row. 01148 **********************************************************************/ 01149 01150 float tune_row_pitch( //find fp cells 01151 TO_ROW *row, //row to do 01152 STATS *projection, //vertical projection 01153 inT16 projection_left, //edge of projection 01154 inT16 projection_right, //edge of projection 01155 float space_size, //size of blank 01156 float &initial_pitch, //guess at pitch 01157 float &best_sp_sd, //space sd 01158 inT16 &best_mid_cuts, //no of cheap cuts 01159 ICOORDELT_LIST *best_cells, //row cells 01160 BOOL8 testing_on //inidividual words 01161 ) { 01162 int pitch_delta; //offset pitch 01163 inT16 mid_cuts; //cheap cuts 01164 float pitch_sd; //current sd 01165 float best_sd; //best result 01166 float best_pitch; //pitch for best result 01167 float initial_sd; //starting error 01168 float sp_sd; //space sd 01169 ICOORDELT_LIST test_cells; //row cells 01170 ICOORDELT_IT best_it; //start of best list 01171 01172 if (textord_fast_pitch_test) 01173 return tune_row_pitch2 (row, projection, projection_left, 01174 projection_right, space_size, initial_pitch, 01175 best_sp_sd, 01176 //space sd 01177 best_mid_cuts, best_cells, testing_on); 01178 if (textord_disable_pitch_test) { 01179 best_sp_sd = initial_pitch; 01180 return initial_pitch; 01181 } 01182 initial_sd = 01183 compute_pitch_sd(row, 01184 projection, 01185 projection_left, 01186 projection_right, 01187 space_size, 01188 initial_pitch, 01189 best_sp_sd, 01190 best_mid_cuts, 01191 best_cells, 01192 testing_on); 01193 best_sd = initial_sd; 01194 best_pitch = initial_pitch; 01195 if (testing_on) 01196 tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd); 01197 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { 01198 pitch_sd = 01199 compute_pitch_sd (row, projection, projection_left, projection_right, 01200 space_size, initial_pitch + pitch_delta, sp_sd, 01201 mid_cuts, &test_cells, testing_on); 01202 if (testing_on) 01203 tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, 01204 pitch_sd); 01205 if (pitch_sd < best_sd) { 01206 best_sd = pitch_sd; 01207 best_mid_cuts = mid_cuts; 01208 best_sp_sd = sp_sd; 01209 best_pitch = initial_pitch + pitch_delta; 01210 best_cells->clear (); 01211 best_it.set_to_list (best_cells); 01212 best_it.add_list_after (&test_cells); 01213 } 01214 else 01215 test_cells.clear (); 01216 if (pitch_sd > initial_sd) 01217 break; //getting worse 01218 } 01219 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { 01220 pitch_sd = 01221 compute_pitch_sd (row, projection, projection_left, projection_right, 01222 space_size, initial_pitch - pitch_delta, sp_sd, 01223 mid_cuts, &test_cells, testing_on); 01224 if (testing_on) 01225 tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, 01226 pitch_sd); 01227 if (pitch_sd < best_sd) { 01228 best_sd = pitch_sd; 01229 best_mid_cuts = mid_cuts; 01230 best_sp_sd = sp_sd; 01231 best_pitch = initial_pitch - pitch_delta; 01232 best_cells->clear (); 01233 best_it.set_to_list (best_cells); 01234 best_it.add_list_after (&test_cells); 01235 } 01236 else 01237 test_cells.clear (); 01238 if (pitch_sd > initial_sd) 01239 break; 01240 } 01241 initial_pitch = best_pitch; 01242 01243 if (textord_debug_pitch_metric) 01244 print_pitch_sd(row, 01245 projection, 01246 projection_left, 01247 projection_right, 01248 space_size, 01249 best_pitch); 01250 01251 return best_sd; 01252 } 01253 01254 01255 /********************************************************************** 01256 * tune_row_pitch 01257 * 01258 * Use a dp algorithm to fit the character cells and return the sd of 01259 * the cell size over the row. 01260 **********************************************************************/ 01261 01262 float tune_row_pitch2( //find fp cells 01263 TO_ROW *row, //row to do 01264 STATS *projection, //vertical projection 01265 inT16 projection_left, //edge of projection 01266 inT16 projection_right, //edge of projection 01267 float space_size, //size of blank 01268 float &initial_pitch, //guess at pitch 01269 float &best_sp_sd, //space sd 01270 inT16 &best_mid_cuts, //no of cheap cuts 01271 ICOORDELT_LIST *best_cells, //row cells 01272 BOOL8 testing_on //inidividual words 01273 ) { 01274 int pitch_delta; //offset pitch 01275 inT16 pixel; //pixel coord 01276 inT16 best_pixel; //pixel coord 01277 inT16 best_delta; //best pitch 01278 inT16 best_pitch; //best pitch 01279 inT16 start; //of good range 01280 inT16 end; //of good range 01281 inT32 best_count; //lowest sum 01282 float best_sd; //best result 01283 STATS *sum_proj; //summed projection 01284 01285 best_sp_sd = initial_pitch; 01286 01287 if (textord_disable_pitch_test) { 01288 return initial_pitch; 01289 } 01290 sum_proj = new STATS[textord_pitch_range * 2 + 1]; 01291 if (sum_proj == NULL) 01292 return initial_pitch; 01293 best_pitch = (inT32) initial_pitch; 01294 01295 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; 01296 pitch_delta++) 01297 sum_proj[textord_pitch_range + pitch_delta].set_range (0, 01298 best_pitch + 01299 pitch_delta + 1); 01300 for (pixel = projection_left; pixel <= projection_right; pixel++) { 01301 for (pitch_delta = -textord_pitch_range; 01302 pitch_delta <= textord_pitch_range; pitch_delta++) 01303 sum_proj[textord_pitch_range + 01304 pitch_delta].add ((pixel - projection_left) % (best_pitch + 01305 pitch_delta), 01306 projection->pile_count (pixel)); 01307 } 01308 best_count = sum_proj[textord_pitch_range].pile_count (0); 01309 best_delta = 0; 01310 best_pixel = 0; 01311 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; 01312 pitch_delta++) { 01313 for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) { 01314 if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel) 01315 < best_count) { 01316 best_count = 01317 sum_proj[textord_pitch_range + 01318 pitch_delta].pile_count (pixel); 01319 best_delta = pitch_delta; 01320 best_pixel = pixel; 01321 } 01322 } 01323 } 01324 if (testing_on) 01325 tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", 01326 initial_pitch, best_delta, best_count); 01327 best_pitch += best_delta; 01328 initial_pitch = best_pitch; 01329 best_count++; 01330 best_count += best_count; 01331 for (start = best_pixel - 2; start > best_pixel - best_pitch 01332 && sum_proj[textord_pitch_range + 01333 best_delta].pile_count (start % best_pitch) <= best_count; 01334 start--); 01335 for (end = best_pixel + 2; 01336 end < best_pixel + best_pitch 01337 && sum_proj[textord_pitch_range + 01338 best_delta].pile_count (end % best_pitch) <= best_count; 01339 end++); 01340 01341 best_sd = 01342 compute_pitch_sd(row, 01343 projection, 01344 projection_left, 01345 projection_right, 01346 space_size, 01347 initial_pitch, 01348 best_sp_sd, 01349 best_mid_cuts, 01350 best_cells, 01351 testing_on, 01352 start, 01353 end); 01354 if (testing_on) 01355 tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, 01356 best_sd); 01357 01358 if (textord_debug_pitch_metric) 01359 print_pitch_sd(row, 01360 projection, 01361 projection_left, 01362 projection_right, 01363 space_size, 01364 initial_pitch); 01365 01366 delete[]sum_proj; 01367 01368 return best_sd; 01369 } 01370 01371 01372 /********************************************************************** 01373 * compute_pitch_sd 01374 * 01375 * Use a dp algorithm to fit the character cells and return the sd of 01376 * the cell size over the row. 01377 **********************************************************************/ 01378 01379 float compute_pitch_sd( //find fp cells 01380 TO_ROW *row, //row to do 01381 STATS *projection, //vertical projection 01382 inT16 projection_left, //edge 01383 inT16 projection_right, //edge 01384 float space_size, //size of blank 01385 float initial_pitch, //guess at pitch 01386 float &sp_sd, //space sd 01387 inT16 &mid_cuts, //no of free cuts 01388 ICOORDELT_LIST *row_cells, //list of chop pts 01389 BOOL8 testing_on, //inidividual words 01390 inT16 start, //start of good range 01391 inT16 end //end of good range 01392 ) { 01393 inT16 occupation; //no of cells in word. 01394 //blobs 01395 BLOBNBOX_IT blob_it = row->blob_list (); 01396 BLOBNBOX_IT start_it; //start of word 01397 BLOBNBOX_IT plot_it; //for plotting 01398 inT16 blob_count; //no of blobs 01399 TBOX blob_box; //bounding box 01400 TBOX prev_box; //of super blob 01401 inT32 prev_right; //of word sync 01402 int scale_factor; //on scores for big words 01403 inT32 sp_count; //spaces 01404 FPSEGPT_LIST seg_list; //char cells 01405 FPSEGPT_IT seg_it; //iterator 01406 inT16 segpos; //position of segment 01407 inT16 cellpos; //previous cell boundary 01408 //iterator 01409 ICOORDELT_IT cell_it = row_cells; 01410 ICOORDELT *cell; //new cell 01411 double sqsum; //sum of squares 01412 double spsum; //of spaces 01413 double sp_var; //space error 01414 double word_sync; //result for word 01415 inT32 total_count; //total blobs 01416 01417 if ((pitsync_linear_version & 3) > 1) { 01418 word_sync = compute_pitch_sd2 (row, projection, projection_left, 01419 projection_right, initial_pitch, 01420 occupation, mid_cuts, row_cells, 01421 testing_on, start, end); 01422 sp_sd = occupation; 01423 return word_sync; 01424 } 01425 mid_cuts = 0; 01426 cellpos = 0; 01427 total_count = 0; 01428 sqsum = 0; 01429 sp_count = 0; 01430 spsum = 0; 01431 prev_right = -1; 01432 if (blob_it.empty ()) 01433 return space_size * 10; 01434 #ifndef GRAPHICS_DISABLED 01435 if (testing_on && to_win > 0) { 01436 blob_box = blob_it.data ()->bounding_box (); 01437 projection->plot (to_win, projection_left, 01438 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); 01439 } 01440 #endif 01441 start_it = blob_it; 01442 blob_count = 0; 01443 blob_box = box_next (&blob_it);//first blob 01444 blob_it.mark_cycle_pt (); 01445 do { 01446 for (; blob_count > 0; blob_count--) 01447 box_next(&start_it); 01448 do { 01449 prev_box = blob_box; 01450 blob_count++; 01451 blob_box = box_next (&blob_it); 01452 } 01453 while (!blob_it.cycled_list () 01454 && blob_box.left () - prev_box.right () < space_size); 01455 plot_it = start_it; 01456 if (pitsync_linear_version & 3) 01457 word_sync = 01458 check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2, 01459 projection, projection_left, projection_right, 01460 row->xheight * textord_projection_scale, 01461 occupation, &seg_list, start, end); 01462 else 01463 word_sync = 01464 check_pitch_sync (&start_it, blob_count, (inT16) initial_pitch, 2, 01465 projection, &seg_list); 01466 if (testing_on) { 01467 tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ", 01468 prev_box.right (), prev_box.top (), 01469 seg_list.length () - 1, word_sync); 01470 seg_it.set_to_list (&seg_list); 01471 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); 01472 seg_it.forward ()) { 01473 if (seg_it.data ()->faked) 01474 tprintf ("(F)"); 01475 tprintf ("%d, ", seg_it.data ()->position ()); 01476 // tprintf("C=%g, s=%g, sq=%g\n", 01477 // seg_it.data()->cost_function(), 01478 // seg_it.data()->sum(), 01479 // seg_it.data()->squares()); 01480 } 01481 tprintf ("\n"); 01482 } 01483 #ifndef GRAPHICS_DISABLED 01484 if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) 01485 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); 01486 #endif 01487 seg_it.set_to_list (&seg_list); 01488 if (prev_right >= 0) { 01489 sp_var = seg_it.data ()->position () - prev_right; 01490 sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch; 01491 sp_var *= sp_var; 01492 spsum += sp_var; 01493 sp_count++; 01494 } 01495 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) { 01496 segpos = seg_it.data ()->position (); 01497 if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) { 01498 //big gap 01499 while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) { 01500 cell = new ICOORDELT (cellpos + (inT16) initial_pitch, 0); 01501 cell_it.add_after_then_move (cell); 01502 cellpos += (inT16) initial_pitch; 01503 } 01504 //make new one 01505 cell = new ICOORDELT (segpos, 0); 01506 cell_it.add_after_then_move (cell); 01507 cellpos = segpos; 01508 } 01509 else if (segpos > cellpos - initial_pitch / 2) { 01510 cell = cell_it.data (); 01511 //average positions 01512 cell->set_x ((cellpos + segpos) / 2); 01513 cellpos = cell->x (); 01514 } 01515 } 01516 seg_it.move_to_last (); 01517 prev_right = seg_it.data ()->position (); 01518 if (textord_pitch_scalebigwords) { 01519 scale_factor = (seg_list.length () - 2) / 2; 01520 if (scale_factor < 1) 01521 scale_factor = 1; 01522 } 01523 else 01524 scale_factor = 1; 01525 sqsum += word_sync * scale_factor; 01526 total_count += (seg_list.length () - 1) * scale_factor; 01527 seg_list.clear (); 01528 } 01529 while (!blob_it.cycled_list ()); 01530 sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0; 01531 return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10; 01532 } 01533 01534 01535 /********************************************************************** 01536 * compute_pitch_sd2 01537 * 01538 * Use a dp algorithm to fit the character cells and return the sd of 01539 * the cell size over the row. 01540 **********************************************************************/ 01541 01542 float compute_pitch_sd2( //find fp cells 01543 TO_ROW *row, //row to do 01544 STATS *projection, //vertical projection 01545 inT16 projection_left, //edge 01546 inT16 projection_right, //edge 01547 float initial_pitch, //guess at pitch 01548 inT16 &occupation, //no of occupied cells 01549 inT16 &mid_cuts, //no of free cuts 01550 ICOORDELT_LIST *row_cells, //list of chop pts 01551 BOOL8 testing_on, //inidividual words 01552 inT16 start, //start of good range 01553 inT16 end //end of good range 01554 ) { 01555 //blobs 01556 BLOBNBOX_IT blob_it = row->blob_list (); 01557 BLOBNBOX_IT plot_it; 01558 inT16 blob_count; //no of blobs 01559 TBOX blob_box; //bounding box 01560 FPSEGPT_LIST seg_list; //char cells 01561 FPSEGPT_IT seg_it; //iterator 01562 inT16 segpos; //position of segment 01563 //iterator 01564 ICOORDELT_IT cell_it = row_cells; 01565 ICOORDELT *cell; //new cell 01566 double word_sync; //result for word 01567 01568 mid_cuts = 0; 01569 if (blob_it.empty ()) { 01570 occupation = 0; 01571 return initial_pitch * 10; 01572 } 01573 #ifndef GRAPHICS_DISABLED 01574 if (testing_on && to_win > 0) { 01575 projection->plot (to_win, projection_left, 01576 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); 01577 } 01578 #endif 01579 blob_count = 0; 01580 blob_it.mark_cycle_pt (); 01581 do { 01582 //first blob 01583 blob_box = box_next (&blob_it); 01584 blob_count++; 01585 } 01586 while (!blob_it.cycled_list ()); 01587 plot_it = blob_it; 01588 word_sync = check_pitch_sync2 (&blob_it, blob_count, (inT16) initial_pitch, 01589 2, projection, projection_left, 01590 projection_right, 01591 row->xheight * textord_projection_scale, 01592 occupation, &seg_list, start, end); 01593 if (testing_on) { 01594 tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ", 01595 blob_box.right (), blob_box.top (), 01596 seg_list.length () - 1, word_sync); 01597 seg_it.set_to_list (&seg_list); 01598 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) { 01599 if (seg_it.data ()->faked) 01600 tprintf ("(F)"); 01601 tprintf ("%d, ", seg_it.data ()->position ()); 01602 // tprintf("C=%g, s=%g, sq=%g\n", 01603 // seg_it.data()->cost_function(), 01604 // seg_it.data()->sum(), 01605 // seg_it.data()->squares()); 01606 } 01607 tprintf ("\n"); 01608 } 01609 #ifndef GRAPHICS_DISABLED 01610 if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) 01611 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); 01612 #endif 01613 seg_it.set_to_list (&seg_list); 01614 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) { 01615 segpos = seg_it.data ()->position (); 01616 //make new one 01617 cell = new ICOORDELT (segpos, 0); 01618 cell_it.add_after_then_move (cell); 01619 if (seg_it.at_last ()) 01620 mid_cuts = seg_it.data ()->cheap_cuts (); 01621 } 01622 seg_list.clear (); 01623 return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10; 01624 } 01625 01626 01627 /********************************************************************** 01628 * print_pitch_sd 01629 * 01630 * Use a dp algorithm to fit the character cells and return the sd of 01631 * the cell size over the row. 01632 **********************************************************************/ 01633 01634 void print_pitch_sd( //find fp cells 01635 TO_ROW *row, //row to do 01636 STATS *projection, //vertical projection 01637 inT16 projection_left, //edges //size of blank 01638 inT16 projection_right, 01639 float space_size, 01640 float initial_pitch //guess at pitch 01641 ) { 01642 const char *res2; //pitch result 01643 inT16 occupation; //used cells 01644 float sp_sd; //space sd 01645 //blobs 01646 BLOBNBOX_IT blob_it = row->blob_list (); 01647 BLOBNBOX_IT start_it; //start of word 01648 BLOBNBOX_IT row_start; //start of row 01649 inT16 blob_count; //no of blobs 01650 inT16 total_blob_count; //total blobs in line 01651 TBOX blob_box; //bounding box 01652 TBOX prev_box; //of super blob 01653 inT32 prev_right; //of word sync 01654 int scale_factor; //on scores for big words 01655 inT32 sp_count; //spaces 01656 FPSEGPT_LIST seg_list; //char cells 01657 FPSEGPT_IT seg_it; //iterator 01658 double sqsum; //sum of squares 01659 double spsum; //of spaces 01660 double sp_var; //space error 01661 double word_sync; //result for word 01662 double total_count; //total cuts 01663 01664 if (blob_it.empty ()) 01665 return; 01666 row_start = blob_it; 01667 total_blob_count = 0; 01668 01669 total_count = 0; 01670 sqsum = 0; 01671 sp_count = 0; 01672 spsum = 0; 01673 prev_right = -1; 01674 blob_it = row_start; 01675 start_it = blob_it; 01676 blob_count = 0; 01677 blob_box = box_next (&blob_it);//first blob 01678 blob_it.mark_cycle_pt (); 01679 do { 01680 for (; blob_count > 0; blob_count--) 01681 box_next(&start_it); 01682 do { 01683 prev_box = blob_box; 01684 blob_count++; 01685 blob_box = box_next (&blob_it); 01686 } 01687 while (!blob_it.cycled_list () 01688 && blob_box.left () - prev_box.right () < space_size); 01689 word_sync = 01690 check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2, 01691 projection, projection_left, projection_right, 01692 row->xheight * textord_projection_scale, 01693 occupation, &seg_list, 0, 0); 01694 total_blob_count += blob_count; 01695 seg_it.set_to_list (&seg_list); 01696 if (prev_right >= 0) { 01697 sp_var = seg_it.data ()->position () - prev_right; 01698 sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch; 01699 sp_var *= sp_var; 01700 spsum += sp_var; 01701 sp_count++; 01702 } 01703 seg_it.move_to_last (); 01704 prev_right = seg_it.data ()->position (); 01705 if (textord_pitch_scalebigwords) { 01706 scale_factor = (seg_list.length () - 2) / 2; 01707 if (scale_factor < 1) 01708 scale_factor = 1; 01709 } 01710 else 01711 scale_factor = 1; 01712 sqsum += word_sync * scale_factor; 01713 total_count += (seg_list.length () - 1) * scale_factor; 01714 seg_list.clear (); 01715 } 01716 while (!blob_it.cycled_list ()); 01717 sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0; 01718 word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10; 01719 tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", 01720 word_sync, word_sync / initial_pitch, sp_sd, 01721 word_sync < textord_words_pitchsd_threshold * initial_pitch 01722 ? 'F' : 'P'); 01723 01724 start_it = row_start; 01725 blob_it = row_start; 01726 word_sync = 01727 check_pitch_sync2 (&blob_it, total_blob_count, (inT16) initial_pitch, 2, 01728 projection, projection_left, projection_right, 01729 row->xheight * textord_projection_scale, occupation, 01730 &seg_list, 0, 0); 01731 if (occupation > 1) 01732 word_sync /= occupation; 01733 word_sync = sqrt (word_sync); 01734 01735 #ifndef GRAPHICS_DISABLED 01736 if (textord_show_row_cuts && to_win != NULL) 01737 plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list); 01738 #endif 01739 seg_list.clear (); 01740 if (word_sync < textord_words_pitchsd_threshold * initial_pitch) { 01741 if (word_sync < textord_words_def_fixed * initial_pitch 01742 && !row->all_caps) 01743 res2 = "DF"; 01744 else 01745 res2 = "MF"; 01746 } 01747 else 01748 res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP"; 01749 tprintf 01750 ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n", 01751 word_sync, word_sync / initial_pitch, 01752 word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', 01753 occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps); 01754 } 01755 01756 /********************************************************************** 01757 * find_repeated_chars 01758 * 01759 * Extract marked leader blobs and put them 01760 * into words in advance of fixed pitch checking and word generation. 01761 **********************************************************************/ 01762 void find_repeated_chars(TO_BLOCK *block, // Block to search. 01763 BOOL8 testing_on) { // Debug mode. 01764 POLY_BLOCK* pb = block->block->poly_block(); 01765 if (pb != NULL && !pb->IsText()) 01766 return; // Don't find repeated chars in non-text blocks. 01767 01768 TO_ROW *row; 01769 BLOBNBOX_IT box_it; 01770 BLOBNBOX_IT search_it; // forward search 01771 WERD_IT word_it; // new words 01772 WERD *word; // new word 01773 TBOX word_box; // for plotting 01774 int blobcount, repeated_set; 01775 01776 TO_ROW_IT row_it = block->get_rows(); 01777 if (row_it.empty()) return; // empty block 01778 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 01779 row = row_it.data(); 01780 box_it.set_to_list(row->blob_list()); 01781 if (box_it.empty()) continue; // no blobs in this row 01782 if (!row->rep_chars_marked()) { 01783 mark_repeated_chars(row); 01784 } 01785 if (row->num_repeated_sets() == 0) continue; // nothing to do for this row 01786 word_it.set_to_list(&row->rep_words); 01787 do { 01788 if (box_it.data()->repeated_set() != 0 && 01789 !box_it.data()->joined_to_prev()) { 01790 blobcount = 1; 01791 repeated_set = box_it.data()->repeated_set(); 01792 search_it = box_it; 01793 search_it.forward(); 01794 while (!search_it.at_first() && 01795 search_it.data()->repeated_set() == repeated_set) { 01796 blobcount++; 01797 search_it.forward(); 01798 } 01799 // After the call to make_real_word() all the blobs from this 01800 // repeated set will be removed from the blob list. box_it will be 01801 // set to point to the blob after the end of the extracted sequence. 01802 word = make_real_word(&box_it, blobcount, box_it.at_first(), 1); 01803 if (!box_it.empty() && box_it.data()->joined_to_prev()) { 01804 tprintf("Bad box joined to prev at"); 01805 box_it.data()->bounding_box().print(); 01806 tprintf("After repeated word:"); 01807 word->bounding_box().print(); 01808 } 01809 ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev()); 01810 word->set_flag(W_REP_CHAR, true); 01811 word->set_flag(W_DONT_CHOP, true); 01812 word_it.add_after_then_move(word); 01813 } else { 01814 box_it.forward(); 01815 } 01816 } while (!box_it.at_first()); 01817 } 01818 } 01819 01820 01821 /********************************************************************** 01822 * plot_fp_word 01823 * 01824 * Plot a block of words as if fixed pitch. 01825 **********************************************************************/ 01826 01827 #ifndef GRAPHICS_DISABLED 01828 void plot_fp_word( //draw block of words 01829 TO_BLOCK *block, //block to draw 01830 float pitch, //pitch to draw with 01831 float nonspace //for space threshold 01832 ) { 01833 TO_ROW *row; //current row 01834 TO_ROW_IT row_it = block->get_rows (); 01835 01836 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 01837 row = row_it.data (); 01838 row->min_space = (inT32) ((pitch + nonspace) / 2); 01839 row->max_nonspace = row->min_space; 01840 row->space_threshold = row->min_space; 01841 plot_word_decisions (to_win, (inT16) pitch, row); 01842 } 01843 } 01844 #endif