Tesseract
3.02
|
00001 /********************************************************************** 00002 * tospace.cpp 00003 * 00004 * Compute fuzzy word spacing thresholds for each row. 00005 * I.e. set : max_nonspace 00006 * space_threshold 00007 * min_space 00008 * kern_size 00009 * space_size 00010 * for each row. 00011 * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE 00012 * 00013 * Note: functions in this file were originally not members of any 00014 * class or enclosed by any namespace. Now they are all static members 00015 * of the Textord class. 00016 * 00017 **********************************************************************/ 00018 00019 #include "textord.h" 00020 #include "mfcpch.h" 00021 #include "tovars.h" 00022 #include "drawtord.h" 00023 #include "textord.h" 00024 #include "ndminx.h" 00025 #include "statistc.h" 00026 00027 // Include automatically generated configuration file if running autoconf. 00028 #ifdef HAVE_CONFIG_H 00029 #include "config_auto.h" 00030 #endif 00031 00032 #define MAXSPACING 128 /*max expected spacing in pix */ 00033 00034 namespace tesseract { 00035 void Textord::to_spacing( 00036 ICOORD page_tr, //topright of page 00037 TO_BLOCK_LIST *blocks //blocks on page 00038 ) { 00039 TO_BLOCK_IT block_it; //iterator 00040 TO_BLOCK *block; //current block; 00041 TO_ROW_IT row_it; //row iterator 00042 TO_ROW *row; //current row 00043 int block_index; //block number 00044 int row_index; //row number 00045 //estimated width of real spaces for whole block 00046 inT16 block_space_gap_width; 00047 //estimated width of non space gaps for whole block 00048 inT16 block_non_space_gap_width; 00049 BOOL8 old_text_ord_proportional;//old fixed/prop result 00050 GAPMAP *gapmap = NULL; //map of big vert gaps in blk 00051 00052 block_it.set_to_list (blocks); 00053 block_index = 1; 00054 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00055 block_it.forward ()) { 00056 block = block_it.data (); 00057 gapmap = new GAPMAP (block); 00058 block_spacing_stats(block, 00059 gapmap, 00060 old_text_ord_proportional, 00061 block_space_gap_width, 00062 block_non_space_gap_width); 00063 // Make sure relative values of block-level space and non-space gap 00064 // widths are reasonable. The ratio of 1:3 is also used in 00065 // block_spacing_stats, to corrrect the block_space_gap_width 00066 // Useful for arabic and hindi, when the non-space gap width is 00067 // often over-estimated and should not be trusted. A similar ratio 00068 // is found in block_spacing_stats. 00069 if (tosp_old_to_method && tosp_old_to_constrain_sp_kn && 00070 (float) block_space_gap_width / block_non_space_gap_width < 3.0) { 00071 block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0); 00072 } 00073 row_it.set_to_list (block->get_rows ()); 00074 row_index = 1; 00075 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00076 row = row_it.data (); 00077 if ((row->pitch_decision == PITCH_DEF_PROP) || 00078 (row->pitch_decision == PITCH_CORR_PROP)) { 00079 if ((tosp_debug_level > 0) && !old_text_ord_proportional) 00080 tprintf ("Block %d Row %d: Now Proportional\n", 00081 block_index, row_index); 00082 row_spacing_stats(row, 00083 gapmap, 00084 block_index, 00085 row_index, 00086 block_space_gap_width, 00087 block_non_space_gap_width); 00088 } 00089 else { 00090 if ((tosp_debug_level > 0) && old_text_ord_proportional) 00091 tprintf 00092 ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", 00093 block_index, row_index, row->pitch_decision, 00094 row->fixed_pitch); 00095 } 00096 #ifndef GRAPHICS_DISABLED 00097 if (textord_show_initial_words) 00098 plot_word_decisions (to_win, (inT16) row->fixed_pitch, row); 00099 #endif 00100 row_index++; 00101 } 00102 delete gapmap; 00103 block_index++; 00104 } 00105 } 00106 00107 00108 /************************************************************************* 00109 * block_spacing_stats() 00110 *************************************************************************/ 00111 00112 void Textord::block_spacing_stats( 00113 TO_BLOCK *block, 00114 GAPMAP *gapmap, 00115 BOOL8 &old_text_ord_proportional, 00116 inT16 &block_space_gap_width, //resulting estimate 00117 inT16 &block_non_space_gap_width //resulting estimate 00118 ) { 00119 TO_ROW_IT row_it; //row iterator 00120 TO_ROW *row; //current row 00121 BLOBNBOX_IT blob_it; //iterator 00122 00123 STATS centre_to_centre_stats (0, MAXSPACING); 00124 //DEBUG USE ONLY 00125 STATS all_gap_stats (0, MAXSPACING); 00126 STATS space_gap_stats (0, MAXSPACING); 00127 inT16 minwidth = MAX_INT16; //narrowest blob 00128 TBOX blob_box; 00129 TBOX prev_blob_box; 00130 inT16 centre_to_centre; 00131 inT16 gap_width; 00132 float real_space_threshold; 00133 float iqr_centre_to_centre; //DEBUG USE ONLY 00134 float iqr_all_gap_stats; //DEBUG USE ONLY 00135 inT32 end_of_row; 00136 inT32 row_length; 00137 00138 row_it.set_to_list (block->get_rows ()); 00139 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00140 row = row_it.data (); 00141 if (!row->blob_list ()->empty () && 00142 (!tosp_only_use_prop_rows || 00143 (row->pitch_decision == PITCH_DEF_PROP) || 00144 (row->pitch_decision == PITCH_CORR_PROP))) { 00145 blob_it.set_to_list (row->blob_list ()); 00146 blob_it.mark_cycle_pt (); 00147 end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); 00148 if (tosp_use_pre_chopping) 00149 blob_box = box_next_pre_chopped (&blob_it); 00150 else if (tosp_stats_use_xht_gaps) 00151 blob_box = reduced_box_next (row, &blob_it); 00152 else 00153 blob_box = box_next (&blob_it); 00154 row_length = end_of_row - blob_box.left (); 00155 if (blob_box.width () < minwidth) 00156 minwidth = blob_box.width (); 00157 prev_blob_box = blob_box; 00158 while (!blob_it.cycled_list ()) { 00159 if (tosp_use_pre_chopping) 00160 blob_box = box_next_pre_chopped (&blob_it); 00161 else if (tosp_stats_use_xht_gaps) 00162 blob_box = reduced_box_next (row, &blob_it); 00163 else 00164 blob_box = box_next (&blob_it); 00165 if (blob_box.width () < minwidth) 00166 minwidth = blob_box.width (); 00167 gap_width = blob_box.left () - prev_blob_box.right (); 00168 if (!ignore_big_gap (row, row_length, gapmap, 00169 prev_blob_box.right (), blob_box.left ())) { 00170 all_gap_stats.add (gap_width, 1); 00171 00172 centre_to_centre = (blob_box.left () + blob_box.right () - 00173 (prev_blob_box.left () + 00174 prev_blob_box.right ())) / 2; 00175 //DEBUG 00176 centre_to_centre_stats.add (centre_to_centre, 1); 00177 // DEBUG 00178 } 00179 prev_blob_box = blob_box; 00180 } 00181 } 00182 } 00183 00184 //Inadequate samples 00185 if (all_gap_stats.get_total () <= 1) { 00186 block_non_space_gap_width = minwidth; 00187 block_space_gap_width = -1; //No est. space width 00188 //DEBUG 00189 old_text_ord_proportional = TRUE; 00190 } 00191 else { 00192 /* For debug only ..... */ 00193 iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) - 00194 centre_to_centre_stats.ile (0.25); 00195 iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25); 00196 old_text_ord_proportional = 00197 iqr_centre_to_centre * 2 > iqr_all_gap_stats; 00198 /* .......For debug only */ 00199 00200 /* 00201 The median of the gaps is used as an estimate of the NON-SPACE gap width. 00202 This RELIES on the assumption that there are more gaps WITHIN words than 00203 BETWEEN words in a block 00204 00205 Now try to estimate the width of a real space for all real spaces in the 00206 block. Do this by using a crude threshold to ignore "narrow" gaps, then 00207 find the median of the "wide" gaps and use this. 00208 */ 00209 block_non_space_gap_width = (inT16) floor (all_gap_stats.median ()); 00210 // median gap 00211 00212 row_it.set_to_list (block->get_rows ()); 00213 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00214 row = row_it.data (); 00215 if (!row->blob_list ()->empty () && 00216 (!tosp_only_use_prop_rows || 00217 (row->pitch_decision == PITCH_DEF_PROP) || 00218 (row->pitch_decision == PITCH_CORR_PROP))) { 00219 real_space_threshold = 00220 MAX (tosp_init_guess_kn_mult * block_non_space_gap_width, 00221 tosp_init_guess_xht_mult * row->xheight); 00222 blob_it.set_to_list (row->blob_list ()); 00223 blob_it.mark_cycle_pt (); 00224 end_of_row = 00225 blob_it.data_relative (-1)->bounding_box ().right (); 00226 if (tosp_use_pre_chopping) 00227 blob_box = box_next_pre_chopped (&blob_it); 00228 else if (tosp_stats_use_xht_gaps) 00229 blob_box = reduced_box_next (row, &blob_it); 00230 else 00231 blob_box = box_next (&blob_it); 00232 row_length = blob_box.left () - end_of_row; 00233 prev_blob_box = blob_box; 00234 while (!blob_it.cycled_list ()) { 00235 if (tosp_use_pre_chopping) 00236 blob_box = box_next_pre_chopped (&blob_it); 00237 else if (tosp_stats_use_xht_gaps) 00238 blob_box = reduced_box_next (row, &blob_it); 00239 else 00240 blob_box = box_next (&blob_it); 00241 gap_width = blob_box.left () - prev_blob_box.right (); 00242 if ((gap_width > real_space_threshold) && 00243 !ignore_big_gap (row, row_length, gapmap, 00244 prev_blob_box.right (), 00245 blob_box.left ())) { 00246 /* 00247 If tosp_use_cert_spaces is enabled, the estimate of the space gap is 00248 restricted to obvious spaces - those wider than half the xht or those 00249 with wide blobs on both sides - i.e not things that are suspect 1's or 00250 punctuation that is sometimes widely spaced. 00251 */ 00252 if (!tosp_block_use_cert_spaces || 00253 (gap_width > 00254 tosp_fuzzy_space_factor2 * row->xheight) 00255 || 00256 ((gap_width > 00257 tosp_fuzzy_space_factor1 * row->xheight) 00258 && (!tosp_narrow_blobs_not_cert 00259 || (!narrow_blob (row, prev_blob_box) 00260 && !narrow_blob (row, blob_box)))) 00261 || (wide_blob (row, prev_blob_box) 00262 && wide_blob (row, blob_box))) 00263 space_gap_stats.add (gap_width, 1); 00264 } 00265 prev_blob_box = blob_box; 00266 } 00267 } 00268 } 00269 //Inadequate samples 00270 if (space_gap_stats.get_total () <= 2) 00271 block_space_gap_width = -1;//No est. space width 00272 else 00273 block_space_gap_width = 00274 MAX ((inT16) floor (space_gap_stats.median ()), 00275 3 * block_non_space_gap_width); 00276 } 00277 } 00278 00279 00280 /************************************************************************* 00281 * row_spacing_stats() 00282 * Set values for min_space, max_non_space based on row stats only 00283 * If failure - return 0 values. 00284 *************************************************************************/ 00285 void Textord::row_spacing_stats( 00286 TO_ROW *row, 00287 GAPMAP *gapmap, 00288 inT16 block_idx, 00289 inT16 row_idx, 00290 inT16 block_space_gap_width, //estimate for block 00291 inT16 block_non_space_gap_width //estimate for block 00292 ) { 00293 //iterator 00294 BLOBNBOX_IT blob_it = row->blob_list (); 00295 STATS all_gap_stats (0, MAXSPACING); 00296 STATS cert_space_gap_stats (0, MAXSPACING); 00297 STATS all_space_gap_stats (0, MAXSPACING); 00298 STATS small_gap_stats (0, MAXSPACING); 00299 TBOX blob_box; 00300 TBOX prev_blob_box; 00301 inT16 gap_width; 00302 inT16 real_space_threshold = 0; 00303 inT16 max = 0; 00304 inT16 index; 00305 inT16 large_gap_count = 0; 00306 BOOL8 suspected_table; 00307 inT32 max_max_nonspace; //upper bound 00308 BOOL8 good_block_space_estimate = block_space_gap_width > 0; 00309 inT32 end_of_row; 00310 inT32 row_length = 0; 00311 float sane_space; 00312 inT32 sane_threshold; 00313 00314 /* Collect first pass stats for row */ 00315 00316 if (!good_block_space_estimate) 00317 block_space_gap_width = inT16 (floor (row->xheight / 2)); 00318 if (!row->blob_list ()->empty ()) { 00319 if (tosp_threshold_bias1 > 0) 00320 real_space_threshold = 00321 block_non_space_gap_width + 00322 inT16 (floor (0.5 + 00323 tosp_threshold_bias1 * (block_space_gap_width - 00324 block_non_space_gap_width))); 00325 else 00326 real_space_threshold = //Old TO method 00327 (block_space_gap_width + block_non_space_gap_width) / 2; 00328 blob_it.set_to_list (row->blob_list ()); 00329 blob_it.mark_cycle_pt (); 00330 end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); 00331 if (tosp_use_pre_chopping) 00332 blob_box = box_next_pre_chopped (&blob_it); 00333 else if (tosp_stats_use_xht_gaps) 00334 blob_box = reduced_box_next (row, &blob_it); 00335 else 00336 blob_box = box_next (&blob_it); 00337 row_length = end_of_row - blob_box.left (); 00338 prev_blob_box = blob_box; 00339 while (!blob_it.cycled_list ()) { 00340 if (tosp_use_pre_chopping) 00341 blob_box = box_next_pre_chopped (&blob_it); 00342 else if (tosp_stats_use_xht_gaps) 00343 blob_box = reduced_box_next (row, &blob_it); 00344 else 00345 blob_box = box_next (&blob_it); 00346 gap_width = blob_box.left () - prev_blob_box.right (); 00347 if (ignore_big_gap (row, row_length, gapmap, 00348 prev_blob_box.right (), blob_box.left ())) 00349 large_gap_count++; 00350 else { 00351 if (gap_width >= real_space_threshold) { 00352 if (!tosp_row_use_cert_spaces || 00353 (gap_width > tosp_fuzzy_space_factor2 * row->xheight) || 00354 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) 00355 && (!tosp_narrow_blobs_not_cert 00356 || (!narrow_blob (row, prev_blob_box) 00357 && !narrow_blob (row, blob_box)))) 00358 || (wide_blob (row, prev_blob_box) 00359 && wide_blob (row, blob_box))) 00360 cert_space_gap_stats.add (gap_width, 1); 00361 all_space_gap_stats.add (gap_width, 1); 00362 } 00363 else 00364 small_gap_stats.add (gap_width, 1); 00365 all_gap_stats.add (gap_width, 1); 00366 } 00367 prev_blob_box = blob_box; 00368 } 00369 } 00370 suspected_table = (large_gap_count > 1) || 00371 ((large_gap_count > 0) && 00372 (all_gap_stats.get_total () <= tosp_few_samples)); 00373 00374 /* Now determine row kern size, space size and threshold */ 00375 00376 if ((cert_space_gap_stats.get_total () >= 00377 tosp_enough_space_samples_for_median) || 00378 ((suspected_table || 00379 all_gap_stats.get_total () <= tosp_short_row) && 00380 cert_space_gap_stats.get_total () > 0)) { 00381 old_to_method(row, 00382 &all_gap_stats, 00383 &cert_space_gap_stats, 00384 &small_gap_stats, 00385 block_space_gap_width, 00386 block_non_space_gap_width); 00387 } else { 00388 if (!tosp_recovery_isolated_row_stats || 00389 !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table, 00390 block_idx, row_idx)) { 00391 if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) 00392 tprintf ("B:%d R:%d -- Inadequate certain spaces.\n", 00393 block_idx, row_idx); 00394 if (tosp_row_use_cert_spaces1 && good_block_space_estimate) { 00395 //Use block default 00396 row->space_size = block_space_gap_width; 00397 if (all_gap_stats.get_total () > tosp_redo_kern_limit) 00398 row->kern_size = all_gap_stats.median (); 00399 else 00400 row->kern_size = block_non_space_gap_width; 00401 row->space_threshold = 00402 inT32 (floor ((row->space_size + row->kern_size) / 00403 tosp_old_sp_kn_th_factor)); 00404 } 00405 else 00406 old_to_method(row, 00407 &all_gap_stats, 00408 &all_space_gap_stats, 00409 &small_gap_stats, 00410 block_space_gap_width, 00411 block_non_space_gap_width); 00412 } 00413 } 00414 00415 if (tosp_improve_thresh && !suspected_table) 00416 improve_row_threshold(row, &all_gap_stats); 00417 00418 /* Now lets try to be careful not to do anything silly with tables when we 00419 are ignoring big gaps*/ 00420 if (tosp_sanity_method == 0) { 00421 if (suspected_table && 00422 (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) { 00423 if (tosp_debug_level > 5) 00424 tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f.\n", 00425 block_idx, row_idx, 00426 row->kern_size, row->space_threshold, row->space_size); 00427 row->space_threshold = 00428 (inT32) (tosp_table_kn_sp_ratio * row->kern_size); 00429 row->space_size = MAX (row->space_threshold + 1, row->xheight); 00430 } 00431 } 00432 else if (tosp_sanity_method == 1) { 00433 sane_space = row->space_size; 00434 /* NEVER let space size get too close to kern size */ 00435 if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) 00436 || ((row->space_size - row->kern_size) < 00437 (tosp_silly_kn_sp_gap * row->xheight))) { 00438 if (good_block_space_estimate && 00439 (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) 00440 sane_space = block_space_gap_width; 00441 else 00442 sane_space = 00443 MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5), 00444 row->xheight / 2); 00445 if (tosp_debug_level > 5) 00446 tprintf 00447 ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", 00448 block_idx, row_idx, row->kern_size, row->space_threshold, 00449 row->space_size, sane_space); 00450 row->space_size = sane_space; 00451 row->space_threshold = 00452 inT32 (floor ((row->space_size + row->kern_size) / 00453 tosp_old_sp_kn_th_factor)); 00454 } 00455 /* NEVER let threshold get VERY far away from kern */ 00456 sane_threshold = inT32 (floor (tosp_max_sane_kn_thresh * 00457 MAX (row->kern_size, 2.5))); 00458 if (row->space_threshold > sane_threshold) { 00459 if (tosp_debug_level > 5) 00460 tprintf ("B:%d R:%d -- DONT BELIEVE THRESH %3.2f %d %3.2f->%d.\n", 00461 block_idx, row_idx, 00462 row->kern_size, 00463 row->space_threshold, row->space_size, sane_threshold); 00464 row->space_threshold = sane_threshold; 00465 if (row->space_size <= sane_threshold) 00466 row->space_size = row->space_threshold + 1.0f; 00467 } 00468 /* Beware of tables - there may be NO spaces */ 00469 if (suspected_table) { 00470 sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size, 00471 tosp_table_xht_sp_ratio * row->xheight); 00472 sane_threshold = inT32 (floor ((sane_space + row->kern_size) / 2)); 00473 00474 if ((row->space_size < sane_space) || 00475 (row->space_threshold < sane_threshold)) { 00476 if (tosp_debug_level > 5) 00477 tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", 00478 block_idx, row_idx, 00479 row->kern_size, 00480 row->space_threshold, row->space_size); 00481 //the minimum sane value 00482 row->space_threshold = (inT32) sane_space; 00483 row->space_size = MAX (row->space_threshold + 1, row->xheight); 00484 } 00485 } 00486 } 00487 00488 /* Now lets try to put some error limits on the threshold */ 00489 00490 if (tosp_old_to_method) { 00491 /* Old textord made a space if gap >= threshold */ 00492 //NO FUZZY SPACES YET 00493 row->max_nonspace = row->space_threshold; 00494 //NO FUZZY SPACES YET 00495 row->min_space = row->space_threshold + 1; 00496 } 00497 else { 00498 /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */ 00499 row->min_space = 00500 MIN (inT32 (ceil (tosp_fuzzy_space_factor * row->xheight)), 00501 inT32 (row->space_size)); 00502 if (row->min_space <= row->space_threshold) 00503 //Dont be silly 00504 row->min_space = row->space_threshold + 1; 00505 /* 00506 Lets try to guess the max certain kern gap by looking at the cluster of 00507 kerns for the row. The row is proportional so the kerns should cluster 00508 tightly at the bottom of the distribution. We also expect most gaps to be 00509 kerns. Find the maximum of the kern piles between 0 and twice the kern 00510 estimate. Piles before the first one with less than 1/10 the maximum 00511 number of samples can be taken as certain kerns. 00512 00513 Of course, there are some cases where the kern peak and space peaks merge, 00514 so we will put an UPPER limit on the max certain kern gap of some fraction 00515 below the threshold. 00516 */ 00517 00518 max_max_nonspace = inT32 ((row->space_threshold + row->kern_size) / 2); 00519 00520 //default 00521 row->max_nonspace = max_max_nonspace; 00522 for (index = 0; index <= max_max_nonspace; index++) { 00523 if (all_gap_stats.pile_count (index) > max) 00524 max = all_gap_stats.pile_count (index); 00525 if ((index > row->kern_size) && 00526 (all_gap_stats.pile_count (index) < 0.1 * max)) { 00527 row->max_nonspace = index; 00528 break; 00529 } 00530 } 00531 } 00532 00533 /* Yet another algorithm - simpler this time - just choose a fraction of the 00534 threshold to space range */ 00535 00536 if ((tosp_fuzzy_sp_fraction > 0) && 00537 (row->space_size > row->space_threshold)) 00538 row->min_space = MAX (row->min_space, 00539 (inT32) ceil (row->space_threshold + 00540 tosp_fuzzy_sp_fraction * 00541 (row->space_size - 00542 row->space_threshold))); 00543 00544 /* Ensure that ANY space less than some multiplier times the kern size is 00545 fuzzy. In tables there is a risk of erroneously setting a small space size 00546 when there are no real spaces. Sometimes tables have text squashed into 00547 columns so that the kn->sp ratio is small anyway - this means that we cant 00548 use this to force a wider separation - hence we rely on context to join any 00549 dubious breaks. */ 00550 00551 if ((tosp_table_fuzzy_kn_sp_ratio > 0) && 00552 (suspected_table || tosp_fuzzy_limit_all)) 00553 row->min_space = MAX (row->min_space, 00554 (inT32) ceil (tosp_table_fuzzy_kn_sp_ratio * 00555 row->kern_size)); 00556 00557 if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) { 00558 row->max_nonspace = (inT32) floor (0.5 + row->kern_size + 00559 tosp_fuzzy_kn_fraction * 00560 (row->space_threshold - 00561 row->kern_size)); 00562 } 00563 if (row->max_nonspace > row->space_threshold) { 00564 //Dont be silly 00565 row->max_nonspace = row->space_threshold; 00566 } 00567 00568 if (tosp_debug_level > 5) 00569 tprintf 00570 ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n", 00571 block_idx, row_idx, row_length, block_non_space_gap_width, 00572 block_space_gap_width, real_space_threshold, row->kern_size, 00573 row->max_nonspace, row->space_threshold, row->min_space, 00574 row->space_size); 00575 if (tosp_debug_level > 10) 00576 tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, " 00577 "row->space_threshold = %d\n", 00578 row->kern_size, row->space_size, row->space_threshold); 00579 } 00580 00581 void Textord::old_to_method( 00582 TO_ROW *row, 00583 STATS *all_gap_stats, 00584 STATS *space_gap_stats, 00585 STATS *small_gap_stats, 00586 inT16 block_space_gap_width, //estimate for block 00587 inT16 block_non_space_gap_width //estimate for block 00588 ) { 00589 /* First, estimate row space size */ 00590 /* Old to condition was > 2 */ 00591 if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) { 00592 //Adequate samples 00593 /* Set space size to median of spaces BUT limits it if it seems wildly out */ 00594 row->space_size = space_gap_stats->median (); 00595 if (row->space_size > block_space_gap_width * 1.5) { 00596 if (tosp_old_to_bug_fix) 00597 row->space_size = block_space_gap_width * 1.5; 00598 else 00599 //BUG??? should be *1.5 00600 row->space_size = block_space_gap_width; 00601 } 00602 if (row->space_size < (block_non_space_gap_width * 2) + 1) 00603 row->space_size = (block_non_space_gap_width * 2) + 1; 00604 } 00605 //Only 1 or 2 samples 00606 else if (space_gap_stats->get_total () >= 1) { 00607 //hence mean not median 00608 row->space_size = space_gap_stats->mean (); 00609 if (row->space_size > block_space_gap_width * 1.5) { 00610 if (tosp_old_to_bug_fix) 00611 row->space_size = block_space_gap_width * 1.5; 00612 else 00613 //BUG??? should be *1.5 00614 row->space_size = block_space_gap_width; 00615 } 00616 if (row->space_size < (block_non_space_gap_width * 3) + 1) 00617 row->space_size = (block_non_space_gap_width * 3) + 1; 00618 } 00619 else { 00620 //Use block default 00621 row->space_size = block_space_gap_width; 00622 } 00623 00624 /* Next, estimate row kern size */ 00625 if ((tosp_only_small_gaps_for_kern) && 00626 (small_gap_stats->get_total () > tosp_redo_kern_limit)) 00627 row->kern_size = small_gap_stats->median (); 00628 else if (all_gap_stats->get_total () > tosp_redo_kern_limit) 00629 row->kern_size = all_gap_stats->median (); 00630 else //old TO -SAME FOR ALL ROWS 00631 row->kern_size = block_non_space_gap_width; 00632 00633 /* Finally, estimate row space threshold */ 00634 if (tosp_threshold_bias2 > 0) { 00635 row->space_threshold = 00636 inT32 (floor (0.5 + row->kern_size + 00637 tosp_threshold_bias2 * (row->space_size - 00638 row->kern_size))); 00639 } else { 00640 /* 00641 NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold 00642 and holds this in a float. The use is with a >= test 00643 NEW textord uses an integer threshold and a > test 00644 It comes to the same thing. 00645 (Though there is a difference in that old textor has integer space_size 00646 and kern_size.) 00647 */ 00648 row->space_threshold = 00649 inT32 (floor ((row->space_size + row->kern_size) / 2)); 00650 } 00651 00652 // Apply the same logic and ratios as in row_spacing_stats to 00653 // restrict relative values of the row's space_size, kern_size, and 00654 // space_threshold 00655 if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 && 00656 ((row->space_size < 00657 tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) || 00658 ((row->space_size - row->kern_size) < 00659 tosp_silly_kn_sp_gap * row->xheight))) { 00660 if (row->kern_size > 2.5) 00661 row->kern_size = row->space_size / tosp_min_sane_kn_sp; 00662 row->space_threshold = inT32 (floor ((row->space_size + row->kern_size) / 00663 tosp_old_sp_kn_th_factor)); 00664 } 00665 } 00666 00667 00668 /************************************************************************* 00669 * isolated_row_stats() 00670 * Set values for min_space, max_non_space based on row stats only 00671 *************************************************************************/ 00672 BOOL8 Textord::isolated_row_stats(TO_ROW *row, 00673 GAPMAP *gapmap, 00674 STATS *all_gap_stats, 00675 BOOL8 suspected_table, 00676 inT16 block_idx, 00677 inT16 row_idx) { 00678 float kern_estimate; 00679 float crude_threshold_estimate; 00680 inT16 small_gaps_count; 00681 inT16 total; 00682 //iterator 00683 BLOBNBOX_IT blob_it = row->blob_list (); 00684 STATS cert_space_gap_stats (0, MAXSPACING); 00685 STATS all_space_gap_stats (0, MAXSPACING); 00686 STATS small_gap_stats (0, MAXSPACING); 00687 TBOX blob_box; 00688 TBOX prev_blob_box; 00689 inT16 gap_width; 00690 inT32 end_of_row; 00691 inT32 row_length; 00692 00693 kern_estimate = all_gap_stats->median (); 00694 crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate, 00695 tosp_init_guess_xht_mult * row->xheight); 00696 small_gaps_count = stats_count_under (all_gap_stats, 00697 (inT16) 00698 ceil (crude_threshold_estimate)); 00699 total = all_gap_stats->get_total (); 00700 00701 if ((total <= tosp_redo_kern_limit) || 00702 ((small_gaps_count / (float) total) < tosp_enough_small_gaps) || 00703 (total - small_gaps_count < 1)) { 00704 if (tosp_debug_level > 5) 00705 tprintf ("B:%d R:%d -- Cant do isolated row stats.\n", 00706 block_idx, row_idx); 00707 return FALSE; 00708 } 00709 blob_it.set_to_list (row->blob_list ()); 00710 blob_it.mark_cycle_pt (); 00711 end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); 00712 if (tosp_use_pre_chopping) 00713 blob_box = box_next_pre_chopped (&blob_it); 00714 else if (tosp_stats_use_xht_gaps) 00715 blob_box = reduced_box_next (row, &blob_it); 00716 else 00717 blob_box = box_next (&blob_it); 00718 row_length = end_of_row - blob_box.left (); 00719 prev_blob_box = blob_box; 00720 while (!blob_it.cycled_list ()) { 00721 if (tosp_use_pre_chopping) 00722 blob_box = box_next_pre_chopped (&blob_it); 00723 else if (tosp_stats_use_xht_gaps) 00724 blob_box = reduced_box_next (row, &blob_it); 00725 else 00726 blob_box = box_next (&blob_it); 00727 gap_width = blob_box.left () - prev_blob_box.right (); 00728 if (!ignore_big_gap (row, row_length, gapmap, 00729 prev_blob_box.right (), blob_box.left ()) && 00730 (gap_width > crude_threshold_estimate)) { 00731 if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) || 00732 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && 00733 (!tosp_narrow_blobs_not_cert || 00734 (!narrow_blob (row, prev_blob_box) && 00735 !narrow_blob (row, blob_box)))) || 00736 (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box))) 00737 cert_space_gap_stats.add (gap_width, 1); 00738 all_space_gap_stats.add (gap_width, 1); 00739 } 00740 if (gap_width < crude_threshold_estimate) 00741 small_gap_stats.add (gap_width, 1); 00742 00743 prev_blob_box = blob_box; 00744 } 00745 if (cert_space_gap_stats.get_total () >= 00746 tosp_enough_space_samples_for_median) 00747 //median 00748 row->space_size = cert_space_gap_stats.median (); 00749 else if (suspected_table && (cert_space_gap_stats.get_total () > 0)) 00750 //to avoid spaced 00751 row->space_size = cert_space_gap_stats.mean (); 00752 // 1's in tables 00753 else if (all_space_gap_stats.get_total () >= 00754 tosp_enough_space_samples_for_median) 00755 //median 00756 row->space_size = all_space_gap_stats.median (); 00757 else 00758 row->space_size = all_space_gap_stats.mean (); 00759 00760 if (tosp_only_small_gaps_for_kern) 00761 row->kern_size = small_gap_stats.median (); 00762 else 00763 row->kern_size = all_gap_stats->median (); 00764 row->space_threshold = 00765 inT32 (floor ((row->space_size + row->kern_size) / 2)); 00766 /* Sanity check */ 00767 if ((row->kern_size >= row->space_threshold) || 00768 (row->space_threshold >= row->space_size) || 00769 (row->space_threshold <= 0)) { 00770 if (tosp_debug_level > 5) 00771 tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", 00772 block_idx, row_idx, 00773 row->kern_size, row->space_threshold, row->space_size); 00774 row->kern_size = 0.0f; 00775 row->space_threshold = 0; 00776 row->space_size = 0.0f; 00777 return FALSE; 00778 } 00779 00780 if (tosp_debug_level > 5) 00781 tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n", 00782 block_idx, row_idx, 00783 row->kern_size, row->space_threshold, row->space_size); 00784 return TRUE; 00785 } 00786 00787 inT16 Textord::stats_count_under(STATS *stats, inT16 threshold) { 00788 inT16 index; 00789 inT16 total = 0; 00790 00791 for (index = 0; index < threshold; index++) 00792 total += stats->pile_count (index); 00793 return total; 00794 } 00795 00796 00797 /************************************************************************* 00798 * improve_row_threshold() 00799 * Try to recognise a "normal line" - 00800 * > 25 gaps 00801 * && space > 3 * kn && space > 10 00802 * (I.e. reasonably large space and kn:sp ratio) 00803 * && > 3/4 # gaps < kn + (sp - kn)/3 00804 * (I.e. most gaps are well away from space estimate) 00805 * && a gap of max( 3, (sp - kn)/3 ) empty histogram positions is found 00806 * somewhere in the histogram between kn and sp 00807 * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies 00808 * NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!! 00809 * try moving the default threshold to within this band but leave the 00810 * fuzzy limit calculation as at present. 00811 *************************************************************************/ 00812 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) { 00813 float sp = row->space_size; 00814 float kn = row->kern_size; 00815 inT16 reqd_zero_width = 0; 00816 inT16 zero_width = 0; 00817 inT16 zero_start = 0; 00818 inT16 index = 0; 00819 00820 if (tosp_debug_level > 10) 00821 tprintf ("Improve row threshold 0"); 00822 if ((all_gap_stats->get_total () <= 25) || 00823 (sp <= 10) || 00824 (sp <= 3 * kn) || 00825 (stats_count_under (all_gap_stats, 00826 (inT16) ceil (kn + (sp - kn) / 3 + 0.5)) < 00827 (0.75 * all_gap_stats->get_total ()))) 00828 return; 00829 if (tosp_debug_level > 10) 00830 tprintf (" 1"); 00831 /* 00832 Look for the first region of all 0's in the histogram which is wider than 00833 max( 3, (sp - kn)/3 ) and starts between kn and sp. If found, and current 00834 threshold is not within it, move the threshold so that is is just inside it. 00835 */ 00836 reqd_zero_width = (inT16) floor ((sp - kn) / 3 + 0.5); 00837 if (reqd_zero_width < 3) 00838 reqd_zero_width = 3; 00839 00840 for (index = inT16 (ceil (kn)); index < inT16 (floor (sp)); index++) { 00841 if (all_gap_stats->pile_count (index) == 0) { 00842 if (zero_width == 0) 00843 zero_start = index; 00844 zero_width++; 00845 } 00846 else { 00847 if (zero_width >= reqd_zero_width) 00848 break; 00849 else { 00850 zero_width = 0; 00851 } 00852 } 00853 } 00854 index--; 00855 if (tosp_debug_level > 10) 00856 tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", 00857 reqd_zero_width, zero_width, zero_start, row->space_threshold); 00858 if ((zero_width < reqd_zero_width) || 00859 ((row->space_threshold >= zero_start) && 00860 (row->space_threshold <= index))) 00861 return; 00862 if (tosp_debug_level > 10) 00863 tprintf (" 2"); 00864 if (row->space_threshold < zero_start) { 00865 if (tosp_debug_level > 5) 00866 tprintf 00867 ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", 00868 kn, sp, zero_start, index, row->space_threshold, zero_start); 00869 row->space_threshold = zero_start; 00870 } 00871 if (row->space_threshold > index) { 00872 if (tosp_debug_level > 5) 00873 tprintf 00874 ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", 00875 kn, sp, zero_start, index, row->space_threshold, index); 00876 row->space_threshold = index; 00877 } 00878 } 00879 00880 00881 /********************************************************************** 00882 * make_prop_words 00883 * 00884 * Convert a TO_BLOCK to a BLOCK. 00885 **********************************************************************/ 00886 ROW *Textord::make_prop_words( 00887 TO_ROW *row, // row to make 00888 FCOORD rotation // for drawing 00889 ) { 00890 BOOL8 bol; //start of line 00891 /* prev_ values are for start of word being built. non prev_ values are for 00892 the gap between the word being built and the next one. */ 00893 BOOL8 prev_fuzzy_sp; //probably space 00894 BOOL8 prev_fuzzy_non; //probably not 00895 uinT8 prev_blanks; //in front of word 00896 BOOL8 fuzzy_sp; //probably space 00897 BOOL8 fuzzy_non; //probably not 00898 uinT8 blanks; //in front of word 00899 BOOL8 prev_gap_was_a_space = FALSE; 00900 BOOL8 break_at_next_gap = FALSE; 00901 ROW *real_row; //output row 00902 C_OUTLINE_IT cout_it; 00903 C_BLOB_LIST cblobs; 00904 C_BLOB_IT cblob_it = &cblobs; 00905 WERD_LIST words; 00906 WERD_IT word_it; //new words 00907 WERD *word; //new word 00908 WERD_IT rep_char_it; //repeated char words 00909 inT32 next_rep_char_word_right = MAX_INT32; 00910 float repetition_spacing; //gap between repetitions 00911 inT32 xstarts[2]; //row ends 00912 double coeffs[3]; //quadratic 00913 inT32 prev_x; //end of prev blob 00914 BLOBNBOX *bblob; //current blob 00915 TBOX blob_box; //bounding box 00916 BLOBNBOX_IT box_it; //iterator 00917 TBOX prev_blob_box; 00918 TBOX next_blob_box; 00919 inT16 prev_gap = MAX_INT16; 00920 inT16 current_gap = MAX_INT16; 00921 inT16 next_gap = MAX_INT16; 00922 inT16 prev_within_xht_gap = MAX_INT16; 00923 inT16 current_within_xht_gap = MAX_INT16; 00924 inT16 next_within_xht_gap = MAX_INT16; 00925 inT16 word_count = 0; 00926 00927 rep_char_it.set_to_list (&(row->rep_words)); 00928 if (!rep_char_it.empty ()) { 00929 next_rep_char_word_right = 00930 rep_char_it.data ()->bounding_box ().right (); 00931 } 00932 00933 prev_x = -MAX_INT16; 00934 cblob_it.set_to_list (&cblobs); 00935 box_it.set_to_list (row->blob_list ()); 00936 word_it.set_to_list (&words); 00937 bol = TRUE; 00938 prev_blanks = 0; 00939 prev_fuzzy_sp = FALSE; 00940 prev_fuzzy_non = FALSE; 00941 if (!box_it.empty ()) { 00942 xstarts[0] = box_it.data ()->bounding_box ().left (); 00943 if (xstarts[0] > next_rep_char_word_right) { 00944 /* We need to insert a repeated char word at the start of the row */ 00945 word = rep_char_it.extract (); 00946 word_it.add_after_then_move (word); 00947 /* Set spaces before repeated char word */ 00948 word->set_flag (W_BOL, TRUE); 00949 bol = FALSE; 00950 word->set_blanks (0); 00951 //NO uncertainty 00952 word->set_flag (W_FUZZY_SP, FALSE); 00953 word->set_flag (W_FUZZY_NON, FALSE); 00954 xstarts[0] = word->bounding_box ().left (); 00955 /* Set spaces after repeated char word (and leave current word set) */ 00956 repetition_spacing = find_mean_blob_spacing (word); 00957 current_gap = box_it.data ()->bounding_box ().left () - 00958 next_rep_char_word_right; 00959 current_within_xht_gap = current_gap; 00960 if (current_gap > tosp_rep_space * repetition_spacing) { 00961 prev_blanks = (uinT8) floor (current_gap / row->space_size); 00962 if (prev_blanks < 1) 00963 prev_blanks = 1; 00964 } 00965 else 00966 prev_blanks = 0; 00967 if (tosp_debug_level > 5) 00968 tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ", 00969 box_it.data ()->bounding_box ().left (), 00970 box_it.data ()->bounding_box ().bottom (), 00971 repetition_spacing, current_gap); 00972 prev_fuzzy_sp = FALSE; 00973 prev_fuzzy_non = FALSE; 00974 if (rep_char_it.empty ()) { 00975 next_rep_char_word_right = MAX_INT32; 00976 } 00977 else { 00978 rep_char_it.forward (); 00979 next_rep_char_word_right = 00980 rep_char_it.data ()->bounding_box ().right (); 00981 } 00982 } 00983 00984 peek_at_next_gap(row, 00985 box_it, 00986 next_blob_box, 00987 next_gap, 00988 next_within_xht_gap); 00989 do { 00990 bblob = box_it.data (); 00991 blob_box = bblob->bounding_box (); 00992 if (bblob->joined_to_prev ()) { 00993 if (bblob->cblob () != NULL) { 00994 cout_it.set_to_list (cblob_it.data ()->out_list ()); 00995 cout_it.move_to_last (); 00996 cout_it.add_list_after (bblob->cblob ()->out_list ()); 00997 delete bblob->cblob (); 00998 } 00999 } else { 01000 if (bblob->cblob() != NULL) 01001 cblob_it.add_after_then_move (bblob->cblob ()); 01002 prev_x = blob_box.right (); 01003 } 01004 box_it.forward (); //next one 01005 bblob = box_it.data (); 01006 blob_box = bblob->bounding_box (); 01007 01008 if (!bblob->joined_to_prev() && bblob->cblob() != NULL) { 01009 /* Real Blob - not multiple outlines or pre-chopped */ 01010 prev_gap = current_gap; 01011 prev_within_xht_gap = current_within_xht_gap; 01012 prev_blob_box = next_blob_box; 01013 current_gap = next_gap; 01014 current_within_xht_gap = next_within_xht_gap; 01015 peek_at_next_gap(row, 01016 box_it, 01017 next_blob_box, 01018 next_gap, 01019 next_within_xht_gap); 01020 01021 inT16 prev_gap_arg = prev_gap; 01022 inT16 next_gap_arg = next_gap; 01023 if (tosp_only_use_xht_gaps) { 01024 prev_gap_arg = prev_within_xht_gap; 01025 next_gap_arg = next_within_xht_gap; 01026 } 01027 // Decide if a word-break should be inserted 01028 if (blob_box.left () > next_rep_char_word_right || 01029 make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, 01030 current_gap, current_within_xht_gap, 01031 next_blob_box, next_gap_arg, 01032 blanks, fuzzy_sp, fuzzy_non, 01033 prev_gap_was_a_space, 01034 break_at_next_gap) || 01035 box_it.at_first()) { 01036 /* Form a new word out of the blobs collected */ 01037 word = new WERD (&cblobs, prev_blanks, NULL); 01038 word_count++; 01039 word_it.add_after_then_move (word); 01040 if (bol) { 01041 word->set_flag (W_BOL, TRUE); 01042 bol = FALSE; 01043 } 01044 if (prev_fuzzy_sp) 01045 //probably space 01046 word->set_flag (W_FUZZY_SP, TRUE); 01047 else if (prev_fuzzy_non) 01048 word->set_flag (W_FUZZY_NON, TRUE); 01049 //probably not 01050 01051 if (blob_box.left () > next_rep_char_word_right) { 01052 /* We need to insert a repeated char word */ 01053 word = rep_char_it.extract (); 01054 word_it.add_after_then_move (word); 01055 01056 /* Set spaces before repeated char word */ 01057 repetition_spacing = find_mean_blob_spacing (word); 01058 current_gap = word->bounding_box ().left () - prev_x; 01059 current_within_xht_gap = current_gap; 01060 if (current_gap > tosp_rep_space * repetition_spacing) { 01061 blanks = 01062 (uinT8) floor (current_gap / row->space_size); 01063 if (blanks < 1) 01064 blanks = 1; 01065 } 01066 else 01067 blanks = 0; 01068 if (tosp_debug_level > 5) 01069 tprintf 01070 ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);", 01071 word->bounding_box ().left (), 01072 word->bounding_box ().bottom (), 01073 repetition_spacing, current_gap, blanks); 01074 word->set_blanks (blanks); 01075 //NO uncertainty 01076 word->set_flag (W_FUZZY_SP, FALSE); 01077 word->set_flag (W_FUZZY_NON, FALSE); 01078 01079 /* Set spaces after repeated char word (and leave current word set) */ 01080 current_gap = 01081 blob_box.left () - next_rep_char_word_right; 01082 if (current_gap > tosp_rep_space * repetition_spacing) { 01083 blanks = (uinT8) (current_gap / row->space_size); 01084 if (blanks < 1) 01085 blanks = 1; 01086 } 01087 else 01088 blanks = 0; 01089 if (tosp_debug_level > 5) 01090 tprintf (" Rgap:%d (%d blanks)\n", 01091 current_gap, blanks); 01092 fuzzy_sp = FALSE; 01093 fuzzy_non = FALSE; 01094 01095 if (rep_char_it.empty ()) { 01096 next_rep_char_word_right = MAX_INT32; 01097 } 01098 else { 01099 rep_char_it.forward (); 01100 next_rep_char_word_right = 01101 rep_char_it.data ()->bounding_box ().right (); 01102 } 01103 } 01104 01105 if (box_it.at_first () && rep_char_it.empty ()) { 01106 //at end of line 01107 word->set_flag (W_EOL, TRUE); 01108 xstarts[1] = prev_x; 01109 } 01110 else { 01111 prev_blanks = blanks; 01112 prev_fuzzy_sp = fuzzy_sp; 01113 prev_fuzzy_non = fuzzy_non; 01114 } 01115 } 01116 } 01117 } 01118 while (!box_it.at_first ()); //until back at start 01119 01120 /* Insert any further repeated char words */ 01121 while (!rep_char_it.empty ()) { 01122 word = rep_char_it.extract (); 01123 word_it.add_after_then_move (word); 01124 01125 /* Set spaces before repeated char word */ 01126 repetition_spacing = find_mean_blob_spacing (word); 01127 current_gap = word->bounding_box ().left () - prev_x; 01128 if (current_gap > tosp_rep_space * repetition_spacing) { 01129 blanks = (uinT8) floor (current_gap / row->space_size); 01130 if (blanks < 1) 01131 blanks = 1; 01132 } 01133 else 01134 blanks = 0; 01135 if (tosp_debug_level > 5) 01136 tprintf 01137 ("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n", 01138 word->bounding_box ().left (), word->bounding_box ().bottom (), 01139 repetition_spacing, current_gap, blanks); 01140 word->set_blanks (blanks); 01141 //NO uncertainty 01142 word->set_flag (W_FUZZY_SP, FALSE); 01143 word->set_flag (W_FUZZY_NON, FALSE); 01144 prev_x = word->bounding_box ().right (); 01145 if (rep_char_it.empty ()) { 01146 //at end of line 01147 word->set_flag (W_EOL, TRUE); 01148 xstarts[1] = prev_x; 01149 } 01150 else { 01151 rep_char_it.forward (); 01152 } 01153 } 01154 coeffs[0] = 0; 01155 coeffs[1] = row->line_m (); 01156 coeffs[2] = row->line_c (); 01157 real_row = new ROW (row, 01158 (inT16) row->kern_size, (inT16) row->space_size); 01159 word_it.set_to_list (real_row->word_list ()); 01160 //put words in row 01161 word_it.add_list_after (&words); 01162 real_row->recalc_bounding_box (); 01163 01164 if (tosp_debug_level > 4) { 01165 tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n", 01166 word_count, 01167 real_row->bounding_box ().left (), 01168 real_row->bounding_box ().bottom (), 01169 real_row->bounding_box ().right (), 01170 real_row->bounding_box ().top ()); 01171 } 01172 return real_row; 01173 } 01174 return NULL; 01175 } 01176 01177 /********************************************************************** 01178 * make_blob_words 01179 * 01180 * Converts words into blobs so that each blob is a single character. 01181 * Used for chopper test. 01182 **********************************************************************/ 01183 ROW *Textord::make_blob_words( 01184 TO_ROW *row, // row to make 01185 FCOORD rotation // for drawing 01186 ) { 01187 bool bol; // start of line 01188 ROW *real_row; // output row 01189 C_OUTLINE_IT cout_it; 01190 C_BLOB_LIST cblobs; 01191 C_BLOB_IT cblob_it = &cblobs; 01192 WERD_LIST words; 01193 WERD_IT word_it; // new words 01194 WERD *word; // new word 01195 double coeffs[3]; // quadratic 01196 BLOBNBOX *bblob; // current blob 01197 TBOX blob_box; // bounding box 01198 BLOBNBOX_IT box_it; // iterator 01199 inT16 word_count = 0; 01200 01201 cblob_it.set_to_list(&cblobs); 01202 box_it.set_to_list(row->blob_list()); 01203 word_it.set_to_list(&words); 01204 bol = TRUE; 01205 if (!box_it.empty()) { 01206 01207 do { 01208 bblob = box_it.data(); 01209 blob_box = bblob->bounding_box(); 01210 if (bblob->joined_to_prev()) { 01211 if (bblob->cblob() != NULL) { 01212 cout_it.set_to_list(cblob_it.data()->out_list()); 01213 cout_it.move_to_last(); 01214 cout_it.add_list_after(bblob->cblob()->out_list()); 01215 delete bblob->cblob(); 01216 } 01217 } else { 01218 if (bblob->cblob() != NULL) 01219 cblob_it.add_after_then_move(bblob->cblob()); 01220 } 01221 box_it.forward(); // next one 01222 bblob = box_it.data(); 01223 blob_box = bblob->bounding_box(); 01224 01225 if (!bblob->joined_to_prev() && !cblobs.empty()) { 01226 word = new WERD(&cblobs, 1, NULL); 01227 word_count++; 01228 word_it.add_after_then_move(word); 01229 if (bol) { 01230 word->set_flag(W_BOL, TRUE); 01231 bol = FALSE; 01232 } 01233 if (box_it.at_first()) { // at end of line 01234 word->set_flag(W_EOL, TRUE); 01235 } 01236 } 01237 } 01238 while (!box_it.at_first()); // until back at start 01239 /* Setup the row with created words. */ 01240 coeffs[0] = 0; 01241 coeffs[1] = row->line_m(); 01242 coeffs[2] = row->line_c(); 01243 real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size); 01244 word_it.set_to_list(real_row->word_list()); 01245 //put words in row 01246 word_it.add_list_after(&words); 01247 real_row->recalc_bounding_box(); 01248 if (tosp_debug_level > 4) { 01249 tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n", 01250 word_count, 01251 real_row->bounding_box().left(), 01252 real_row->bounding_box().bottom(), 01253 real_row->bounding_box().right(), 01254 real_row->bounding_box().top()); 01255 } 01256 return real_row; 01257 } 01258 return NULL; 01259 } 01260 01261 BOOL8 Textord::make_a_word_break( 01262 TO_ROW *row, // row being made 01263 TBOX blob_box, // for next_blob // how many blanks? 01264 inT16 prev_gap, 01265 TBOX prev_blob_box, 01266 inT16 real_current_gap, 01267 inT16 within_xht_current_gap, 01268 TBOX next_blob_box, 01269 inT16 next_gap, 01270 uinT8 &blanks, 01271 BOOL8 &fuzzy_sp, 01272 BOOL8 &fuzzy_non, 01273 BOOL8& prev_gap_was_a_space, 01274 BOOL8& break_at_next_gap) { 01275 BOOL8 space; 01276 inT16 current_gap; 01277 float fuzzy_sp_to_kn_limit; 01278 01279 if (break_at_next_gap) { 01280 break_at_next_gap = FALSE; 01281 return TRUE; 01282 } 01283 /* Inhibit using the reduced gap if 01284 The kerning is large - chars are not kerned and reducing "f"s can cause 01285 erroneous blanks 01286 OR The real gap is less than 0 01287 OR The real gap is less than the kerning estimate 01288 */ 01289 if ((row->kern_size > tosp_large_kerning * row->xheight) || 01290 ((tosp_dont_fool_with_small_kerns >= 0) && 01291 (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) 01292 //Ignore the difference 01293 within_xht_current_gap = real_current_gap; 01294 01295 if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) 01296 current_gap = within_xht_current_gap; 01297 else 01298 current_gap = real_current_gap; 01299 01300 if (tosp_old_to_method) { 01301 //Boring old method 01302 space = current_gap > row->max_nonspace; 01303 if (space && (current_gap < MAX_INT16)) { 01304 if (current_gap < row->min_space) { 01305 if (current_gap > row->space_threshold) { 01306 blanks = 1; 01307 fuzzy_sp = TRUE; 01308 fuzzy_non = FALSE; 01309 } 01310 else { 01311 blanks = 0; 01312 fuzzy_sp = FALSE; 01313 fuzzy_non = TRUE; 01314 } 01315 } 01316 else { 01317 blanks = (uinT8) (current_gap / row->space_size); 01318 if (blanks < 1) 01319 blanks = 1; 01320 fuzzy_sp = FALSE; 01321 fuzzy_non = FALSE; 01322 } 01323 } 01324 return space; 01325 } 01326 else { 01327 /* New exciting heuristic method */ 01328 if (prev_blob_box.null_box ()) // Beginning of row 01329 prev_gap_was_a_space = TRUE; 01330 01331 //Default as old TO 01332 space = current_gap > row->space_threshold; 01333 01334 /* Set defaults for the word break incase we find one. Currently there are 01335 no fuzzy spaces. Depending on the reliability of the different heuristics 01336 we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY 01337 be used if the function returns TRUE - ie the word is to be broken. 01338 */ 01339 blanks = (uinT8) (current_gap / row->space_size); 01340 if (blanks < 1) 01341 blanks = 1; 01342 fuzzy_sp = FALSE; 01343 fuzzy_non = FALSE; 01344 /* 01345 If xht measure causes gap to flip one of the 3 thresholds act accordingly - 01346 despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to 01347 context. 01348 */ 01349 if (tosp_use_xht_gaps && 01350 (real_current_gap <= row->max_nonspace) && 01351 (within_xht_current_gap > row->max_nonspace)) { 01352 space = TRUE; 01353 fuzzy_non = TRUE; 01354 #ifndef GRAPHICS_DISABLED 01355 mark_gap (blob_box, 20, 01356 prev_gap, prev_blob_box.width (), 01357 current_gap, next_blob_box.width (), next_gap); 01358 #endif 01359 } 01360 else if (tosp_use_xht_gaps && 01361 (real_current_gap <= row->space_threshold) && 01362 (within_xht_current_gap > row->space_threshold)) { 01363 space = TRUE; 01364 if (tosp_flip_fuzz_kn_to_sp) 01365 fuzzy_sp = TRUE; 01366 else 01367 fuzzy_non = TRUE; 01368 #ifndef GRAPHICS_DISABLED 01369 mark_gap (blob_box, 21, 01370 prev_gap, prev_blob_box.width (), 01371 current_gap, next_blob_box.width (), next_gap); 01372 #endif 01373 } 01374 else if (tosp_use_xht_gaps && 01375 (real_current_gap < row->min_space) && 01376 (within_xht_current_gap >= row->min_space)) { 01377 space = TRUE; 01378 #ifndef GRAPHICS_DISABLED 01379 mark_gap (blob_box, 22, 01380 prev_gap, prev_blob_box.width (), 01381 current_gap, next_blob_box.width (), next_gap); 01382 #endif 01383 } 01384 else if (tosp_force_wordbreak_on_punct && 01385 !suspected_punct_blob(row, prev_blob_box) && 01386 suspected_punct_blob(row, blob_box)) { 01387 break_at_next_gap = TRUE; 01388 } 01389 /* Now continue with normal heuristics */ 01390 else if ((current_gap < row->min_space) && 01391 (current_gap > row->space_threshold)) { 01392 /* Heuristics to turn dubious spaces to kerns */ 01393 if (tosp_pass_wide_fuzz_sp_to_context > 0) 01394 fuzzy_sp_to_kn_limit = row->kern_size + 01395 tosp_pass_wide_fuzz_sp_to_context * 01396 (row->space_size - row->kern_size); 01397 else 01398 fuzzy_sp_to_kn_limit = 99999.0f; 01399 01400 /* If current gap is significantly smaller than the previous space the other 01401 side of a narrow blob then this gap is a kern. */ 01402 if ((prev_blob_box.width () > 0) && 01403 narrow_blob (row, prev_blob_box) && 01404 prev_gap_was_a_space && 01405 (current_gap <= tosp_gap_factor * prev_gap)) { 01406 if ((tosp_all_flips_fuzzy) || 01407 (current_gap > fuzzy_sp_to_kn_limit)) { 01408 if (tosp_flip_fuzz_sp_to_kn) 01409 fuzzy_non = TRUE; 01410 else 01411 fuzzy_sp = TRUE; 01412 } 01413 else 01414 space = FALSE; 01415 #ifndef GRAPHICS_DISABLED 01416 mark_gap (blob_box, 1, 01417 prev_gap, prev_blob_box.width (), 01418 current_gap, next_blob_box.width (), next_gap); 01419 #endif 01420 } 01421 /* If current gap not much bigger than the previous kern the other side of a 01422 narrow blob then this gap is a kern as well */ 01423 else if ((prev_blob_box.width () > 0) && 01424 narrow_blob (row, prev_blob_box) && 01425 !prev_gap_was_a_space && 01426 (current_gap * tosp_gap_factor <= prev_gap)) { 01427 if ((tosp_all_flips_fuzzy) || 01428 (current_gap > fuzzy_sp_to_kn_limit)) { 01429 if (tosp_flip_fuzz_sp_to_kn) 01430 fuzzy_non = TRUE; 01431 else 01432 fuzzy_sp = TRUE; 01433 } 01434 else 01435 space = FALSE; 01436 #ifndef GRAPHICS_DISABLED 01437 mark_gap (blob_box, 2, 01438 prev_gap, prev_blob_box.width (), 01439 current_gap, next_blob_box.width (), next_gap); 01440 #endif 01441 } 01442 else if ((next_blob_box.width () > 0) && 01443 narrow_blob (row, next_blob_box) && 01444 (next_gap > row->space_threshold) && 01445 (current_gap <= tosp_gap_factor * next_gap)) { 01446 if ((tosp_all_flips_fuzzy) || 01447 (current_gap > fuzzy_sp_to_kn_limit)) { 01448 if (tosp_flip_fuzz_sp_to_kn) 01449 fuzzy_non = TRUE; 01450 else 01451 fuzzy_sp = TRUE; 01452 } 01453 else 01454 space = FALSE; 01455 #ifndef GRAPHICS_DISABLED 01456 mark_gap (blob_box, 3, 01457 prev_gap, prev_blob_box.width (), 01458 current_gap, next_blob_box.width (), next_gap); 01459 #endif 01460 } 01461 else if ((next_blob_box.width () > 0) && 01462 narrow_blob (row, next_blob_box) && 01463 (next_gap <= row->space_threshold) && 01464 (current_gap * tosp_gap_factor <= next_gap)) { 01465 if ((tosp_all_flips_fuzzy) || 01466 (current_gap > fuzzy_sp_to_kn_limit)) { 01467 if (tosp_flip_fuzz_sp_to_kn) 01468 fuzzy_non = TRUE; 01469 else 01470 fuzzy_sp = TRUE; 01471 } 01472 else 01473 space = FALSE; 01474 #ifndef GRAPHICS_DISABLED 01475 mark_gap (blob_box, 4, 01476 prev_gap, prev_blob_box.width (), 01477 current_gap, next_blob_box.width (), next_gap); 01478 #endif 01479 } 01480 else if ((((next_blob_box.width () > 0) && 01481 narrow_blob (row, next_blob_box)) || 01482 ((prev_blob_box.width () > 0) && 01483 narrow_blob (row, prev_blob_box)))) { 01484 fuzzy_sp = TRUE; 01485 #ifndef GRAPHICS_DISABLED 01486 mark_gap (blob_box, 6, 01487 prev_gap, prev_blob_box.width (), 01488 current_gap, next_blob_box.width (), next_gap); 01489 #endif 01490 } 01491 } 01492 else if ((current_gap > row->max_nonspace) && 01493 (current_gap <= row->space_threshold)) { 01494 01495 /* Heuristics to turn dubious kerns to spaces */ 01496 /* TRIED THIS BUT IT MADE THINGS WORSE 01497 if ( prev_gap == MAX_INT16 ) 01498 prev_gap = 0; // start of row 01499 if ( next_gap == MAX_INT16 ) 01500 next_gap = 0; // end of row 01501 */ 01502 if ((prev_blob_box.width () > 0) && 01503 (next_blob_box.width () > 0) && 01504 (current_gap >= 01505 tosp_kern_gap_factor1 * MAX (prev_gap, next_gap)) && 01506 wide_blob (row, prev_blob_box) && 01507 wide_blob (row, next_blob_box)) { 01508 01509 space = TRUE; 01510 /* 01511 tosp_flip_caution is an attempt to stop the default changing in cases 01512 where there is a large difference between the kern and space estimates. 01513 See problem in 'chiefs' where "have" gets split in the quotation. 01514 */ 01515 if ((tosp_flip_fuzz_kn_to_sp) && 01516 ((tosp_flip_caution <= 0) || 01517 (tosp_flip_caution * row->kern_size > row->space_size))) 01518 fuzzy_sp = TRUE; 01519 else 01520 fuzzy_non = TRUE; 01521 #ifndef GRAPHICS_DISABLED 01522 mark_gap (blob_box, 7, 01523 prev_gap, prev_blob_box.width (), 01524 current_gap, next_blob_box.width (), next_gap); 01525 #endif 01526 } else if (prev_blob_box.width() > 0 && 01527 next_blob_box.width() > 0 && 01528 current_gap > 5 && // Rule 9 handles small gap, big ratio. 01529 current_gap >= 01530 tosp_kern_gap_factor2 * MAX(prev_gap, next_gap) && 01531 !(narrow_blob(row, prev_blob_box) || 01532 suspected_punct_blob(row, prev_blob_box)) && 01533 !(narrow_blob(row, next_blob_box) || 01534 suspected_punct_blob(row, next_blob_box))) { 01535 space = TRUE; 01536 fuzzy_non = TRUE; 01537 #ifndef GRAPHICS_DISABLED 01538 mark_gap (blob_box, 8, 01539 prev_gap, prev_blob_box.width (), 01540 current_gap, next_blob_box.width (), next_gap); 01541 #endif 01542 } 01543 else if ((tosp_kern_gap_factor3 > 0) && 01544 (prev_blob_box.width () > 0) && 01545 (next_blob_box.width () > 0) && 01546 (current_gap >= tosp_kern_gap_factor3 * MAX (prev_gap, next_gap)) && 01547 (!tosp_rule_9_test_punct || 01548 (!suspected_punct_blob (row, prev_blob_box) && 01549 !suspected_punct_blob (row, next_blob_box)))) { 01550 space = TRUE; 01551 fuzzy_non = TRUE; 01552 #ifndef GRAPHICS_DISABLED 01553 mark_gap (blob_box, 9, 01554 prev_gap, prev_blob_box.width (), 01555 current_gap, next_blob_box.width (), next_gap); 01556 #endif 01557 } 01558 } 01559 if (tosp_debug_level > 10) 01560 tprintf("word break = %d current_gap = %d, prev_gap = %d, " 01561 "next_gap = %d\n", space ? 1 : 0, current_gap, 01562 prev_gap, next_gap); 01563 prev_gap_was_a_space = space && !(fuzzy_non); 01564 return space; 01565 } 01566 } 01567 01568 BOOL8 Textord::narrow_blob(TO_ROW *row, TBOX blob_box) { 01569 BOOL8 result; 01570 result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) || 01571 (((float) blob_box.width () / blob_box.height ()) <= 01572 tosp_narrow_aspect_ratio)); 01573 return result; 01574 } 01575 01576 BOOL8 Textord::wide_blob(TO_ROW *row, TBOX blob_box) { 01577 BOOL8 result; 01578 if (tosp_wide_fraction > 0) { 01579 if (tosp_wide_aspect_ratio > 0) 01580 result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) && 01581 (((float) blob_box.width () / blob_box.height ()) > 01582 tosp_wide_aspect_ratio)); 01583 else 01584 result = (blob_box.width () >= tosp_wide_fraction * row->xheight); 01585 } 01586 else 01587 result = !narrow_blob (row, blob_box); 01588 return result; 01589 } 01590 01591 BOOL8 Textord::suspected_punct_blob(TO_ROW *row, TBOX box) { 01592 BOOL8 result; 01593 float baseline; 01594 float blob_x_centre; 01595 /* Find baseline of centre of blob */ 01596 blob_x_centre = (box.right () + box.left ()) / 2.0; 01597 baseline = row->baseline.y (blob_x_centre); 01598 01599 result = (box.height () <= 0.66 * row->xheight) || 01600 (box.top () < baseline + row->xheight / 2.0) || 01601 (box.bottom () > baseline + row->xheight / 2.0); 01602 return result; 01603 } 01604 01605 01606 void Textord::peek_at_next_gap(TO_ROW *row, 01607 BLOBNBOX_IT box_it, 01608 TBOX &next_blob_box, 01609 inT16 &next_gap, 01610 inT16 &next_within_xht_gap) { 01611 TBOX next_reduced_blob_box; 01612 TBOX bit_beyond; 01613 BLOBNBOX_IT reduced_box_it = box_it; 01614 01615 next_blob_box = box_next (&box_it); 01616 next_reduced_blob_box = reduced_box_next (row, &reduced_box_it); 01617 if (box_it.at_first ()) { 01618 next_gap = MAX_INT16; 01619 next_within_xht_gap = MAX_INT16; 01620 } 01621 else { 01622 bit_beyond = box_it.data ()->bounding_box (); 01623 next_gap = bit_beyond.left () - next_blob_box.right (); 01624 bit_beyond = reduced_box_next (row, &reduced_box_it); 01625 next_within_xht_gap = 01626 bit_beyond.left () - next_reduced_blob_box.right (); 01627 } 01628 } 01629 01630 01631 #ifndef GRAPHICS_DISABLED 01632 void Textord::mark_gap( 01633 TBOX blob, // blob following gap 01634 inT16 rule, // heuristic id 01635 inT16 prev_gap, 01636 inT16 prev_blob_width, 01637 inT16 current_gap, 01638 inT16 next_blob_width, 01639 inT16 next_gap) { 01640 ScrollView::Color col; //of ellipse marking flipped gap 01641 01642 switch (rule) { 01643 case 1: 01644 col = ScrollView::RED; 01645 break; 01646 case 2: 01647 col = ScrollView::CYAN; 01648 break; 01649 case 3: 01650 col = ScrollView::GREEN; 01651 break; 01652 case 4: 01653 col = ScrollView::BLACK; 01654 break; 01655 case 5: 01656 col = ScrollView::MAGENTA; 01657 break; 01658 case 6: 01659 col = ScrollView::BLUE; 01660 break; 01661 01662 case 7: 01663 col = ScrollView::WHITE; 01664 break; 01665 case 8: 01666 col = ScrollView::YELLOW; 01667 break; 01668 case 9: 01669 col = ScrollView::BLACK; 01670 break; 01671 01672 case 20: 01673 col = ScrollView::CYAN; 01674 break; 01675 case 21: 01676 col = ScrollView::GREEN; 01677 break; 01678 case 22: 01679 col = ScrollView::MAGENTA; 01680 break; 01681 default: 01682 col = ScrollView::BLACK; 01683 } 01684 if (textord_show_initial_words) { 01685 to_win->Pen(col); 01686 /* if (rule < 20) 01687 //interior_style(to_win, INT_SOLID, FALSE); 01688 else 01689 //interior_style(to_win, INT_HOLLOW, TRUE);*/ 01690 //x radius 01691 to_win->Ellipse (current_gap / 2.0f, 01692 blob.height () / 2.0f, //y radius 01693 //x centre 01694 blob.left () - current_gap / 2.0f, 01695 //y centre 01696 blob.bottom () + blob.height () / 2.0f); 01697 } 01698 if (tosp_debug_level > 5) 01699 tprintf (" (%d,%d) Sp<->Kn Rule %d %d %d %d %d\n", 01700 blob.left () - current_gap / 2, blob.bottom (), rule, 01701 prev_gap, prev_blob_width, current_gap, 01702 next_blob_width, next_gap); 01703 } 01704 #endif 01705 01706 float Textord::find_mean_blob_spacing(WERD *word) { 01707 C_BLOB_IT cblob_it; 01708 TBOX blob_box; 01709 inT32 gap_sum = 0; 01710 inT16 gap_count = 0; 01711 inT16 prev_right; 01712 01713 cblob_it.set_to_list (word->cblob_list ()); 01714 if (!cblob_it.empty ()) { 01715 cblob_it.mark_cycle_pt (); 01716 prev_right = cblob_it.data ()->bounding_box ().right (); 01717 //first blob 01718 cblob_it.forward (); 01719 for (; !cblob_it.cycled_list (); cblob_it.forward ()) { 01720 blob_box = cblob_it.data ()->bounding_box (); 01721 gap_sum += blob_box.left () - prev_right; 01722 gap_count++; 01723 prev_right = blob_box.right (); 01724 } 01725 } 01726 if (gap_count > 0) 01727 return (gap_sum / (float) gap_count); 01728 else 01729 return 0.0f; 01730 } 01731 01732 01733 BOOL8 Textord::ignore_big_gap(TO_ROW *row, 01734 inT32 row_length, 01735 GAPMAP *gapmap, 01736 inT16 left, 01737 inT16 right) { 01738 inT16 gap = right - left + 1; 01739 01740 if (tosp_ignore_big_gaps > 999) 01741 return FALSE; //Dont ignore 01742 if (tosp_ignore_big_gaps > 0) 01743 return (gap > tosp_ignore_big_gaps * row->xheight); 01744 if (gap > tosp_ignore_very_big_gaps * row->xheight) 01745 return TRUE; 01746 if (tosp_ignore_big_gaps == 0) { 01747 if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) 01748 return TRUE; 01749 if ((gap > 1.75 * row->xheight) && 01750 ((row_length > 35 * row->xheight) || 01751 gapmap->table_gap (left, right))) 01752 return TRUE; 01753 } 01754 else { 01755 /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */ 01756 if ((gap > gapmap_big_gaps * row->xheight) && 01757 gapmap->table_gap (left, right)) 01758 return TRUE; 01759 } 01760 return FALSE; 01761 } 01762 01763 01764 /********************************************************************** 01765 * reduced_box_next 01766 * 01767 * Compute the bounding box of this blob with merging of x overlaps 01768 * but no pre-chopping. 01769 * Then move the iterator on to the start of the next blob. 01770 * DONT reduce the box for small things - eg punctuation. 01771 **********************************************************************/ 01772 TBOX Textord::reduced_box_next( 01773 TO_ROW *row, // current row 01774 BLOBNBOX_IT *it // iterator to blobds 01775 ) { 01776 BLOBNBOX *blob; //current blob 01777 BLOBNBOX *head_blob; //place to store box 01778 TBOX full_box; //full blob boundg box 01779 TBOX reduced_box; //box of significant part 01780 inT16 left_above_xht; //ABOVE xht left limit 01781 inT16 new_left_above_xht; //ABOVE xht left limit 01782 01783 blob = it->data (); 01784 if (blob->red_box_set ()) { 01785 reduced_box = blob->reduced_box (); 01786 do { 01787 it->forward(); 01788 blob = it->data(); 01789 } 01790 while (blob->cblob() == NULL || blob->joined_to_prev()); 01791 return reduced_box; 01792 } 01793 head_blob = blob; 01794 full_box = blob->bounding_box (); 01795 reduced_box = reduced_box_for_blob (blob, row, &left_above_xht); 01796 do { 01797 it->forward (); 01798 blob = it->data (); 01799 if (blob->cblob() == NULL) 01800 //was pre-chopped 01801 full_box += blob->bounding_box (); 01802 else if (blob->joined_to_prev ()) { 01803 reduced_box += 01804 reduced_box_for_blob(blob, row, &new_left_above_xht); 01805 left_above_xht = MIN (left_above_xht, new_left_above_xht); 01806 } 01807 } 01808 //until next real blob 01809 while (blob->cblob() == NULL || blob->joined_to_prev()); 01810 01811 if ((reduced_box.width () > 0) && 01812 ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ()) 01813 < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) { 01814 #ifndef GRAPHICS_DISABLED 01815 if (textord_show_initial_words) 01816 reduced_box.plot (to_win, ScrollView::YELLOW, ScrollView::YELLOW); 01817 #endif 01818 } 01819 else 01820 reduced_box = full_box; 01821 head_blob->set_reduced_box (reduced_box); 01822 return reduced_box; 01823 } 01824 01825 01826 /************************************************************************* 01827 * reduced_box_for_blob() 01828 * Find box for blob which is the same height and y position as the whole blob, 01829 * but whose left limit is the left most position of the blob ABOVE the 01830 * baseline and whose right limit is the right most position of the blob BELOW 01831 * the xheight. 01832 * 01833 * 01834 * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on 01835 * "home". Perhaps we need something which say if the width ABOVE the 01836 * xht alone includes the whole of the reduced width, then use the full 01837 * blob box - Might still fail on italic F 01838 * 01839 * Alternatively we could be a little less severe and only reduce the 01840 * left and right edges by half the difference between the full box and 01841 * the reduced box. 01842 * 01843 * NOTE that we need to rotate all the coordinates as 01844 * find_blob_limits finds the y min and max within a specified x band 01845 *************************************************************************/ 01846 TBOX Textord::reduced_box_for_blob( 01847 BLOBNBOX *blob, 01848 TO_ROW *row, 01849 inT16 *left_above_xht) { 01850 float baseline; 01851 float blob_x_centre; 01852 float left_limit; 01853 float right_limit; 01854 float junk; 01855 TBOX blob_box; 01856 01857 /* Find baseline of centre of blob */ 01858 01859 blob_box = blob->bounding_box (); 01860 blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0; 01861 baseline = row->baseline.y (blob_x_centre); 01862 01863 /* 01864 Find LH limit of blob ABOVE the xht. This is so that we can detect certain 01865 caps ht chars which should NOT have their box reduced: T, Y, V, W etc 01866 */ 01867 left_limit = (float) MAX_INT32; 01868 junk = (float) -MAX_INT32; 01869 find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), 01870 static_cast<float>(MAX_INT16), left_limit, junk); 01871 if (left_limit > junk) 01872 *left_above_xht = MAX_INT16; //No area above xht 01873 else 01874 *left_above_xht = (inT16) floor (left_limit); 01875 /* 01876 Find reduced LH limit of blob - the left extent of the region ABOVE the 01877 baseline. 01878 */ 01879 left_limit = (float) MAX_INT32; 01880 junk = (float) -MAX_INT32; 01881 find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(MAX_INT16), 01882 left_limit, junk); 01883 01884 if (left_limit > junk) 01885 return TBOX (); //no area within xht so return empty box 01886 /* 01887 Find reduced RH limit of blob - the right extent of the region BELOW the xht. 01888 */ 01889 junk = (float) MAX_INT32; 01890 right_limit = (float) -MAX_INT32; 01891 find_cblob_hlimits(blob->cblob(), static_cast<float>(-MAX_INT16), 01892 (baseline + row->xheight), junk, right_limit); 01893 if (junk > right_limit) 01894 return TBOX (); //no area within xht so return empty box 01895 01896 return TBOX (ICOORD ((inT16) floor (left_limit), blob_box.bottom ()), 01897 ICOORD ((inT16) ceil (right_limit), blob_box.top ())); 01898 } 01899 } // namespace tesseract