Tesseract  3.02
tesseract-ocr/textord/tospace.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * tospace.cpp
00003  *
00004  * Compute fuzzy word spacing thresholds for each row.
00005  * I.e. set :   max_nonspace
00006  *              space_threshold
00007  *              min_space
00008  *              kern_size
00009  *              space_size
00010  * for each row.
00011  * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
00012  *
00013  * Note: functions in this file were originally not members of any
00014  * class or enclosed by any namespace. Now they are all static members
00015  * of the Textord class.
00016  *
00017  **********************************************************************/
00018 
00019 #include "textord.h"
00020 #include "mfcpch.h"
00021 #include "tovars.h"
00022 #include "drawtord.h"
00023 #include "textord.h"
00024 #include "ndminx.h"
00025 #include "statistc.h"
00026 
00027 // Include automatically generated configuration file if running autoconf.
00028 #ifdef HAVE_CONFIG_H
00029 #include "config_auto.h"
00030 #endif
00031 
00032 #define MAXSPACING      128      /*max expected spacing in pix */
00033 
00034 namespace tesseract {
00035 void Textord::to_spacing(
00036     ICOORD page_tr,        //topright of page
00037     TO_BLOCK_LIST *blocks  //blocks on page
00038                          ) {
00039   TO_BLOCK_IT block_it;          //iterator
00040   TO_BLOCK *block;               //current block;
00041   TO_ROW_IT row_it;              //row iterator
00042   TO_ROW *row;                   //current row
00043   int block_index;               //block number
00044   int row_index;                 //row number
00045   //estimated width of real spaces for whole block
00046   inT16 block_space_gap_width;
00047   //estimated width of non space gaps for whole block
00048   inT16 block_non_space_gap_width;
00049   BOOL8 old_text_ord_proportional;//old fixed/prop result
00050   GAPMAP *gapmap = NULL;          //map of big vert gaps in blk
00051 
00052   block_it.set_to_list (blocks);
00053   block_index = 1;
00054   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00055   block_it.forward ()) {
00056     block = block_it.data ();
00057     gapmap = new GAPMAP (block);
00058     block_spacing_stats(block,
00059                         gapmap,
00060                         old_text_ord_proportional,
00061                         block_space_gap_width,
00062                         block_non_space_gap_width);
00063     // Make sure relative values of block-level space and non-space gap
00064     // widths are reasonable. The ratio of 1:3 is also used in
00065     // block_spacing_stats, to corrrect the block_space_gap_width
00066     // Useful for arabic and hindi, when the non-space gap width is
00067     // often over-estimated and should not be trusted. A similar ratio
00068     // is found in block_spacing_stats.
00069     if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
00070         (float) block_space_gap_width / block_non_space_gap_width < 3.0) {
00071       block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0);
00072     }
00073     row_it.set_to_list (block->get_rows ());
00074     row_index = 1;
00075     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00076       row = row_it.data ();
00077       if ((row->pitch_decision == PITCH_DEF_PROP) ||
00078       (row->pitch_decision == PITCH_CORR_PROP)) {
00079         if ((tosp_debug_level > 0) && !old_text_ord_proportional)
00080           tprintf ("Block %d Row %d: Now Proportional\n",
00081             block_index, row_index);
00082         row_spacing_stats(row,
00083                           gapmap,
00084                           block_index,
00085                           row_index,
00086                           block_space_gap_width,
00087                           block_non_space_gap_width);
00088       }
00089       else {
00090         if ((tosp_debug_level > 0) && old_text_ord_proportional)
00091           tprintf
00092             ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
00093             block_index, row_index, row->pitch_decision,
00094             row->fixed_pitch);
00095       }
00096 #ifndef GRAPHICS_DISABLED
00097       if (textord_show_initial_words)
00098         plot_word_decisions (to_win, (inT16) row->fixed_pitch, row);
00099 #endif
00100       row_index++;
00101     }
00102     delete gapmap;
00103     block_index++;
00104   }
00105 }
00106 
00107 
00108 /*************************************************************************
00109  * block_spacing_stats()
00110  *************************************************************************/
00111 
00112 void Textord::block_spacing_stats(
00113     TO_BLOCK *block,
00114     GAPMAP *gapmap,
00115     BOOL8 &old_text_ord_proportional,
00116     inT16 &block_space_gap_width,     //resulting estimate
00117     inT16 &block_non_space_gap_width  //resulting estimate
00118                                   ) {
00119   TO_ROW_IT row_it;              //row iterator
00120   TO_ROW *row;                   //current row
00121   BLOBNBOX_IT blob_it;           //iterator
00122 
00123   STATS centre_to_centre_stats (0, MAXSPACING);
00124   //DEBUG USE ONLY
00125   STATS all_gap_stats (0, MAXSPACING);
00126   STATS space_gap_stats (0, MAXSPACING);
00127   inT16 minwidth = MAX_INT16;    //narrowest blob
00128   TBOX blob_box;
00129   TBOX prev_blob_box;
00130   inT16 centre_to_centre;
00131   inT16 gap_width;
00132   float real_space_threshold;
00133   float iqr_centre_to_centre;    //DEBUG USE ONLY
00134   float iqr_all_gap_stats;       //DEBUG USE ONLY
00135   inT32 end_of_row;
00136   inT32 row_length;
00137 
00138   row_it.set_to_list (block->get_rows ());
00139   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00140     row = row_it.data ();
00141     if (!row->blob_list ()->empty () &&
00142       (!tosp_only_use_prop_rows ||
00143       (row->pitch_decision == PITCH_DEF_PROP) ||
00144     (row->pitch_decision == PITCH_CORR_PROP))) {
00145       blob_it.set_to_list (row->blob_list ());
00146       blob_it.mark_cycle_pt ();
00147       end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00148       if (tosp_use_pre_chopping)
00149         blob_box = box_next_pre_chopped (&blob_it);
00150       else if (tosp_stats_use_xht_gaps)
00151         blob_box = reduced_box_next (row, &blob_it);
00152       else
00153         blob_box = box_next (&blob_it);
00154       row_length = end_of_row - blob_box.left ();
00155       if (blob_box.width () < minwidth)
00156         minwidth = blob_box.width ();
00157       prev_blob_box = blob_box;
00158       while (!blob_it.cycled_list ()) {
00159         if (tosp_use_pre_chopping)
00160           blob_box = box_next_pre_chopped (&blob_it);
00161         else if (tosp_stats_use_xht_gaps)
00162           blob_box = reduced_box_next (row, &blob_it);
00163         else
00164           blob_box = box_next (&blob_it);
00165         if (blob_box.width () < minwidth)
00166           minwidth = blob_box.width ();
00167         gap_width = blob_box.left () - prev_blob_box.right ();
00168         if (!ignore_big_gap (row, row_length, gapmap,
00169                              prev_blob_box.right (), blob_box.left ())) {
00170           all_gap_stats.add (gap_width, 1);
00171 
00172           centre_to_centre = (blob_box.left () + blob_box.right () -
00173             (prev_blob_box.left () +
00174              prev_blob_box.right ())) / 2;
00175           //DEBUG
00176           centre_to_centre_stats.add (centre_to_centre, 1);
00177           // DEBUG
00178         }
00179         prev_blob_box = blob_box;
00180       }
00181     }
00182   }
00183 
00184                                  //Inadequate samples
00185   if (all_gap_stats.get_total () <= 1) {
00186     block_non_space_gap_width = minwidth;
00187     block_space_gap_width = -1;  //No est. space width
00188                                  //DEBUG
00189     old_text_ord_proportional = TRUE;
00190   }
00191   else {
00192     /* For debug only ..... */
00193     iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
00194       centre_to_centre_stats.ile (0.25);
00195     iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
00196     old_text_ord_proportional =
00197       iqr_centre_to_centre * 2 > iqr_all_gap_stats;
00198     /* .......For debug only */
00199 
00200     /*
00201     The median of the gaps is used as an estimate of the NON-SPACE gap width.
00202     This RELIES on the assumption that there are more gaps WITHIN words than
00203     BETWEEN words in a block
00204 
00205     Now try to estimate the width of a real space for all real spaces in the
00206     block. Do this by using a crude threshold to ignore "narrow" gaps, then
00207     find the median of the "wide" gaps and use this.
00208     */
00209     block_non_space_gap_width = (inT16) floor (all_gap_stats.median ());
00210     // median gap
00211 
00212     row_it.set_to_list (block->get_rows ());
00213     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00214       row = row_it.data ();
00215       if (!row->blob_list ()->empty () &&
00216         (!tosp_only_use_prop_rows ||
00217         (row->pitch_decision == PITCH_DEF_PROP) ||
00218       (row->pitch_decision == PITCH_CORR_PROP))) {
00219         real_space_threshold =
00220           MAX (tosp_init_guess_kn_mult * block_non_space_gap_width,
00221           tosp_init_guess_xht_mult * row->xheight);
00222         blob_it.set_to_list (row->blob_list ());
00223         blob_it.mark_cycle_pt ();
00224         end_of_row =
00225           blob_it.data_relative (-1)->bounding_box ().right ();
00226         if (tosp_use_pre_chopping)
00227           blob_box = box_next_pre_chopped (&blob_it);
00228         else if (tosp_stats_use_xht_gaps)
00229           blob_box = reduced_box_next (row, &blob_it);
00230         else
00231           blob_box = box_next (&blob_it);
00232         row_length = blob_box.left () - end_of_row;
00233         prev_blob_box = blob_box;
00234         while (!blob_it.cycled_list ()) {
00235           if (tosp_use_pre_chopping)
00236             blob_box = box_next_pre_chopped (&blob_it);
00237           else if (tosp_stats_use_xht_gaps)
00238             blob_box = reduced_box_next (row, &blob_it);
00239           else
00240             blob_box = box_next (&blob_it);
00241           gap_width = blob_box.left () - prev_blob_box.right ();
00242           if ((gap_width > real_space_threshold) &&
00243             !ignore_big_gap (row, row_length, gapmap,
00244             prev_blob_box.right (),
00245           blob_box.left ())) {
00246             /*
00247             If tosp_use_cert_spaces is enabled, the estimate of the space gap is
00248             restricted to obvious spaces - those wider than half the xht or those
00249             with wide blobs on both sides - i.e not things that are suspect 1's or
00250             punctuation that is sometimes widely spaced.
00251             */
00252             if (!tosp_block_use_cert_spaces ||
00253               (gap_width >
00254               tosp_fuzzy_space_factor2 * row->xheight)
00255               ||
00256               ((gap_width >
00257               tosp_fuzzy_space_factor1 * row->xheight)
00258               && (!tosp_narrow_blobs_not_cert
00259               || (!narrow_blob (row, prev_blob_box)
00260               && !narrow_blob (row, blob_box))))
00261               || (wide_blob (row, prev_blob_box)
00262               && wide_blob (row, blob_box)))
00263               space_gap_stats.add (gap_width, 1);
00264           }
00265           prev_blob_box = blob_box;
00266         }
00267       }
00268     }
00269                                  //Inadequate samples
00270     if (space_gap_stats.get_total () <= 2)
00271       block_space_gap_width = -1;//No est. space width
00272     else
00273       block_space_gap_width =
00274         MAX ((inT16) floor (space_gap_stats.median ()),
00275         3 * block_non_space_gap_width);
00276   }
00277 }
00278 
00279 
00280 /*************************************************************************
00281  * row_spacing_stats()
00282  * Set values for min_space, max_non_space based on row stats only
00283  * If failure - return 0 values.
00284  *************************************************************************/
00285 void Textord::row_spacing_stats(
00286     TO_ROW *row,
00287     GAPMAP *gapmap,
00288     inT16 block_idx,
00289     inT16 row_idx,
00290     inT16 block_space_gap_width,    //estimate for block
00291     inT16 block_non_space_gap_width //estimate for block
00292                                 ) {
00293   //iterator
00294   BLOBNBOX_IT blob_it = row->blob_list ();
00295   STATS all_gap_stats (0, MAXSPACING);
00296   STATS cert_space_gap_stats (0, MAXSPACING);
00297   STATS all_space_gap_stats (0, MAXSPACING);
00298   STATS small_gap_stats (0, MAXSPACING);
00299   TBOX blob_box;
00300   TBOX prev_blob_box;
00301   inT16 gap_width;
00302   inT16 real_space_threshold = 0;
00303   inT16 max = 0;
00304   inT16 index;
00305   inT16 large_gap_count = 0;
00306   BOOL8 suspected_table;
00307   inT32 max_max_nonspace;        //upper bound
00308   BOOL8 good_block_space_estimate = block_space_gap_width > 0;
00309   inT32 end_of_row;
00310   inT32 row_length = 0;
00311   float sane_space;
00312   inT32 sane_threshold;
00313 
00314   /* Collect first pass stats for row */
00315 
00316   if (!good_block_space_estimate)
00317     block_space_gap_width = inT16 (floor (row->xheight / 2));
00318   if (!row->blob_list ()->empty ()) {
00319     if (tosp_threshold_bias1 > 0)
00320       real_space_threshold =
00321         block_non_space_gap_width +
00322         inT16 (floor (0.5 +
00323         tosp_threshold_bias1 * (block_space_gap_width -
00324                                 block_non_space_gap_width)));
00325     else
00326       real_space_threshold =     //Old TO method
00327         (block_space_gap_width + block_non_space_gap_width) / 2;
00328     blob_it.set_to_list (row->blob_list ());
00329     blob_it.mark_cycle_pt ();
00330     end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00331     if (tosp_use_pre_chopping)
00332       blob_box = box_next_pre_chopped (&blob_it);
00333     else if (tosp_stats_use_xht_gaps)
00334       blob_box = reduced_box_next (row, &blob_it);
00335     else
00336       blob_box = box_next (&blob_it);
00337     row_length = end_of_row - blob_box.left ();
00338     prev_blob_box = blob_box;
00339     while (!blob_it.cycled_list ()) {
00340       if (tosp_use_pre_chopping)
00341         blob_box = box_next_pre_chopped (&blob_it);
00342       else if (tosp_stats_use_xht_gaps)
00343         blob_box = reduced_box_next (row, &blob_it);
00344       else
00345         blob_box = box_next (&blob_it);
00346       gap_width = blob_box.left () - prev_blob_box.right ();
00347       if (ignore_big_gap (row, row_length, gapmap,
00348         prev_blob_box.right (), blob_box.left ()))
00349         large_gap_count++;
00350       else {
00351         if (gap_width >= real_space_threshold) {
00352           if (!tosp_row_use_cert_spaces ||
00353             (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
00354             ((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
00355             && (!tosp_narrow_blobs_not_cert
00356             || (!narrow_blob (row, prev_blob_box)
00357             && !narrow_blob (row, blob_box))))
00358             || (wide_blob (row, prev_blob_box)
00359             && wide_blob (row, blob_box)))
00360             cert_space_gap_stats.add (gap_width, 1);
00361           all_space_gap_stats.add (gap_width, 1);
00362         }
00363         else
00364           small_gap_stats.add (gap_width, 1);
00365         all_gap_stats.add (gap_width, 1);
00366       }
00367       prev_blob_box = blob_box;
00368     }
00369   }
00370   suspected_table = (large_gap_count > 1) ||
00371       ((large_gap_count > 0) &&
00372        (all_gap_stats.get_total () <= tosp_few_samples));
00373 
00374   /* Now determine row kern size, space size and threshold */
00375 
00376   if ((cert_space_gap_stats.get_total () >=
00377     tosp_enough_space_samples_for_median) ||
00378     ((suspected_table ||
00379     all_gap_stats.get_total () <= tosp_short_row) &&
00380     cert_space_gap_stats.get_total () > 0)) {
00381     old_to_method(row,
00382                   &all_gap_stats,
00383                   &cert_space_gap_stats,
00384                   &small_gap_stats,
00385                   block_space_gap_width,
00386                   block_non_space_gap_width);
00387   } else {
00388     if (!tosp_recovery_isolated_row_stats ||
00389         !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
00390                              block_idx, row_idx)) {
00391       if (tosp_row_use_cert_spaces && (tosp_debug_level > 5))
00392         tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
00393           block_idx, row_idx);
00394       if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
00395                                  //Use block default
00396         row->space_size = block_space_gap_width;
00397         if (all_gap_stats.get_total () > tosp_redo_kern_limit)
00398           row->kern_size = all_gap_stats.median ();
00399         else
00400           row->kern_size = block_non_space_gap_width;
00401         row->space_threshold =
00402           inT32 (floor ((row->space_size + row->kern_size) /
00403                         tosp_old_sp_kn_th_factor));
00404       }
00405       else
00406         old_to_method(row,
00407                       &all_gap_stats,
00408                       &all_space_gap_stats,
00409                       &small_gap_stats,
00410                       block_space_gap_width,
00411                       block_non_space_gap_width);
00412     }
00413   }
00414 
00415   if (tosp_improve_thresh && !suspected_table)
00416     improve_row_threshold(row, &all_gap_stats);
00417 
00418   /* Now lets try to be careful not to do anything silly with tables when we
00419   are ignoring big gaps*/
00420   if (tosp_sanity_method == 0) {
00421     if (suspected_table &&
00422     (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
00423       if (tosp_debug_level > 5)
00424         tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f.\n",
00425           block_idx, row_idx,
00426           row->kern_size, row->space_threshold, row->space_size);
00427       row->space_threshold =
00428         (inT32) (tosp_table_kn_sp_ratio * row->kern_size);
00429       row->space_size = MAX (row->space_threshold + 1, row->xheight);
00430     }
00431   }
00432   else if (tosp_sanity_method == 1) {
00433     sane_space = row->space_size;
00434     /* NEVER let space size get too close to kern size */
00435     if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5))
00436       || ((row->space_size - row->kern_size) <
00437     (tosp_silly_kn_sp_gap * row->xheight))) {
00438       if (good_block_space_estimate &&
00439         (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
00440         sane_space = block_space_gap_width;
00441       else
00442         sane_space =
00443           MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5),
00444           row->xheight / 2);
00445       if (tosp_debug_level > 5)
00446         tprintf
00447           ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
00448           block_idx, row_idx, row->kern_size, row->space_threshold,
00449           row->space_size, sane_space);
00450       row->space_size = sane_space;
00451       row->space_threshold =
00452         inT32 (floor ((row->space_size + row->kern_size) /
00453                       tosp_old_sp_kn_th_factor));
00454     }
00455     /* NEVER let threshold get VERY far away from kern */
00456     sane_threshold = inT32 (floor (tosp_max_sane_kn_thresh *
00457       MAX (row->kern_size, 2.5)));
00458     if (row->space_threshold > sane_threshold) {
00459       if (tosp_debug_level > 5)
00460         tprintf ("B:%d R:%d -- DONT BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
00461           block_idx, row_idx,
00462           row->kern_size,
00463           row->space_threshold, row->space_size, sane_threshold);
00464       row->space_threshold = sane_threshold;
00465       if (row->space_size <= sane_threshold)
00466         row->space_size = row->space_threshold + 1.0f;
00467     }
00468     /* Beware of tables - there may be NO spaces */
00469     if (suspected_table) {
00470       sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size,
00471         tosp_table_xht_sp_ratio * row->xheight);
00472       sane_threshold = inT32 (floor ((sane_space + row->kern_size) / 2));
00473 
00474       if ((row->space_size < sane_space) ||
00475       (row->space_threshold < sane_threshold)) {
00476         if (tosp_debug_level > 5)
00477           tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
00478             block_idx, row_idx,
00479             row->kern_size,
00480             row->space_threshold, row->space_size);
00481                                  //the minimum sane value
00482         row->space_threshold = (inT32) sane_space;
00483         row->space_size = MAX (row->space_threshold + 1, row->xheight);
00484       }
00485     }
00486   }
00487 
00488   /* Now lets try to put some error limits on the threshold */
00489 
00490   if (tosp_old_to_method) {
00491     /* Old textord made a space if gap >= threshold */
00492                                  //NO FUZZY SPACES YET
00493     row->max_nonspace = row->space_threshold;
00494                                  //NO FUZZY SPACES       YET
00495     row->min_space = row->space_threshold + 1;
00496   }
00497   else {
00498     /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
00499     row->min_space =
00500       MIN (inT32 (ceil (tosp_fuzzy_space_factor * row->xheight)),
00501       inT32 (row->space_size));
00502     if (row->min_space <= row->space_threshold)
00503                                  //Dont be silly
00504       row->min_space = row->space_threshold + 1;
00505     /*
00506     Lets try to guess the max certain kern gap by looking at the cluster of
00507     kerns for the row. The row is proportional so the kerns should cluster
00508     tightly at the bottom of the distribution. We also expect most gaps to be
00509     kerns. Find the maximum of the kern piles between 0 and twice the kern
00510     estimate. Piles before the first one with less than 1/10 the maximum
00511     number of samples can be taken as certain kerns.
00512 
00513       Of course, there are some cases where the kern peak and space peaks merge,
00514       so we will put an UPPER limit on the max certain kern gap of some fraction
00515       below the threshold.
00516     */
00517 
00518     max_max_nonspace = inT32 ((row->space_threshold + row->kern_size) / 2);
00519 
00520                                  //default
00521     row->max_nonspace = max_max_nonspace;
00522     for (index = 0; index <= max_max_nonspace; index++) {
00523       if (all_gap_stats.pile_count (index) > max)
00524         max = all_gap_stats.pile_count (index);
00525       if ((index > row->kern_size) &&
00526       (all_gap_stats.pile_count (index) < 0.1 * max)) {
00527         row->max_nonspace = index;
00528         break;
00529       }
00530     }
00531   }
00532 
00533   /* Yet another algorithm - simpler this time - just choose a fraction of the
00534   threshold to space range */
00535 
00536   if ((tosp_fuzzy_sp_fraction > 0) &&
00537     (row->space_size > row->space_threshold))
00538     row->min_space = MAX (row->min_space,
00539       (inT32) ceil (row->space_threshold +
00540       tosp_fuzzy_sp_fraction *
00541       (row->space_size -
00542       row->space_threshold)));
00543 
00544   /* Ensure that ANY space less than some multiplier times the kern size is
00545   fuzzy.  In tables there is a risk of erroneously setting a small space size
00546   when there are no real spaces. Sometimes tables have text squashed into
00547   columns so that the kn->sp ratio is small anyway - this means that we cant
00548   use this to force a wider separation - hence we rely on context to join any
00549   dubious breaks. */
00550 
00551   if ((tosp_table_fuzzy_kn_sp_ratio > 0) &&
00552     (suspected_table || tosp_fuzzy_limit_all))
00553     row->min_space = MAX (row->min_space,
00554       (inT32) ceil (tosp_table_fuzzy_kn_sp_ratio *
00555       row->kern_size));
00556 
00557   if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
00558     row->max_nonspace = (inT32) floor (0.5 + row->kern_size +
00559       tosp_fuzzy_kn_fraction *
00560       (row->space_threshold -
00561       row->kern_size));
00562   }
00563   if (row->max_nonspace > row->space_threshold) {
00564                                  //Dont be silly
00565     row->max_nonspace = row->space_threshold;
00566   }
00567 
00568   if (tosp_debug_level > 5)
00569     tprintf
00570       ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
00571       block_idx, row_idx, row_length, block_non_space_gap_width,
00572       block_space_gap_width, real_space_threshold, row->kern_size,
00573       row->max_nonspace, row->space_threshold, row->min_space,
00574       row->space_size);
00575   if (tosp_debug_level > 10)
00576     tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, "
00577             "row->space_threshold = %d\n",
00578             row->kern_size, row->space_size, row->space_threshold);
00579 }
00580 
00581 void Textord::old_to_method(
00582     TO_ROW *row,
00583     STATS *all_gap_stats,
00584     STATS *space_gap_stats,
00585     STATS *small_gap_stats,
00586     inT16 block_space_gap_width,     //estimate for block
00587     inT16 block_non_space_gap_width  //estimate for block
00588                             ) {
00589   /* First, estimate row space size */
00590   /* Old to condition was > 2 */
00591   if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
00592   //Adequate samples
00593     /* Set space size to median of spaces BUT limits it if it seems wildly out */
00594     row->space_size = space_gap_stats->median ();
00595     if (row->space_size > block_space_gap_width * 1.5) {
00596       if (tosp_old_to_bug_fix)
00597         row->space_size = block_space_gap_width * 1.5;
00598       else
00599                                  //BUG??? should be *1.5
00600         row->space_size = block_space_gap_width;
00601     }
00602     if (row->space_size < (block_non_space_gap_width * 2) + 1)
00603       row->space_size = (block_non_space_gap_width * 2) + 1;
00604   }
00605                                  //Only 1 or 2 samples
00606   else if (space_gap_stats->get_total () >= 1) {
00607                                  //hence mean not median
00608     row->space_size = space_gap_stats->mean ();
00609     if (row->space_size > block_space_gap_width * 1.5) {
00610       if (tosp_old_to_bug_fix)
00611         row->space_size = block_space_gap_width * 1.5;
00612       else
00613                                  //BUG??? should be *1.5
00614         row->space_size = block_space_gap_width;
00615     }
00616     if (row->space_size < (block_non_space_gap_width * 3) + 1)
00617       row->space_size = (block_non_space_gap_width * 3) + 1;
00618   }
00619   else {
00620                                  //Use block default
00621     row->space_size = block_space_gap_width;
00622   }
00623 
00624   /* Next, estimate row kern size */
00625   if ((tosp_only_small_gaps_for_kern) &&
00626     (small_gap_stats->get_total () > tosp_redo_kern_limit))
00627     row->kern_size = small_gap_stats->median ();
00628   else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
00629     row->kern_size = all_gap_stats->median ();
00630   else                          //old TO -SAME FOR ALL ROWS
00631     row->kern_size = block_non_space_gap_width;
00632 
00633   /* Finally, estimate row space threshold */
00634   if (tosp_threshold_bias2 > 0) {
00635     row->space_threshold =
00636         inT32 (floor (0.5 + row->kern_size +
00637                       tosp_threshold_bias2 * (row->space_size -
00638                                               row->kern_size)));
00639   } else {
00640     /*
00641       NOTE old text ord uses (space_size + kern_size + 1)/2  as the threshold
00642     and holds this in a float. The use is with a >= test
00643     NEW textord uses an integer threshold and a > test
00644     It comes to the same thing.
00645       (Though there is a difference in that old textor has integer space_size
00646       and kern_size.)
00647     */
00648     row->space_threshold =
00649         inT32 (floor ((row->space_size + row->kern_size) / 2));
00650   }
00651 
00652   // Apply the same logic and ratios as in row_spacing_stats to
00653   // restrict relative values of the row's space_size, kern_size, and
00654   // space_threshold
00655   if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
00656       ((row->space_size <
00657         tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) ||
00658        ((row->space_size - row->kern_size) <
00659         tosp_silly_kn_sp_gap * row->xheight))) {
00660     if (row->kern_size > 2.5)
00661       row->kern_size = row->space_size / tosp_min_sane_kn_sp;
00662     row->space_threshold = inT32 (floor ((row->space_size + row->kern_size) /
00663                                          tosp_old_sp_kn_th_factor));
00664   }
00665 }
00666 
00667 
00668 /*************************************************************************
00669  * isolated_row_stats()
00670  * Set values for min_space, max_non_space based on row stats only
00671  *************************************************************************/
00672 BOOL8 Textord::isolated_row_stats(TO_ROW *row,
00673                                   GAPMAP *gapmap,
00674                                   STATS *all_gap_stats,
00675                                   BOOL8 suspected_table,
00676                                   inT16 block_idx,
00677                                   inT16 row_idx) {
00678   float kern_estimate;
00679   float crude_threshold_estimate;
00680   inT16 small_gaps_count;
00681   inT16 total;
00682   //iterator
00683   BLOBNBOX_IT blob_it = row->blob_list ();
00684   STATS cert_space_gap_stats (0, MAXSPACING);
00685   STATS all_space_gap_stats (0, MAXSPACING);
00686   STATS small_gap_stats (0, MAXSPACING);
00687   TBOX blob_box;
00688   TBOX prev_blob_box;
00689   inT16 gap_width;
00690   inT32 end_of_row;
00691   inT32 row_length;
00692 
00693   kern_estimate = all_gap_stats->median ();
00694   crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate,
00695     tosp_init_guess_xht_mult * row->xheight);
00696   small_gaps_count = stats_count_under (all_gap_stats,
00697     (inT16)
00698     ceil (crude_threshold_estimate));
00699   total = all_gap_stats->get_total ();
00700 
00701   if ((total <= tosp_redo_kern_limit) ||
00702     ((small_gaps_count / (float) total) < tosp_enough_small_gaps) ||
00703   (total - small_gaps_count < 1)) {
00704     if (tosp_debug_level > 5)
00705       tprintf ("B:%d R:%d -- Cant do isolated row stats.\n",
00706         block_idx, row_idx);
00707     return FALSE;
00708   }
00709   blob_it.set_to_list (row->blob_list ());
00710   blob_it.mark_cycle_pt ();
00711   end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00712   if (tosp_use_pre_chopping)
00713     blob_box = box_next_pre_chopped (&blob_it);
00714   else if (tosp_stats_use_xht_gaps)
00715     blob_box = reduced_box_next (row, &blob_it);
00716   else
00717     blob_box = box_next (&blob_it);
00718   row_length = end_of_row - blob_box.left ();
00719   prev_blob_box = blob_box;
00720   while (!blob_it.cycled_list ()) {
00721     if (tosp_use_pre_chopping)
00722       blob_box = box_next_pre_chopped (&blob_it);
00723     else if (tosp_stats_use_xht_gaps)
00724       blob_box = reduced_box_next (row, &blob_it);
00725     else
00726       blob_box = box_next (&blob_it);
00727     gap_width = blob_box.left () - prev_blob_box.right ();
00728     if (!ignore_big_gap (row, row_length, gapmap,
00729       prev_blob_box.right (), blob_box.left ()) &&
00730     (gap_width > crude_threshold_estimate)) {
00731       if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
00732         ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
00733         (!tosp_narrow_blobs_not_cert ||
00734         (!narrow_blob (row, prev_blob_box) &&
00735         !narrow_blob (row, blob_box)))) ||
00736         (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
00737         cert_space_gap_stats.add (gap_width, 1);
00738       all_space_gap_stats.add (gap_width, 1);
00739     }
00740     if (gap_width < crude_threshold_estimate)
00741       small_gap_stats.add (gap_width, 1);
00742 
00743     prev_blob_box = blob_box;
00744   }
00745   if (cert_space_gap_stats.get_total () >=
00746     tosp_enough_space_samples_for_median)
00747                                  //median
00748     row->space_size = cert_space_gap_stats.median ();
00749   else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
00750                                  //to avoid spaced
00751     row->space_size = cert_space_gap_stats.mean ();
00752   //      1's in tables
00753   else if (all_space_gap_stats.get_total () >=
00754     tosp_enough_space_samples_for_median)
00755                                  //median
00756     row->space_size = all_space_gap_stats.median ();
00757   else
00758     row->space_size = all_space_gap_stats.mean ();
00759 
00760   if (tosp_only_small_gaps_for_kern)
00761     row->kern_size = small_gap_stats.median ();
00762   else
00763     row->kern_size = all_gap_stats->median ();
00764   row->space_threshold =
00765     inT32 (floor ((row->space_size + row->kern_size) / 2));
00766   /* Sanity check */
00767   if ((row->kern_size >= row->space_threshold) ||
00768     (row->space_threshold >= row->space_size) ||
00769   (row->space_threshold <= 0)) {
00770     if (tosp_debug_level > 5)
00771       tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
00772         block_idx, row_idx,
00773         row->kern_size, row->space_threshold, row->space_size);
00774     row->kern_size = 0.0f;
00775     row->space_threshold = 0;
00776     row->space_size = 0.0f;
00777     return FALSE;
00778   }
00779 
00780   if (tosp_debug_level > 5)
00781     tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
00782       block_idx, row_idx,
00783       row->kern_size, row->space_threshold, row->space_size);
00784   return TRUE;
00785 }
00786 
00787 inT16 Textord::stats_count_under(STATS *stats, inT16 threshold) {
00788   inT16 index;
00789   inT16 total = 0;
00790 
00791   for (index = 0; index < threshold; index++)
00792     total += stats->pile_count (index);
00793   return total;
00794 }
00795 
00796 
00797 /*************************************************************************
00798  * improve_row_threshold()
00799  *    Try to recognise a "normal line" -
00800  *           > 25 gaps
00801  *     &&    space > 3 * kn  && space > 10
00802  *              (I.e. reasonably large space and kn:sp ratio)
00803  *     &&    > 3/4 # gaps < kn + (sp - kn)/3
00804  *              (I.e. most gaps are well away from space estimate)
00805  *     &&    a gap of max( 3, (sp - kn)/3 ) empty histogram positions is found
00806  *           somewhere in the histogram between kn and sp
00807  *     THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
00808  *          NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!!
00809  *          try moving the default threshold to within this band but leave the
00810  *          fuzzy limit calculation as at present.
00811  *************************************************************************/
00812 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
00813   float sp = row->space_size;
00814   float kn = row->kern_size;
00815   inT16 reqd_zero_width = 0;
00816   inT16 zero_width = 0;
00817   inT16 zero_start = 0;
00818   inT16 index = 0;
00819 
00820   if (tosp_debug_level > 10)
00821     tprintf ("Improve row threshold 0");
00822   if ((all_gap_stats->get_total () <= 25) ||
00823     (sp <= 10) ||
00824     (sp <= 3 * kn) ||
00825     (stats_count_under (all_gap_stats,
00826     (inT16) ceil (kn + (sp - kn) / 3 + 0.5)) <
00827     (0.75 * all_gap_stats->get_total ())))
00828     return;
00829   if (tosp_debug_level > 10)
00830     tprintf (" 1");
00831   /*
00832   Look for the first region of all 0's in the histogram which is wider than
00833   max( 3, (sp - kn)/3 ) and starts between kn and sp. If found, and current
00834   threshold is not within it, move the threshold so that is is just inside it.
00835   */
00836   reqd_zero_width = (inT16) floor ((sp - kn) / 3 + 0.5);
00837   if (reqd_zero_width < 3)
00838     reqd_zero_width = 3;
00839 
00840   for (index = inT16 (ceil (kn)); index < inT16 (floor (sp)); index++) {
00841     if (all_gap_stats->pile_count (index) == 0) {
00842       if (zero_width == 0)
00843         zero_start = index;
00844       zero_width++;
00845     }
00846     else {
00847       if (zero_width >= reqd_zero_width)
00848         break;
00849       else {
00850         zero_width = 0;
00851       }
00852     }
00853   }
00854   index--;
00855   if (tosp_debug_level > 10)
00856     tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
00857       reqd_zero_width, zero_width, zero_start, row->space_threshold);
00858   if ((zero_width < reqd_zero_width) ||
00859     ((row->space_threshold >= zero_start) &&
00860     (row->space_threshold <= index)))
00861     return;
00862   if (tosp_debug_level > 10)
00863     tprintf (" 2");
00864   if (row->space_threshold < zero_start) {
00865     if (tosp_debug_level > 5)
00866       tprintf
00867         ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n",
00868         kn, sp, zero_start, index, row->space_threshold, zero_start);
00869     row->space_threshold = zero_start;
00870   }
00871   if (row->space_threshold > index) {
00872     if (tosp_debug_level > 5)
00873       tprintf
00874         ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n",
00875         kn, sp, zero_start, index, row->space_threshold, index);
00876     row->space_threshold = index;
00877   }
00878 }
00879 
00880 
00881 /**********************************************************************
00882  * make_prop_words
00883  *
00884  * Convert a TO_BLOCK to a BLOCK.
00885  **********************************************************************/
00886 ROW *Textord::make_prop_words(
00887     TO_ROW *row,     // row to make
00888     FCOORD rotation  // for drawing
00889                               ) {
00890   BOOL8 bol;                     //start of line
00891   /* prev_ values are for start of word being built. non prev_ values are for
00892   the gap between the word being built and the next one. */
00893   BOOL8 prev_fuzzy_sp;           //probably space
00894   BOOL8 prev_fuzzy_non;          //probably not
00895   uinT8 prev_blanks;             //in front of word
00896   BOOL8 fuzzy_sp;                //probably space
00897   BOOL8 fuzzy_non;               //probably not
00898   uinT8 blanks;                  //in front of word
00899   BOOL8 prev_gap_was_a_space = FALSE;
00900   BOOL8 break_at_next_gap = FALSE;
00901   ROW *real_row;                 //output row
00902   C_OUTLINE_IT cout_it;
00903   C_BLOB_LIST cblobs;
00904   C_BLOB_IT cblob_it = &cblobs;
00905   WERD_LIST words;
00906   WERD_IT word_it;               //new words
00907   WERD *word;                    //new word
00908   WERD_IT rep_char_it;           //repeated char words
00909   inT32 next_rep_char_word_right = MAX_INT32;
00910   float repetition_spacing;      //gap between repetitions
00911   inT32 xstarts[2];              //row ends
00912   double coeffs[3];              //quadratic
00913   inT32 prev_x;                  //end of prev blob
00914   BLOBNBOX *bblob;               //current blob
00915   TBOX blob_box;                  //bounding box
00916   BLOBNBOX_IT box_it;            //iterator
00917   TBOX prev_blob_box;
00918   TBOX next_blob_box;
00919   inT16 prev_gap = MAX_INT16;
00920   inT16 current_gap = MAX_INT16;
00921   inT16 next_gap = MAX_INT16;
00922   inT16 prev_within_xht_gap = MAX_INT16;
00923   inT16 current_within_xht_gap = MAX_INT16;
00924   inT16 next_within_xht_gap = MAX_INT16;
00925   inT16 word_count = 0;
00926 
00927   rep_char_it.set_to_list (&(row->rep_words));
00928   if (!rep_char_it.empty ()) {
00929     next_rep_char_word_right =
00930       rep_char_it.data ()->bounding_box ().right ();
00931   }
00932 
00933   prev_x = -MAX_INT16;
00934   cblob_it.set_to_list (&cblobs);
00935   box_it.set_to_list (row->blob_list ());
00936   word_it.set_to_list (&words);
00937   bol = TRUE;
00938   prev_blanks = 0;
00939   prev_fuzzy_sp = FALSE;
00940   prev_fuzzy_non = FALSE;
00941   if (!box_it.empty ()) {
00942     xstarts[0] = box_it.data ()->bounding_box ().left ();
00943     if (xstarts[0] > next_rep_char_word_right) {
00944       /* We need to insert a repeated char word at the start of the row */
00945       word = rep_char_it.extract ();
00946       word_it.add_after_then_move (word);
00947       /* Set spaces before repeated char word */
00948       word->set_flag (W_BOL, TRUE);
00949       bol = FALSE;
00950       word->set_blanks (0);
00951                                  //NO uncertainty
00952       word->set_flag (W_FUZZY_SP, FALSE);
00953       word->set_flag (W_FUZZY_NON, FALSE);
00954       xstarts[0] = word->bounding_box ().left ();
00955       /* Set spaces after repeated char word (and leave current word set) */
00956       repetition_spacing = find_mean_blob_spacing (word);
00957       current_gap = box_it.data ()->bounding_box ().left () -
00958         next_rep_char_word_right;
00959       current_within_xht_gap = current_gap;
00960       if (current_gap > tosp_rep_space * repetition_spacing) {
00961         prev_blanks = (uinT8) floor (current_gap / row->space_size);
00962         if (prev_blanks < 1)
00963           prev_blanks = 1;
00964       }
00965       else
00966         prev_blanks = 0;
00967       if (tosp_debug_level > 5)
00968         tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f;  Rgap:%d  ",
00969           box_it.data ()->bounding_box ().left (),
00970           box_it.data ()->bounding_box ().bottom (),
00971           repetition_spacing, current_gap);
00972       prev_fuzzy_sp = FALSE;
00973       prev_fuzzy_non = FALSE;
00974       if (rep_char_it.empty ()) {
00975         next_rep_char_word_right = MAX_INT32;
00976       }
00977       else {
00978         rep_char_it.forward ();
00979         next_rep_char_word_right =
00980           rep_char_it.data ()->bounding_box ().right ();
00981       }
00982     }
00983 
00984     peek_at_next_gap(row,
00985                      box_it,
00986                      next_blob_box,
00987                      next_gap,
00988                      next_within_xht_gap);
00989     do {
00990       bblob = box_it.data ();
00991       blob_box = bblob->bounding_box ();
00992       if (bblob->joined_to_prev ()) {
00993         if (bblob->cblob () != NULL) {
00994           cout_it.set_to_list (cblob_it.data ()->out_list ());
00995           cout_it.move_to_last ();
00996           cout_it.add_list_after (bblob->cblob ()->out_list ());
00997           delete bblob->cblob ();
00998         }
00999       } else {
01000         if (bblob->cblob() != NULL)
01001           cblob_it.add_after_then_move (bblob->cblob ());
01002         prev_x = blob_box.right ();
01003       }
01004       box_it.forward ();         //next one
01005       bblob = box_it.data ();
01006       blob_box = bblob->bounding_box ();
01007 
01008       if (!bblob->joined_to_prev() && bblob->cblob() != NULL) {
01009         /* Real Blob - not multiple outlines or pre-chopped */
01010         prev_gap = current_gap;
01011         prev_within_xht_gap = current_within_xht_gap;
01012         prev_blob_box = next_blob_box;
01013         current_gap = next_gap;
01014         current_within_xht_gap = next_within_xht_gap;
01015         peek_at_next_gap(row,
01016                          box_it,
01017                          next_blob_box,
01018                          next_gap,
01019                          next_within_xht_gap);
01020 
01021         inT16 prev_gap_arg = prev_gap;
01022         inT16 next_gap_arg = next_gap;
01023         if (tosp_only_use_xht_gaps) {
01024           prev_gap_arg = prev_within_xht_gap;
01025           next_gap_arg = next_within_xht_gap;
01026         }
01027         // Decide if a word-break should be inserted
01028         if (blob_box.left () > next_rep_char_word_right ||
01029             make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
01030                               current_gap, current_within_xht_gap,
01031                               next_blob_box, next_gap_arg,
01032                               blanks, fuzzy_sp, fuzzy_non,
01033                               prev_gap_was_a_space,
01034                               break_at_next_gap) ||
01035             box_it.at_first()) {
01036           /* Form a new word out of the blobs collected */
01037           word = new WERD (&cblobs, prev_blanks, NULL);
01038           word_count++;
01039           word_it.add_after_then_move (word);
01040           if (bol) {
01041             word->set_flag (W_BOL, TRUE);
01042             bol = FALSE;
01043           }
01044           if (prev_fuzzy_sp)
01045                                  //probably space
01046             word->set_flag (W_FUZZY_SP, TRUE);
01047           else if (prev_fuzzy_non)
01048             word->set_flag (W_FUZZY_NON, TRUE);
01049           //probably not
01050 
01051           if (blob_box.left () > next_rep_char_word_right) {
01052             /* We need to insert a repeated char word */
01053             word = rep_char_it.extract ();
01054             word_it.add_after_then_move (word);
01055 
01056             /* Set spaces before repeated char word */
01057             repetition_spacing = find_mean_blob_spacing (word);
01058             current_gap = word->bounding_box ().left () - prev_x;
01059             current_within_xht_gap = current_gap;
01060             if (current_gap > tosp_rep_space * repetition_spacing) {
01061               blanks =
01062                 (uinT8) floor (current_gap / row->space_size);
01063               if (blanks < 1)
01064                 blanks = 1;
01065             }
01066             else
01067               blanks = 0;
01068             if (tosp_debug_level > 5)
01069               tprintf
01070                 ("Repch wd (%d,%d) rep gap %5.2f;  Lgap:%d (%d blanks);",
01071                 word->bounding_box ().left (),
01072                 word->bounding_box ().bottom (),
01073                 repetition_spacing, current_gap, blanks);
01074             word->set_blanks (blanks);
01075                                  //NO uncertainty
01076             word->set_flag (W_FUZZY_SP, FALSE);
01077             word->set_flag (W_FUZZY_NON, FALSE);
01078 
01079             /* Set spaces after repeated char word (and leave current word set) */
01080             current_gap =
01081               blob_box.left () - next_rep_char_word_right;
01082             if (current_gap > tosp_rep_space * repetition_spacing) {
01083               blanks = (uinT8) (current_gap / row->space_size);
01084               if (blanks < 1)
01085                 blanks = 1;
01086             }
01087             else
01088               blanks = 0;
01089             if (tosp_debug_level > 5)
01090               tprintf (" Rgap:%d (%d blanks)\n",
01091                 current_gap, blanks);
01092             fuzzy_sp = FALSE;
01093             fuzzy_non = FALSE;
01094 
01095             if (rep_char_it.empty ()) {
01096               next_rep_char_word_right = MAX_INT32;
01097             }
01098             else {
01099               rep_char_it.forward ();
01100               next_rep_char_word_right =
01101                 rep_char_it.data ()->bounding_box ().right ();
01102             }
01103           }
01104 
01105           if (box_it.at_first () && rep_char_it.empty ()) {
01106                                  //at end of line
01107             word->set_flag (W_EOL, TRUE);
01108             xstarts[1] = prev_x;
01109           }
01110           else {
01111             prev_blanks = blanks;
01112             prev_fuzzy_sp = fuzzy_sp;
01113             prev_fuzzy_non = fuzzy_non;
01114           }
01115         }
01116       }
01117     }
01118     while (!box_it.at_first ()); //until back at start
01119 
01120     /* Insert any further repeated char words */
01121     while (!rep_char_it.empty ()) {
01122       word = rep_char_it.extract ();
01123       word_it.add_after_then_move (word);
01124 
01125       /* Set spaces before repeated char word */
01126       repetition_spacing = find_mean_blob_spacing (word);
01127       current_gap = word->bounding_box ().left () - prev_x;
01128       if (current_gap > tosp_rep_space * repetition_spacing) {
01129         blanks = (uinT8) floor (current_gap / row->space_size);
01130         if (blanks < 1)
01131           blanks = 1;
01132       }
01133       else
01134         blanks = 0;
01135       if (tosp_debug_level > 5)
01136         tprintf
01137           ("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n",
01138           word->bounding_box ().left (), word->bounding_box ().bottom (),
01139           repetition_spacing, current_gap, blanks);
01140       word->set_blanks (blanks);
01141                                  //NO uncertainty
01142       word->set_flag (W_FUZZY_SP, FALSE);
01143       word->set_flag (W_FUZZY_NON, FALSE);
01144       prev_x = word->bounding_box ().right ();
01145       if (rep_char_it.empty ()) {
01146                                  //at end of line
01147         word->set_flag (W_EOL, TRUE);
01148         xstarts[1] = prev_x;
01149       }
01150       else {
01151         rep_char_it.forward ();
01152       }
01153     }
01154     coeffs[0] = 0;
01155     coeffs[1] = row->line_m ();
01156     coeffs[2] = row->line_c ();
01157     real_row = new ROW (row,
01158       (inT16) row->kern_size, (inT16) row->space_size);
01159     word_it.set_to_list (real_row->word_list ());
01160                                  //put words in row
01161     word_it.add_list_after (&words);
01162     real_row->recalc_bounding_box ();
01163 
01164     if (tosp_debug_level > 4) {
01165       tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
01166         word_count,
01167         real_row->bounding_box ().left (),
01168         real_row->bounding_box ().bottom (),
01169         real_row->bounding_box ().right (),
01170         real_row->bounding_box ().top ());
01171     }
01172     return real_row;
01173   }
01174   return NULL;
01175 }
01176 
01177 /**********************************************************************
01178  * make_blob_words
01179  *
01180  * Converts words into blobs so that each blob is a single character.
01181  *  Used for chopper test.
01182  **********************************************************************/
01183 ROW *Textord::make_blob_words(
01184     TO_ROW *row,     // row to make
01185     FCOORD rotation  // for drawing
01186                               ) {
01187   bool bol;                      // start of line
01188   ROW *real_row;                 // output row
01189   C_OUTLINE_IT cout_it;
01190   C_BLOB_LIST cblobs;
01191   C_BLOB_IT cblob_it = &cblobs;
01192   WERD_LIST words;
01193   WERD_IT word_it;               // new words
01194   WERD *word;                    // new word
01195   double coeffs[3];              // quadratic
01196   BLOBNBOX *bblob;               // current blob
01197   TBOX blob_box;                 // bounding box
01198   BLOBNBOX_IT box_it;            // iterator
01199   inT16 word_count = 0;
01200 
01201   cblob_it.set_to_list(&cblobs);
01202   box_it.set_to_list(row->blob_list());
01203   word_it.set_to_list(&words);
01204   bol = TRUE;
01205   if (!box_it.empty()) {
01206 
01207     do {
01208       bblob = box_it.data();
01209       blob_box = bblob->bounding_box();
01210       if (bblob->joined_to_prev()) {
01211         if (bblob->cblob() != NULL) {
01212           cout_it.set_to_list(cblob_it.data()->out_list());
01213           cout_it.move_to_last();
01214           cout_it.add_list_after(bblob->cblob()->out_list());
01215           delete bblob->cblob();
01216         }
01217       } else {
01218         if (bblob->cblob() != NULL)
01219           cblob_it.add_after_then_move(bblob->cblob());
01220       }
01221       box_it.forward();         // next one
01222       bblob = box_it.data();
01223       blob_box = bblob->bounding_box();
01224 
01225       if (!bblob->joined_to_prev() && !cblobs.empty()) {
01226         word = new WERD(&cblobs, 1, NULL);
01227         word_count++;
01228         word_it.add_after_then_move(word);
01229         if (bol) {
01230           word->set_flag(W_BOL, TRUE);
01231           bol = FALSE;
01232         }
01233         if (box_it.at_first()) { // at end of line
01234           word->set_flag(W_EOL, TRUE);
01235         }
01236       }
01237     }
01238     while (!box_it.at_first()); // until back at start
01239     /* Setup the row with created words. */
01240     coeffs[0] = 0;
01241     coeffs[1] = row->line_m();
01242     coeffs[2] = row->line_c();
01243     real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size);
01244     word_it.set_to_list(real_row->word_list());
01245                                  //put words in row
01246     word_it.add_list_after(&words);
01247     real_row->recalc_bounding_box();
01248     if (tosp_debug_level > 4) {
01249       tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
01250         word_count,
01251         real_row->bounding_box().left(),
01252         real_row->bounding_box().bottom(),
01253         real_row->bounding_box().right(),
01254         real_row->bounding_box().top());
01255     }
01256     return real_row;
01257   }
01258   return NULL;
01259 }
01260 
01261 BOOL8 Textord::make_a_word_break(
01262     TO_ROW *row,   // row being made
01263     TBOX blob_box, // for next_blob // how many blanks?
01264     inT16 prev_gap,
01265     TBOX prev_blob_box,
01266     inT16 real_current_gap,
01267     inT16 within_xht_current_gap,
01268     TBOX next_blob_box,
01269     inT16 next_gap,
01270     uinT8 &blanks,
01271     BOOL8 &fuzzy_sp,
01272     BOOL8 &fuzzy_non,
01273     BOOL8& prev_gap_was_a_space,
01274     BOOL8& break_at_next_gap) {
01275   BOOL8 space;
01276   inT16 current_gap;
01277   float fuzzy_sp_to_kn_limit;
01278 
01279   if (break_at_next_gap) {
01280     break_at_next_gap = FALSE;
01281     return TRUE;
01282   }
01283   /* Inhibit using the reduced gap if
01284     The kerning is large - chars are not kerned and reducing "f"s can cause
01285     erroneous blanks
01286   OR  The real gap is less than 0
01287   OR  The real gap is less than the kerning estimate
01288   */
01289   if ((row->kern_size > tosp_large_kerning * row->xheight) ||
01290       ((tosp_dont_fool_with_small_kerns >= 0) &&
01291        (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
01292                                  //Ignore the difference
01293     within_xht_current_gap = real_current_gap;
01294 
01295   if (tosp_use_xht_gaps && tosp_only_use_xht_gaps)
01296     current_gap = within_xht_current_gap;
01297   else
01298     current_gap = real_current_gap;
01299 
01300   if (tosp_old_to_method) {
01301                                  //Boring old method
01302     space = current_gap > row->max_nonspace;
01303     if (space && (current_gap < MAX_INT16)) {
01304       if (current_gap < row->min_space) {
01305         if (current_gap > row->space_threshold) {
01306           blanks = 1;
01307           fuzzy_sp = TRUE;
01308           fuzzy_non = FALSE;
01309         }
01310         else {
01311           blanks = 0;
01312           fuzzy_sp = FALSE;
01313           fuzzy_non = TRUE;
01314         }
01315       }
01316       else {
01317         blanks = (uinT8) (current_gap / row->space_size);
01318         if (blanks < 1)
01319           blanks = 1;
01320         fuzzy_sp = FALSE;
01321         fuzzy_non = FALSE;
01322       }
01323     }
01324     return space;
01325   }
01326   else {
01327   /* New exciting heuristic method */
01328     if (prev_blob_box.null_box ())  // Beginning of row
01329       prev_gap_was_a_space = TRUE;
01330 
01331                                  //Default as old TO
01332     space = current_gap > row->space_threshold;
01333 
01334     /* Set defaults for the word break incase we find one.  Currently there are
01335     no fuzzy spaces. Depending on the reliability of the different heuristics
01336     we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
01337     be used if the function returns TRUE - ie the word is to be broken.
01338     */
01339     blanks = (uinT8) (current_gap / row->space_size);
01340     if (blanks < 1)
01341       blanks = 1;
01342     fuzzy_sp = FALSE;
01343     fuzzy_non = FALSE;
01344     /*
01345     If xht measure causes gap to flip one of the 3 thresholds act accordingly -
01346     despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
01347     context.
01348     */
01349     if (tosp_use_xht_gaps &&
01350       (real_current_gap <= row->max_nonspace) &&
01351     (within_xht_current_gap > row->max_nonspace)) {
01352       space = TRUE;
01353       fuzzy_non = TRUE;
01354 #ifndef GRAPHICS_DISABLED
01355       mark_gap (blob_box, 20,
01356         prev_gap, prev_blob_box.width (),
01357         current_gap, next_blob_box.width (), next_gap);
01358 #endif
01359     }
01360     else if (tosp_use_xht_gaps &&
01361       (real_current_gap <= row->space_threshold) &&
01362     (within_xht_current_gap > row->space_threshold)) {
01363       space = TRUE;
01364       if (tosp_flip_fuzz_kn_to_sp)
01365         fuzzy_sp = TRUE;
01366       else
01367         fuzzy_non = TRUE;
01368 #ifndef GRAPHICS_DISABLED
01369       mark_gap (blob_box, 21,
01370         prev_gap, prev_blob_box.width (),
01371         current_gap, next_blob_box.width (), next_gap);
01372 #endif
01373     }
01374     else if (tosp_use_xht_gaps &&
01375       (real_current_gap < row->min_space) &&
01376     (within_xht_current_gap >= row->min_space)) {
01377       space = TRUE;
01378 #ifndef GRAPHICS_DISABLED
01379       mark_gap (blob_box, 22,
01380         prev_gap, prev_blob_box.width (),
01381         current_gap, next_blob_box.width (), next_gap);
01382 #endif
01383     }
01384     else if (tosp_force_wordbreak_on_punct &&
01385              !suspected_punct_blob(row, prev_blob_box) &&
01386              suspected_punct_blob(row, blob_box)) {
01387       break_at_next_gap = TRUE;
01388     }
01389     /* Now continue with normal heuristics */
01390     else if ((current_gap < row->min_space) &&
01391     (current_gap > row->space_threshold)) {
01392       /* Heuristics to turn dubious spaces to kerns */
01393       if (tosp_pass_wide_fuzz_sp_to_context > 0)
01394         fuzzy_sp_to_kn_limit = row->kern_size +
01395           tosp_pass_wide_fuzz_sp_to_context *
01396           (row->space_size - row->kern_size);
01397       else
01398         fuzzy_sp_to_kn_limit = 99999.0f;
01399 
01400       /* If current gap is significantly smaller than the previous space the other
01401       side of a narrow blob then this gap is a kern. */
01402       if ((prev_blob_box.width () > 0) &&
01403         narrow_blob (row, prev_blob_box) &&
01404         prev_gap_was_a_space &&
01405       (current_gap <= tosp_gap_factor * prev_gap)) {
01406         if ((tosp_all_flips_fuzzy) ||
01407         (current_gap > fuzzy_sp_to_kn_limit)) {
01408           if (tosp_flip_fuzz_sp_to_kn)
01409             fuzzy_non = TRUE;
01410           else
01411             fuzzy_sp = TRUE;
01412         }
01413         else
01414           space = FALSE;
01415 #ifndef GRAPHICS_DISABLED
01416         mark_gap (blob_box, 1,
01417           prev_gap, prev_blob_box.width (),
01418           current_gap, next_blob_box.width (), next_gap);
01419 #endif
01420       }
01421       /* If current gap not much bigger than the previous kern the other side of a
01422       narrow blob then this gap is a kern as well */
01423       else if ((prev_blob_box.width () > 0) &&
01424         narrow_blob (row, prev_blob_box) &&
01425         !prev_gap_was_a_space &&
01426       (current_gap * tosp_gap_factor <= prev_gap)) {
01427         if ((tosp_all_flips_fuzzy) ||
01428         (current_gap > fuzzy_sp_to_kn_limit)) {
01429           if (tosp_flip_fuzz_sp_to_kn)
01430             fuzzy_non = TRUE;
01431           else
01432             fuzzy_sp = TRUE;
01433         }
01434         else
01435           space = FALSE;
01436 #ifndef GRAPHICS_DISABLED
01437         mark_gap (blob_box, 2,
01438           prev_gap, prev_blob_box.width (),
01439           current_gap, next_blob_box.width (), next_gap);
01440 #endif
01441       }
01442       else if ((next_blob_box.width () > 0) &&
01443         narrow_blob (row, next_blob_box) &&
01444         (next_gap > row->space_threshold) &&
01445       (current_gap <= tosp_gap_factor * next_gap)) {
01446         if ((tosp_all_flips_fuzzy) ||
01447         (current_gap > fuzzy_sp_to_kn_limit)) {
01448           if (tosp_flip_fuzz_sp_to_kn)
01449             fuzzy_non = TRUE;
01450           else
01451             fuzzy_sp = TRUE;
01452         }
01453         else
01454           space = FALSE;
01455 #ifndef GRAPHICS_DISABLED
01456         mark_gap (blob_box, 3,
01457           prev_gap, prev_blob_box.width (),
01458           current_gap, next_blob_box.width (), next_gap);
01459 #endif
01460       }
01461       else if ((next_blob_box.width () > 0) &&
01462         narrow_blob (row, next_blob_box) &&
01463         (next_gap <= row->space_threshold) &&
01464       (current_gap * tosp_gap_factor <= next_gap)) {
01465         if ((tosp_all_flips_fuzzy) ||
01466         (current_gap > fuzzy_sp_to_kn_limit)) {
01467           if (tosp_flip_fuzz_sp_to_kn)
01468             fuzzy_non = TRUE;
01469           else
01470             fuzzy_sp = TRUE;
01471         }
01472         else
01473           space = FALSE;
01474 #ifndef GRAPHICS_DISABLED
01475         mark_gap (blob_box, 4,
01476           prev_gap, prev_blob_box.width (),
01477           current_gap, next_blob_box.width (), next_gap);
01478 #endif
01479       }
01480       else if ((((next_blob_box.width () > 0) &&
01481         narrow_blob (row, next_blob_box)) ||
01482         ((prev_blob_box.width () > 0) &&
01483       narrow_blob (row, prev_blob_box)))) {
01484         fuzzy_sp = TRUE;
01485 #ifndef GRAPHICS_DISABLED
01486         mark_gap (blob_box, 6,
01487           prev_gap, prev_blob_box.width (),
01488           current_gap, next_blob_box.width (), next_gap);
01489 #endif
01490       }
01491     }
01492     else if ((current_gap > row->max_nonspace) &&
01493              (current_gap <= row->space_threshold)) {
01494 
01495       /* Heuristics to turn dubious kerns to spaces */
01496       /* TRIED THIS BUT IT MADE THINGS WORSE
01497           if ( prev_gap == MAX_INT16 )
01498             prev_gap = 0;  // start of row
01499           if ( next_gap == MAX_INT16 )
01500             next_gap = 0;  // end of row
01501       */
01502       if ((prev_blob_box.width () > 0) &&
01503         (next_blob_box.width () > 0) &&
01504         (current_gap >=
01505         tosp_kern_gap_factor1 * MAX (prev_gap, next_gap)) &&
01506         wide_blob (row, prev_blob_box) &&
01507       wide_blob (row, next_blob_box)) {
01508 
01509         space = TRUE;
01510         /*
01511         tosp_flip_caution is an attempt to stop the default changing in cases
01512         where there is a large difference between the kern and space estimates.
01513           See problem in 'chiefs' where "have" gets split in the quotation.
01514         */
01515         if ((tosp_flip_fuzz_kn_to_sp) &&
01516           ((tosp_flip_caution <= 0) ||
01517           (tosp_flip_caution * row->kern_size > row->space_size)))
01518           fuzzy_sp = TRUE;
01519         else
01520           fuzzy_non = TRUE;
01521 #ifndef GRAPHICS_DISABLED
01522         mark_gap (blob_box, 7,
01523           prev_gap, prev_blob_box.width (),
01524           current_gap, next_blob_box.width (), next_gap);
01525 #endif
01526       } else if (prev_blob_box.width() > 0 &&
01527                  next_blob_box.width() > 0 &&
01528                  current_gap > 5 &&  // Rule 9 handles small gap, big ratio.
01529                  current_gap >=
01530                    tosp_kern_gap_factor2 * MAX(prev_gap, next_gap) &&
01531                  !(narrow_blob(row, prev_blob_box) ||
01532                    suspected_punct_blob(row, prev_blob_box)) &&
01533                  !(narrow_blob(row, next_blob_box) ||
01534                    suspected_punct_blob(row, next_blob_box))) {
01535         space = TRUE;
01536         fuzzy_non = TRUE;
01537 #ifndef GRAPHICS_DISABLED
01538         mark_gap (blob_box, 8,
01539           prev_gap, prev_blob_box.width (),
01540           current_gap, next_blob_box.width (), next_gap);
01541 #endif
01542       }
01543       else if ((tosp_kern_gap_factor3 > 0) &&
01544                (prev_blob_box.width () > 0) &&
01545                (next_blob_box.width () > 0) &&
01546                (current_gap >= tosp_kern_gap_factor3 * MAX (prev_gap, next_gap)) &&
01547                (!tosp_rule_9_test_punct ||
01548                 (!suspected_punct_blob (row, prev_blob_box) &&
01549                  !suspected_punct_blob (row, next_blob_box)))) {
01550         space = TRUE;
01551         fuzzy_non = TRUE;
01552 #ifndef GRAPHICS_DISABLED
01553         mark_gap (blob_box, 9,
01554           prev_gap, prev_blob_box.width (),
01555           current_gap, next_blob_box.width (), next_gap);
01556 #endif
01557       }
01558     }
01559     if (tosp_debug_level > 10)
01560       tprintf("word break = %d current_gap = %d, prev_gap = %d, "
01561               "next_gap = %d\n", space ? 1 : 0, current_gap,
01562               prev_gap, next_gap);
01563     prev_gap_was_a_space = space && !(fuzzy_non);
01564     return space;
01565   }
01566 }
01567 
01568 BOOL8 Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
01569   BOOL8 result;
01570   result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
01571     (((float) blob_box.width () / blob_box.height ()) <=
01572     tosp_narrow_aspect_ratio));
01573   return result;
01574 }
01575 
01576 BOOL8 Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
01577   BOOL8 result;
01578   if (tosp_wide_fraction > 0) {
01579     if (tosp_wide_aspect_ratio > 0)
01580       result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
01581         (((float) blob_box.width () / blob_box.height ()) >
01582         tosp_wide_aspect_ratio));
01583     else
01584       result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
01585   }
01586   else
01587     result = !narrow_blob (row, blob_box);
01588   return result;
01589 }
01590 
01591 BOOL8 Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
01592   BOOL8 result;
01593   float baseline;
01594   float blob_x_centre;
01595   /* Find baseline of centre of blob */
01596   blob_x_centre = (box.right () + box.left ()) / 2.0;
01597   baseline = row->baseline.y (blob_x_centre);
01598 
01599   result = (box.height () <= 0.66 * row->xheight) ||
01600            (box.top () < baseline + row->xheight / 2.0) ||
01601            (box.bottom () > baseline + row->xheight / 2.0);
01602   return result;
01603 }
01604 
01605 
01606 void Textord::peek_at_next_gap(TO_ROW *row,
01607                                BLOBNBOX_IT box_it,
01608                                TBOX &next_blob_box,
01609                                inT16 &next_gap,
01610                                inT16 &next_within_xht_gap) {
01611   TBOX next_reduced_blob_box;
01612   TBOX bit_beyond;
01613   BLOBNBOX_IT reduced_box_it = box_it;
01614 
01615   next_blob_box = box_next (&box_it);
01616   next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
01617   if (box_it.at_first ()) {
01618     next_gap = MAX_INT16;
01619     next_within_xht_gap = MAX_INT16;
01620   }
01621   else {
01622     bit_beyond = box_it.data ()->bounding_box ();
01623     next_gap = bit_beyond.left () - next_blob_box.right ();
01624     bit_beyond = reduced_box_next (row, &reduced_box_it);
01625     next_within_xht_gap =
01626       bit_beyond.left () - next_reduced_blob_box.right ();
01627   }
01628 }
01629 
01630 
01631 #ifndef GRAPHICS_DISABLED
01632 void Textord::mark_gap(
01633     TBOX blob,   // blob following gap
01634     inT16 rule,  // heuristic id
01635     inT16 prev_gap,
01636     inT16 prev_blob_width,
01637     inT16 current_gap,
01638     inT16 next_blob_width,
01639     inT16 next_gap) {
01640   ScrollView::Color col;                    //of ellipse marking flipped gap
01641 
01642   switch (rule) {
01643     case 1:
01644       col = ScrollView::RED;
01645       break;
01646     case 2:
01647       col = ScrollView::CYAN;
01648       break;
01649     case 3:
01650       col = ScrollView::GREEN;
01651       break;
01652     case 4:
01653       col = ScrollView::BLACK;
01654       break;
01655     case 5:
01656       col = ScrollView::MAGENTA;
01657       break;
01658     case 6:
01659       col = ScrollView::BLUE;
01660       break;
01661 
01662     case 7:
01663       col = ScrollView::WHITE;
01664       break;
01665     case 8:
01666       col = ScrollView::YELLOW;
01667       break;
01668     case 9:
01669       col = ScrollView::BLACK;
01670       break;
01671 
01672     case 20:
01673       col = ScrollView::CYAN;
01674       break;
01675     case 21:
01676       col = ScrollView::GREEN;
01677       break;
01678     case 22:
01679       col = ScrollView::MAGENTA;
01680       break;
01681     default:
01682       col = ScrollView::BLACK;
01683   }
01684   if (textord_show_initial_words) {
01685     to_win->Pen(col);
01686   /*  if (rule < 20)
01687       //interior_style(to_win, INT_SOLID, FALSE);
01688     else
01689       //interior_style(to_win, INT_HOLLOW, TRUE);*/
01690                                  //x radius
01691     to_win->Ellipse (current_gap / 2.0f,
01692       blob.height () / 2.0f,     //y radius
01693                                  //x centre
01694       blob.left () - current_gap / 2.0f,
01695                                  //y centre
01696       blob.bottom () + blob.height () / 2.0f);
01697  }
01698   if (tosp_debug_level > 5)
01699     tprintf ("  (%d,%d) Sp<->Kn Rule %d %d %d %d %d\n",
01700       blob.left () - current_gap / 2, blob.bottom (), rule,
01701       prev_gap, prev_blob_width, current_gap,
01702       next_blob_width, next_gap);
01703 }
01704 #endif
01705 
01706 float Textord::find_mean_blob_spacing(WERD *word) {
01707   C_BLOB_IT cblob_it;
01708   TBOX blob_box;
01709   inT32 gap_sum = 0;
01710   inT16 gap_count = 0;
01711   inT16 prev_right;
01712 
01713   cblob_it.set_to_list (word->cblob_list ());
01714   if (!cblob_it.empty ()) {
01715     cblob_it.mark_cycle_pt ();
01716     prev_right = cblob_it.data ()->bounding_box ().right ();
01717     //first blob
01718     cblob_it.forward ();
01719     for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
01720       blob_box = cblob_it.data ()->bounding_box ();
01721       gap_sum += blob_box.left () - prev_right;
01722       gap_count++;
01723       prev_right = blob_box.right ();
01724     }
01725   }
01726   if (gap_count > 0)
01727     return (gap_sum / (float) gap_count);
01728   else
01729     return 0.0f;
01730 }
01731 
01732 
01733 BOOL8 Textord::ignore_big_gap(TO_ROW *row,
01734                               inT32 row_length,
01735                               GAPMAP *gapmap,
01736                               inT16 left,
01737                               inT16 right) {
01738   inT16 gap = right - left + 1;
01739 
01740   if (tosp_ignore_big_gaps > 999)
01741     return FALSE;                //Dont ignore
01742   if (tosp_ignore_big_gaps > 0)
01743     return (gap > tosp_ignore_big_gaps * row->xheight);
01744   if (gap > tosp_ignore_very_big_gaps * row->xheight)
01745     return TRUE;
01746   if (tosp_ignore_big_gaps == 0) {
01747     if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
01748       return TRUE;
01749     if ((gap > 1.75 * row->xheight) &&
01750       ((row_length > 35 * row->xheight) ||
01751       gapmap->table_gap (left, right)))
01752       return TRUE;
01753   }
01754   else {
01755   /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */
01756     if ((gap > gapmap_big_gaps * row->xheight) &&
01757       gapmap->table_gap (left, right))
01758       return TRUE;
01759   }
01760   return FALSE;
01761 }
01762 
01763 
01764 /**********************************************************************
01765  * reduced_box_next
01766  *
01767  * Compute the bounding box of this blob with merging of x overlaps
01768  * but no pre-chopping.
01769  * Then move the iterator on to the start of the next blob.
01770  * DONT reduce the box for small things - eg punctuation.
01771  **********************************************************************/
01772 TBOX Textord::reduced_box_next(
01773     TO_ROW *row,     // current row
01774     BLOBNBOX_IT *it  // iterator to blobds
01775                                ) {
01776   BLOBNBOX *blob;                //current blob
01777   BLOBNBOX *head_blob;           //place to store box
01778   TBOX full_box;                  //full blob boundg box
01779   TBOX reduced_box;               //box of significant part
01780   inT16 left_above_xht;          //ABOVE xht left limit
01781   inT16 new_left_above_xht;      //ABOVE xht left limit
01782 
01783   blob = it->data ();
01784   if (blob->red_box_set ()) {
01785     reduced_box = blob->reduced_box ();
01786     do {
01787       it->forward();
01788       blob = it->data();
01789     }
01790     while (blob->cblob() == NULL || blob->joined_to_prev());
01791     return reduced_box;
01792   }
01793   head_blob = blob;
01794   full_box = blob->bounding_box ();
01795   reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
01796   do {
01797     it->forward ();
01798     blob = it->data ();
01799     if (blob->cblob() == NULL)
01800                                  //was pre-chopped
01801       full_box += blob->bounding_box ();
01802     else if (blob->joined_to_prev ()) {
01803       reduced_box +=
01804         reduced_box_for_blob(blob, row, &new_left_above_xht);
01805       left_above_xht = MIN (left_above_xht, new_left_above_xht);
01806     }
01807   }
01808                                  //until next real blob
01809   while (blob->cblob() == NULL || blob->joined_to_prev());
01810 
01811   if ((reduced_box.width () > 0) &&
01812     ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
01813   < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
01814 #ifndef GRAPHICS_DISABLED
01815     if (textord_show_initial_words)
01816       reduced_box.plot (to_win, ScrollView::YELLOW, ScrollView::YELLOW);
01817 #endif
01818   }
01819   else
01820     reduced_box = full_box;
01821   head_blob->set_reduced_box (reduced_box);
01822   return reduced_box;
01823 }
01824 
01825 
01826 /*************************************************************************
01827  * reduced_box_for_blob()
01828  * Find box for blob which is the same height and y position as the whole blob,
01829  * but whose left limit is the left most position of the blob ABOVE the
01830  * baseline and whose right limit is the right most position of the blob BELOW
01831  * the xheight.
01832  *
01833  *
01834  * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
01835  *         "home".  Perhaps we need something which say if the width ABOVE the
01836  *         xht alone includes the whole of the reduced width, then use the full
01837  *         blob box - Might still fail on italic F
01838  *
01839  *         Alternatively we could be a little less severe and only reduce the
01840  *         left and right edges by half the difference between the full box and
01841  *         the reduced box.
01842  *
01843  * NOTE that we need to rotate all the coordinates as
01844  * find_blob_limits finds the y min and max within a specified x band
01845  *************************************************************************/
01846 TBOX Textord::reduced_box_for_blob(
01847     BLOBNBOX *blob,
01848     TO_ROW *row,
01849     inT16 *left_above_xht) {
01850   float baseline;
01851   float blob_x_centre;
01852   float left_limit;
01853   float right_limit;
01854   float junk;
01855   TBOX blob_box;
01856 
01857   /* Find baseline of centre of blob */
01858 
01859   blob_box = blob->bounding_box ();
01860   blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
01861   baseline = row->baseline.y (blob_x_centre);
01862 
01863   /*
01864   Find LH limit of blob ABOVE the xht. This is so that we can detect certain
01865   caps ht chars which should NOT have their box reduced: T, Y, V, W etc
01866   */
01867   left_limit = (float) MAX_INT32;
01868   junk = (float) -MAX_INT32;
01869   find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight),
01870                      static_cast<float>(MAX_INT16), left_limit, junk);
01871   if (left_limit > junk)
01872     *left_above_xht = MAX_INT16; //No area above xht
01873   else
01874     *left_above_xht = (inT16) floor (left_limit);
01875   /*
01876   Find reduced LH limit of blob - the left extent of the region ABOVE the
01877   baseline.
01878   */
01879   left_limit = (float) MAX_INT32;
01880   junk = (float) -MAX_INT32;
01881   find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(MAX_INT16),
01882                      left_limit, junk);
01883 
01884   if (left_limit > junk)
01885     return TBOX ();               //no area within xht so return empty box
01886   /*
01887   Find reduced RH limit of blob - the right extent of the region BELOW the xht.
01888   */
01889   junk = (float) MAX_INT32;
01890   right_limit = (float) -MAX_INT32;
01891   find_cblob_hlimits(blob->cblob(), static_cast<float>(-MAX_INT16),
01892                      (baseline + row->xheight), junk, right_limit);
01893   if (junk > right_limit)
01894     return TBOX ();               //no area within xht so return empty box
01895 
01896   return TBOX (ICOORD ((inT16) floor (left_limit), blob_box.bottom ()),
01897     ICOORD ((inT16) ceil (right_limit), blob_box.top ()));
01898 }
01899 }  // namespace tesseract