Tesseract  3.02
tesseract-ocr/textord/topitch.h File Reference
#include "blobbox.h"
#include "notdll.h"

Go to the source code of this file.

Namespaces

namespace  tesseract

Functions

void compute_fixed_pitch (ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient, FCOORD rotation, BOOL8 testing_on)
void fix_row_pitch (TO_ROW *bad_row, TO_BLOCK *bad_block, TO_BLOCK_LIST *blocks, inT32 row_target, inT32 block_target)
void compute_block_pitch (TO_BLOCK *block, FCOORD rotation, inT32 block_index, BOOL8 testing_on)
BOOL8 compute_rows_pitch (TO_BLOCK *block, inT32 block_index, BOOL8 testing_on)
BOOL8 try_doc_fixed (ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient)
BOOL8 try_block_fixed (TO_BLOCK *block, inT32 block_index)
BOOL8 try_rows_fixed (TO_BLOCK *block, inT32 block_index, BOOL8 testing_on)
void print_block_counts (TO_BLOCK *block, inT32 block_index)
void count_block_votes (TO_BLOCK *block, inT32 &def_fixed, inT32 &def_prop, inT32 &maybe_fixed, inT32 &maybe_prop, inT32 &corr_fixed, inT32 &corr_prop, inT32 &dunno)
BOOL8 row_pitch_stats (TO_ROW *row, inT32 maxwidth, BOOL8 testing_on)
BOOL8 find_row_pitch (TO_ROW *row, inT32 maxwidth, inT32 dm_gap, TO_BLOCK *block, inT32 block_index, inT32 row_index, BOOL8 testing_on)
BOOL8 fixed_pitch_row (TO_ROW *row, BLOCK *block, inT32 block_index)
BOOL8 count_pitch_stats (TO_ROW *row, STATS *gap_stats, STATS *pitch_stats, float initial_pitch, float min_space, BOOL8 ignore_outsize, BOOL8 split_outsize, inT32 dm_gap)
float tune_row_pitch (TO_ROW *row, STATS *projection, inT16 projection_left, inT16 projection_right, float space_size, float &initial_pitch, float &best_sp_sd, inT16 &best_mid_cuts, ICOORDELT_LIST *best_cells, BOOL8 testing_on)
float tune_row_pitch2 (TO_ROW *row, STATS *projection, inT16 projection_left, inT16 projection_right, float space_size, float &initial_pitch, float &best_sp_sd, inT16 &best_mid_cuts, ICOORDELT_LIST *best_cells, BOOL8 testing_on)
float compute_pitch_sd (TO_ROW *row, STATS *projection, inT16 projection_left, inT16 projection_right, float space_size, float initial_pitch, float &sp_sd, inT16 &mid_cuts, ICOORDELT_LIST *row_cells, BOOL8 testing_on, inT16 start=0, inT16 end=0)
float compute_pitch_sd2 (TO_ROW *row, STATS *projection, inT16 projection_left, inT16 projection_right, float initial_pitch, inT16 &occupation, inT16 &mid_cuts, ICOORDELT_LIST *row_cells, BOOL8 testing_on, inT16 start=0, inT16 end=0)
void print_pitch_sd (TO_ROW *row, STATS *projection, inT16 projection_left, inT16 projection_right, float space_size, float initial_pitch)
void find_repeated_chars (TO_BLOCK *block, BOOL8 testing_on)
void plot_fp_word (TO_BLOCK *block, float pitch, float nonspace)

Variables

bool textord_debug_pitch_test = 0
bool textord_debug_pitch_metric = 0
bool textord_show_row_cuts = 0
bool textord_show_page_cuts = 0
bool textord_pitch_cheat = 0
bool textord_blockndoc_fixed = 1
bool textord_fast_pitch_test = 0
double textord_projection_scale = 0.125
double textord_balance_factor = 2.0

Function Documentation

void compute_block_pitch ( TO_BLOCK block,
FCOORD  rotation,
inT32  block_index,
BOOL8  testing_on 
)

Definition at line 311 of file topitch.cpp.

                                           {  // correct orientation
   TBOX block_box;                 //bounding box

  block_box = block->block->bounding_box ();
  if (testing_on && textord_debug_pitch_test) {
    tprintf ("Block %d at (%d,%d)->(%d,%d)\n",
      block_index,
      block_box.left (), block_box.bottom (),
      block_box.right (), block_box.top ());
  }
  block->min_space = (inT32) floor (block->xheight
    * textord_words_default_minspace);
  block->max_nonspace = (inT32) ceil (block->xheight
    * textord_words_default_nonspace);
  block->fixed_pitch = 0.0f;
  block->space_size = (float) block->min_space;
  block->kern_size = (float) block->max_nonspace;
  block->pr_nonsp = block->xheight * words_default_prop_nonspace;
  block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;
  if (!block->get_rows ()->empty ()) {
    ASSERT_HOST (block->xheight > 0);
    find_repeated_chars(block, textord_show_initial_words && testing_on);
#ifndef GRAPHICS_DISABLED
    if (textord_show_initial_words && testing_on)
      //overlap_picture_ops(TRUE);
      ScrollView::Update();
#endif
    compute_rows_pitch(block,
                       block_index,
                       textord_debug_pitch_test &&testing_on);
  }
}
void compute_fixed_pitch ( ICOORD  page_tr,
TO_BLOCK_LIST *  port_blocks,
float  gradient,
FCOORD  rotation,
BOOL8  testing_on 
)

Definition at line 75 of file topitch.cpp.

                                           {          // correct orientation
  TO_BLOCK_IT block_it;          //iterator
  TO_BLOCK *block;               //current block;
  TO_ROW_IT row_it;              //row iterator
  TO_ROW *row;                   //current row
  int block_index;               //block number
  int row_index;                 //row number

#ifndef GRAPHICS_DISABLED
  if (textord_show_initial_words && testing_on) {
    if (to_win == NULL)
      create_to_win(page_tr);
  }
#endif

  block_it.set_to_list (port_blocks);
  block_index = 1;
  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
  block_it.forward ()) {
    block = block_it.data ();
    compute_block_pitch(block, rotation, block_index, testing_on);
    block_index++;
  }

  if (!try_doc_fixed (page_tr, port_blocks, gradient)) {
    block_index = 1;
    for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
    block_it.forward ()) {
      block = block_it.data ();
      if (!try_block_fixed (block, block_index))
        try_rows_fixed(block, block_index, testing_on);
      block_index++;
    }
  }

  block_index = 1;
  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
       block_it.forward()) {
    block = block_it.data ();
    POLY_BLOCK* pb = block->block->poly_block();
    if (pb != NULL && !pb->IsText()) continue;  // Non-text doesn't exist!
    row_it.set_to_list (block->get_rows ());
    row_index = 1;
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      fix_row_pitch(row, block, port_blocks, row_index, block_index);
      row_index++;
    }
    block_index++;
  }
#ifndef GRAPHICS_DISABLED
  if (textord_show_initial_words && testing_on) {
    ScrollView::Update();
  }
#endif
}
float compute_pitch_sd ( TO_ROW row,
STATS projection,
inT16  projection_left,
inT16  projection_right,
float  space_size,
float  initial_pitch,
float &  sp_sd,
inT16 mid_cuts,
ICOORDELT_LIST *  row_cells,
BOOL8  testing_on,
inT16  start = 0,
inT16  end = 0 
)

Definition at line 1379 of file topitch.cpp.

                        {
  inT16 occupation;              //no of cells in word.
                                 //blobs
  BLOBNBOX_IT blob_it = row->blob_list ();
  BLOBNBOX_IT start_it;          //start of word
  BLOBNBOX_IT plot_it;           //for plotting
  inT16 blob_count;              //no of blobs
  TBOX blob_box;                  //bounding box
  TBOX prev_box;                  //of super blob
  inT32 prev_right;              //of word sync
  int scale_factor;              //on scores for big words
  inT32 sp_count;                //spaces
  FPSEGPT_LIST seg_list;         //char cells
  FPSEGPT_IT seg_it;             //iterator
  inT16 segpos;                  //position of segment
  inT16 cellpos;                 //previous cell boundary
                                 //iterator
  ICOORDELT_IT cell_it = row_cells;
  ICOORDELT *cell;               //new cell
  double sqsum;                  //sum of squares
  double spsum;                  //of spaces
  double sp_var;                 //space error
  double word_sync;              //result for word
  inT32 total_count;             //total blobs

  if ((pitsync_linear_version & 3) > 1) {
    word_sync = compute_pitch_sd2 (row, projection, projection_left,
      projection_right, initial_pitch,
      occupation, mid_cuts, row_cells,
      testing_on, start, end);
    sp_sd = occupation;
    return word_sync;
  }
  mid_cuts = 0;
  cellpos = 0;
  total_count = 0;
  sqsum = 0;
  sp_count = 0;
  spsum = 0;
  prev_right = -1;
  if (blob_it.empty ())
    return space_size * 10;
#ifndef GRAPHICS_DISABLED
  if (testing_on && to_win > 0) {
    blob_box = blob_it.data ()->bounding_box ();
    projection->plot (to_win, projection_left,
      row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
  }
#endif
  start_it = blob_it;
  blob_count = 0;
  blob_box = box_next (&blob_it);//first blob
  blob_it.mark_cycle_pt ();
  do {
    for (; blob_count > 0; blob_count--)
      box_next(&start_it);
    do {
      prev_box = blob_box;
      blob_count++;
      blob_box = box_next (&blob_it);
    }
    while (!blob_it.cycled_list ()
      && blob_box.left () - prev_box.right () < space_size);
    plot_it = start_it;
    if (pitsync_linear_version & 3)
      word_sync =
        check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2,
        projection, projection_left, projection_right,
        row->xheight * textord_projection_scale,
        occupation, &seg_list, start, end);
    else
      word_sync =
        check_pitch_sync (&start_it, blob_count, (inT16) initial_pitch, 2,
        projection, &seg_list);
    if (testing_on) {
      tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ",
        prev_box.right (), prev_box.top (),
        seg_list.length () - 1, word_sync);
      seg_it.set_to_list (&seg_list);
      for (seg_it.mark_cycle_pt (); !seg_it.cycled_list ();
      seg_it.forward ()) {
        if (seg_it.data ()->faked)
          tprintf ("(F)");
        tprintf ("%d, ", seg_it.data ()->position ());
        //                              tprintf("C=%g, s=%g, sq=%g\n",
        //                                      seg_it.data()->cost_function(),
        //                                      seg_it.data()->sum(),
        //                                      seg_it.data()->squares());
      }
      tprintf ("\n");
    }
#ifndef GRAPHICS_DISABLED
    if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
      plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
#endif
    seg_it.set_to_list (&seg_list);
    if (prev_right >= 0) {
      sp_var = seg_it.data ()->position () - prev_right;
      sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
      sp_var *= sp_var;
      spsum += sp_var;
      sp_count++;
    }
    for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
      segpos = seg_it.data ()->position ();
      if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) {
                                 //big gap
        while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) {
          cell = new ICOORDELT (cellpos + (inT16) initial_pitch, 0);
          cell_it.add_after_then_move (cell);
          cellpos += (inT16) initial_pitch;
        }
                                 //make new one
        cell = new ICOORDELT (segpos, 0);
        cell_it.add_after_then_move (cell);
        cellpos = segpos;
      }
      else if (segpos > cellpos - initial_pitch / 2) {
        cell = cell_it.data ();
                                 //average positions
        cell->set_x ((cellpos + segpos) / 2);
        cellpos = cell->x ();
      }
    }
    seg_it.move_to_last ();
    prev_right = seg_it.data ()->position ();
    if (textord_pitch_scalebigwords) {
      scale_factor = (seg_list.length () - 2) / 2;
      if (scale_factor < 1)
        scale_factor = 1;
    }
    else
      scale_factor = 1;
    sqsum += word_sync * scale_factor;
    total_count += (seg_list.length () - 1) * scale_factor;
    seg_list.clear ();
  }
  while (!blob_it.cycled_list ());
  sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
  return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
}
float compute_pitch_sd2 ( TO_ROW row,
STATS projection,
inT16  projection_left,
inT16  projection_right,
float  initial_pitch,
inT16 occupation,
inT16 mid_cuts,
ICOORDELT_LIST *  row_cells,
BOOL8  testing_on,
inT16  start = 0,
inT16  end = 0 
)

Definition at line 1542 of file topitch.cpp.

                         {
                                 //blobs
  BLOBNBOX_IT blob_it = row->blob_list ();
  BLOBNBOX_IT plot_it;
  inT16 blob_count;              //no of blobs
  TBOX blob_box;                  //bounding box
  FPSEGPT_LIST seg_list;         //char cells
  FPSEGPT_IT seg_it;             //iterator
  inT16 segpos;                  //position of segment
                                 //iterator
  ICOORDELT_IT cell_it = row_cells;
  ICOORDELT *cell;               //new cell
  double word_sync;              //result for word

  mid_cuts = 0;
  if (blob_it.empty ()) {
    occupation = 0;
    return initial_pitch * 10;
  }
#ifndef GRAPHICS_DISABLED
  if (testing_on && to_win > 0) {
    projection->plot (to_win, projection_left,
      row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
  }
#endif
  blob_count = 0;
  blob_it.mark_cycle_pt ();
  do {
                                 //first blob
    blob_box = box_next (&blob_it);
    blob_count++;
  }
  while (!blob_it.cycled_list ());
  plot_it = blob_it;
  word_sync = check_pitch_sync2 (&blob_it, blob_count, (inT16) initial_pitch,
    2, projection, projection_left,
    projection_right,
    row->xheight * textord_projection_scale,
    occupation, &seg_list, start, end);
  if (testing_on) {
    tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ",
      blob_box.right (), blob_box.top (),
      seg_list.length () - 1, word_sync);
    seg_it.set_to_list (&seg_list);
    for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
      if (seg_it.data ()->faked)
        tprintf ("(F)");
      tprintf ("%d, ", seg_it.data ()->position ());
      //                              tprintf("C=%g, s=%g, sq=%g\n",
      //                                      seg_it.data()->cost_function(),
      //                                      seg_it.data()->sum(),
      //                                      seg_it.data()->squares());
    }
    tprintf ("\n");
  }
#ifndef GRAPHICS_DISABLED
  if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
    plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
#endif
  seg_it.set_to_list (&seg_list);
  for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
    segpos = seg_it.data ()->position ();
                                 //make new one
    cell = new ICOORDELT (segpos, 0);
    cell_it.add_after_then_move (cell);
    if (seg_it.at_last ())
      mid_cuts = seg_it.data ()->cheap_cuts ();
  }
  seg_list.clear ();
  return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10;
}
BOOL8 compute_rows_pitch ( TO_BLOCK block,
inT32  block_index,
BOOL8  testing_on 
)

Definition at line 354 of file topitch.cpp.

                          {
  inT32 maxwidth;                //of spaces
  TO_ROW *row;                   //current row
  inT32 row_index;               //row number.
  float lower, upper;            //cluster thresholds
  TO_ROW_IT row_it = block->get_rows ();

  row_index = 1;
  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
    row = row_it.data ();
    ASSERT_HOST (row->xheight > 0);
    row->compute_vertical_projection ();
    maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace);
    if (row_pitch_stats (row, maxwidth, testing_on)
      && find_row_pitch (row, maxwidth,
      textord_dotmatrix_gap + 1, block, block_index,
    row_index, testing_on)) {
      if (row->fixed_pitch == 0) {
        lower = row->pr_nonsp;
        upper = row->pr_space;
        row->space_size = upper;
        row->kern_size = lower;
      }
    }
    else {
      row->fixed_pitch = 0.0f;   //insufficient data
      row->pitch_decision = PITCH_DUNNO;
    }
    row_index++;
  }
  return FALSE;
}
void count_block_votes ( TO_BLOCK block,
inT32 def_fixed,
inT32 def_prop,
inT32 maybe_fixed,
inT32 maybe_prop,
inT32 corr_fixed,
inT32 corr_prop,
inT32 dunno 
)

Definition at line 659 of file topitch.cpp.

                                     {
  TO_ROW *row;                   //current row
  TO_ROW_IT row_it = block->get_rows ();

  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
    row = row_it.data ();
    switch (row->pitch_decision) {
      case PITCH_DUNNO:
        dunno++;
        break;
      case PITCH_DEF_PROP:
        def_prop++;
        break;
      case PITCH_MAYBE_PROP:
        maybe_prop++;
        break;
      case PITCH_DEF_FIXED:
        def_fixed++;
        break;
      case PITCH_MAYBE_FIXED:
        maybe_fixed++;
        break;
      case PITCH_CORR_PROP:
        corr_prop++;
        break;
      case PITCH_CORR_FIXED:
        corr_fixed++;
        break;
    }
  }
}
BOOL8 count_pitch_stats ( TO_ROW row,
STATS gap_stats,
STATS pitch_stats,
float  initial_pitch,
float  min_space,
BOOL8  ignore_outsize,
BOOL8  split_outsize,
inT32  dm_gap 
)

Definition at line 1062 of file topitch.cpp.

                         {
  BOOL8 prev_valid;              //not word broken
  BLOBNBOX *blob;                //current blob
                                 //blobs
  BLOBNBOX_IT blob_it = row->blob_list ();
  inT32 prev_right;              //end of prev blob
  inT32 prev_centre;             //centre of previous blob
  inT32 x_centre;                //centre of this blob
  inT32 blob_width;              //width of blob
  inT32 width_units;             //no of widths in blob
  float width;                   //blob width
  TBOX blob_box;                  //bounding box
  TBOX joined_box;                //of super blob

  gap_stats->clear ();
  pitch_stats->clear ();
  if (blob_it.empty ())
    return FALSE;
  prev_valid = FALSE;
  prev_centre = 0;
  prev_right = 0;                //stop complier warning
  joined_box = blob_it.data ()->bounding_box ();
  do {
    blob_it.forward ();
    blob = blob_it.data ();
    if (!blob->joined_to_prev ()) {
      blob_box = blob->bounding_box ();
      if ((blob_box.left () - joined_box.right () < dm_gap
        && !blob_it.at_first ())
        || blob->cblob() == NULL)
        joined_box += blob_box;  //merge blobs
      else {
        blob_width = joined_box.width ();
        if (split_outsize) {
          width_units =
            (inT32) floor ((float) blob_width / initial_pitch + 0.5);
          if (width_units < 1)
            width_units = 1;
          width_units--;
        }
        else if (ignore_outsize) {
          width = (float) blob_width / initial_pitch;
          width_units = width < 1 + words_default_fixed_limit
            && width > 1 - words_default_fixed_limit ? 0 : -1;
        }
        else
          width_units = 0;       //everything in
        x_centre = (inT32) (joined_box.left ()
          + (blob_width -
          width_units * initial_pitch) / 2);
        if (prev_valid && width_units >= 0) {
          //                                              if (width_units>0)
          //                                              {
          //                                                      tprintf("wu=%d, width=%d, xc=%d, adding %d\n",
          //                                                              width_units,blob_width,x_centre,x_centre-prev_centre);
          //                                              }
          gap_stats->add (joined_box.left () - prev_right, 1);
          pitch_stats->add (x_centre - prev_centre, 1);
        }
        prev_centre = (inT32) (x_centre + width_units * initial_pitch);
        prev_right = joined_box.right ();
        prev_valid = blob_box.left () - joined_box.right () < min_space;
        prev_valid = prev_valid && width_units >= 0;
        joined_box = blob_box;
      }
    }
  }
  while (!blob_it.at_first ());
  return gap_stats->get_total () >= 3;
}
void find_repeated_chars ( TO_BLOCK block,
BOOL8  testing_on 
)

Definition at line 1762 of file topitch.cpp.

                                           {    // Debug mode.
  POLY_BLOCK* pb = block->block->poly_block();
  if (pb != NULL && !pb->IsText())
    return;  // Don't find repeated chars in non-text blocks.

  TO_ROW *row;
  BLOBNBOX_IT box_it;
  BLOBNBOX_IT search_it;         // forward search
  WERD_IT word_it;               // new words
  WERD *word;                    // new word
  TBOX word_box;                 // for plotting
  int blobcount, repeated_set;

  TO_ROW_IT row_it = block->get_rows();
  if (row_it.empty()) return;  // empty block
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    row = row_it.data();
    box_it.set_to_list(row->blob_list());
    if (box_it.empty())  continue; // no blobs in this row
    if (!row->rep_chars_marked()) {
      mark_repeated_chars(row);
    }
    if (row->num_repeated_sets() == 0) continue;  // nothing to do for this row
    word_it.set_to_list(&row->rep_words);
    do {
      if (box_it.data()->repeated_set() != 0 &&
          !box_it.data()->joined_to_prev()) {
        blobcount = 1;
        repeated_set = box_it.data()->repeated_set();
        search_it = box_it;
        search_it.forward();
        while (!search_it.at_first() &&
               search_it.data()->repeated_set() == repeated_set) {
          blobcount++;
          search_it.forward();
        }
        // After the call to make_real_word() all the blobs from this
        // repeated set will be removed from the blob list. box_it will be
        // set to point to the blob after the end of the extracted sequence.
        word = make_real_word(&box_it, blobcount, box_it.at_first(), 1);
        if (!box_it.empty() && box_it.data()->joined_to_prev()) {
          tprintf("Bad box joined to prev at");
          box_it.data()->bounding_box().print();
          tprintf("After repeated word:");
          word->bounding_box().print();
        }
        ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev());
        word->set_flag(W_REP_CHAR, true);
        word->set_flag(W_DONT_CHOP, true);
        word_it.add_after_then_move(word);
      } else {
        box_it.forward();
      }
    } while (!box_it.at_first());
  }
}
BOOL8 find_row_pitch ( TO_ROW row,
inT32  maxwidth,
inT32  dm_gap,
TO_BLOCK block,
inT32  block_index,
inT32  row_index,
BOOL8  testing_on 
)

Definition at line 839 of file topitch.cpp.

                      {
  BOOL8 used_dm_model;           //looks lik dot matrix
  float min_space;               //estimate threshold
  float non_space;               //gap size
  float gap_iqr;                 //interquartile range
  float pitch_iqr;
  float dm_gap_iqr;              //interquartile range
  float dm_pitch_iqr;
  float dm_pitch;                //pitch with dm on
  float pitch;                   //revised estimate
  float initial_pitch;           //guess at pitch
  STATS gap_stats (0, maxwidth);
                                 //centre-centre
  STATS pitch_stats (0, maxwidth);

  row->fixed_pitch = 0.0f;
  initial_pitch = row->fp_space;
  if (initial_pitch > row->xheight * (1 + words_default_fixed_limit))
    initial_pitch = row->xheight;//keep pitch decent
  non_space = row->fp_nonsp;
  if (non_space > initial_pitch)
    non_space = initial_pitch;
  min_space = (initial_pitch + non_space) / 2;

  if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
  initial_pitch, min_space, TRUE, FALSE, dm_gap)) {
    dm_gap_iqr = 0.0001;
    dm_pitch_iqr = maxwidth * 2.0f;
    dm_pitch = initial_pitch;
  }
  else {
    dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
    dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
    dm_pitch = pitch_stats.ile (0.5);
  }
  gap_stats.clear ();
  pitch_stats.clear ();
  if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
  initial_pitch, min_space, TRUE, FALSE, 0)) {
    gap_iqr = 0.0001;
    pitch_iqr = maxwidth * 3.0f;
  }
  else {
    gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
    pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
    if (testing_on)
      tprintf
        ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
        initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
    initial_pitch = pitch_stats.ile (0.5);
    if (min_space > initial_pitch
      && count_pitch_stats (row, &gap_stats, &pitch_stats,
    initial_pitch, initial_pitch, TRUE, FALSE, 0)) {
      min_space = initial_pitch;
      gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
      pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
      if (testing_on)
        tprintf
          ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
          initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
      initial_pitch = pitch_stats.ile (0.5);
    }
  }
  if (textord_debug_pitch_metric)
    tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:",
            block_index, row_index, 'X',
            pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
            pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D' :
              (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
  if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
    row->pitch_decision = PITCH_DUNNO;
    if (textord_debug_pitch_metric)
      tprintf ("\n");
    return FALSE;                //insufficient data
  }
  if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
    if (testing_on)
      tprintf
        ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
        pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
    gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
    pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
    pitch = pitch_stats.ile (0.5);
    used_dm_model = FALSE;
  }
  else {
    if (testing_on)
      tprintf
        ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
        pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
    gap_iqr = dm_gap_iqr;
    pitch_iqr = dm_pitch_iqr;
    pitch = dm_pitch;
    used_dm_model = TRUE;
  }
  if (textord_debug_pitch_metric) {
    tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:",
      pitch_iqr, gap_iqr, pitch);
    tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:",
      pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
      pitch_iqr < gap_iqr * textord_fpiqr_ratio
      && pitch_iqr < block->xheight * textord_max_pitch_iqr
      && pitch < block->xheight * textord_words_default_maxspace
      ? 'F' : 'P');
  }
  if (pitch_iqr < gap_iqr * textord_fpiqr_ratio
    && pitch_iqr < block->xheight * textord_max_pitch_iqr
    && pitch < block->xheight * textord_words_default_maxspace)
    row->pitch_decision = PITCH_MAYBE_FIXED;
  else
    row->pitch_decision = PITCH_MAYBE_PROP;
  row->fixed_pitch = pitch;
  row->kern_size = gap_stats.ile (0.5);
  row->min_space = (inT32) (row->fixed_pitch + non_space) / 2;
  if (row->min_space > row->fixed_pitch)
    row->min_space = (inT32) row->fixed_pitch;
  row->max_nonspace = row->min_space;
  row->space_size = row->fixed_pitch;
  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
  row->used_dm_model = used_dm_model;
  return TRUE;
}
void fix_row_pitch ( TO_ROW bad_row,
TO_BLOCK bad_block,
TO_BLOCK_LIST *  blocks,
inT32  row_target,
inT32  block_target 
)

Definition at line 144 of file topitch.cpp.

                                       {   // number of block
  inT16 mid_cuts;
  int block_votes;               //votes in block
  int like_votes;                //votes over page
  int other_votes;               //votes of unlike blocks
  int block_index;               //number of block
  int row_index;                 //number of row
  int maxwidth;                  //max pitch
  TO_BLOCK_IT block_it = blocks; //block iterator
  TO_ROW_IT row_it;
  TO_BLOCK *block;               //current block
  TO_ROW *row;                   //current row
  float sp_sd;                   //space deviation
  STATS block_stats;             //pitches in block
  STATS like_stats;              //pitches in page

  block_votes = like_votes = other_votes = 0;
  maxwidth = (inT32) ceil (bad_row->xheight * textord_words_maxspace);
  if (bad_row->pitch_decision != PITCH_DEF_FIXED
  && bad_row->pitch_decision != PITCH_DEF_PROP) {
    block_stats.set_range (0, maxwidth);
    like_stats.set_range (0, maxwidth);
    block_index = 1;
    for (block_it.mark_cycle_pt(); !block_it.cycled_list();
         block_it.forward()) {
      block = block_it.data();
      POLY_BLOCK* pb = block->block->poly_block();
      if (pb != NULL && !pb->IsText()) continue;  // Non text doesn't exist!
      row_index = 1;
      row_it.set_to_list (block->get_rows ());
      for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
      row_it.forward ()) {
        row = row_it.data ();
        if ((bad_row->all_caps
          && row->xheight + row->ascrise
          <
          (bad_row->xheight + bad_row->ascrise) * (1 +
          textord_pitch_rowsimilarity)
          && row->xheight + row->ascrise >
          (bad_row->xheight + bad_row->ascrise) * (1 -
          textord_pitch_rowsimilarity))
          || (!bad_row->all_caps
          && row->xheight <
          bad_row->xheight * (1 + textord_pitch_rowsimilarity)
          && row->xheight >
        bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {
          if (block_index == block_target) {
            if (row->pitch_decision == PITCH_DEF_FIXED) {
              block_votes += textord_words_veto_power;
              block_stats.add ((inT32) row->fixed_pitch,
                textord_words_veto_power);
            }
            else if (row->pitch_decision == PITCH_MAYBE_FIXED
            || row->pitch_decision == PITCH_CORR_FIXED) {
              block_votes++;
              block_stats.add ((inT32) row->fixed_pitch, 1);
            }
            else if (row->pitch_decision == PITCH_DEF_PROP)
              block_votes -= textord_words_veto_power;
            else if (row->pitch_decision == PITCH_MAYBE_PROP
              || row->pitch_decision == PITCH_CORR_PROP)
              block_votes--;
          }
          else {
            if (row->pitch_decision == PITCH_DEF_FIXED) {
              like_votes += textord_words_veto_power;
              like_stats.add ((inT32) row->fixed_pitch,
                textord_words_veto_power);
            }
            else if (row->pitch_decision == PITCH_MAYBE_FIXED
            || row->pitch_decision == PITCH_CORR_FIXED) {
              like_votes++;
              like_stats.add ((inT32) row->fixed_pitch, 1);
            }
            else if (row->pitch_decision == PITCH_DEF_PROP)
              like_votes -= textord_words_veto_power;
            else if (row->pitch_decision == PITCH_MAYBE_PROP
              || row->pitch_decision == PITCH_CORR_PROP)
              like_votes--;
          }
        }
        else {
          if (row->pitch_decision == PITCH_DEF_FIXED)
            other_votes += textord_words_veto_power;
          else if (row->pitch_decision == PITCH_MAYBE_FIXED
            || row->pitch_decision == PITCH_CORR_FIXED)
            other_votes++;
          else if (row->pitch_decision == PITCH_DEF_PROP)
            other_votes -= textord_words_veto_power;
          else if (row->pitch_decision == PITCH_MAYBE_PROP
            || row->pitch_decision == PITCH_CORR_PROP)
            other_votes--;
        }
        row_index++;
      }
      block_index++;
    }
    if (block_votes > textord_words_veto_power) {
      bad_row->fixed_pitch = block_stats.ile (0.5);
      bad_row->pitch_decision = PITCH_CORR_FIXED;
    }
    else if (block_votes <= textord_words_veto_power && like_votes > 0) {
      bad_row->fixed_pitch = like_stats.ile (0.5);
      bad_row->pitch_decision = PITCH_CORR_FIXED;
    }
    else {
      bad_row->pitch_decision = PITCH_CORR_PROP;
      #ifndef SECURE_NAMES
      if (block_votes == 0 && like_votes == 0 && other_votes > 0
        && (textord_debug_pitch_test || textord_debug_pitch_metric))
        tprintf
          ("Warning:row %d of block %d set prop with no like rows against trend\n",
          row_target, block_target);
      #endif
    }
  }
  if (textord_debug_pitch_metric) {
    tprintf(":b_votes=%d:l_votes=%d:o_votes=%d",
            block_votes, like_votes, other_votes);
    tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise);
  }
  if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
    if (bad_row->fixed_pitch < textord_min_xheight) {
      if (block_votes > 0)
        bad_row->fixed_pitch = block_stats.ile (0.5);
      else if (block_votes == 0 && like_votes > 0)
        bad_row->fixed_pitch = like_stats.ile (0.5);
      else {
        tprintf
          ("Warning:guessing pitch as xheight on row %d, block %d\n",
          row_target, block_target);
        bad_row->fixed_pitch = bad_row->xheight;
      }
    }
    if (bad_row->fixed_pitch < textord_min_xheight)
      bad_row->fixed_pitch = (float) textord_min_xheight;
    bad_row->kern_size = bad_row->fixed_pitch / 4;
    bad_row->min_space = (inT32) (bad_row->fixed_pitch * 0.6);
    bad_row->max_nonspace = (inT32) (bad_row->fixed_pitch * 0.4);
    bad_row->space_threshold =
      (bad_row->min_space + bad_row->max_nonspace) / 2;
    bad_row->space_size = bad_row->fixed_pitch;
    if (bad_row->char_cells.empty ())
      tune_row_pitch (bad_row, &bad_row->projection,
        bad_row->projection_left, bad_row->projection_right,
        (bad_row->fixed_pitch +
        bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
        sp_sd, mid_cuts, &bad_row->char_cells, FALSE);
  }
  else if (bad_row->pitch_decision == PITCH_CORR_PROP
  || bad_row->pitch_decision == PITCH_DEF_PROP) {
    bad_row->fixed_pitch = 0.0f;
    bad_row->char_cells.clear ();
  }
}
BOOL8 fixed_pitch_row ( TO_ROW row,
BLOCK block,
inT32  block_index 
)

Definition at line 979 of file topitch.cpp.

                       {
  const char *res_string;        //pitch result
  inT16 mid_cuts;                //no of cheap cuts
  float non_space;               //gap size
  float pitch_sd;                //error on pitch
  float sp_sd;                   //space sd

  non_space = row->fp_nonsp;
  if (non_space > row->fixed_pitch)
    non_space = row->fixed_pitch;
  POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
  if (textord_all_prop || (pb != NULL && !pb->IsText())) {
    // Set the decision to definitely proportional.
    pitch_sd = textord_words_def_prop * row->fixed_pitch;
    row->pitch_decision = PITCH_DEF_PROP;
  } else {
    pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left,
                               row->projection_right,
                               (row->fixed_pitch + non_space * 3) / 4,
                               row->fixed_pitch, sp_sd, mid_cuts,
                               &row->char_cells,
                               block_index == textord_debug_block);
    if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch
      && ((pitsync_linear_version & 3) < 3
      || ((pitsync_linear_version & 3) >= 3 && (row->used_dm_model
      || sp_sd > 20
    || (pitch_sd == 0 && sp_sd > 10))))) {
      if (pitch_sd < textord_words_def_fixed * row->fixed_pitch
        && !row->all_caps
        && ((pitsync_linear_version & 3) < 3 || sp_sd > 20))
        row->pitch_decision = PITCH_DEF_FIXED;
      else
        row->pitch_decision = PITCH_MAYBE_FIXED;
    }
    else if ((pitsync_linear_version & 3) < 3
      || sp_sd > 20
      || mid_cuts > 0
      || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
      if (pitch_sd < textord_words_def_prop * row->fixed_pitch)
        row->pitch_decision = PITCH_MAYBE_PROP;
      else
        row->pitch_decision = PITCH_DEF_PROP;
    }
    else
      row->pitch_decision = PITCH_DUNNO;
  }

  if (textord_debug_pitch_metric) {
    res_string = "??";
    switch (row->pitch_decision) {
      case PITCH_DEF_PROP:
        res_string = "DP";
        break;
      case PITCH_MAYBE_PROP:
        res_string = "MP";
        break;
      case PITCH_DEF_FIXED:
        res_string = "DF";
        break;
      case PITCH_MAYBE_FIXED:
        res_string = "MF";
      default:
        res_string = "??";
    }
    tprintf (":sd/p=%g:occ=%g:init_res=%s\n",
      pitch_sd / row->fixed_pitch, sp_sd, res_string);
  }
  return TRUE;
}
void plot_fp_word ( TO_BLOCK block,
float  pitch,
float  nonspace 
)

Definition at line 1828 of file topitch.cpp.

                   {
  TO_ROW *row;                   //current row
  TO_ROW_IT row_it = block->get_rows ();

  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
    row = row_it.data ();
    row->min_space = (inT32) ((pitch + nonspace) / 2);
    row->max_nonspace = row->min_space;
    row->space_threshold = row->min_space;
    plot_word_decisions (to_win, (inT16) pitch, row);
  }
}
void print_block_counts ( TO_BLOCK block,
inT32  block_index 
)

Definition at line 622 of file topitch.cpp.

                         {
  inT32 def_fixed = 0;           //counters
  inT32 def_prop = 0;
  inT32 maybe_fixed = 0;
  inT32 maybe_prop = 0;
  inT32 dunno = 0;
  inT32 corr_fixed = 0;
  inT32 corr_prop = 0;

  count_block_votes(block,
                    def_fixed,
                    def_prop,
                    maybe_fixed,
                    maybe_prop,
                    corr_fixed,
                    corr_prop,
                    dunno);
  tprintf ("Block %d has (%d,%d,%d)",
    block_index, def_fixed, maybe_fixed, corr_fixed);
  if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed))
    tprintf (" (Wrongly)");
  tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
  if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop))
    tprintf (" (Wrongly)");
  tprintf (" prop, %d dunno\n", dunno);
}
void print_pitch_sd ( TO_ROW row,
STATS projection,
inT16  projection_left,
inT16  projection_right,
float  space_size,
float  initial_pitch 
)

Definition at line 1634 of file topitch.cpp.

                     {
  const char *res2;              //pitch result
  inT16 occupation;              //used cells
  float sp_sd;                   //space sd
                                 //blobs
  BLOBNBOX_IT blob_it = row->blob_list ();
  BLOBNBOX_IT start_it;          //start of word
  BLOBNBOX_IT row_start;         //start of row
  inT16 blob_count;              //no of blobs
  inT16 total_blob_count;        //total blobs in line
  TBOX blob_box;                  //bounding box
  TBOX prev_box;                  //of super blob
  inT32 prev_right;              //of word sync
  int scale_factor;              //on scores for big words
  inT32 sp_count;                //spaces
  FPSEGPT_LIST seg_list;         //char cells
  FPSEGPT_IT seg_it;             //iterator
  double sqsum;                  //sum of squares
  double spsum;                  //of spaces
  double sp_var;                 //space error
  double word_sync;              //result for word
  double total_count;            //total cuts

  if (blob_it.empty ())
    return;
  row_start = blob_it;
  total_blob_count = 0;

  total_count = 0;
  sqsum = 0;
  sp_count = 0;
  spsum = 0;
  prev_right = -1;
  blob_it = row_start;
  start_it = blob_it;
  blob_count = 0;
  blob_box = box_next (&blob_it);//first blob
  blob_it.mark_cycle_pt ();
  do {
    for (; blob_count > 0; blob_count--)
      box_next(&start_it);
    do {
      prev_box = blob_box;
      blob_count++;
      blob_box = box_next (&blob_it);
    }
    while (!blob_it.cycled_list ()
      && blob_box.left () - prev_box.right () < space_size);
    word_sync =
      check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2,
      projection, projection_left, projection_right,
      row->xheight * textord_projection_scale,
      occupation, &seg_list, 0, 0);
    total_blob_count += blob_count;
    seg_it.set_to_list (&seg_list);
    if (prev_right >= 0) {
      sp_var = seg_it.data ()->position () - prev_right;
      sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
      sp_var *= sp_var;
      spsum += sp_var;
      sp_count++;
    }
    seg_it.move_to_last ();
    prev_right = seg_it.data ()->position ();
    if (textord_pitch_scalebigwords) {
      scale_factor = (seg_list.length () - 2) / 2;
      if (scale_factor < 1)
        scale_factor = 1;
    }
    else
      scale_factor = 1;
    sqsum += word_sync * scale_factor;
    total_count += (seg_list.length () - 1) * scale_factor;
    seg_list.clear ();
  }
  while (!blob_it.cycled_list ());
  sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
  word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
  tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:",
    word_sync, word_sync / initial_pitch, sp_sd,
    word_sync < textord_words_pitchsd_threshold * initial_pitch
    ? 'F' : 'P');

  start_it = row_start;
  blob_it = row_start;
  word_sync =
    check_pitch_sync2 (&blob_it, total_blob_count, (inT16) initial_pitch, 2,
    projection, projection_left, projection_right,
    row->xheight * textord_projection_scale, occupation,
    &seg_list, 0, 0);
  if (occupation > 1)
    word_sync /= occupation;
  word_sync = sqrt (word_sync);

#ifndef GRAPHICS_DISABLED
  if (textord_show_row_cuts && to_win != NULL)
    plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);
#endif
  seg_list.clear ();
  if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
    if (word_sync < textord_words_def_fixed * initial_pitch
      && !row->all_caps)
      res2 = "DF";
    else
      res2 = "MF";
  }
  else
    res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
  tprintf
    ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n",
    word_sync, word_sync / initial_pitch,
    word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P',
    occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps);
}
BOOL8 row_pitch_stats ( TO_ROW row,
inT32  maxwidth,
BOOL8  testing_on 
)

Definition at line 706 of file topitch.cpp.

                       {
  BLOBNBOX *blob;                //current blob
  int gap_index;                 //current gap
  inT32 prev_x;                  //end of prev blob
  inT32 cluster_count;           //no of clusters
  inT32 prev_count;              //of clusters
  inT32 smooth_factor;           //for smoothing stats
  TBOX blob_box;                  //bounding box
  float lower, upper;            //cluster thresholds
                                 //gap sizes
  float gaps[BLOCK_STATS_CLUSTERS];
                                 //blobs
  BLOBNBOX_IT blob_it = row->blob_list ();
  STATS gap_stats (0, maxwidth);
  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
  //clusters

  smooth_factor =
    (inT32) (row->xheight * textord_wordstats_smooth_factor + 1.5);
  if (!blob_it.empty ()) {
    prev_x = blob_it.data ()->bounding_box ().right ();
    blob_it.forward ();
    while (!blob_it.at_first ()) {
      blob = blob_it.data ();
      if (!blob->joined_to_prev ()) {
        blob_box = blob->bounding_box ();
        if (blob_box.left () - prev_x < maxwidth)
          gap_stats.add (blob_box.left () - prev_x, 1);
        prev_x = blob_box.right ();
      }
      blob_it.forward ();
    }
  }
  if (gap_stats.get_total () == 0) {
    return FALSE;
  }
  cluster_count = 0;
  lower = row->xheight * words_initial_lower;
  upper = row->xheight * words_initial_upper;
  gap_stats.smooth (smooth_factor);
  do {
    prev_count = cluster_count;
    cluster_count = gap_stats.cluster (lower, upper,
      textord_spacesize_ratioprop,
      BLOCK_STATS_CLUSTERS, cluster_stats);
  }
  while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
  if (cluster_count < 1) {
    return FALSE;
  }
  for (gap_index = 0; gap_index < cluster_count; gap_index++)
    gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
  //get medians
  if (testing_on) {
    tprintf ("cluster_count=%d:", cluster_count);
    for (gap_index = 0; gap_index < cluster_count; gap_index++)
      tprintf (" %g(%d)", gaps[gap_index],
        cluster_stats[gap_index + 1].get_total ());
    tprintf ("\n");
  }
  qsort (gaps, cluster_count, sizeof (float), sort_floats);

  //Try to find proportional non-space and space for row.
  lower = row->xheight * words_default_prop_nonspace;
  upper = row->xheight * textord_words_min_minspace;
  for (gap_index = 0; gap_index < cluster_count
    && gaps[gap_index] < lower; gap_index++);
  if (gap_index == 0) {
    if (testing_on)
      tprintf ("No clusters below nonspace threshold!!\n");
    if (cluster_count > 1) {
      row->pr_nonsp = gaps[0];
      row->pr_space = gaps[1];
    }
    else {
      row->pr_nonsp = lower;
      row->pr_space = gaps[0];
    }
  }
  else {
    row->pr_nonsp = gaps[gap_index - 1];
    while (gap_index < cluster_count && gaps[gap_index] < upper)
      gap_index++;
    if (gap_index == cluster_count) {
      if (testing_on)
        tprintf ("No clusters above nonspace threshold!!\n");
      row->pr_space = lower * textord_spacesize_ratioprop;
    }
    else
      row->pr_space = gaps[gap_index];
  }

  //Now try to find the fixed pitch space and non-space.
  upper = row->xheight * words_default_fixed_space;
  for (gap_index = 0; gap_index < cluster_count
    && gaps[gap_index] < upper; gap_index++);
  if (gap_index == 0) {
    if (testing_on)
      tprintf ("No clusters below space threshold!!\n");
    row->fp_nonsp = upper;
    row->fp_space = gaps[0];
  }
  else {
    row->fp_nonsp = gaps[gap_index - 1];
    if (gap_index == cluster_count) {
      if (testing_on)
        tprintf ("No clusters above space threshold!!\n");
      row->fp_space = row->xheight;
    }
    else
      row->fp_space = gaps[gap_index];
  }
  if (testing_on) {
    tprintf
      ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n",
      row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
  }
  return TRUE;                   //computed some stats
}
BOOL8 try_block_fixed ( TO_BLOCK block,
inT32  block_index 
)

Definition at line 538 of file topitch.cpp.

                       {
  return FALSE;
}
BOOL8 try_doc_fixed ( ICOORD  page_tr,
TO_BLOCK_LIST *  port_blocks,
float  gradient 
)

Definition at line 398 of file topitch.cpp.

                     {
  inT16 master_x;                //uniform shifts
  inT16 pitch;                   //median pitch.
  int x;                         //profile coord
  int prop_blocks;               //correct counts
  int fixed_blocks;
  int total_row_count;           //total in page
                                 //iterator
  TO_BLOCK_IT block_it = port_blocks;
  TO_BLOCK *block;               //current block;
  TO_ROW_IT row_it;              //row iterator
  TO_ROW *row;                   //current row
  inT16 projection_left;         //edges
  inT16 projection_right;
  inT16 row_left;                //edges of row
  inT16 row_right;
  ICOORDELT_LIST *master_cells;  //cells for page
  float master_y;                //uniform shifts
  float shift_factor;            //page skew correction
  float row_shift;               //shift for row
  float final_pitch;             //output pitch
  float row_y;                   //baseline
  STATS projection;              //entire page
  STATS pitches (0, MAX_ALLOWED_PITCH);
  //for median
  float sp_sd;                   //space sd
  inT16 mid_cuts;                //no of cheap cuts
  float pitch_sd;                //sync rating

  if (block_it.empty ()
    //      || block_it.data()==block_it.data_relative(1)
    || !textord_blockndoc_fixed)
    return FALSE;
  shift_factor = gradient / (gradient * gradient + 1);
  row_it.set_to_list (block_it.data ()->get_rows ());
  master_x = row_it.data ()->projection_left;
  master_y = row_it.data ()->baseline.y (master_x);
  projection_left = MAX_INT16;
  projection_right = -MAX_INT16;
  prop_blocks = 0;
  fixed_blocks = 0;
  total_row_count = 0;

  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
  block_it.forward ()) {
    block = block_it.data ();
    row_it.set_to_list (block->get_rows ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      total_row_count++;
      if (row->fixed_pitch > 0)
        pitches.add ((inT32) (row->fixed_pitch), 1);
      //find median
      row_y = row->baseline.y (master_x);
      row_left =
        (inT16) (row->projection_left -
        shift_factor * (master_y - row_y));
      row_right =
        (inT16) (row->projection_right -
        shift_factor * (master_y - row_y));
      if (row_left < projection_left)
        projection_left = row_left;
      if (row_right > projection_right)
        projection_right = row_right;
    }
  }
  if (pitches.get_total () == 0)
    return FALSE;
  projection.set_range (projection_left, projection_right);

  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
  block_it.forward ()) {
    block = block_it.data ();
    row_it.set_to_list (block->get_rows ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      row_y = row->baseline.y (master_x);
      row_left =
        (inT16) (row->projection_left -
        shift_factor * (master_y - row_y));
      for (x = row->projection_left; x < row->projection_right;
      x++, row_left++) {
        projection.add (row_left, row->projection.pile_count (x));
      }
    }
  }

  row_it.set_to_list (block_it.data ()->get_rows ());
  row = row_it.data ();
#ifndef GRAPHICS_DISABLED
  if (textord_show_page_cuts && to_win != NULL)
    projection.plot (to_win, projection_left,
      row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
#endif
  final_pitch = pitches.ile (0.5);
  pitch = (inT16) final_pitch;
  pitch_sd =
    tune_row_pitch (row, &projection, projection_left, projection_right,
    pitch * 0.75, final_pitch, sp_sd, mid_cuts,
    &row->char_cells, FALSE);

  if (textord_debug_pitch_metric)
    tprintf
      ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
      prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd,
      pitch_sd / total_row_count, pitch_sd / pitch,
      pitch_sd / total_row_count / pitch);

#ifndef GRAPHICS_DISABLED
  if (textord_show_page_cuts && to_win != NULL) {
    master_cells = &row->char_cells;
    for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
    block_it.forward ()) {
      block = block_it.data ();
      row_it.set_to_list (block->get_rows ());
      for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
      row_it.forward ()) {
        row = row_it.data ();
        row_y = row->baseline.y (master_x);
        row_shift = shift_factor * (master_y - row_y);
        plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);
      }
    }
  }
#endif
  row->char_cells.clear ();
  return FALSE;
}
BOOL8 try_rows_fixed ( TO_BLOCK block,
inT32  block_index,
BOOL8  testing_on 
)

Definition at line 552 of file topitch.cpp.

                      {
  inT32 maxwidth;                //of spaces
  TO_ROW *row;                   //current row
  inT32 row_index;               //row number.
  inT32 def_fixed = 0;           //counters
  inT32 def_prop = 0;
  inT32 maybe_fixed = 0;
  inT32 maybe_prop = 0;
  inT32 dunno = 0;
  inT32 corr_fixed = 0;
  inT32 corr_prop = 0;
  float lower, upper;            //cluster thresholds
  TO_ROW_IT row_it = block->get_rows ();

  row_index = 1;
  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
    row = row_it.data ();
    ASSERT_HOST (row->xheight > 0);
    maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace);
    if (row->fixed_pitch > 0 &&
        fixed_pitch_row(row, block->block, block_index)) {
      if (row->fixed_pitch == 0) {
        lower = row->pr_nonsp;
        upper = row->pr_space;
        row->space_size = upper;
        row->kern_size = lower;
      }
    }
    row_index++;
  }
  count_block_votes(block,
                    def_fixed,
                    def_prop,
                    maybe_fixed,
                    maybe_prop,
                    corr_fixed,
                    corr_prop,
                    dunno);
  if (testing_on
    && (textord_debug_pitch_test
  || textord_blocksall_prop || textord_blocksall_fixed)) {
    tprintf ("Initially:");
    print_block_counts(block, block_index);
  }
  if (def_fixed > def_prop * textord_words_veto_power)
    block->pitch_decision = PITCH_DEF_FIXED;
  else if (def_prop > def_fixed * textord_words_veto_power)
    block->pitch_decision = PITCH_DEF_PROP;
  else if (def_fixed > 0 || def_prop > 0)
    block->pitch_decision = PITCH_DUNNO;
  else if (maybe_fixed > maybe_prop * textord_words_veto_power)
    block->pitch_decision = PITCH_MAYBE_FIXED;
  else if (maybe_prop > maybe_fixed * textord_words_veto_power)
    block->pitch_decision = PITCH_MAYBE_PROP;
  else
    block->pitch_decision = PITCH_DUNNO;
  return FALSE;
}
float tune_row_pitch ( TO_ROW row,
STATS projection,
inT16  projection_left,
inT16  projection_right,
float  space_size,
float &  initial_pitch,
float &  best_sp_sd,
inT16 best_mid_cuts,
ICOORDELT_LIST *  best_cells,
BOOL8  testing_on 
)

Definition at line 1150 of file topitch.cpp.

                      {
  int pitch_delta;               //offset pitch
  inT16 mid_cuts;                //cheap cuts
  float pitch_sd;                //current sd
  float best_sd;                 //best result
  float best_pitch;              //pitch for best result
  float initial_sd;              //starting error
  float sp_sd;                   //space sd
  ICOORDELT_LIST test_cells;     //row cells
  ICOORDELT_IT best_it;          //start of best list

  if (textord_fast_pitch_test)
    return tune_row_pitch2 (row, projection, projection_left,
      projection_right, space_size, initial_pitch,
      best_sp_sd,
    //space sd
      best_mid_cuts, best_cells, testing_on);
  if (textord_disable_pitch_test) {
    best_sp_sd = initial_pitch;
    return initial_pitch;
  }
  initial_sd =
    compute_pitch_sd(row,
                     projection,
                     projection_left,
                     projection_right,
                     space_size,
                     initial_pitch,
                     best_sp_sd,
                     best_mid_cuts,
                     best_cells,
                     testing_on);
  best_sd = initial_sd;
  best_pitch = initial_pitch;
  if (testing_on)
    tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
    pitch_sd =
      compute_pitch_sd (row, projection, projection_left, projection_right,
      space_size, initial_pitch + pitch_delta, sp_sd,
      mid_cuts, &test_cells, testing_on);
    if (testing_on)
      tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta,
        pitch_sd);
    if (pitch_sd < best_sd) {
      best_sd = pitch_sd;
      best_mid_cuts = mid_cuts;
      best_sp_sd = sp_sd;
      best_pitch = initial_pitch + pitch_delta;
      best_cells->clear ();
      best_it.set_to_list (best_cells);
      best_it.add_list_after (&test_cells);
    }
    else
      test_cells.clear ();
    if (pitch_sd > initial_sd)
      break;                     //getting worse
  }
  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
    pitch_sd =
      compute_pitch_sd (row, projection, projection_left, projection_right,
      space_size, initial_pitch - pitch_delta, sp_sd,
      mid_cuts, &test_cells, testing_on);
    if (testing_on)
      tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta,
        pitch_sd);
    if (pitch_sd < best_sd) {
      best_sd = pitch_sd;
      best_mid_cuts = mid_cuts;
      best_sp_sd = sp_sd;
      best_pitch = initial_pitch - pitch_delta;
      best_cells->clear ();
      best_it.set_to_list (best_cells);
      best_it.add_list_after (&test_cells);
    }
    else
      test_cells.clear ();
    if (pitch_sd > initial_sd)
      break;
  }
  initial_pitch = best_pitch;

  if (textord_debug_pitch_metric)
    print_pitch_sd(row,
                   projection,
                   projection_left,
                   projection_right,
                   space_size,
                   best_pitch);

  return best_sd;
}
float tune_row_pitch2 ( TO_ROW row,
STATS projection,
inT16  projection_left,
inT16  projection_right,
float  space_size,
float &  initial_pitch,
float &  best_sp_sd,
inT16 best_mid_cuts,
ICOORDELT_LIST *  best_cells,
BOOL8  testing_on 
)

Definition at line 1262 of file topitch.cpp.

                       {
  int pitch_delta;               //offset pitch
  inT16 pixel;                   //pixel coord
  inT16 best_pixel;              //pixel coord
  inT16 best_delta;              //best pitch
  inT16 best_pitch;              //best pitch
  inT16 start;                   //of good range
  inT16 end;                     //of good range
  inT32 best_count;              //lowest sum
  float best_sd;                 //best result
  STATS *sum_proj;               //summed projection

  best_sp_sd = initial_pitch;

  if (textord_disable_pitch_test) {
    return initial_pitch;
  }
  sum_proj = new STATS[textord_pitch_range * 2 + 1];
  if (sum_proj == NULL)
    return initial_pitch;
  best_pitch = (inT32) initial_pitch;

  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
    pitch_delta++)
  sum_proj[textord_pitch_range + pitch_delta].set_range (0,
      best_pitch +
      pitch_delta + 1);
  for (pixel = projection_left; pixel <= projection_right; pixel++) {
    for (pitch_delta = -textord_pitch_range;
      pitch_delta <= textord_pitch_range; pitch_delta++)
    sum_proj[textord_pitch_range +
        pitch_delta].add ((pixel - projection_left) % (best_pitch +
        pitch_delta),
        projection->pile_count (pixel));
  }
  best_count = sum_proj[textord_pitch_range].pile_count (0);
  best_delta = 0;
  best_pixel = 0;
  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
  pitch_delta++) {
    for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
      if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel)
      < best_count) {
        best_count =
          sum_proj[textord_pitch_range +
          pitch_delta].pile_count (pixel);
        best_delta = pitch_delta;
        best_pixel = pixel;
      }
    }
  }
  if (testing_on)
    tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n",
      initial_pitch, best_delta, best_count);
  best_pitch += best_delta;
  initial_pitch = best_pitch;
  best_count++;
  best_count += best_count;
  for (start = best_pixel - 2; start > best_pixel - best_pitch
    && sum_proj[textord_pitch_range +
    best_delta].pile_count (start % best_pitch) <= best_count;
    start--);
  for (end = best_pixel + 2;
    end < best_pixel + best_pitch
    && sum_proj[textord_pitch_range +
    best_delta].pile_count (end % best_pitch) <= best_count;
    end++);

  best_sd =
    compute_pitch_sd(row,
                     projection,
                     projection_left,
                     projection_right,
                     space_size,
                     initial_pitch,
                     best_sp_sd,
                     best_mid_cuts,
                     best_cells,
                     testing_on,
                     start,
                     end);
  if (testing_on)
    tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch,
      best_sd);

  if (textord_debug_pitch_metric)
    print_pitch_sd(row,
                   projection,
                   projection_left,
                   projection_right,
                   space_size,
                   initial_pitch);

  delete[]sum_proj;

  return best_sd;
}

Variable Documentation

double textord_balance_factor = 2.0

"Ding rate for unbalanced char cells"

Definition at line 61 of file topitch.cpp.

"Attempt whole doc/block fixed pitch"

Definition at line 58 of file topitch.cpp.

"Write full metric stuff"

Definition at line 52 of file topitch.cpp.

"Debug on fixed pitch test"

Definition at line 46 of file topitch.cpp.

"Do even faster pitch algorithm"

Definition at line 50 of file topitch.cpp.

"Use correct answer for fixed/prop"

Definition at line 56 of file topitch.cpp.

double textord_projection_scale = 0.125

"Ding rate for mid-cuts"

Definition at line 59 of file topitch.cpp.

"Draw page-level cuts"

Definition at line 54 of file topitch.cpp.

"Draw row-level cuts"

Definition at line 53 of file topitch.cpp.