Tesseract  3.02
tesseract::Textord Class Reference

#include <textord.h>

List of all members.

Public Member Functions

 Textord (CCStruct *ccstruct)
 ~Textord ()
void TextordPage (PageSegMode pageseg_mode, int width, int height, Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
void CleanupSingleRowResult (PageSegMode pageseg_mode, PAGE_RES *page_res)
bool use_cjk_fp_model () const
void set_use_cjk_fp_model (bool flag)
void to_spacing (ICOORD page_tr, TO_BLOCK_LIST *blocks)
ROWmake_prop_words (TO_ROW *row, FCOORD rotation)
ROWmake_blob_words (TO_ROW *row, FCOORD rotation)
void find_components (Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
void filter_blobs (ICOORD page_tr, TO_BLOCK_LIST *blocks, BOOL8 testing_on)

Public Attributes

bool textord_single_height_mode = false
bool tosp_old_to_method = false
bool tosp_old_to_constrain_sp_kn = false
bool tosp_only_use_prop_rows = true
bool tosp_force_wordbreak_on_punct = false
bool tosp_use_pre_chopping = false
bool tosp_old_to_bug_fix = false
bool tosp_block_use_cert_spaces = true
bool tosp_row_use_cert_spaces = true
bool tosp_narrow_blobs_not_cert = true
bool tosp_row_use_cert_spaces1 = true
bool tosp_recovery_isolated_row_stats = true
bool tosp_only_small_gaps_for_kern = false
bool tosp_all_flips_fuzzy = false
bool tosp_fuzzy_limit_all = true
bool tosp_stats_use_xht_gaps = true
bool tosp_use_xht_gaps = true
bool tosp_only_use_xht_gaps = false
bool tosp_rule_9_test_punct = false
bool tosp_flip_fuzz_kn_to_sp = true
bool tosp_flip_fuzz_sp_to_kn = true
bool tosp_improve_thresh = false
int tosp_debug_level = 0
int tosp_enough_space_samples_for_median = 3
int tosp_redo_kern_limit = 10
int tosp_few_samples = 40
int tosp_short_row = 20
int tosp_sanity_method = 1
double tosp_old_sp_kn_th_factor = 2.0
double tosp_threshold_bias1 = 0
double tosp_threshold_bias2 = 0
double tosp_narrow_fraction = 0.3
double tosp_narrow_aspect_ratio = 0.48
double tosp_wide_fraction = 0.52
double tosp_wide_aspect_ratio = 0.0
double tosp_fuzzy_space_factor = 0.6
double tosp_fuzzy_space_factor1 = 0.5
double tosp_fuzzy_space_factor2 = 0.72
double tosp_gap_factor = 0.83
double tosp_kern_gap_factor1 = 2.0
double tosp_kern_gap_factor2 = 1.3
double tosp_kern_gap_factor3 = 2.5
double tosp_ignore_big_gaps = -1
double tosp_ignore_very_big_gaps = 3.5
double tosp_rep_space = 1.6
double tosp_enough_small_gaps = 0.65
double tosp_table_kn_sp_ratio = 2.25
double tosp_table_xht_sp_ratio = 0.33
double tosp_table_fuzzy_kn_sp_ratio = 3.0
double tosp_fuzzy_kn_fraction = 0.5
double tosp_fuzzy_sp_fraction = 0.5
double tosp_min_sane_kn_sp = 1.5
double tosp_init_guess_kn_mult = 2.2
double tosp_init_guess_xht_mult = 0.28
double tosp_max_sane_kn_thresh = 5.0
double tosp_flip_caution = 0.0
double tosp_large_kerning = 0.19
double tosp_dont_fool_with_small_kerns = -1
double tosp_near_lh_edge = 0
double tosp_silly_kn_sp_gap = 0.2
double tosp_pass_wide_fuzz_sp_to_context = 0.75
bool textord_no_rejects = false
bool textord_show_blobs = false
bool textord_show_boxes = false
int textord_max_noise_size = 7
double textord_blob_size_bigile = 95
double textord_noise_area_ratio = 0.7
double textord_blob_size_smallile = 20
double textord_initialx_ile = 0.75
double textord_initialasc_ile = 0.90
int textord_noise_sizefraction = 10
double textord_noise_sizelimit = 0.5
int textord_noise_translimit = 16
double textord_noise_normratio = 2.0
bool textord_noise_rejwords = true
bool textord_noise_rejrows = true
double textord_noise_syfract = 0.2
double textord_noise_sxfract = 0.4
double textord_noise_hfract = 1.0/64
int textord_noise_sncount = 1
double textord_noise_rowratio = 6.0
bool textord_noise_debug = 0
double textord_blshift_maxshift = 0.00
double textord_blshift_xfraction = 9.99

Detailed Description

Definition at line 39 of file textord.h.


Constructor & Destructor Documentation

tesseract::Textord::Textord ( CCStruct ccstruct) [explicit]

Definition at line 29 of file textord.cpp.

    : ccstruct_(ccstruct), use_cjk_fp_model_(false),
      // makerow.cpp ///////////////////////////////////////////
      BOOL_MEMBER(textord_single_height_mode, false,
                  "Script has no xheight, so use a single mode",
                  ccstruct_->params()),
      // tospace.cpp ///////////////////////////////////////////
      BOOL_MEMBER(tosp_old_to_method, false, "Space stats use prechopping?",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_old_to_constrain_sp_kn, false,
                  "Constrain relative values of inter and intra-word gaps for "
                  "old_to_method.",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_only_use_prop_rows, true,
                  "Block stats to use fixed pitch rows?",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_force_wordbreak_on_punct, false,
                  "Force word breaks on punct to break long lines in non-space "
                  "delimited langs",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_use_pre_chopping, false,
                  "Space stats use prechopping?",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_old_to_bug_fix, false, "Fix suspected bug in old code",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_block_use_cert_spaces, true,
                  "Only stat OBVIOUS spaces",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_row_use_cert_spaces, true, "Only stat OBVIOUS spaces",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_narrow_blobs_not_cert, true,
            "Only stat OBVIOUS spaces",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_row_use_cert_spaces1, true, "Only stat OBVIOUS spaces",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_recovery_isolated_row_stats, true,
                  "Use row alone when inadequate cert spaces",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_only_small_gaps_for_kern, false, "Better guess",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_all_flips_fuzzy, false, "Pass ANY flip to context?",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_fuzzy_limit_all, true,
                  "Dont restrict kn->sp fuzzy limit to tables",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_stats_use_xht_gaps, true,
                  "Use within xht gap for wd breaks",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_use_xht_gaps, true, "Use within xht gap for wd breaks",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_only_use_xht_gaps, false,
                  "Only use within xht gap for wd breaks",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_rule_9_test_punct, false,
                  "Dont chng kn to space next to punct",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_flip_fuzz_kn_to_sp, true, "Default flip",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_flip_fuzz_sp_to_kn, true, "Default flip",
                  ccstruct_->params()),
      BOOL_MEMBER(tosp_improve_thresh, false, "Enable improvement heuristic",
                  ccstruct_->params()),
      INT_MEMBER(tosp_debug_level, 0, "Debug data",
                 ccstruct_->params()),
      INT_MEMBER(tosp_enough_space_samples_for_median, 3,
           "or should we use mean",
                 ccstruct_->params()),
      INT_MEMBER(tosp_redo_kern_limit, 10,
                 "No.samples reqd to reestimate for row",
                 ccstruct_->params()),
      INT_MEMBER(tosp_few_samples, 40,
                 "No.gaps reqd with 1 large gap to treat as a table",
                 ccstruct_->params()),
      INT_MEMBER(tosp_short_row, 20,
                 "No.gaps reqd with few cert spaces to use certs",
                 ccstruct_->params()),
      INT_MEMBER(tosp_sanity_method, 1, "How to avoid being silly",
                 ccstruct_->params()),
      double_MEMBER(tosp_old_sp_kn_th_factor, 2.0,
                    "Factor for defining space threshold in terms of space and "
                    "kern sizes",
                    ccstruct_->params()),
      double_MEMBER(tosp_threshold_bias1, 0,
                    "how far between kern and space?",
                    ccstruct_->params()),
      double_MEMBER(tosp_threshold_bias2, 0,
                    "how far between kern and space?",
                    ccstruct_->params()),
      double_MEMBER(tosp_narrow_fraction, 0.3, "Fract of xheight for narrow",
                    ccstruct_->params()),
      double_MEMBER(tosp_narrow_aspect_ratio, 0.48,
                    "narrow if w/h less than this",
                    ccstruct_->params()),
      double_MEMBER(tosp_wide_fraction, 0.52, "Fract of xheight for wide",
                    ccstruct_->params()),
      double_MEMBER(tosp_wide_aspect_ratio, 0.0, "wide if w/h less than this",
                    ccstruct_->params()),
      double_MEMBER(tosp_fuzzy_space_factor, 0.6,
                    "Fract of xheight for fuzz sp",
                    ccstruct_->params()),
      double_MEMBER(tosp_fuzzy_space_factor1, 0.5,
                    "Fract of xheight for fuzz sp",
                    ccstruct_->params()),
      double_MEMBER(tosp_fuzzy_space_factor2, 0.72,
                    "Fract of xheight for fuzz sp",
                    ccstruct_->params()),
      double_MEMBER(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern",
                    ccstruct_->params()),
      double_MEMBER(tosp_kern_gap_factor1, 2.0, "gap ratio to flip kern->sp",
                    ccstruct_->params()),
      double_MEMBER(tosp_kern_gap_factor2, 1.3, "gap ratio to flip kern->sp",
                    ccstruct_->params()),
      double_MEMBER(tosp_kern_gap_factor3, 2.5, "gap ratio to flip kern->sp",
                    ccstruct_->params()),
      double_MEMBER(tosp_ignore_big_gaps, -1, "xht multiplier",
                    ccstruct_->params()),
      double_MEMBER(tosp_ignore_very_big_gaps, 3.5, "xht multiplier",
                    ccstruct_->params()),
      double_MEMBER(tosp_rep_space, 1.6, "rep gap multiplier for space",
                    ccstruct_->params()),
      double_MEMBER(tosp_enough_small_gaps, 0.65,
                    "Fract of kerns reqd for isolated row stats",
                    ccstruct_->params()),
      double_MEMBER(tosp_table_kn_sp_ratio, 2.25,
                    "Min difference of kn & sp in table",
                    ccstruct_->params()),
      double_MEMBER(tosp_table_xht_sp_ratio, 0.33,
                    "Expect spaces bigger than this",
                    ccstruct_->params()),
      double_MEMBER(tosp_table_fuzzy_kn_sp_ratio, 3.0,
                    "Fuzzy if less than this",
                    ccstruct_->params()),
      double_MEMBER(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg",
                    ccstruct_->params()),
      double_MEMBER(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg",
                    ccstruct_->params()),
      double_MEMBER(tosp_min_sane_kn_sp, 1.5,
                    "Dont trust spaces less than this time kn",
                    ccstruct_->params()),
      double_MEMBER(tosp_init_guess_kn_mult, 2.2,
                    "Thresh guess - mult kn by this",
                    ccstruct_->params()),
      double_MEMBER(tosp_init_guess_xht_mult, 0.28,
                    "Thresh guess - mult xht by this",
                    ccstruct_->params()),
      double_MEMBER(tosp_max_sane_kn_thresh, 5.0,
                    "Multiplier on kn to limit thresh",
                    ccstruct_->params()),
      double_MEMBER(tosp_flip_caution, 0.0,
                    "Dont autoflip kn to sp when large separation",
                    ccstruct_->params()),
      double_MEMBER(tosp_large_kerning, 0.19,
                    "Limit use of xht gap with large kns",
                    ccstruct_->params()),
      double_MEMBER(tosp_dont_fool_with_small_kerns, -1,
                    "Limit use of xht gap with odd small kns",
                    ccstruct_->params()),
      double_MEMBER(tosp_near_lh_edge, 0,
                    "Dont reduce box if the top left is non blank",
                    ccstruct_->params()),
      double_MEMBER(tosp_silly_kn_sp_gap, 0.2,
                    "Dont let sp minus kn get too small",
                    ccstruct_->params()),
      double_MEMBER(tosp_pass_wide_fuzz_sp_to_context, 0.75,
                    "How wide fuzzies need context",
                    ccstruct_->params()),
      // tordmain.cpp ///////////////////////////////////////////
      BOOL_MEMBER(textord_no_rejects, false, "Don't remove noise blobs",
                  ccstruct_->params()),
      BOOL_MEMBER(textord_show_blobs, false, "Display unsorted blobs",
                  ccstruct_->params()),
      BOOL_MEMBER(textord_show_boxes, false, "Display unsorted blobs",
                  ccstruct_->params()),
      INT_MEMBER(textord_max_noise_size, 7, "Pixel size of noise",
                  ccstruct_->params()),
      double_MEMBER(textord_blob_size_bigile, 95, "Percentile for large blobs",
                    ccstruct_->params()),
      double_MEMBER(textord_noise_area_ratio, 0.7,
                    "Fraction of bounding box for noise",
                    ccstruct_->params()),
      double_MEMBER(textord_blob_size_smallile, 20,
                    "Percentile for small blobs",
                    ccstruct_->params()),
      double_MEMBER(textord_initialx_ile, 0.75,
                    "Ile of sizes for xheight guess",
                    ccstruct_->params()),
      double_MEMBER(textord_initialasc_ile, 0.90,
                    "Ile of sizes for xheight guess",
                    ccstruct_->params()),
      INT_MEMBER(textord_noise_sizefraction, 10,
                 "Fraction of size for maxima",
                 ccstruct_->params()),
      double_MEMBER(textord_noise_sizelimit, 0.5,
                    "Fraction of x for big t count",
                    ccstruct_->params()),
      INT_MEMBER(textord_noise_translimit, 16, "Transitions for normal blob",
                 ccstruct_->params()),
      double_MEMBER(textord_noise_normratio, 2.0,
                    "Dot to norm ratio for deletion",
                    ccstruct_->params()),
      BOOL_MEMBER(textord_noise_rejwords, true, "Reject noise-like words",
                  ccstruct_->params()),
      BOOL_MEMBER(textord_noise_rejrows, true, "Reject noise-like rows",
                  ccstruct_->params()),
      double_MEMBER(textord_noise_syfract, 0.2,
                    "xh fract height error for norm blobs",
                    ccstruct_->params()),
      double_MEMBER(textord_noise_sxfract, 0.4,
                    "xh fract width error for norm blobs",
                    ccstruct_->params()),
      double_MEMBER(textord_noise_hfract, 1.0/64,
                    "Height fraction to discard outlines as speckle noise",
                    ccstruct_->params()),
      INT_MEMBER(textord_noise_sncount, 1, "super norm blobs to save row",
                 ccstruct_->params()),
      double_MEMBER(textord_noise_rowratio, 6.0,
                    "Dot to norm ratio for deletion",
                    ccstruct_->params()),
      BOOL_MEMBER(textord_noise_debug, false, "Debug row garbage detector",
                  ccstruct_->params()),
      double_MEMBER(textord_blshift_maxshift, 0.00, "Max baseline shift",
                    ccstruct_->params()),
      double_MEMBER(textord_blshift_xfraction, 9.99,
                    "Min size of baseline shift",
                    ccstruct_->params()) {
}
tesseract::Textord::~Textord ( )

Definition at line 256 of file textord.cpp.

                  {
}

Member Function Documentation

void tesseract::Textord::CleanupSingleRowResult ( PageSegMode  pageseg_mode,
PAGE_RES page_res 
)

Definition at line 334 of file textord.cpp.

                                                         {
  if (PSM_LINE_FIND_ENABLED(pageseg_mode))
    return;  // No cleanup required.
  PAGE_RES_IT it(page_res);
  // Find the best row, being the greatest mean word conf.
  float row_total_conf = 0.0f;
  int row_word_count = 0;
  ROW_RES* best_row = NULL;
  float best_conf = 0.0f;
  for (it.restart_page(); it.word() != NULL; it.forward()) {
    WERD_RES* word = it.word();
    row_total_conf += word->best_choice->certainty();
    ++row_word_count;
    if (it.next_row() != it.row()) {
      row_total_conf /= row_word_count;
      if (best_row == NULL || best_conf < row_total_conf) {
        best_row = it.row();
        best_conf = row_total_conf;
      }
      row_total_conf = 0.0f;
      row_word_count = 0;
    }
  }
  // Now eliminate any word not in the best row.
  for (it.restart_page(); it.word() != NULL; it.forward()) {
    if (it.row() != best_row)
      it.DeleteCurrentWord();
  }
}
void tesseract::Textord::filter_blobs ( ICOORD  page_tr,
TO_BLOCK_LIST *  blocks,
BOOL8  testing_on 
)

Definition at line 239 of file tordmain.cpp.

                                             {     // for plotting
  TO_BLOCK_IT block_it = blocks;          // destination iterator
  TO_BLOCK *block;                        // created block

  #ifndef GRAPHICS_DISABLED
  if (to_win != NULL)
    to_win->Clear();
  #endif  // GRAPHICS_DISABLED

  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
       block_it.forward()) {
    block = block_it.data();
    block->line_size = filter_noise_blobs(&block->blobs,
      &block->noise_blobs,
      &block->small_blobs,
      &block->large_blobs);
    block->line_spacing = block->line_size *
        (tesseract::CCStruct::kDescenderFraction +
         tesseract::CCStruct::kXHeightFraction +
         2 * tesseract::CCStruct::kAscenderFraction) /
         tesseract::CCStruct::kXHeightFraction;
    block->line_size *= textord_min_linesize;
    block->max_blob_size = block->line_size * textord_excess_blobsize;

    #ifndef GRAPHICS_DISABLED
    if (textord_show_blobs && testing_on) {
      if (to_win == NULL)
        create_to_win(page_tr);
      block->plot_graded_blobs(to_win);
    }
    if (textord_show_boxes && testing_on) {
      if (to_win == NULL)
        create_to_win(page_tr);
      plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE);
      plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE);
      plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE);
      plot_box_list(to_win, &block->blobs, ScrollView::WHITE);
    }
    #endif  // GRAPHICS_DISABLED
  }
}
void tesseract::Textord::find_components ( Pix *  pix,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks 
)

Definition at line 208 of file tordmain.cpp.

                                                        {
  int width = pixGetWidth(pix);
  int height = pixGetHeight(pix);
  if (width > MAX_INT16 || height > MAX_INT16) {
    tprintf("Input image too large! (%d, %d)\n", width, height);
    return;  // Can't handle it.
  }

  set_global_loc_code(LOC_EDGE_PROG);

  BLOCK_IT block_it(blocks);    // iterator
  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
       block_it.forward()) {
    BLOCK* block = block_it.data();
    if (block->poly_block() == NULL || block->poly_block()->IsText()) {
      extract_edges(pix, block);
    }
  }

  assign_blobs_to_blocks2(pix, blocks, to_blocks);
  ICOORD page_tr(width, height);
  filter_blobs(page_tr, to_blocks, !textord_test_landscape);
}
ROW * tesseract::Textord::make_blob_words ( TO_ROW row,
FCOORD  rotation 
)

Definition at line 1183 of file tospace.cpp.

                                {
  bool bol;                      // start of line
  ROW *real_row;                 // output row
  C_OUTLINE_IT cout_it;
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it = &cblobs;
  WERD_LIST words;
  WERD_IT word_it;               // new words
  WERD *word;                    // new word
  double coeffs[3];              // quadratic
  BLOBNBOX *bblob;               // current blob
  TBOX blob_box;                 // bounding box
  BLOBNBOX_IT box_it;            // iterator
  inT16 word_count = 0;

  cblob_it.set_to_list(&cblobs);
  box_it.set_to_list(row->blob_list());
  word_it.set_to_list(&words);
  bol = TRUE;
  if (!box_it.empty()) {

    do {
      bblob = box_it.data();
      blob_box = bblob->bounding_box();
      if (bblob->joined_to_prev()) {
        if (bblob->cblob() != NULL) {
          cout_it.set_to_list(cblob_it.data()->out_list());
          cout_it.move_to_last();
          cout_it.add_list_after(bblob->cblob()->out_list());
          delete bblob->cblob();
        }
      } else {
        if (bblob->cblob() != NULL)
          cblob_it.add_after_then_move(bblob->cblob());
      }
      box_it.forward();         // next one
      bblob = box_it.data();
      blob_box = bblob->bounding_box();

      if (!bblob->joined_to_prev() && !cblobs.empty()) {
        word = new WERD(&cblobs, 1, NULL);
        word_count++;
        word_it.add_after_then_move(word);
        if (bol) {
          word->set_flag(W_BOL, TRUE);
          bol = FALSE;
        }
        if (box_it.at_first()) { // at end of line
          word->set_flag(W_EOL, TRUE);
        }
      }
    }
    while (!box_it.at_first()); // until back at start
    /* Setup the row with created words. */
    coeffs[0] = 0;
    coeffs[1] = row->line_m();
    coeffs[2] = row->line_c();
    real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size);
    word_it.set_to_list(real_row->word_list());
                                 //put words in row
    word_it.add_list_after(&words);
    real_row->recalc_bounding_box();
    if (tosp_debug_level > 4) {
      tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
        word_count,
        real_row->bounding_box().left(),
        real_row->bounding_box().bottom(),
        real_row->bounding_box().right(),
        real_row->bounding_box().top());
    }
    return real_row;
  }
  return NULL;
}
ROW * tesseract::Textord::make_prop_words ( TO_ROW row,
FCOORD  rotation 
)

Definition at line 886 of file tospace.cpp.

                                {
  BOOL8 bol;                     //start of line
  /* prev_ values are for start of word being built. non prev_ values are for
  the gap between the word being built and the next one. */
  BOOL8 prev_fuzzy_sp;           //probably space
  BOOL8 prev_fuzzy_non;          //probably not
  uinT8 prev_blanks;             //in front of word
  BOOL8 fuzzy_sp;                //probably space
  BOOL8 fuzzy_non;               //probably not
  uinT8 blanks;                  //in front of word
  BOOL8 prev_gap_was_a_space = FALSE;
  BOOL8 break_at_next_gap = FALSE;
  ROW *real_row;                 //output row
  C_OUTLINE_IT cout_it;
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it = &cblobs;
  WERD_LIST words;
  WERD_IT word_it;               //new words
  WERD *word;                    //new word
  WERD_IT rep_char_it;           //repeated char words
  inT32 next_rep_char_word_right = MAX_INT32;
  float repetition_spacing;      //gap between repetitions
  inT32 xstarts[2];              //row ends
  double coeffs[3];              //quadratic
  inT32 prev_x;                  //end of prev blob
  BLOBNBOX *bblob;               //current blob
  TBOX blob_box;                  //bounding box
  BLOBNBOX_IT box_it;            //iterator
  TBOX prev_blob_box;
  TBOX next_blob_box;
  inT16 prev_gap = MAX_INT16;
  inT16 current_gap = MAX_INT16;
  inT16 next_gap = MAX_INT16;
  inT16 prev_within_xht_gap = MAX_INT16;
  inT16 current_within_xht_gap = MAX_INT16;
  inT16 next_within_xht_gap = MAX_INT16;
  inT16 word_count = 0;

  rep_char_it.set_to_list (&(row->rep_words));
  if (!rep_char_it.empty ()) {
    next_rep_char_word_right =
      rep_char_it.data ()->bounding_box ().right ();
  }

  prev_x = -MAX_INT16;
  cblob_it.set_to_list (&cblobs);
  box_it.set_to_list (row->blob_list ());
  word_it.set_to_list (&words);
  bol = TRUE;
  prev_blanks = 0;
  prev_fuzzy_sp = FALSE;
  prev_fuzzy_non = FALSE;
  if (!box_it.empty ()) {
    xstarts[0] = box_it.data ()->bounding_box ().left ();
    if (xstarts[0] > next_rep_char_word_right) {
      /* We need to insert a repeated char word at the start of the row */
      word = rep_char_it.extract ();
      word_it.add_after_then_move (word);
      /* Set spaces before repeated char word */
      word->set_flag (W_BOL, TRUE);
      bol = FALSE;
      word->set_blanks (0);
                                 //NO uncertainty
      word->set_flag (W_FUZZY_SP, FALSE);
      word->set_flag (W_FUZZY_NON, FALSE);
      xstarts[0] = word->bounding_box ().left ();
      /* Set spaces after repeated char word (and leave current word set) */
      repetition_spacing = find_mean_blob_spacing (word);
      current_gap = box_it.data ()->bounding_box ().left () -
        next_rep_char_word_right;
      current_within_xht_gap = current_gap;
      if (current_gap > tosp_rep_space * repetition_spacing) {
        prev_blanks = (uinT8) floor (current_gap / row->space_size);
        if (prev_blanks < 1)
          prev_blanks = 1;
      }
      else
        prev_blanks = 0;
      if (tosp_debug_level > 5)
        tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f;  Rgap:%d  ",
          box_it.data ()->bounding_box ().left (),
          box_it.data ()->bounding_box ().bottom (),
          repetition_spacing, current_gap);
      prev_fuzzy_sp = FALSE;
      prev_fuzzy_non = FALSE;
      if (rep_char_it.empty ()) {
        next_rep_char_word_right = MAX_INT32;
      }
      else {
        rep_char_it.forward ();
        next_rep_char_word_right =
          rep_char_it.data ()->bounding_box ().right ();
      }
    }

    peek_at_next_gap(row,
                     box_it,
                     next_blob_box,
                     next_gap,
                     next_within_xht_gap);
    do {
      bblob = box_it.data ();
      blob_box = bblob->bounding_box ();
      if (bblob->joined_to_prev ()) {
        if (bblob->cblob () != NULL) {
          cout_it.set_to_list (cblob_it.data ()->out_list ());
          cout_it.move_to_last ();
          cout_it.add_list_after (bblob->cblob ()->out_list ());
          delete bblob->cblob ();
        }
      } else {
        if (bblob->cblob() != NULL)
          cblob_it.add_after_then_move (bblob->cblob ());
        prev_x = blob_box.right ();
      }
      box_it.forward ();         //next one
      bblob = box_it.data ();
      blob_box = bblob->bounding_box ();

      if (!bblob->joined_to_prev() && bblob->cblob() != NULL) {
        /* Real Blob - not multiple outlines or pre-chopped */
        prev_gap = current_gap;
        prev_within_xht_gap = current_within_xht_gap;
        prev_blob_box = next_blob_box;
        current_gap = next_gap;
        current_within_xht_gap = next_within_xht_gap;
        peek_at_next_gap(row,
                         box_it,
                         next_blob_box,
                         next_gap,
                         next_within_xht_gap);

        inT16 prev_gap_arg = prev_gap;
        inT16 next_gap_arg = next_gap;
        if (tosp_only_use_xht_gaps) {
          prev_gap_arg = prev_within_xht_gap;
          next_gap_arg = next_within_xht_gap;
        }
        // Decide if a word-break should be inserted
        if (blob_box.left () > next_rep_char_word_right ||
            make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
                              current_gap, current_within_xht_gap,
                              next_blob_box, next_gap_arg,
                              blanks, fuzzy_sp, fuzzy_non,
                              prev_gap_was_a_space,
                              break_at_next_gap) ||
            box_it.at_first()) {
          /* Form a new word out of the blobs collected */
          word = new WERD (&cblobs, prev_blanks, NULL);
          word_count++;
          word_it.add_after_then_move (word);
          if (bol) {
            word->set_flag (W_BOL, TRUE);
            bol = FALSE;
          }
          if (prev_fuzzy_sp)
                                 //probably space
            word->set_flag (W_FUZZY_SP, TRUE);
          else if (prev_fuzzy_non)
            word->set_flag (W_FUZZY_NON, TRUE);
          //probably not

          if (blob_box.left () > next_rep_char_word_right) {
            /* We need to insert a repeated char word */
            word = rep_char_it.extract ();
            word_it.add_after_then_move (word);

            /* Set spaces before repeated char word */
            repetition_spacing = find_mean_blob_spacing (word);
            current_gap = word->bounding_box ().left () - prev_x;
            current_within_xht_gap = current_gap;
            if (current_gap > tosp_rep_space * repetition_spacing) {
              blanks =
                (uinT8) floor (current_gap / row->space_size);
              if (blanks < 1)
                blanks = 1;
            }
            else
              blanks = 0;
            if (tosp_debug_level > 5)
              tprintf
                ("Repch wd (%d,%d) rep gap %5.2f;  Lgap:%d (%d blanks);",
                word->bounding_box ().left (),
                word->bounding_box ().bottom (),
                repetition_spacing, current_gap, blanks);
            word->set_blanks (blanks);
                                 //NO uncertainty
            word->set_flag (W_FUZZY_SP, FALSE);
            word->set_flag (W_FUZZY_NON, FALSE);

            /* Set spaces after repeated char word (and leave current word set) */
            current_gap =
              blob_box.left () - next_rep_char_word_right;
            if (current_gap > tosp_rep_space * repetition_spacing) {
              blanks = (uinT8) (current_gap / row->space_size);
              if (blanks < 1)
                blanks = 1;
            }
            else
              blanks = 0;
            if (tosp_debug_level > 5)
              tprintf (" Rgap:%d (%d blanks)\n",
                current_gap, blanks);
            fuzzy_sp = FALSE;
            fuzzy_non = FALSE;

            if (rep_char_it.empty ()) {
              next_rep_char_word_right = MAX_INT32;
            }
            else {
              rep_char_it.forward ();
              next_rep_char_word_right =
                rep_char_it.data ()->bounding_box ().right ();
            }
          }

          if (box_it.at_first () && rep_char_it.empty ()) {
                                 //at end of line
            word->set_flag (W_EOL, TRUE);
            xstarts[1] = prev_x;
          }
          else {
            prev_blanks = blanks;
            prev_fuzzy_sp = fuzzy_sp;
            prev_fuzzy_non = fuzzy_non;
          }
        }
      }
    }
    while (!box_it.at_first ()); //until back at start

    /* Insert any further repeated char words */
    while (!rep_char_it.empty ()) {
      word = rep_char_it.extract ();
      word_it.add_after_then_move (word);

      /* Set spaces before repeated char word */
      repetition_spacing = find_mean_blob_spacing (word);
      current_gap = word->bounding_box ().left () - prev_x;
      if (current_gap > tosp_rep_space * repetition_spacing) {
        blanks = (uinT8) floor (current_gap / row->space_size);
        if (blanks < 1)
          blanks = 1;
      }
      else
        blanks = 0;
      if (tosp_debug_level > 5)
        tprintf
          ("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n",
          word->bounding_box ().left (), word->bounding_box ().bottom (),
          repetition_spacing, current_gap, blanks);
      word->set_blanks (blanks);
                                 //NO uncertainty
      word->set_flag (W_FUZZY_SP, FALSE);
      word->set_flag (W_FUZZY_NON, FALSE);
      prev_x = word->bounding_box ().right ();
      if (rep_char_it.empty ()) {
                                 //at end of line
        word->set_flag (W_EOL, TRUE);
        xstarts[1] = prev_x;
      }
      else {
        rep_char_it.forward ();
      }
    }
    coeffs[0] = 0;
    coeffs[1] = row->line_m ();
    coeffs[2] = row->line_c ();
    real_row = new ROW (row,
      (inT16) row->kern_size, (inT16) row->space_size);
    word_it.set_to_list (real_row->word_list ());
                                 //put words in row
    word_it.add_list_after (&words);
    real_row->recalc_bounding_box ();

    if (tosp_debug_level > 4) {
      tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
        word_count,
        real_row->bounding_box ().left (),
        real_row->bounding_box ().bottom (),
        real_row->bounding_box ().right (),
        real_row->bounding_box ().top ());
    }
    return real_row;
  }
  return NULL;
}
void tesseract::Textord::set_use_cjk_fp_model ( bool  flag) [inline]

Definition at line 56 of file textord.h.

                                       {
    use_cjk_fp_model_ = flag;
  }
void tesseract::Textord::TextordPage ( PageSegMode  pageseg_mode,
int  width,
int  height,
Pix *  pix,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks 
)

Definition at line 260 of file textord.cpp.

                                                                        {
  page_tr_.set_x(width);
  page_tr_.set_y(height);
  if (to_blocks->empty()) {
    // AutoPageSeg was not used, so we need to find_components first.
    find_components(pix, blocks, to_blocks);
  } else {
    // AutoPageSeg does not need to find_components as it did that already.
    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
    filter_blobs(page_tr_, to_blocks, true);
  }

  ASSERT_HOST(!to_blocks->empty());
  if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) {
    const FCOORD anticlockwise90(0.0f, 1.0f);
    const FCOORD clockwise90(0.0f, -1.0f);
    TO_BLOCK_IT it(to_blocks);
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
      TO_BLOCK* to_block = it.data();
      BLOCK* block = to_block->block;
      // Create a fake poly_block in block from its bounding box.
      block->set_poly_block(new POLY_BLOCK(block->bounding_box(),
                                           PT_VERTICAL_TEXT));
      // Rotate the to_block along with its contained block and blobnbox lists.
      to_block->rotate(anticlockwise90);
      // Set the block's rotation values to obey the convention followed in
      // layout analysis for vertical text.
      block->set_re_rotation(clockwise90);
      block->set_classify_rotation(clockwise90);
    }
  }

  TO_BLOCK_IT to_block_it(to_blocks);
  TO_BLOCK* to_block = to_block_it.data();
  // Make the rows in the block.
  float gradient;
  // Do it the old fashioned way.
  if (PSM_LINE_FIND_ENABLED(pageseg_mode)) {
    gradient = make_rows(page_tr_, to_blocks);
  } else {
    // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
    gradient = make_single_row(page_tr_, to_block, to_blocks);
  }
  // Now fit baselines. For now only old mode is available.
  fit_rows(gradient, page_tr_, to_blocks);
  // Now make the words in the lines.
  if (PSM_WORD_FIND_ENABLED(pageseg_mode)) {
    // SINGLE_LINE uses the old word maker on the single line.
    make_words(this, page_tr_, gradient, blocks, to_blocks);
  } else {
    // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
    // single word, and in SINGLE_CHAR mode, all the outlines
    // go in a single blob.
    TO_BLOCK* to_block = to_block_it.data();
    make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
                     to_block->get_rows(), to_block->block->row_list());
  }
  cleanup_blocks(blocks);  // Remove empties.

  // Compute the margins for each row in the block, to be used later for
  // paragraph detection.
  BLOCK_IT b_it(blocks);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    b_it.data()->compute_row_margins();
  }
#ifndef GRAPHICS_DISABLED
  close_to_win();
#endif
}
void tesseract::Textord::to_spacing ( ICOORD  page_tr,
TO_BLOCK_LIST *  blocks 
)

Definition at line 35 of file tospace.cpp.

                           {
  TO_BLOCK_IT block_it;          //iterator
  TO_BLOCK *block;               //current block;
  TO_ROW_IT row_it;              //row iterator
  TO_ROW *row;                   //current row
  int block_index;               //block number
  int row_index;                 //row number
  //estimated width of real spaces for whole block
  inT16 block_space_gap_width;
  //estimated width of non space gaps for whole block
  inT16 block_non_space_gap_width;
  BOOL8 old_text_ord_proportional;//old fixed/prop result
  GAPMAP *gapmap = NULL;          //map of big vert gaps in blk

  block_it.set_to_list (blocks);
  block_index = 1;
  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
  block_it.forward ()) {
    block = block_it.data ();
    gapmap = new GAPMAP (block);
    block_spacing_stats(block,
                        gapmap,
                        old_text_ord_proportional,
                        block_space_gap_width,
                        block_non_space_gap_width);
    // Make sure relative values of block-level space and non-space gap
    // widths are reasonable. The ratio of 1:3 is also used in
    // block_spacing_stats, to corrrect the block_space_gap_width
    // Useful for arabic and hindi, when the non-space gap width is
    // often over-estimated and should not be trusted. A similar ratio
    // is found in block_spacing_stats.
    if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
        (float) block_space_gap_width / block_non_space_gap_width < 3.0) {
      block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0);
    }
    row_it.set_to_list (block->get_rows ());
    row_index = 1;
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      if ((row->pitch_decision == PITCH_DEF_PROP) ||
      (row->pitch_decision == PITCH_CORR_PROP)) {
        if ((tosp_debug_level > 0) && !old_text_ord_proportional)
          tprintf ("Block %d Row %d: Now Proportional\n",
            block_index, row_index);
        row_spacing_stats(row,
                          gapmap,
                          block_index,
                          row_index,
                          block_space_gap_width,
                          block_non_space_gap_width);
      }
      else {
        if ((tosp_debug_level > 0) && old_text_ord_proportional)
          tprintf
            ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
            block_index, row_index, row->pitch_decision,
            row->fixed_pitch);
      }
#ifndef GRAPHICS_DISABLED
      if (textord_show_initial_words)
        plot_word_decisions (to_win, (inT16) row->fixed_pitch, row);
#endif
      row_index++;
    }
    delete gapmap;
    block_index++;
  }
}
bool tesseract::Textord::use_cjk_fp_model ( ) const [inline]

Definition at line 53 of file textord.h.

                                {
    return use_cjk_fp_model_;
  }

Member Data Documentation

"Percentile for large blobs"

Definition at line 330 of file textord.h.

"Percentile for small blobs"

Definition at line 333 of file textord.h.

"Max baseline shift"

Definition at line 350 of file textord.h.

"Min size of baseline shift"

Definition at line 351 of file textord.h.

"Ile of sizes for xheight guess"

Definition at line 335 of file textord.h.

"Ile of sizes for xheight guess"

Definition at line 334 of file textord.h.

"Pixel size of noise"

Definition at line 329 of file textord.h.

"Don't remove noise blobs"

Definition at line 326 of file textord.h.

"Fraction of bounding box for noise"

Definition at line 332 of file textord.h.

"Debug row garbage detector"

Definition at line 349 of file textord.h.

"Height fraction to discard outlines as speckle noise"

Definition at line 346 of file textord.h.

"Dot to norm ratio for deletion"

Definition at line 339 of file textord.h.

"Reject noise-like rows"

Definition at line 341 of file textord.h.

"Reject noise-like words"

Definition at line 340 of file textord.h.

"Dot to norm ratio for deletion"

Definition at line 348 of file textord.h.

"Fraction of size for maxima"

Definition at line 336 of file textord.h.

"Fraction of x for big t count"

Definition at line 337 of file textord.h.

"super norm blobs to save row"

Definition at line 347 of file textord.h.

"xh fract width error for norm blobs"

Definition at line 344 of file textord.h.

"xh fract error for norm blobs"

Definition at line 342 of file textord.h.

"Transitions for normal blob"

Definition at line 338 of file textord.h.

"Display unsorted blobs"

Definition at line 327 of file textord.h.

"Display boxes"

Definition at line 328 of file textord.h.

"Script has no xheight, so use a single mode for horizontal text"

Definition at line 214 of file textord.h.

"Pass ANY flip to context?"

Definition at line 240 of file textord.h.

"Only stat OBVIOUS spaces"

Definition at line 230 of file textord.h.

"Debug data"

Definition at line 255 of file textord.h.

"Limit use of xht gap with odd small kns"

Definition at line 318 of file textord.h.

"Fract of kerns reqd for isolated row stats"

Definition at line 296 of file textord.h.

"or should we use mean"

Definition at line 257 of file textord.h.

"No.gaps reqd with 1 large gap to treat as a table"

Definition at line 261 of file textord.h.

"Dont autoflip kn to sp when large separation"

Definition at line 314 of file textord.h.

"Default flip"

Definition at line 251 of file textord.h.

"Default flip"

Definition at line 252 of file textord.h.

"Force word breaks on punct to break long lines in non-space " "delimited langs"

Definition at line 224 of file textord.h.

"New fuzzy kn alg"

Definition at line 303 of file textord.h.

"Dont restrict kn->sp fuzzy limit to tables"

Definition at line 242 of file textord.h.

"New fuzzy sp alg"

Definition at line 304 of file textord.h.

"Fract of xheight for fuzz sp"

Definition at line 280 of file textord.h.

"Fract of xheight for fuzz sp"

Definition at line 282 of file textord.h.

"Fract of xheight for fuzz sp"

Definition at line 284 of file textord.h.

"gap ratio to flip sp->kern"

Definition at line 285 of file textord.h.

"xht multiplier"

Definition at line 292 of file textord.h.

"xht multiplier"

Definition at line 293 of file textord.h.

"Enable improvement heuristic"

Definition at line 254 of file textord.h.

"Thresh guess - mult kn by this"

Definition at line 308 of file textord.h.

"Thresh guess - mult xht by this"

Definition at line 310 of file textord.h.

"gap ratio to flip kern->sp"

Definition at line 287 of file textord.h.

"gap ratio to flip kern->sp"

Definition at line 289 of file textord.h.

"gap ratio to flip kern->sp"

Definition at line 291 of file textord.h.

"Limit use of xht gap with large kns"

Definition at line 316 of file textord.h.

"Multiplier on kn to limit thresh"

Definition at line 312 of file textord.h.

"Dont trust spaces less than this time kn"

Definition at line 306 of file textord.h.

"narrow if w/h less than this"

Definition at line 275 of file textord.h.

"Only stat OBVIOUS spaces"

Definition at line 234 of file textord.h.

"Fract of xheight for narrow"

Definition at line 273 of file textord.h.

"Dont reduce box if the top left is non blank"

Definition at line 320 of file textord.h.

"Factor for defining space threshold in terms of space and " "kern sizes"

Definition at line 267 of file textord.h.

"Fix suspected bug in old code"

Definition at line 228 of file textord.h.

"Constrain relative values of inter and intra-word gaps for " "old_to_method."

Definition at line 219 of file textord.h.

"Space stats use prechopping?"

Definition at line 216 of file textord.h.

"Better guess"

Definition at line 239 of file textord.h.

"Block stats to use fixed pitch rows?"

Definition at line 221 of file textord.h.

"Only use within xht gap for wd breaks"

Definition at line 248 of file textord.h.

"How wide fuzzies need context"

Definition at line 324 of file textord.h.

"Use row alone when inadequate cert spaces"

Definition at line 238 of file textord.h.

"No.samples reqd to reestimate for row"

Definition at line 259 of file textord.h.

"rep gap multiplier for space"

Definition at line 294 of file textord.h.

"Only stat OBVIOUS spaces"

Definition at line 232 of file textord.h.

"Only stat OBVIOUS spaces"

Definition at line 236 of file textord.h.

"Dont chng kn to space next to punct"

Definition at line 250 of file textord.h.

"How to avoid being silly"

Definition at line 264 of file textord.h.

"No.gaps reqd with few cert spaces to use certs"

Definition at line 263 of file textord.h.

"Dont let sp minus kn get too small"

Definition at line 322 of file textord.h.

"Use within xht gap for wd breaks"

Definition at line 244 of file textord.h.

"Fuzzy if less than this"

Definition at line 302 of file textord.h.

"Min difference of kn & sp in table"

Definition at line 298 of file textord.h.

"Expect spaces bigger than this"

Definition at line 300 of file textord.h.

"how far between kern and space?"

Definition at line 269 of file textord.h.

"how far between kern and space?"

Definition at line 271 of file textord.h.

"Space stats use prechopping?"

Definition at line 226 of file textord.h.

"Use within xht gap for wd breaks"

Definition at line 246 of file textord.h.

"wide if w/h less than this"

Definition at line 278 of file textord.h.

"Fract of xheight for wide"

Definition at line 276 of file textord.h.


The documentation for this class was generated from the following files: