Tesseract
3.02
|
#include <textord.h>
tesseract::Textord::Textord | ( | CCStruct * | ccstruct | ) | [explicit] |
Definition at line 29 of file textord.cpp.
: ccstruct_(ccstruct), use_cjk_fp_model_(false), // makerow.cpp /////////////////////////////////////////// BOOL_MEMBER(textord_single_height_mode, false, "Script has no xheight, so use a single mode", ccstruct_->params()), // tospace.cpp /////////////////////////////////////////// BOOL_MEMBER(tosp_old_to_method, false, "Space stats use prechopping?", ccstruct_->params()), BOOL_MEMBER(tosp_old_to_constrain_sp_kn, false, "Constrain relative values of inter and intra-word gaps for " "old_to_method.", ccstruct_->params()), BOOL_MEMBER(tosp_only_use_prop_rows, true, "Block stats to use fixed pitch rows?", ccstruct_->params()), BOOL_MEMBER(tosp_force_wordbreak_on_punct, false, "Force word breaks on punct to break long lines in non-space " "delimited langs", ccstruct_->params()), BOOL_MEMBER(tosp_use_pre_chopping, false, "Space stats use prechopping?", ccstruct_->params()), BOOL_MEMBER(tosp_old_to_bug_fix, false, "Fix suspected bug in old code", ccstruct_->params()), BOOL_MEMBER(tosp_block_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params()), BOOL_MEMBER(tosp_row_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params()), BOOL_MEMBER(tosp_narrow_blobs_not_cert, true, "Only stat OBVIOUS spaces", ccstruct_->params()), BOOL_MEMBER(tosp_row_use_cert_spaces1, true, "Only stat OBVIOUS spaces", ccstruct_->params()), BOOL_MEMBER(tosp_recovery_isolated_row_stats, true, "Use row alone when inadequate cert spaces", ccstruct_->params()), BOOL_MEMBER(tosp_only_small_gaps_for_kern, false, "Better guess", ccstruct_->params()), BOOL_MEMBER(tosp_all_flips_fuzzy, false, "Pass ANY flip to context?", ccstruct_->params()), BOOL_MEMBER(tosp_fuzzy_limit_all, true, "Dont restrict kn->sp fuzzy limit to tables", ccstruct_->params()), BOOL_MEMBER(tosp_stats_use_xht_gaps, true, "Use within xht gap for wd breaks", ccstruct_->params()), BOOL_MEMBER(tosp_use_xht_gaps, true, "Use within xht gap for wd breaks", ccstruct_->params()), BOOL_MEMBER(tosp_only_use_xht_gaps, false, "Only use within xht gap for wd breaks", ccstruct_->params()), BOOL_MEMBER(tosp_rule_9_test_punct, false, "Dont chng kn to space next to punct", ccstruct_->params()), BOOL_MEMBER(tosp_flip_fuzz_kn_to_sp, true, "Default flip", ccstruct_->params()), BOOL_MEMBER(tosp_flip_fuzz_sp_to_kn, true, "Default flip", ccstruct_->params()), BOOL_MEMBER(tosp_improve_thresh, false, "Enable improvement heuristic", ccstruct_->params()), INT_MEMBER(tosp_debug_level, 0, "Debug data", ccstruct_->params()), INT_MEMBER(tosp_enough_space_samples_for_median, 3, "or should we use mean", ccstruct_->params()), INT_MEMBER(tosp_redo_kern_limit, 10, "No.samples reqd to reestimate for row", ccstruct_->params()), INT_MEMBER(tosp_few_samples, 40, "No.gaps reqd with 1 large gap to treat as a table", ccstruct_->params()), INT_MEMBER(tosp_short_row, 20, "No.gaps reqd with few cert spaces to use certs", ccstruct_->params()), INT_MEMBER(tosp_sanity_method, 1, "How to avoid being silly", ccstruct_->params()), double_MEMBER(tosp_old_sp_kn_th_factor, 2.0, "Factor for defining space threshold in terms of space and " "kern sizes", ccstruct_->params()), double_MEMBER(tosp_threshold_bias1, 0, "how far between kern and space?", ccstruct_->params()), double_MEMBER(tosp_threshold_bias2, 0, "how far between kern and space?", ccstruct_->params()), double_MEMBER(tosp_narrow_fraction, 0.3, "Fract of xheight for narrow", ccstruct_->params()), double_MEMBER(tosp_narrow_aspect_ratio, 0.48, "narrow if w/h less than this", ccstruct_->params()), double_MEMBER(tosp_wide_fraction, 0.52, "Fract of xheight for wide", ccstruct_->params()), double_MEMBER(tosp_wide_aspect_ratio, 0.0, "wide if w/h less than this", ccstruct_->params()), double_MEMBER(tosp_fuzzy_space_factor, 0.6, "Fract of xheight for fuzz sp", ccstruct_->params()), double_MEMBER(tosp_fuzzy_space_factor1, 0.5, "Fract of xheight for fuzz sp", ccstruct_->params()), double_MEMBER(tosp_fuzzy_space_factor2, 0.72, "Fract of xheight for fuzz sp", ccstruct_->params()), double_MEMBER(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern", ccstruct_->params()), double_MEMBER(tosp_kern_gap_factor1, 2.0, "gap ratio to flip kern->sp", ccstruct_->params()), double_MEMBER(tosp_kern_gap_factor2, 1.3, "gap ratio to flip kern->sp", ccstruct_->params()), double_MEMBER(tosp_kern_gap_factor3, 2.5, "gap ratio to flip kern->sp", ccstruct_->params()), double_MEMBER(tosp_ignore_big_gaps, -1, "xht multiplier", ccstruct_->params()), double_MEMBER(tosp_ignore_very_big_gaps, 3.5, "xht multiplier", ccstruct_->params()), double_MEMBER(tosp_rep_space, 1.6, "rep gap multiplier for space", ccstruct_->params()), double_MEMBER(tosp_enough_small_gaps, 0.65, "Fract of kerns reqd for isolated row stats", ccstruct_->params()), double_MEMBER(tosp_table_kn_sp_ratio, 2.25, "Min difference of kn & sp in table", ccstruct_->params()), double_MEMBER(tosp_table_xht_sp_ratio, 0.33, "Expect spaces bigger than this", ccstruct_->params()), double_MEMBER(tosp_table_fuzzy_kn_sp_ratio, 3.0, "Fuzzy if less than this", ccstruct_->params()), double_MEMBER(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg", ccstruct_->params()), double_MEMBER(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg", ccstruct_->params()), double_MEMBER(tosp_min_sane_kn_sp, 1.5, "Dont trust spaces less than this time kn", ccstruct_->params()), double_MEMBER(tosp_init_guess_kn_mult, 2.2, "Thresh guess - mult kn by this", ccstruct_->params()), double_MEMBER(tosp_init_guess_xht_mult, 0.28, "Thresh guess - mult xht by this", ccstruct_->params()), double_MEMBER(tosp_max_sane_kn_thresh, 5.0, "Multiplier on kn to limit thresh", ccstruct_->params()), double_MEMBER(tosp_flip_caution, 0.0, "Dont autoflip kn to sp when large separation", ccstruct_->params()), double_MEMBER(tosp_large_kerning, 0.19, "Limit use of xht gap with large kns", ccstruct_->params()), double_MEMBER(tosp_dont_fool_with_small_kerns, -1, "Limit use of xht gap with odd small kns", ccstruct_->params()), double_MEMBER(tosp_near_lh_edge, 0, "Dont reduce box if the top left is non blank", ccstruct_->params()), double_MEMBER(tosp_silly_kn_sp_gap, 0.2, "Dont let sp minus kn get too small", ccstruct_->params()), double_MEMBER(tosp_pass_wide_fuzz_sp_to_context, 0.75, "How wide fuzzies need context", ccstruct_->params()), // tordmain.cpp /////////////////////////////////////////// BOOL_MEMBER(textord_no_rejects, false, "Don't remove noise blobs", ccstruct_->params()), BOOL_MEMBER(textord_show_blobs, false, "Display unsorted blobs", ccstruct_->params()), BOOL_MEMBER(textord_show_boxes, false, "Display unsorted blobs", ccstruct_->params()), INT_MEMBER(textord_max_noise_size, 7, "Pixel size of noise", ccstruct_->params()), double_MEMBER(textord_blob_size_bigile, 95, "Percentile for large blobs", ccstruct_->params()), double_MEMBER(textord_noise_area_ratio, 0.7, "Fraction of bounding box for noise", ccstruct_->params()), double_MEMBER(textord_blob_size_smallile, 20, "Percentile for small blobs", ccstruct_->params()), double_MEMBER(textord_initialx_ile, 0.75, "Ile of sizes for xheight guess", ccstruct_->params()), double_MEMBER(textord_initialasc_ile, 0.90, "Ile of sizes for xheight guess", ccstruct_->params()), INT_MEMBER(textord_noise_sizefraction, 10, "Fraction of size for maxima", ccstruct_->params()), double_MEMBER(textord_noise_sizelimit, 0.5, "Fraction of x for big t count", ccstruct_->params()), INT_MEMBER(textord_noise_translimit, 16, "Transitions for normal blob", ccstruct_->params()), double_MEMBER(textord_noise_normratio, 2.0, "Dot to norm ratio for deletion", ccstruct_->params()), BOOL_MEMBER(textord_noise_rejwords, true, "Reject noise-like words", ccstruct_->params()), BOOL_MEMBER(textord_noise_rejrows, true, "Reject noise-like rows", ccstruct_->params()), double_MEMBER(textord_noise_syfract, 0.2, "xh fract height error for norm blobs", ccstruct_->params()), double_MEMBER(textord_noise_sxfract, 0.4, "xh fract width error for norm blobs", ccstruct_->params()), double_MEMBER(textord_noise_hfract, 1.0/64, "Height fraction to discard outlines as speckle noise", ccstruct_->params()), INT_MEMBER(textord_noise_sncount, 1, "super norm blobs to save row", ccstruct_->params()), double_MEMBER(textord_noise_rowratio, 6.0, "Dot to norm ratio for deletion", ccstruct_->params()), BOOL_MEMBER(textord_noise_debug, false, "Debug row garbage detector", ccstruct_->params()), double_MEMBER(textord_blshift_maxshift, 0.00, "Max baseline shift", ccstruct_->params()), double_MEMBER(textord_blshift_xfraction, 9.99, "Min size of baseline shift", ccstruct_->params()) { }
tesseract::Textord::~Textord | ( | ) |
Definition at line 256 of file textord.cpp.
{ }
void tesseract::Textord::CleanupSingleRowResult | ( | PageSegMode | pageseg_mode, |
PAGE_RES * | page_res | ||
) |
Definition at line 334 of file textord.cpp.
{ if (PSM_LINE_FIND_ENABLED(pageseg_mode)) return; // No cleanup required. PAGE_RES_IT it(page_res); // Find the best row, being the greatest mean word conf. float row_total_conf = 0.0f; int row_word_count = 0; ROW_RES* best_row = NULL; float best_conf = 0.0f; for (it.restart_page(); it.word() != NULL; it.forward()) { WERD_RES* word = it.word(); row_total_conf += word->best_choice->certainty(); ++row_word_count; if (it.next_row() != it.row()) { row_total_conf /= row_word_count; if (best_row == NULL || best_conf < row_total_conf) { best_row = it.row(); best_conf = row_total_conf; } row_total_conf = 0.0f; row_word_count = 0; } } // Now eliminate any word not in the best row. for (it.restart_page(); it.word() != NULL; it.forward()) { if (it.row() != best_row) it.DeleteCurrentWord(); } }
Definition at line 239 of file tordmain.cpp.
{ // for plotting TO_BLOCK_IT block_it = blocks; // destination iterator TO_BLOCK *block; // created block #ifndef GRAPHICS_DISABLED if (to_win != NULL) to_win->Clear(); #endif // GRAPHICS_DISABLED for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { block = block_it.data(); block->line_size = filter_noise_blobs(&block->blobs, &block->noise_blobs, &block->small_blobs, &block->large_blobs); block->line_spacing = block->line_size * (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + 2 * tesseract::CCStruct::kAscenderFraction) / tesseract::CCStruct::kXHeightFraction; block->line_size *= textord_min_linesize; block->max_blob_size = block->line_size * textord_excess_blobsize; #ifndef GRAPHICS_DISABLED if (textord_show_blobs && testing_on) { if (to_win == NULL) create_to_win(page_tr); block->plot_graded_blobs(to_win); } if (textord_show_boxes && testing_on) { if (to_win == NULL) create_to_win(page_tr); plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE); plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE); plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE); plot_box_list(to_win, &block->blobs, ScrollView::WHITE); } #endif // GRAPHICS_DISABLED } }
void tesseract::Textord::find_components | ( | Pix * | pix, |
BLOCK_LIST * | blocks, | ||
TO_BLOCK_LIST * | to_blocks | ||
) |
Definition at line 208 of file tordmain.cpp.
{ int width = pixGetWidth(pix); int height = pixGetHeight(pix); if (width > MAX_INT16 || height > MAX_INT16) { tprintf("Input image too large! (%d, %d)\n", width, height); return; // Can't handle it. } set_global_loc_code(LOC_EDGE_PROG); BLOCK_IT block_it(blocks); // iterator for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); if (block->poly_block() == NULL || block->poly_block()->IsText()) { extract_edges(pix, block); } } assign_blobs_to_blocks2(pix, blocks, to_blocks); ICOORD page_tr(width, height); filter_blobs(page_tr, to_blocks, !textord_test_landscape); }
Definition at line 1183 of file tospace.cpp.
{ bool bol; // start of line ROW *real_row; // output row C_OUTLINE_IT cout_it; C_BLOB_LIST cblobs; C_BLOB_IT cblob_it = &cblobs; WERD_LIST words; WERD_IT word_it; // new words WERD *word; // new word double coeffs[3]; // quadratic BLOBNBOX *bblob; // current blob TBOX blob_box; // bounding box BLOBNBOX_IT box_it; // iterator inT16 word_count = 0; cblob_it.set_to_list(&cblobs); box_it.set_to_list(row->blob_list()); word_it.set_to_list(&words); bol = TRUE; if (!box_it.empty()) { do { bblob = box_it.data(); blob_box = bblob->bounding_box(); if (bblob->joined_to_prev()) { if (bblob->cblob() != NULL) { cout_it.set_to_list(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); } box_it.forward(); // next one bblob = box_it.data(); blob_box = bblob->bounding_box(); if (!bblob->joined_to_prev() && !cblobs.empty()) { word = new WERD(&cblobs, 1, NULL); word_count++; word_it.add_after_then_move(word); if (bol) { word->set_flag(W_BOL, TRUE); bol = FALSE; } if (box_it.at_first()) { // at end of line word->set_flag(W_EOL, TRUE); } } } while (!box_it.at_first()); // until back at start /* Setup the row with created words. */ coeffs[0] = 0; coeffs[1] = row->line_m(); coeffs[2] = row->line_c(); real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size); word_it.set_to_list(real_row->word_list()); //put words in row word_it.add_list_after(&words); real_row->recalc_bounding_box(); if (tosp_debug_level > 4) { tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count, real_row->bounding_box().left(), real_row->bounding_box().bottom(), real_row->bounding_box().right(), real_row->bounding_box().top()); } return real_row; } return NULL; }
Definition at line 886 of file tospace.cpp.
{ BOOL8 bol; //start of line /* prev_ values are for start of word being built. non prev_ values are for the gap between the word being built and the next one. */ BOOL8 prev_fuzzy_sp; //probably space BOOL8 prev_fuzzy_non; //probably not uinT8 prev_blanks; //in front of word BOOL8 fuzzy_sp; //probably space BOOL8 fuzzy_non; //probably not uinT8 blanks; //in front of word BOOL8 prev_gap_was_a_space = FALSE; BOOL8 break_at_next_gap = FALSE; ROW *real_row; //output row C_OUTLINE_IT cout_it; C_BLOB_LIST cblobs; C_BLOB_IT cblob_it = &cblobs; WERD_LIST words; WERD_IT word_it; //new words WERD *word; //new word WERD_IT rep_char_it; //repeated char words inT32 next_rep_char_word_right = MAX_INT32; float repetition_spacing; //gap between repetitions inT32 xstarts[2]; //row ends double coeffs[3]; //quadratic inT32 prev_x; //end of prev blob BLOBNBOX *bblob; //current blob TBOX blob_box; //bounding box BLOBNBOX_IT box_it; //iterator TBOX prev_blob_box; TBOX next_blob_box; inT16 prev_gap = MAX_INT16; inT16 current_gap = MAX_INT16; inT16 next_gap = MAX_INT16; inT16 prev_within_xht_gap = MAX_INT16; inT16 current_within_xht_gap = MAX_INT16; inT16 next_within_xht_gap = MAX_INT16; inT16 word_count = 0; rep_char_it.set_to_list (&(row->rep_words)); if (!rep_char_it.empty ()) { next_rep_char_word_right = rep_char_it.data ()->bounding_box ().right (); } prev_x = -MAX_INT16; cblob_it.set_to_list (&cblobs); box_it.set_to_list (row->blob_list ()); word_it.set_to_list (&words); bol = TRUE; prev_blanks = 0; prev_fuzzy_sp = FALSE; prev_fuzzy_non = FALSE; if (!box_it.empty ()) { xstarts[0] = box_it.data ()->bounding_box ().left (); if (xstarts[0] > next_rep_char_word_right) { /* We need to insert a repeated char word at the start of the row */ word = rep_char_it.extract (); word_it.add_after_then_move (word); /* Set spaces before repeated char word */ word->set_flag (W_BOL, TRUE); bol = FALSE; word->set_blanks (0); //NO uncertainty word->set_flag (W_FUZZY_SP, FALSE); word->set_flag (W_FUZZY_NON, FALSE); xstarts[0] = word->bounding_box ().left (); /* Set spaces after repeated char word (and leave current word set) */ repetition_spacing = find_mean_blob_spacing (word); current_gap = box_it.data ()->bounding_box ().left () - next_rep_char_word_right; current_within_xht_gap = current_gap; if (current_gap > tosp_rep_space * repetition_spacing) { prev_blanks = (uinT8) floor (current_gap / row->space_size); if (prev_blanks < 1) prev_blanks = 1; } else prev_blanks = 0; if (tosp_debug_level > 5) tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ", box_it.data ()->bounding_box ().left (), box_it.data ()->bounding_box ().bottom (), repetition_spacing, current_gap); prev_fuzzy_sp = FALSE; prev_fuzzy_non = FALSE; if (rep_char_it.empty ()) { next_rep_char_word_right = MAX_INT32; } else { rep_char_it.forward (); next_rep_char_word_right = rep_char_it.data ()->bounding_box ().right (); } } peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); do { bblob = box_it.data (); blob_box = bblob->bounding_box (); if (bblob->joined_to_prev ()) { if (bblob->cblob () != NULL) { cout_it.set_to_list (cblob_it.data ()->out_list ()); cout_it.move_to_last (); cout_it.add_list_after (bblob->cblob ()->out_list ()); delete bblob->cblob (); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move (bblob->cblob ()); prev_x = blob_box.right (); } box_it.forward (); //next one bblob = box_it.data (); blob_box = bblob->bounding_box (); if (!bblob->joined_to_prev() && bblob->cblob() != NULL) { /* Real Blob - not multiple outlines or pre-chopped */ prev_gap = current_gap; prev_within_xht_gap = current_within_xht_gap; prev_blob_box = next_blob_box; current_gap = next_gap; current_within_xht_gap = next_within_xht_gap; peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); inT16 prev_gap_arg = prev_gap; inT16 next_gap_arg = next_gap; if (tosp_only_use_xht_gaps) { prev_gap_arg = prev_within_xht_gap; next_gap_arg = next_within_xht_gap; } // Decide if a word-break should be inserted if (blob_box.left () > next_rep_char_word_right || make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap, current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp, fuzzy_non, prev_gap_was_a_space, break_at_next_gap) || box_it.at_first()) { /* Form a new word out of the blobs collected */ word = new WERD (&cblobs, prev_blanks, NULL); word_count++; word_it.add_after_then_move (word); if (bol) { word->set_flag (W_BOL, TRUE); bol = FALSE; } if (prev_fuzzy_sp) //probably space word->set_flag (W_FUZZY_SP, TRUE); else if (prev_fuzzy_non) word->set_flag (W_FUZZY_NON, TRUE); //probably not if (blob_box.left () > next_rep_char_word_right) { /* We need to insert a repeated char word */ word = rep_char_it.extract (); word_it.add_after_then_move (word); /* Set spaces before repeated char word */ repetition_spacing = find_mean_blob_spacing (word); current_gap = word->bounding_box ().left () - prev_x; current_within_xht_gap = current_gap; if (current_gap > tosp_rep_space * repetition_spacing) { blanks = (uinT8) floor (current_gap / row->space_size); if (blanks < 1) blanks = 1; } else blanks = 0; if (tosp_debug_level > 5) tprintf ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);", word->bounding_box ().left (), word->bounding_box ().bottom (), repetition_spacing, current_gap, blanks); word->set_blanks (blanks); //NO uncertainty word->set_flag (W_FUZZY_SP, FALSE); word->set_flag (W_FUZZY_NON, FALSE); /* Set spaces after repeated char word (and leave current word set) */ current_gap = blob_box.left () - next_rep_char_word_right; if (current_gap > tosp_rep_space * repetition_spacing) { blanks = (uinT8) (current_gap / row->space_size); if (blanks < 1) blanks = 1; } else blanks = 0; if (tosp_debug_level > 5) tprintf (" Rgap:%d (%d blanks)\n", current_gap, blanks); fuzzy_sp = FALSE; fuzzy_non = FALSE; if (rep_char_it.empty ()) { next_rep_char_word_right = MAX_INT32; } else { rep_char_it.forward (); next_rep_char_word_right = rep_char_it.data ()->bounding_box ().right (); } } if (box_it.at_first () && rep_char_it.empty ()) { //at end of line word->set_flag (W_EOL, TRUE); xstarts[1] = prev_x; } else { prev_blanks = blanks; prev_fuzzy_sp = fuzzy_sp; prev_fuzzy_non = fuzzy_non; } } } } while (!box_it.at_first ()); //until back at start /* Insert any further repeated char words */ while (!rep_char_it.empty ()) { word = rep_char_it.extract (); word_it.add_after_then_move (word); /* Set spaces before repeated char word */ repetition_spacing = find_mean_blob_spacing (word); current_gap = word->bounding_box ().left () - prev_x; if (current_gap > tosp_rep_space * repetition_spacing) { blanks = (uinT8) floor (current_gap / row->space_size); if (blanks < 1) blanks = 1; } else blanks = 0; if (tosp_debug_level > 5) tprintf ("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n", word->bounding_box ().left (), word->bounding_box ().bottom (), repetition_spacing, current_gap, blanks); word->set_blanks (blanks); //NO uncertainty word->set_flag (W_FUZZY_SP, FALSE); word->set_flag (W_FUZZY_NON, FALSE); prev_x = word->bounding_box ().right (); if (rep_char_it.empty ()) { //at end of line word->set_flag (W_EOL, TRUE); xstarts[1] = prev_x; } else { rep_char_it.forward (); } } coeffs[0] = 0; coeffs[1] = row->line_m (); coeffs[2] = row->line_c (); real_row = new ROW (row, (inT16) row->kern_size, (inT16) row->space_size); word_it.set_to_list (real_row->word_list ()); //put words in row word_it.add_list_after (&words); real_row->recalc_bounding_box (); if (tosp_debug_level > 4) { tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count, real_row->bounding_box ().left (), real_row->bounding_box ().bottom (), real_row->bounding_box ().right (), real_row->bounding_box ().top ()); } return real_row; } return NULL; }
void tesseract::Textord::set_use_cjk_fp_model | ( | bool | flag | ) | [inline] |
void tesseract::Textord::TextordPage | ( | PageSegMode | pageseg_mode, |
int | width, | ||
int | height, | ||
Pix * | pix, | ||
BLOCK_LIST * | blocks, | ||
TO_BLOCK_LIST * | to_blocks | ||
) |
Definition at line 260 of file textord.cpp.
{ page_tr_.set_x(width); page_tr_.set_y(height); if (to_blocks->empty()) { // AutoPageSeg was not used, so we need to find_components first. find_components(pix, blocks, to_blocks); } else { // AutoPageSeg does not need to find_components as it did that already. // Filter_blobs sets up the TO_BLOCKs the same as find_components does. filter_blobs(page_tr_, to_blocks, true); } ASSERT_HOST(!to_blocks->empty()); if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) { const FCOORD anticlockwise90(0.0f, 1.0f); const FCOORD clockwise90(0.0f, -1.0f); TO_BLOCK_IT it(to_blocks); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { TO_BLOCK* to_block = it.data(); BLOCK* block = to_block->block; // Create a fake poly_block in block from its bounding box. block->set_poly_block(new POLY_BLOCK(block->bounding_box(), PT_VERTICAL_TEXT)); // Rotate the to_block along with its contained block and blobnbox lists. to_block->rotate(anticlockwise90); // Set the block's rotation values to obey the convention followed in // layout analysis for vertical text. block->set_re_rotation(clockwise90); block->set_classify_rotation(clockwise90); } } TO_BLOCK_IT to_block_it(to_blocks); TO_BLOCK* to_block = to_block_it.data(); // Make the rows in the block. float gradient; // Do it the old fashioned way. if (PSM_LINE_FIND_ENABLED(pageseg_mode)) { gradient = make_rows(page_tr_, to_blocks); } else { // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row. gradient = make_single_row(page_tr_, to_block, to_blocks); } // Now fit baselines. For now only old mode is available. fit_rows(gradient, page_tr_, to_blocks); // Now make the words in the lines. if (PSM_WORD_FIND_ENABLED(pageseg_mode)) { // SINGLE_LINE uses the old word maker on the single line. make_words(this, page_tr_, gradient, blocks, to_blocks); } else { // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a // single word, and in SINGLE_CHAR mode, all the outlines // go in a single blob. TO_BLOCK* to_block = to_block_it.data(); make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), to_block->block->row_list()); } cleanup_blocks(blocks); // Remove empties. // Compute the margins for each row in the block, to be used later for // paragraph detection. BLOCK_IT b_it(blocks); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { b_it.data()->compute_row_margins(); } #ifndef GRAPHICS_DISABLED close_to_win(); #endif }
void tesseract::Textord::to_spacing | ( | ICOORD | page_tr, |
TO_BLOCK_LIST * | blocks | ||
) |
Definition at line 35 of file tospace.cpp.
{ TO_BLOCK_IT block_it; //iterator TO_BLOCK *block; //current block; TO_ROW_IT row_it; //row iterator TO_ROW *row; //current row int block_index; //block number int row_index; //row number //estimated width of real spaces for whole block inT16 block_space_gap_width; //estimated width of non space gaps for whole block inT16 block_non_space_gap_width; BOOL8 old_text_ord_proportional;//old fixed/prop result GAPMAP *gapmap = NULL; //map of big vert gaps in blk block_it.set_to_list (blocks); block_index = 1; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block = block_it.data (); gapmap = new GAPMAP (block); block_spacing_stats(block, gapmap, old_text_ord_proportional, block_space_gap_width, block_non_space_gap_width); // Make sure relative values of block-level space and non-space gap // widths are reasonable. The ratio of 1:3 is also used in // block_spacing_stats, to corrrect the block_space_gap_width // Useful for arabic and hindi, when the non-space gap width is // often over-estimated and should not be trusted. A similar ratio // is found in block_spacing_stats. if (tosp_old_to_method && tosp_old_to_constrain_sp_kn && (float) block_space_gap_width / block_non_space_gap_width < 3.0) { block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0); } row_it.set_to_list (block->get_rows ()); row_index = 1; for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) { if ((tosp_debug_level > 0) && !old_text_ord_proportional) tprintf ("Block %d Row %d: Now Proportional\n", block_index, row_index); row_spacing_stats(row, gapmap, block_index, row_index, block_space_gap_width, block_non_space_gap_width); } else { if ((tosp_debug_level > 0) && old_text_ord_proportional) tprintf ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index, row_index, row->pitch_decision, row->fixed_pitch); } #ifndef GRAPHICS_DISABLED if (textord_show_initial_words) plot_word_decisions (to_win, (inT16) row->fixed_pitch, row); #endif row_index++; } delete gapmap; block_index++; } }
bool tesseract::Textord::use_cjk_fp_model | ( | ) | const [inline] |
double tesseract::Textord::textord_blob_size_bigile = 95 |
double tesseract::Textord::textord_blob_size_smallile = 20 |
double tesseract::Textord::textord_blshift_maxshift = 0.00 |
double tesseract::Textord::textord_blshift_xfraction = 9.99 |
double tesseract::Textord::textord_initialasc_ile = 0.90 |
double tesseract::Textord::textord_initialx_ile = 0.75 |
bool tesseract::Textord::textord_no_rejects = false |
double tesseract::Textord::textord_noise_area_ratio = 0.7 |
double tesseract::Textord::textord_noise_hfract = 1.0/64 |
double tesseract::Textord::textord_noise_normratio = 2.0 |
bool tesseract::Textord::textord_noise_rejrows = true |
bool tesseract::Textord::textord_noise_rejwords = true |
double tesseract::Textord::textord_noise_rowratio = 6.0 |
double tesseract::Textord::textord_noise_sizelimit = 0.5 |
double tesseract::Textord::textord_noise_sxfract = 0.4 |
double tesseract::Textord::textord_noise_syfract = 0.2 |
bool tesseract::Textord::textord_show_blobs = false |
bool tesseract::Textord::textord_show_boxes = false |
bool tesseract::Textord::textord_single_height_mode = false |
bool tesseract::Textord::tosp_all_flips_fuzzy = false |
bool tesseract::Textord::tosp_block_use_cert_spaces = true |
double tesseract::Textord::tosp_enough_small_gaps = 0.65 |
int tesseract::Textord::tosp_few_samples = 40 |
double tesseract::Textord::tosp_flip_caution = 0.0 |
bool tesseract::Textord::tosp_flip_fuzz_kn_to_sp = true |
bool tesseract::Textord::tosp_flip_fuzz_sp_to_kn = true |
bool tesseract::Textord::tosp_force_wordbreak_on_punct = false |
double tesseract::Textord::tosp_fuzzy_kn_fraction = 0.5 |
bool tesseract::Textord::tosp_fuzzy_limit_all = true |
double tesseract::Textord::tosp_fuzzy_sp_fraction = 0.5 |
double tesseract::Textord::tosp_fuzzy_space_factor = 0.6 |
double tesseract::Textord::tosp_fuzzy_space_factor1 = 0.5 |
double tesseract::Textord::tosp_fuzzy_space_factor2 = 0.72 |
double tesseract::Textord::tosp_gap_factor = 0.83 |
double tesseract::Textord::tosp_ignore_big_gaps = -1 |
double tesseract::Textord::tosp_ignore_very_big_gaps = 3.5 |
bool tesseract::Textord::tosp_improve_thresh = false |
double tesseract::Textord::tosp_init_guess_kn_mult = 2.2 |
double tesseract::Textord::tosp_init_guess_xht_mult = 0.28 |
double tesseract::Textord::tosp_kern_gap_factor1 = 2.0 |
double tesseract::Textord::tosp_kern_gap_factor2 = 1.3 |
double tesseract::Textord::tosp_kern_gap_factor3 = 2.5 |
double tesseract::Textord::tosp_large_kerning = 0.19 |
double tesseract::Textord::tosp_max_sane_kn_thresh = 5.0 |
double tesseract::Textord::tosp_min_sane_kn_sp = 1.5 |
double tesseract::Textord::tosp_narrow_aspect_ratio = 0.48 |
bool tesseract::Textord::tosp_narrow_blobs_not_cert = true |
double tesseract::Textord::tosp_narrow_fraction = 0.3 |
double tesseract::Textord::tosp_near_lh_edge = 0 |
double tesseract::Textord::tosp_old_sp_kn_th_factor = 2.0 |
bool tesseract::Textord::tosp_old_to_bug_fix = false |
bool tesseract::Textord::tosp_old_to_constrain_sp_kn = false |
bool tesseract::Textord::tosp_old_to_method = false |
bool tesseract::Textord::tosp_only_small_gaps_for_kern = false |
bool tesseract::Textord::tosp_only_use_prop_rows = true |
bool tesseract::Textord::tosp_only_use_xht_gaps = false |
double tesseract::Textord::tosp_pass_wide_fuzz_sp_to_context = 0.75 |
double tesseract::Textord::tosp_rep_space = 1.6 |
bool tesseract::Textord::tosp_row_use_cert_spaces = true |
bool tesseract::Textord::tosp_row_use_cert_spaces1 = true |
bool tesseract::Textord::tosp_rule_9_test_punct = false |
int tesseract::Textord::tosp_short_row = 20 |
double tesseract::Textord::tosp_silly_kn_sp_gap = 0.2 |
bool tesseract::Textord::tosp_stats_use_xht_gaps = true |
double tesseract::Textord::tosp_table_fuzzy_kn_sp_ratio = 3.0 |
double tesseract::Textord::tosp_table_kn_sp_ratio = 2.25 |
double tesseract::Textord::tosp_table_xht_sp_ratio = 0.33 |
double tesseract::Textord::tosp_threshold_bias1 = 0 |
double tesseract::Textord::tosp_threshold_bias2 = 0 |
bool tesseract::Textord::tosp_use_pre_chopping = false |
bool tesseract::Textord::tosp_use_xht_gaps = true |
double tesseract::Textord::tosp_wide_aspect_ratio = 0.0 |
double tesseract::Textord::tosp_wide_fraction = 0.52 |