Tesseract
3.02
|
#include <equationdetect.h>
Definition at line 42 of file equationdetect.h.
Definition at line 48 of file equationdetect.h.
tesseract::EquationDetect::EquationDetect | ( | const char * | equ_datapath, |
const char * | equ_language | ||
) |
Definition at line 105 of file equationdetect.cpp.
{ const char* default_name = "equ"; if (equ_name == NULL) { equ_name = default_name; } equ_tesseract_ = lang_tesseract_ = NULL; resolution_ = 0; page_count_ = 0; // Construct equ_tesseract_. equ_tesseract_ = new Tesseract(); if (equ_tesseract_->init_tesseract(equ_datapath, equ_name, OEM_TESSERACT_ONLY)) { tprintf("Warning: equation region detection requested," " but %s failed to load from %s\n", equ_name, equ_datapath); delete equ_tesseract_; equ_tesseract_ = NULL; } cps_super_bbox_ = NULL; }
tesseract::EquationDetect::~EquationDetect | ( | ) |
Definition at line 128 of file equationdetect.cpp.
{ if (equ_tesseract_) { delete (equ_tesseract_); } if (cps_super_bbox_) { delete(cps_super_bbox_); } }
bool tesseract::EquationDetect::CheckForSeed2 | ( | const GenericVector< int > & | indented_texts_left, |
const float | foreground_density_th, | ||
ColPartition * | part | ||
) | [protected] |
Definition at line 757 of file equationdetect.cpp.
{ ASSERT_HOST(part); const TBOX& box = part->bounding_box(); // Check if it is aligned with any indented_texts_left. if (!indented_texts_left.empty() && CountAlignment(indented_texts_left, box.left()) >= kLeftIndentAlignmentCountTh) { return false; } // Check the foreground density. if (ComputeForegroundDensity(box) > foreground_density_th) { return false; } return true; }
bool tesseract::EquationDetect::CheckSeedBlobsCount | ( | ColPartition * | part | ) | [protected] |
Definition at line 1004 of file equationdetect.cpp.
{ if (!part) { return false; } const int kSeedMathBlobsCount = 2; const int kSeedMathDigitBlobsCount = 5; int blobs = part->boxes_count(), math_blobs = part->SpecialBlobsCount(BSTT_MATH), digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT); if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount || math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) { return false; } return true; }
bool tesseract::EquationDetect::CheckSeedDensity | ( | const float | math_density_high, |
const float | math_density_low, | ||
const ColPartition * | part | ||
) | const [protected] |
Definition at line 1022 of file equationdetect.cpp.
{ ASSERT_HOST(part); float math_digit_density = part->SpecialBlobsDensity(BSTT_MATH) + part->SpecialBlobsDensity(BSTT_DIGIT); float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC); if (math_digit_density > math_density_high) { return true; } if (math_digit_density + italic_density > kMathItalicDensityTh && math_digit_density > math_density_low) { return true; } return false; }
bool tesseract::EquationDetect::CheckSeedFgDensity | ( | const float | density_th, |
ColPartition * | part | ||
) | [protected] |
Definition at line 645 of file equationdetect.cpp.
{ ASSERT_HOST(part); // Split part horizontall, and check for each sub part. GenericVector<TBOX> sub_boxes; SplitCPHorLite(part, &sub_boxes); float parts_passed = 0.0; for (int i = 0; i < sub_boxes.size(); ++i) { float density = ComputeForegroundDensity(sub_boxes[i]); if (density < density_th) { parts_passed++; } } // If most sub parts passed, then we return true. const float kSeedPartRatioTh = 0.3; bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh); return retval; }
bool tesseract::EquationDetect::CheckSeedNeighborDensity | ( | const ColPartition * | part | ) | const [protected] |
Definition at line 1313 of file equationdetect.cpp.
{ ASSERT_HOST(part); if (part->boxes_count() < kSeedBlobsCountTh) { // Too few blobs, skip the check. return true; } // We check the math blobs density and the unclear blobs density. if (part->SpecialBlobsDensity(BSTT_MATH) + part->SpecialBlobsDensity(BSTT_DIGIT) > kMathDigitDensityTh1 || part->SpecialBlobsDensity(BSTT_UNCLEAR) > kUnclearDensityTh) { return true; } return false; }
void tesseract::EquationDetect::ComputeCPsSuperBBox | ( | ) | [protected] |
Definition at line 810 of file equationdetect.cpp.
{ ColPartitionGridSearch gsearch(part_grid_); ColPartition *part = NULL; gsearch.StartFullSearch(); if (cps_super_bbox_) { delete cps_super_bbox_; } cps_super_bbox_ = new TBOX(); while ((part = gsearch.NextFullSearch()) != NULL) { (*cps_super_bbox_) += part->bounding_box(); } }
float tesseract::EquationDetect::ComputeForegroundDensity | ( | const TBOX & | tbox | ) | [protected] |
Definition at line 626 of file equationdetect.cpp.
{ #if LIBLEPT_MINOR_VERSION < 69 && LIBLEPT_MAJOR_VERSION <= 1 // This will disable the detector because no seed will be identified. return 1.0f; #else Pix *pix_bi = lang_tesseract_->pix_binary(); int pix_height = pixGetHeight(pix_bi); Box* box = boxCreate(tbox.left(), pix_height - tbox.top(), tbox.width(), tbox.height()); Pix *pix_sub = pixClipRectangle(pix_bi, box, NULL); l_float32 fract; pixForegroundFraction(pix_sub, &fract); pixDestroy(&pix_sub); boxDestroy(&box); return fract; #endif }
int tesseract::EquationDetect::CountAlignment | ( | const GenericVector< int > & | sorted_vec, |
const int | val | ||
) | const [protected] |
Definition at line 779 of file equationdetect.cpp.
{ if (sorted_vec.empty()) { return 0; } const int kDistTh = static_cast<int>(roundf(0.03 * resolution_)); int pos = sorted_vec.binary_search(val), count = 0; // Search left side. int index = pos; while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) { count++; } // Search right side. index = pos + 1; while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) { count++; } return count; }
int tesseract::EquationDetect::EstimateTextPartLineSpacing | ( | ) | [protected] |
Definition at line 888 of file equationdetect.cpp.
{ ColPartitionGridSearch gsearch(part_grid_); // Get the y gap between text partitions; ColPartition *current = NULL, *prev = NULL; gsearch.StartFullSearch(); GenericVector<int> ygaps; while ((current = gsearch.NextFullSearch()) != NULL) { if (!PTIsTextType(current->type())) { continue; } if (prev != NULL) { const TBOX ¤t_box = current->bounding_box(); const TBOX &prev_box = prev->bounding_box(); // prev and current should be x major overlap and non y overlap. if (current_box.major_x_overlap(prev_box) && !current_box.y_overlap(prev_box)) { int gap = current_box.y_gap(prev_box); if (gap < MIN(current_box.height(), prev_box.height())) { // The gap should be smaller than the height of the bounding boxes. ygaps.push_back(gap); } } } prev = current; } if (ygaps.size() < 8) { // We do not have enough data. return -1; } // Compute the line spacing from ygaps: use the mean of the first half. ygaps.sort(); int spacing = 0, count; for (count = 0; count < ygaps.size() / 2; count++) { spacing += ygaps[count]; } return spacing / count; }
BlobSpecialTextType tesseract::EquationDetect::EstimateTypeForUnichar | ( | const UNICHARSET & | unicharset, |
const UNICHAR_ID | id | ||
) | const [protected] |
Definition at line 239 of file equationdetect.cpp.
{ STRING s = unicharset.id_to_unichar(id); if (unicharset.get_isalpha(id)) { return BSTT_NONE; } if (unicharset.get_ispunctuation(id)) { // Exclude some special texts that are likely to be confused as math symbol. static GenericVector<UNICHAR_ID> ids_to_exclude; if (ids_to_exclude.empty()) { static const STRING kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".", "〈", "〉", "《", "》", "」", "「", ""}; int i = 0; while (kCharsToEx[i] != "") { ids_to_exclude.push_back( unicharset.unichar_to_id(kCharsToEx[i++].string())); } ids_to_exclude.sort(); } return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH; } // Check if it is digit. In addition to the isdigit attribute, we also check // if this character belongs to those likely to be confused with a digit. static const STRING kDigitsChars = "|"; if (unicharset.get_isdigit(id) || (s.length() == 1 && kDigitsChars.contains(s[0]))) { return BSTT_DIGIT; } else { return BSTT_MATH; } }
bool tesseract::EquationDetect::ExpandSeed | ( | ColPartition * | seed | ) | [protected] |
Definition at line 1105 of file equationdetect.cpp.
{ if (seed == NULL || // This seed has been absorbed by other seeds. seed->IsVerticalType()) { // We skip vertical type right now. return false; } // Expand in four directions. GenericVector<ColPartition*> parts_to_merge; ExpandSeedHorizontal(true, seed, &parts_to_merge); ExpandSeedHorizontal(false, seed, &parts_to_merge); ExpandSeedVertical(true, seed, &parts_to_merge); ExpandSeedVertical(false, seed, &parts_to_merge); SearchByOverlap(seed, &parts_to_merge); if (parts_to_merge.empty()) { // We don't find any partition to merge. return false; } // Merge all partitions in parts_to_merge with seed. We first remove seed // from part_grid_ as its bounding box is going to expand. Then we add it // back after it aborbs all parts_to_merge parititions. part_grid_->RemoveBBox(seed); for (int i = 0; i < parts_to_merge.size(); ++i) { ColPartition* part = parts_to_merge[i]; if (part->type() == PT_EQUATION) { // If part is in cp_seeds_, then we mark it as NULL so that we won't // process it again. for (int j = 0; j < cp_seeds_.size(); ++j) { if (part == cp_seeds_[j]) { cp_seeds_[j] = NULL; break; } } } // part has already been removed from part_grid_ in function // ExpandSeedHorizontal/ExpandSeedVertical. seed->Absorb(part, NULL); } return true; }
void tesseract::EquationDetect::ExpandSeedHorizontal | ( | const bool | search_left, |
ColPartition * | seed, | ||
GenericVector< ColPartition * > * | parts_to_merge | ||
) | [protected] |
Definition at line 1148 of file equationdetect.cpp.
{ ASSERT_HOST(seed != NULL && parts_to_merge != NULL); const float kYOverlapTh = 0.6; const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_)); ColPartitionGridSearch search(part_grid_); const TBOX& seed_box(seed->bounding_box()); int x = search_left ? seed_box.left() : seed_box.right(); search.StartSideSearch(x, seed_box.bottom(), seed_box.top()); search.SetUniqueMode(true); // Search iteratively. ColPartition *part = NULL; while ((part = search.NextSideSearch(search_left)) != NULL) { if (part == seed) { continue; } const TBOX& part_box(part->bounding_box()); if (part_box.x_gap(seed_box) > kXGapTh) { // Out of scope. break; } // Check part location. if ((part_box.left() >= seed_box.left() && search_left) || (part_box.right() <= seed_box.right() && !search_left)) { continue; } if (part->type() != PT_EQUATION) { // Non-equation type. // Skip PT_LINLINE_EQUATION and non text type. if (part->type() == PT_INLINE_EQUATION || (!IsTextOrEquationType(part->type()) && part->blob_type() != BRT_HLINE)) { continue; } // For other types, it should be the near small neighbor of seed. if (!IsNearSmallNeighbor(seed_box, part_box) || !CheckSeedNeighborDensity(part)) { continue; } } else { // Equation type, check the y overlap. if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh && seed_box.y_overlap_fraction(part_box) < kYOverlapTh) { continue; } } // Passed the check, delete it from search and add into parts_to_merge. search.RemoveBBox(); parts_to_merge->push_back(part); } }
void tesseract::EquationDetect::ExpandSeedVertical | ( | const bool | search_bottom, |
ColPartition * | seed, | ||
GenericVector< ColPartition * > * | parts_to_merge | ||
) | [protected] |
Definition at line 1204 of file equationdetect.cpp.
{ ASSERT_HOST(seed != NULL && parts_to_merge != NULL && cps_super_bbox_ != NULL); const float kXOverlapTh = 0.4; const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_)); ColPartitionGridSearch search(part_grid_); const TBOX& seed_box(seed->bounding_box()); int y = search_bottom ? seed_box.bottom() : seed_box.top(); search.StartVerticalSearch( cps_super_bbox_->left(), cps_super_bbox_->right(), y); search.SetUniqueMode(true); // Search iteratively. ColPartition *part = NULL; GenericVector<ColPartition*> parts; int skipped_min_top = INT_MAX, skipped_max_bottom = -1; while ((part = search.NextVerticalSearch(search_bottom)) != NULL) { if (part == seed) { continue; } const TBOX& part_box(part->bounding_box()); if (part_box.y_gap(seed_box) > kYGapTh) { // Out of scope. break; } // Check part location. if ((part_box.bottom() >= seed_box.bottom() && search_bottom) || (part_box.top() <= seed_box.top() && !search_bottom)) { continue; } bool skip_part = false; if (part->type() != PT_EQUATION) { // Non-equation type. // Skip PT_LINLINE_EQUATION and non text type. if (part->type() == PT_INLINE_EQUATION || (!IsTextOrEquationType(part->type()) && part->blob_type() != BRT_HLINE)) { skip_part = true; } else if (!IsNearSmallNeighbor(seed_box, part_box) || !CheckSeedNeighborDensity(part)) { // For other types, it should be the near small neighbor of seed. skip_part = true; } } else { // Equation type, check the x overlap. if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh && seed_box.x_overlap_fraction(part_box) < kXOverlapTh) { skip_part = true; } } if (skip_part) { if (part->type() != PT_EQUATION) { if (skipped_min_top > part_box.top()) { skipped_min_top = part_box.top(); } if (skipped_max_bottom < part_box.bottom()) { skipped_max_bottom = part_box.bottom(); } } } else { parts.push_back(part); } } // For every part in parts, we need verify it is not above skipped_min_top // when search top, or not below skipped_max_bottom when search bottom. I.e., // we will skip a part if it looks like: // search bottom | search top // seed: ****************** | part: ********** // skipped: xxx | skipped: xxx // part: ********** | seed: *********** for (int i = 0; i < parts.size(); i++) { const TBOX& part_box(parts[i]->bounding_box()); if ((search_bottom && part_box.top() <= skipped_max_bottom) || (!search_bottom && part_box.bottom() >= skipped_min_top)) { continue; } // Add parts[i] into parts_to_merge, and delete it from part_grid_. parts_to_merge->push_back(parts[i]); part_grid_->RemoveBBox(parts[i]); } }
int tesseract::EquationDetect::FindEquationParts | ( | ColPartitionGrid * | part_grid, |
ColPartitionSet ** | best_columns | ||
) | [virtual] |
Implements tesseract::EquationDetectBase.
Definition at line 373 of file equationdetect.cpp.
{ if (!equ_tesseract_ || !lang_tesseract_) { tprintf("Warning: equ_tesseract_/lang_tesseract_ is NULL!\n"); return -1; } if (!part_grid || !best_columns) { tprintf("part_grid/best_columns is NULL!!\n"); return -1; } cp_seeds_.clear(); part_grid_ = part_grid; best_columns_ = best_columns; resolution_ = lang_tesseract_->source_resolution(); STRING outfile; page_count_++; if (equationdetect_save_bi_image) { GetOutputTiffName("_bi", &outfile); pixWrite(outfile.string(), lang_tesseract_->pix_binary(), IFF_TIFF_G4); } // Pass 0: Compute special text type for blobs. IdentifySpecialText(); // Pass 1: Merge parts by overlap. MergePartsByLocation(); // Pass 2: compute the math blob density and find the seed partition. IdentifySeedParts(); // We still need separate seed into block seed and inline seed partition. IdentifyInlineParts(); if (equationdetect_save_seed_image) { GetOutputTiffName("_seed", &outfile); PaintColParts(outfile); } // Pass 3: expand block equation seeds. while (!cp_seeds_.empty()) { GenericVector<ColPartition*> seeds_expanded; for (int i = 0; i < cp_seeds_.size(); ++i) { if (ExpandSeed(cp_seeds_[i])) { // If this seed is expanded, then we add it into seeds_expanded. Note // this seed has been removed from part_grid_ if it is expanded. seeds_expanded.push_back(cp_seeds_[i]); } } // Add seeds_expanded back into part_grid_ and reset cp_seeds_. for (int i = 0; i < seeds_expanded.size(); ++i) { InsertPartAfterAbsorb(seeds_expanded[i]); } cp_seeds_ = seeds_expanded; } // Pass 4: find math block satellite text partitions and merge them. ProcessMathBlockSatelliteParts(); if (equationdetect_save_merged_image) { // For debug. GetOutputTiffName("_merged", &outfile); PaintColParts(outfile); } return 0; }
void tesseract::EquationDetect::GetOutputTiffName | ( | const char * | name, |
STRING * | image_name | ||
) | const [protected] |
Definition at line 1477 of file equationdetect.cpp.
{ ASSERT_HOST(image_name && name); char page[50]; snprintf(page, sizeof(page), "%04d", page_count_); *image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif"; }
void tesseract::EquationDetect::IdentifyBlobsToSkip | ( | ColPartition * | part | ) | [protected] |
Definition at line 325 of file equationdetect.cpp.
{ ASSERT_HOST(part); BLOBNBOX_C_IT blob_it(part->boxes()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { // At this moment, no blob should have been joined. ASSERT_HOST(!blob_it.data()->joined_to_prev()); } for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) { continue; } TBOX blob_box = blob->bounding_box(); // Search if any blob can be merged into blob. If found, then we mark all // these blobs as BSTT_SKIP. BLOBNBOX_C_IT blob_it2 = blob_it; bool found = false; while (!blob_it2.at_last()) { BLOBNBOX* nextblob = blob_it2.forward(); const TBOX& nextblob_box = nextblob->bounding_box(); if (nextblob_box.left() >= blob_box.right()) { break; } const float kWidthR = 0.4, kHeightR = 0.3; bool xoverlap = blob_box.major_x_overlap(nextblob_box), yoverlap = blob_box.y_overlap(nextblob_box); float widthR = static_cast<float>( MIN(nextblob_box.width(), blob_box.width())) / MAX(nextblob_box.width(), blob_box.width()); float heightR = static_cast<float>( MIN(nextblob_box.height(), blob_box.height())) / MAX(nextblob_box.height(), blob_box.height()); if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) { // Found one, set nextblob type and recompute blob_box. found = true; nextblob->set_special_text_type(BSTT_SKIP); blob_box += nextblob_box; } } if (found) { blob->set_special_text_type(BSTT_SKIP); } } }
void tesseract::EquationDetect::IdentifyInlineParts | ( | ) | [protected] |
Definition at line 802 of file equationdetect.cpp.
{ ComputeCPsSuperBBox(); IdentifyInlinePartsHorizontal(); int textparts_linespacing = EstimateTextPartLineSpacing(); IdentifyInlinePartsVertical(true, textparts_linespacing); IdentifyInlinePartsVertical(false, textparts_linespacing); }
void tesseract::EquationDetect::IdentifyInlinePartsHorizontal | ( | ) | [protected] |
Definition at line 823 of file equationdetect.cpp.
{ ASSERT_HOST(cps_super_bbox_); GenericVector<ColPartition*> new_seeds; const int kMarginDiffTh = IntCastRounded( 0.5 * lang_tesseract_->source_resolution()); const int kGapTh = static_cast<int>(roundf( 1.0 * lang_tesseract_->source_resolution())); ColPartitionGridSearch search(part_grid_); search.SetUniqueMode(true); // The center x coordinate of the cp_super_bbox_. int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2; for (int i = 0; i < cp_seeds_.size(); ++i) { ColPartition* part = cp_seeds_[i]; const TBOX& part_box(part->bounding_box()); int left_margin = part_box.left() - cps_super_bbox_->left(), right_margin = cps_super_bbox_->right() - part_box.right(); bool right_to_left; if (left_margin + kMarginDiffTh < right_margin && left_margin < kMarginDiffTh) { // part is left aligned, so we search if it has any right neighbor. search.StartSideSearch( part_box.right(), part_box.top(), part_box.bottom()); right_to_left = false; } else if (left_margin > cps_cx) { // part locates on the right half on image, so search if it has any left // neighbor. search.StartSideSearch( part_box.left(), part_box.top(), part_box.bottom()); right_to_left = true; } else { // part is not an inline equation. new_seeds.push_back(part); continue; } ColPartition* neighbor = NULL; bool side_neighbor_found = false; while ((neighbor = search.NextSideSearch(right_to_left)) != NULL) { const TBOX& neighbor_box(neighbor->bounding_box()); if (!IsTextOrEquationType(neighbor->type()) || part_box.x_gap(neighbor_box) > kGapTh || !part_box.major_y_overlap(neighbor_box) || part_box.major_x_overlap(neighbor_box)) { continue; } // We have found one. Set the side_neighbor_found flag. side_neighbor_found = true; break; } if (!side_neighbor_found) { // Mark part as PT_INLINE_EQUATION. part->set_type(PT_INLINE_EQUATION); } else { // Check the geometric feature of neighbor. const TBOX& neighbor_box(neighbor->bounding_box()); if (neighbor_box.width() > part_box.width() && neighbor->type() != PT_EQUATION) { // Mark as PT_INLINE_EQUATION. part->set_type(PT_INLINE_EQUATION); } else { // part is not an inline equation type. new_seeds.push_back(part); } } } // Reset the cp_seeds_ using the new_seeds. cp_seeds_ = new_seeds; }
void tesseract::EquationDetect::IdentifyInlinePartsVertical | ( | const bool | top_to_bottom, |
const int | textPartsLineSpacing | ||
) | [protected] |
Definition at line 928 of file equationdetect.cpp.
{ if (cp_seeds_.empty()) { return; } // Sort cp_seeds_. if (top_to_bottom) { // From top to bottom. cp_seeds_.sort(&SortCPByTopReverse); } else { // From bottom to top. cp_seeds_.sort(&SortCPByBottom); } GenericVector<ColPartition*> new_seeds; for (int i = 0; i < cp_seeds_.size(); ++i) { ColPartition* part = cp_seeds_[i]; // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look // for its top neighbors, so that if two/more inline regions are connected // to each other, then we will identify the top one, and then use it to // identify the bottom one. if (IsInline(!top_to_bottom, textparts_linespacing, part)) { part->set_type(PT_INLINE_EQUATION); } else { new_seeds.push_back(part); } } cp_seeds_ = new_seeds; }
void tesseract::EquationDetect::IdentifySeedParts | ( | ) | [protected] |
Definition at line 554 of file equationdetect.cpp.
{ ColPartitionGridSearch gsearch(part_grid_); ColPartition *part = NULL; gsearch.StartFullSearch(); GenericVector<ColPartition*> seeds1, seeds2; // The left coordinates of indented text partitions. GenericVector<int> indented_texts_left; // The foreground density of text partitions. GenericVector<float> texts_foreground_density; while ((part = gsearch.NextFullSearch()) != NULL) { if (!IsTextOrEquationType(part->type())) { continue; } part->ComputeSpecialBlobsDensity(); bool blobs_check = CheckSeedBlobsCount(part); const int kTextBlobsTh = 20; if (CheckSeedDensity(kMathDigitDensityTh1, kMathDigitDensityTh2, part) && blobs_check) { // Passed high density threshold test, save into seeds1. seeds1.push_back(part); } else { IndentType indent = IsIndented(part); if (IsLeftIndented(indent) && blobs_check && CheckSeedDensity(kMathDigitDensityTh2, kMathDigitDensityTh2, part)) { // Passed low density threshold test and is indented, save into seeds2. seeds2.push_back(part); } else if (!IsRightIndented(indent) && part->boxes_count() > kTextBlobsTh) { // This is likely to be a text part, save the features. const TBOX&box = part->bounding_box(); if (IsLeftIndented(indent)) { indented_texts_left.push_back(box.left()); } texts_foreground_density.push_back(ComputeForegroundDensity(box)); } } } // Sort the features collected from text regions. indented_texts_left.sort(); texts_foreground_density.sort(); float foreground_density_th = 0.15; // Default value. if (!texts_foreground_density.empty()) { // Use the median of the texts_foreground_density. foreground_density_th = 0.8 * texts_foreground_density[ texts_foreground_density.size() / 2]; } for (int i = 0; i < seeds1.size(); ++i) { const TBOX& box = seeds1[i]->bounding_box(); if (CheckSeedFgDensity(foreground_density_th, seeds1[i]) && !(IsLeftIndented(IsIndented(seeds1[i])) && CountAlignment(indented_texts_left, box.left()) >= kLeftIndentAlignmentCountTh)) { // Mark as PT_EQUATION type. seeds1[i]->set_type(PT_EQUATION); cp_seeds_.push_back(seeds1[i]); } else { // Mark as PT_INLINE_EQUATION type. seeds1[i]->set_type(PT_INLINE_EQUATION); } } for (int i = 0; i < seeds2.size(); ++i) { if (CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) { seeds2[i]->set_type(PT_EQUATION); cp_seeds_.push_back(seeds2[i]); } } }
void tesseract::EquationDetect::IdentifySpecialText | ( | BLOBNBOX * | blob, |
const int | height_th | ||
) | [protected] |
Definition at line 165 of file equationdetect.cpp.
{ ASSERT_HOST(blobnbox != NULL); if (blobnbox->bounding_box().height() < height_th && height_th > 0) { // For small blob, we simply set to BSTT_NONE. blobnbox->set_special_text_type(BSTT_NONE); return; } BLOB_CHOICE_LIST ratings_equ, ratings_lang; C_BLOB* blob = blobnbox->cblob(); TBLOB* tblob = TBLOB::PolygonalCopy(blob); const TBOX& box = tblob->bounding_box(); // Normalize the blob. Set the origin to the place we want to be the // bottom-middle, and scaling is to make the height the x-height. float scaling = static_cast<float>(kBlnXHeight) / box.height(); DENORM denorm; float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom(); denorm.SetupNormalization(NULL, NULL, NULL, NULL, NULL, 0, x_orig, y_orig, scaling, scaling, 0.0f, static_cast<float>(kBlnBaselineOffset)); TBLOB* normed_blob = new TBLOB(*tblob); normed_blob->Normalize(denorm); equ_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_equ, NULL); lang_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_lang, NULL); delete normed_blob; delete tblob; // Get the best choice from ratings_lang and rating_equ. As the choice in the // list has already been sorted by the certainty, we simply use the first // choice. BLOB_CHOICE *lang_choice = NULL, *equ_choice = NULL; if (ratings_lang.length() > 0) { BLOB_CHOICE_IT choice_it(&ratings_lang); lang_choice = choice_it.data(); } if (ratings_equ.length() > 0) { BLOB_CHOICE_IT choice_it(&ratings_equ); equ_choice = choice_it.data(); } float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX; float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX; const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8; // The scores here are negative, so the max/min == fabs(min/max). // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score); float diff = fabs(lang_score - equ_score); BlobSpecialTextType type = BSTT_NONE; // Classification. if (fmax(lang_score, equ_score) < kConfScoreTh) { // If both score are very small, then mark it as unclear. type = BSTT_UNCLEAR; } else if (diff > kConfDiffTh && equ_score > lang_score) { // If equ_score is significantly higher, then we classify this character as // math symbol. type = BSTT_MATH; } else if (lang_choice) { // For other cases: lang_score is similar or significantly higher. type = EstimateTypeForUnichar( lang_tesseract_->unicharset, lang_choice->unichar_id()); } if (type == BSTT_NONE && lang_tesseract_->get_fontinfo_table().get( lang_choice->fontinfo_id()).is_italic()) { // For text symbol, we still check if it is italic. blobnbox->set_special_text_type(BSTT_ITALIC); } else { blobnbox->set_special_text_type(type); } }
void tesseract::EquationDetect::IdentifySpecialText | ( | ) | [protected] |
Definition at line 273 of file equationdetect.cpp.
{ // Set configuration for Tesseract::AdaptiveClassifier. equ_tesseract_->tess_cn_matching.set_value(true); // turn it on equ_tesseract_->tess_bn_matching.set_value(false); // Set the multiplier to zero for lang_tesseract_ to improve the accuracy. int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier; int classify_integer_matcher = lang_tesseract_->classify_integer_matcher_multiplier; lang_tesseract_->classify_class_pruner_multiplier.set_value(0); lang_tesseract_->classify_integer_matcher_multiplier.set_value(0); ColPartitionGridSearch gsearch(part_grid_); ColPartition *part = NULL; gsearch.StartFullSearch(); while ((part = gsearch.NextFullSearch()) != NULL) { if (!IsTextOrEquationType(part->type())) { continue; } IdentifyBlobsToSkip(part); BLOBNBOX_C_IT bbox_it(part->boxes()); // Compute the height threshold. GenericVector<int> blob_heights; for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list(); bbox_it.forward()) { if (bbox_it.data()->special_text_type() != BSTT_SKIP) { blob_heights.push_back(bbox_it.data()->bounding_box().height()); } } blob_heights.sort(); int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2; for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list(); bbox_it.forward()) { if (bbox_it.data()->special_text_type() != BSTT_SKIP) { IdentifySpecialText(bbox_it.data(), height_th); } } } // Set the multiplier values back. lang_tesseract_->classify_class_pruner_multiplier.set_value( classify_class_pruner); lang_tesseract_->classify_integer_matcher_multiplier.set_value( classify_integer_matcher); if (equationdetect_save_spt_image) { // For debug. STRING outfile; GetOutputTiffName("_spt", &outfile); PaintSpecialTexts(outfile); } }
void tesseract::EquationDetect::InsertPartAfterAbsorb | ( | ColPartition * | part | ) | [protected] |
Definition at line 527 of file equationdetect.cpp.
{ ASSERT_HOST(part); // Before insert part back into part_grid_, we will need re-compute some // of its attributes such as first_column_, last_column_. However, we still // want to preserve its type. BlobTextFlowType flow_type = part->flow(); PolyBlockType part_type = part->type(); BlobRegionType blob_type = part->blob_type(); // Call SetPartitionType to re-compute the attributes of part. const TBOX& part_box(part->bounding_box()); int grid_x, grid_y; part_grid_->GridCoords( part_box.left(), part_box.bottom(), &grid_x, &grid_y); part->SetPartitionType(resolution_, best_columns_[grid_y]); // Reset the types back. part->set_type(part_type); part->set_blob_type(blob_type); part->set_flow(flow_type); part->SetBlobTypes(); // Insert into part_grid_. part_grid_->InsertBBox(true, true, part); }
EquationDetect::IndentType tesseract::EquationDetect::IsIndented | ( | ColPartition * | part | ) | [protected] |
Definition at line 1041 of file equationdetect.cpp.
{ ASSERT_HOST(part); ColPartitionGridSearch search(part_grid_); ColPartition *neighbor = NULL; const TBOX& part_box(part->bounding_box()); const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_)); const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_)); const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_)); // Here we use a simple approximation algorithm: from the center of part, We // perform the radius search, and check if we can find a neighboring parition // that locates on the top/bottom left of part. search.StartRadSearch((part_box.left() + part_box.right()) / 2, (part_box.top() + part_box.bottom()) / 2, kRadiusTh); search.SetUniqueMode(true); bool left_indented = false, right_indented = false; while ((neighbor = search.NextRadSearch()) != NULL && (!left_indented || !right_indented)) { if (neighbor == part) { continue; } const TBOX& neighbor_box(neighbor->bounding_box()); if (part_box.major_y_overlap(neighbor_box) && part_box.x_gap(neighbor_box) < kXGapTh) { // When this happens, it is likely part is a fragment of an // over-segmented colpartition. So we return false. return NO_INDENT; } if (!IsTextOrEquationType(neighbor->type())) { continue; } // The neighbor should be above/below part, and overlap in x direction. if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) { continue; } if (part_box.y_gap(neighbor_box) < kYGapTh) { int left_gap = part_box.left() - neighbor_box.left(); int right_gap = neighbor_box.right() - part_box.right(); if (left_gap > kXGapTh) { left_indented = true; } if (right_gap > kXGapTh) { right_indented = true; } } } if (left_indented && right_indented) { return BOTH_INDENT; } if (left_indented) { return LEFT_INDENT; } if (right_indented) { return RIGHT_INDENT; } return NO_INDENT; }
bool tesseract::EquationDetect::IsInline | ( | const bool | search_bottom, |
const int | textPartsLineSpacing, | ||
ColPartition * | part | ||
) | [protected] |
Definition at line 957 of file equationdetect.cpp.
{ ASSERT_HOST(part != NULL); // Look for its nearest vertical neighbor that hardly overlaps in y but // largely overlaps in x. ColPartitionGridSearch search(part_grid_); ColPartition *neighbor = NULL; const TBOX& part_box(part->bounding_box()); const float kYGapRatioTh = 1.0; if (search_bottom) { search.StartVerticalSearch(part_box.left(), part_box.right(), part_box.bottom()); } else { search.StartVerticalSearch(part_box.left(), part_box.right(), part_box.top()); } search.SetUniqueMode(true); while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) { const TBOX& neighbor_box(neighbor->bounding_box()); if (part_box.y_gap(neighbor_box) > kYGapRatioTh * MIN(part_box.height(), neighbor_box.height())) { // Finished searching. break; } if (!PTIsTextType(neighbor->type())) { continue; } // Check if neighbor and part is inline similar. const float kHeightRatioTh = 0.5; const int kYGapTh = textparts_linespacing > 0 ? textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_)): static_cast<int>(roundf(0.05 * resolution_)); // Default value. if (part_box.x_overlap(neighbor_box) && // Location feature. part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing. // Geo feature. static_cast<float>(MIN(part_box.height(), neighbor_box.height())) / MAX(part_box.height(), neighbor_box.height()) > kHeightRatioTh) { return true; } } return false; }
bool tesseract::EquationDetect::IsMathBlockSatellite | ( | ColPartition * | part, |
GenericVector< ColPartition * > * | math_blocks | ||
) | [protected] |
Definition at line 1379 of file equationdetect.cpp.
{ ASSERT_HOST(part != NULL && math_blocks != NULL); math_blocks->clear(); const TBOX& part_box(part->bounding_box()); // Find the top/bottom nearest neighbor of part. ColPartition *neighbors[2]; int y_gaps[2] = {INT_MAX, INT_MAX}; // The horizontal boundary of the neighbors. int neighbors_left = INT_MAX, neighbors_right = 0; for (int i = 0; i < 2; ++i) { neighbors[i] = SearchNNVertical(i != 0, part); if (neighbors[i]) { const TBOX& neighbor_box = neighbors[i]->bounding_box(); y_gaps[i] = neighbor_box.y_gap(part_box); if (neighbor_box.left() < neighbors_left) { neighbors_left = neighbor_box.left(); } if (neighbor_box.right() > neighbors_right) { neighbors_right = neighbor_box.right(); } } } if (neighbors[0] == neighbors[1]) { // This happens when part is inside neighbor. neighbors[1] = NULL; y_gaps[1] = INT_MAX; } // Check if part is within [neighbors_left, neighbors_right]. if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) { return false; } // Get the index of the near one in neighbors. int index = y_gaps[0] < y_gaps[1] ? 0 : 1; // Check the near one. if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) { math_blocks->push_back(neighbors[index]); } else { // If the near one failed the check, then we skip checking the far one. return false; } // Check the far one. index = 1 - index; if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) { math_blocks->push_back(neighbors[index]); } return true; }
bool tesseract::EquationDetect::IsNearMathNeighbor | ( | const int | y_gap, |
const ColPartition * | neighbor | ||
) | const [protected] |
Definition at line 1468 of file equationdetect.cpp.
{ if (!neighbor) { return false; } const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1)); return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh; }
bool tesseract::EquationDetect::IsNearSmallNeighbor | ( | const TBOX & | seed_box, |
const TBOX & | part_box | ||
) | const [protected] |
Definition at line 1291 of file equationdetect.cpp.
{ const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_)); const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_)); // Check geometric feature. if (part_box.height() > seed_box.height() || part_box.width() > seed_box.width()) { return false; } // Check overlap and distance. if ((!part_box.major_x_overlap(seed_box) || part_box.y_gap(seed_box) > kYGapTh) && (!part_box.major_y_overlap(seed_box) || part_box.x_gap(seed_box) > kXGapTh)) { return false; } return true; }
int tesseract::EquationDetect::LabelSpecialText | ( | TO_BLOCK * | to_block | ) | [virtual] |
Implements tesseract::EquationDetectBase.
Definition at line 145 of file equationdetect.cpp.
{ if (to_block == NULL) { tprintf("Warning: input to_block is NULL!\n"); return -1; } GenericVector<BLOBNBOX_LIST*> blob_lists; blob_lists.push_back(&(to_block->blobs)); blob_lists.push_back(&(to_block->large_blobs)); for (int i = 0; i < blob_lists.size(); ++i) { BLOBNBOX_IT bbox_it(blob_lists[i]); for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list(); bbox_it.forward()) { bbox_it.data()->set_special_text_type(BSTT_NONE); } } return 0; }
void tesseract::EquationDetect::MergePartsByLocation | ( | ) | [protected] |
Definition at line 439 of file equationdetect.cpp.
{ while (true) { ColPartition* part = NULL; // partitions that have been updated. GenericVector<ColPartition*> parts_updated; ColPartitionGridSearch gsearch(part_grid_); gsearch.StartFullSearch(); while ((part = gsearch.NextFullSearch()) != NULL) { if (!IsTextOrEquationType(part->type())) { continue; } GenericVector<ColPartition*> parts_to_merge; SearchByOverlap(part, &parts_to_merge); if (parts_to_merge.empty()) { continue; } // Merge parts_to_merge with part, and remove them from part_grid_. part_grid_->RemoveBBox(part); for (int i = 0; i < parts_to_merge.size(); ++i) { ASSERT_HOST(parts_to_merge[i] != NULL && parts_to_merge[i] != part); part->Absorb(parts_to_merge[i], NULL); } gsearch.RepositionIterator(); parts_updated.push_back(part); } if (parts_updated.empty()) { // Exit the loop break; } // Re-insert parts_updated into part_grid_. for (int i = 0; i < parts_updated.size(); ++i) { InsertPartAfterAbsorb(parts_updated[i]); } } }
void tesseract::EquationDetect::PaintColParts | ( | const STRING & | outfile | ) | const [protected] |
Definition at line 1502 of file equationdetect.cpp.
{ Pix *pix = pixConvertTo32(lang_tesseract_->BestPix()); ColPartitionGridSearch gsearch(part_grid_); gsearch.StartFullSearch(); ColPartition* part = NULL; while ((part = gsearch.NextFullSearch()) != NULL) { const TBOX& tbox = part->bounding_box(); Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(), tbox.width(), tbox.height()); if (part->type() == PT_EQUATION) { pixRenderBoxArb(pix, box, 5, 255, 0, 0); } else if (part->type() == PT_INLINE_EQUATION) { pixRenderBoxArb(pix, box, 5, 0, 255, 0); } else { pixRenderBoxArb(pix, box, 5, 0, 0, 255); } boxDestroy(&box); } pixWrite(outfile.string(), pix, IFF_TIFF_LZW); pixDestroy(&pix); }
void tesseract::EquationDetect::PaintSpecialTexts | ( | const STRING & | outfile | ) | const [protected] |
Definition at line 1485 of file equationdetect.cpp.
{ Pix *pix = NULL, *pixBi = lang_tesseract_->pix_binary(); pix = pixConvertTo32(pixBi); ColPartitionGridSearch gsearch(part_grid_); ColPartition* part = NULL; gsearch.StartFullSearch(); while ((part = gsearch.NextFullSearch()) != NULL) { BLOBNBOX_C_IT blob_it(part->boxes()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { RenderSpecialText(pix, blob_it.data()); } } pixWrite(outfile.string(), pix, IFF_TIFF_LZW); pixDestroy(&pix); }
void tesseract::EquationDetect::PrintSpecialBlobsDensity | ( | const ColPartition * | part | ) | const [protected] |
Definition at line 1525 of file equationdetect.cpp.
{ ASSERT_HOST(part); TBOX box(part->bounding_box()); int h = pixGetHeight(lang_tesseract_->BestPix()); tprintf("Printing special blobs density values for ColParition (t=%d,b=%d) ", h - box.top(), h - box.bottom()); box.print(); tprintf("blobs count = %d, density = ", part->boxes_count()); for (int i = 0; i < BSTT_COUNT; ++i) { BlobSpecialTextType type = static_cast<BlobSpecialTextType>(i); tprintf("%d:%f ", i, part->SpecialBlobsDensity(type)); } tprintf("\n"); }
void tesseract::EquationDetect::ProcessMathBlockSatelliteParts | ( | ) | [protected] |
Definition at line 1330 of file equationdetect.cpp.
{ // Iterate over part_grid_, and find all parts that are text type but not // equation type. ColPartition *part = NULL; GenericVector<ColPartition*> text_parts; ColPartitionGridSearch gsearch(part_grid_); gsearch.StartFullSearch(); while ((part = gsearch.NextFullSearch()) != NULL) { if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) { text_parts.push_back(part); } } if (text_parts.empty()) { return; } // Compute the medium height of the text_parts. text_parts.sort(&SortCPByHeight); const TBOX& text_box = text_parts[text_parts.size() / 2]->bounding_box(); int med_height = text_box.height(); if (text_parts.size() % 2 == 0 && text_parts.size() > 1) { const TBOX& text_box = text_parts[text_parts.size() / 2 - 1]->bounding_box(); med_height = static_cast<int>(roundf( 0.5 * (text_box.height() + med_height))); } // Iterate every text_parts and check if it is a math block satellite. for (int i = 0; i < text_parts.size(); ++i) { const TBOX& text_box(text_parts[i]->bounding_box()); if (text_box.height() > med_height) { continue; } GenericVector<ColPartition*> math_blocks; if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) { continue; } // Found. merge text_parts[i] with math_blocks. part_grid_->RemoveBBox(text_parts[i]); text_parts[i]->set_type(PT_EQUATION); for (int j = 0; j < math_blocks.size(); ++j) { part_grid_->RemoveBBox(math_blocks[j]); text_parts[i]->Absorb(math_blocks[j], NULL); } InsertPartAfterAbsorb(text_parts[i]); } }
void tesseract::EquationDetect::SearchByOverlap | ( | ColPartition * | seed, |
GenericVector< ColPartition * > * | parts_overlap | ||
) | [protected] |
Definition at line 478 of file equationdetect.cpp.
{ ASSERT_HOST(seed != NULL && parts_overlap != NULL); if (!IsTextOrEquationType(seed->type())) { return; } ColPartitionGridSearch search(part_grid_); const TBOX& seed_box(seed->bounding_box()); const int kRadNeighborCells = 30; search.StartRadSearch((seed_box.left() + seed_box.right()) / 2, (seed_box.top() + seed_box.bottom()) / 2, kRadNeighborCells); search.SetUniqueMode(true); // Search iteratively. ColPartition *part; GenericVector<ColPartition*> parts; const float kLargeOverlapTh = 0.95; const float kEquXOverlap = 0.4, kEquYOverlap = 0.5; while ((part = search.NextRadSearch()) != NULL) { if (part == seed || !IsTextOrEquationType(part->type())) { continue; } const TBOX& part_box(part->bounding_box()); bool merge = false; float x_overlap_fraction = part_box.x_overlap_fraction(seed_box), y_overlap_fraction = part_box.y_overlap_fraction(seed_box); // If part is large overlapped with seed, then set merge to true. if (x_overlap_fraction >= kLargeOverlapTh && y_overlap_fraction >= kLargeOverlapTh) { merge = true; } else if (seed->type() == PT_EQUATION && IsTextOrEquationType(part->type())) { if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) || (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) { merge = true; } } if (merge) { // Remove the part from search and put it into parts. search.RemoveBBox(); parts_overlap->push_back(part); } } }
ColPartition * tesseract::EquationDetect::SearchNNVertical | ( | const bool | search_bottom, |
const ColPartition * | part | ||
) | [protected] |
Definition at line 1433 of file equationdetect.cpp.
{ ASSERT_HOST(part); ColPartition *nearest_neighbor = NULL, *neighbor = NULL; const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5)); ColPartitionGridSearch search(part_grid_); search.SetUniqueMode(true); const TBOX& part_box(part->bounding_box()); int y = search_bottom ? part_box.bottom() : part_box.top(); search.StartVerticalSearch(part_box.left(), part_box.right(), y); int min_y_gap = INT_MAX; while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) { if (neighbor == part || !IsTextOrEquationType(neighbor->type())) { continue; } const TBOX& neighbor_box(neighbor->bounding_box()); int y_gap = neighbor_box.y_gap(part_box); if (y_gap > kYGapTh) { // Out of scope. break; } if (!neighbor_box.major_x_overlap(part_box) || (search_bottom && neighbor_box.bottom() > part_box.bottom()) || (!search_bottom && neighbor_box.top() < part_box.top())) { continue; } if (y_gap < min_y_gap) { min_y_gap = y_gap; nearest_neighbor = neighbor; } } return nearest_neighbor; }
void tesseract::EquationDetect::SetLangTesseract | ( | Tesseract * | lang_tesseract | ) |
Definition at line 137 of file equationdetect.cpp.
{ lang_tesseract_ = lang_tesseract; }
void tesseract::EquationDetect::SetResolution | ( | const int | resolution | ) |
Definition at line 141 of file equationdetect.cpp.
{ resolution_ = resolution; }
void tesseract::EquationDetect::SplitCPHor | ( | ColPartition * | part, |
GenericVector< ColPartition * > * | parts_splitted | ||
) | [protected] |
Definition at line 667 of file equationdetect.cpp.
{ ASSERT_HOST(part && parts_splitted); if (part->median_width() == 0 || part->boxes_count() == 0) { return; } // Make a copy of part, and reset parts_splitted. ColPartition* right_part = part->CopyButDontOwnBlobs(); parts_splitted->delete_data_pointers(); parts_splitted->clear(); const double kThreshold = part->median_width() * 3.0; bool found_split = true; while (found_split) { found_split = false; BLOBNBOX_C_IT box_it(right_part->boxes()); // Blobs are sorted left side first. If blobs overlap, // the previous blob may have a "more right" right side. // Account for this by always keeping the largest "right" // so far. int previous_right = MIN_INT32; // Look for the next split in the partition. for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { const TBOX& box = box_it.data()->bounding_box(); if (previous_right != MIN_INT32 && box.left() - previous_right > kThreshold) { // We have a split position. Split the partition in two pieces. // Insert the left piece in the grid and keep processing the right. int mid_x = (box.left() + previous_right) / 2; ColPartition* left_part = right_part; right_part = left_part->SplitAt(mid_x); parts_splitted->push_back(left_part); left_part->ComputeSpecialBlobsDensity(); found_split = true; break; } // The right side of the previous blobs. previous_right = MAX(previous_right, box.right()); } } // Add the last piece. right_part->ComputeSpecialBlobsDensity(); parts_splitted->push_back(right_part); }
void tesseract::EquationDetect::SplitCPHorLite | ( | ColPartition * | part, |
GenericVector< TBOX > * | splitted_boxes | ||
) | [protected] |
Definition at line 717 of file equationdetect.cpp.
{ ASSERT_HOST(part && splitted_boxes); splitted_boxes->clear(); if (part->median_width() == 0) { return; } const double kThreshold = part->median_width() * 3.0; // Blobs are sorted left side first. If blobs overlap, // the previous blob may have a "more right" right side. // Account for this by always keeping the largest "right" // so far. TBOX union_box; int previous_right = MIN_INT32; BLOBNBOX_C_IT box_it(part->boxes()); for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { const TBOX& box = box_it.data()->bounding_box(); if (previous_right != MIN_INT32 && box.left() - previous_right > kThreshold) { // We have a split position. splitted_boxes->push_back(union_box); previous_right = MIN_INT32; } if (previous_right == MIN_INT32) { union_box = box; } else { union_box += box; } // The right side of the previous blobs. previous_right = MAX(previous_right, box.right()); } // Add the last piece. if (previous_right != MIN_INT32) { splitted_boxes->push_back(union_box); } }
ColPartitionSet** tesseract::EquationDetect::best_columns_ [protected] |
Definition at line 260 of file equationdetect.h.
GenericVector<ColPartition*> tesseract::EquationDetect::cp_seeds_ [protected] |
Definition at line 266 of file equationdetect.h.
TBOX* tesseract::EquationDetect::cps_super_bbox_ [protected] |
Definition at line 263 of file equationdetect.h.
Tesseract* tesseract::EquationDetect::equ_tesseract_ [protected] |
Definition at line 247 of file equationdetect.h.
Tesseract* tesseract::EquationDetect::lang_tesseract_ [protected] |
Definition at line 251 of file equationdetect.h.
int tesseract::EquationDetect::page_count_ [protected] |
Definition at line 272 of file equationdetect.h.
ColPartitionGrid* tesseract::EquationDetect::part_grid_ [protected] |
Definition at line 255 of file equationdetect.h.
int tesseract::EquationDetect::resolution_ [protected] |
Definition at line 269 of file equationdetect.h.