Tesseract
3.02
|
00001 00002 // File: tabvector.cpp 00003 // Description: Class to hold a near-vertical vector representing a tab-stop. 00004 // Author: Ray Smith 00005 // Created: Thu Apr 10 16:28:01 PST 2008 00006 // 00007 // (C) Copyright 2008, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include "tabvector.h" 00025 #include "blobbox.h" 00026 #include "colfind.h" 00027 #include "colpartitionset.h" 00028 #include "detlinefit.h" 00029 #include "statistc.h" 00030 00031 // Include automatically generated configuration file if running autoconf. 00032 #ifdef HAVE_CONFIG_H 00033 #include "config_auto.h" 00034 #endif 00035 00036 namespace tesseract { 00037 00038 // Multiple of height used as a gutter for evaluation search. 00039 const int kGutterMultiple = 4; 00040 // Multiple of neighbour gap that we expect the gutter gap to be at minimum. 00041 const int kGutterToNeighbourRatio = 3; 00042 // Pixel distance for tab vectors to be considered the same. 00043 const int kSimilarVectorDist = 10; 00044 // Pixel distance for ragged tab vectors to be considered the same if there 00045 // is nothing in the overlap box 00046 const int kSimilarRaggedDist = 50; 00047 // Max multiple of height to allow filling in between blobs when evaluating. 00048 const int kMaxFillinMultiple = 11; 00049 // Min fraction of mean gutter size to allow a gutter on a good tab blob. 00050 const double kMinGutterFraction = 0.5; 00051 // Multiple of 1/n lines as a minimum gutter in evaluation. 00052 const double kLineCountReciprocal = 4.0; 00053 // Constant add-on for minimum gutter for aligned tabs. 00054 const double kMinAlignedGutter = 0.25; 00055 // Constant add-on for minimum gutter for ragged tabs. 00056 const double kMinRaggedGutter = 1.5; 00057 00058 double_VAR(textord_tabvector_vertical_gap_fraction, 0.5, 00059 "max fraction of mean blob width allowed for vertical gaps in vertical text"); 00060 00061 double_VAR(textord_tabvector_vertical_box_ratio, 0.5, 00062 "Fraction of box matches required to declare a line vertical"); 00063 00064 ELISTIZE(TabConstraint) 00065 00066 // Create a constraint for the top or bottom of this TabVector. 00067 void TabConstraint::CreateConstraint(TabVector* vector, bool is_top) { 00068 TabConstraint* constraint = new TabConstraint(vector, is_top); 00069 TabConstraint_LIST* constraints = new TabConstraint_LIST; 00070 TabConstraint_IT it(constraints); 00071 it.add_to_end(constraint); 00072 if (is_top) 00073 vector->set_top_constraints(constraints); 00074 else 00075 vector->set_bottom_constraints(constraints); 00076 } 00077 00078 // Test to see if the constraints are compatible enough to merge. 00079 bool TabConstraint::CompatibleConstraints(TabConstraint_LIST* list1, 00080 TabConstraint_LIST* list2) { 00081 if (list1 == list2) 00082 return false; 00083 int y_min = -MAX_INT32; 00084 int y_max = MAX_INT32; 00085 if (textord_debug_tabfind > 3) 00086 tprintf("Testing constraint compatibility\n"); 00087 GetConstraints(list1, &y_min, &y_max); 00088 GetConstraints(list2, &y_min, &y_max); 00089 if (textord_debug_tabfind > 3) 00090 tprintf("Resulting range = [%d,%d]\n", y_min, y_max); 00091 return y_max >= y_min; 00092 } 00093 00094 // Merge the lists of constraints and update the TabVector pointers. 00095 // The second list is deleted. 00096 void TabConstraint::MergeConstraints(TabConstraint_LIST* list1, 00097 TabConstraint_LIST* list2) { 00098 if (list1 == list2) 00099 return; 00100 TabConstraint_IT it(list2); 00101 if (textord_debug_tabfind > 3) 00102 tprintf("Merging constraints\n"); 00103 // The vectors of all constraints on list2 are now going to be on list1. 00104 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00105 TabConstraint* constraint = it.data(); 00106 if (textord_debug_tabfind> 3) 00107 constraint->vector_->Print("Merge"); 00108 if (constraint->is_top_) 00109 constraint->vector_->set_top_constraints(list1); 00110 else 00111 constraint->vector_->set_bottom_constraints(list1); 00112 } 00113 it = list1; 00114 it.add_list_before(list2); 00115 delete list2; 00116 } 00117 00118 // Set all the tops and bottoms as appropriate to a mean of the 00119 // constrained range. Delete all the constraints and list. 00120 void TabConstraint::ApplyConstraints(TabConstraint_LIST* constraints) { 00121 int y_min = -MAX_INT32; 00122 int y_max = MAX_INT32; 00123 GetConstraints(constraints, &y_min, &y_max); 00124 int y = (y_min + y_max) / 2; 00125 TabConstraint_IT it(constraints); 00126 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00127 TabConstraint* constraint = it.data(); 00128 TabVector* v = constraint->vector_; 00129 if (constraint->is_top_) { 00130 v->SetYEnd(y); 00131 v->set_top_constraints(NULL); 00132 } else { 00133 v->SetYStart(y); 00134 v->set_bottom_constraints(NULL); 00135 } 00136 } 00137 delete constraints; 00138 } 00139 00140 TabConstraint::TabConstraint(TabVector* vector, bool is_top) 00141 : vector_(vector), is_top_(is_top) { 00142 if (is_top) { 00143 y_min_ = vector->endpt().y(); 00144 y_max_ = vector->extended_ymax(); 00145 } else { 00146 y_max_ = vector->startpt().y(); 00147 y_min_ = vector->extended_ymin(); 00148 } 00149 } 00150 00151 // Get the max of the mins and the min of the maxes. 00152 void TabConstraint::GetConstraints(TabConstraint_LIST* constraints, 00153 int* y_min, int* y_max) { 00154 TabConstraint_IT it(constraints); 00155 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00156 TabConstraint* constraint = it.data(); 00157 if (textord_debug_tabfind > 3) { 00158 tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_); 00159 constraint->vector_->Print(" for"); 00160 } 00161 *y_min = MAX(*y_min, constraint->y_min_); 00162 *y_max = MIN(*y_max, constraint->y_max_); 00163 } 00164 } 00165 00166 ELIST2IZE(TabVector) 00167 CLISTIZE(TabVector) 00168 00169 // The constructor is private. See the bottom of the file... 00170 00171 TabVector::~TabVector() { 00172 } 00173 00174 00175 // Public factory to build a TabVector from a list of boxes. 00176 // The TabVector will be of the given alignment type. 00177 // The input vertical vector is used in fitting, and the output 00178 // vertical_x, vertical_y have the resulting line vector added to them 00179 // if the alignment is not ragged. 00180 // The extended_start_y and extended_end_y are the maximum possible 00181 // extension to the line segment that can be used to align with others. 00182 // The input CLIST of BLOBNBOX good_points is consumed and taken over. 00183 TabVector* TabVector::FitVector(TabAlignment alignment, ICOORD vertical, 00184 int extended_start_y, int extended_end_y, 00185 BLOBNBOX_CLIST* good_points, 00186 int* vertical_x, int* vertical_y) { 00187 TabVector* vector = new TabVector(extended_start_y, extended_end_y, 00188 alignment, good_points); 00189 if (!vector->Fit(vertical, false)) { 00190 delete vector; 00191 return NULL; 00192 } 00193 if (!vector->IsRagged()) { 00194 vertical = vector->endpt_ - vector->startpt_; 00195 int weight = vector->BoxCount(); 00196 *vertical_x += vertical.x() * weight; 00197 *vertical_y += vertical.y() * weight; 00198 } 00199 return vector; 00200 } 00201 00202 // Build a ragged TabVector by copying another's direction, shifting it 00203 // to match the given blob, and making its initial extent the height 00204 // of the blob, but its extended bounds from the bounds of the original. 00205 TabVector::TabVector(const TabVector& src, TabAlignment alignment, 00206 const ICOORD& vertical_skew, BLOBNBOX* blob) 00207 : extended_ymin_(src.extended_ymin_), extended_ymax_(src.extended_ymax_), 00208 sort_key_(0), percent_score_(0), mean_width_(0), 00209 needs_refit_(true), needs_evaluation_(true), intersects_other_lines_(false), 00210 alignment_(alignment), 00211 top_constraints_(NULL), bottom_constraints_(NULL) { 00212 BLOBNBOX_C_IT it(&boxes_); 00213 it.add_to_end(blob); 00214 TBOX box = blob->bounding_box(); 00215 if (IsLeftTab()) { 00216 startpt_ = box.botleft(); 00217 endpt_ = box.topleft(); 00218 } else { 00219 startpt_ = box.botright(); 00220 endpt_ = box.topright(); 00221 } 00222 sort_key_ = SortKey(vertical_skew, 00223 (startpt_.x() + endpt_.x()) / 2, 00224 (startpt_.y() + endpt_.y()) / 2); 00225 if (textord_debug_tabfind > 3) 00226 Print("Constructed a new tab vector:"); 00227 } 00228 00229 // Copies basic attributes of a tab vector for simple operations. 00230 // Copies things such startpt, endpt, range. 00231 // Does not copy things such as partners, boxes, or constraints. 00232 // This is useful if you only need vector information for processing, such 00233 // as in the table detection code. 00234 TabVector* TabVector::ShallowCopy() const { 00235 TabVector* copy = new TabVector(); 00236 copy->startpt_ = startpt_; 00237 copy->endpt_ = endpt_; 00238 copy->alignment_ = alignment_; 00239 copy->extended_ymax_ = extended_ymax_; 00240 copy->extended_ymin_ = extended_ymin_; 00241 copy->intersects_other_lines_ = intersects_other_lines_; 00242 return copy; 00243 } 00244 00245 // Extend this vector to include the supplied blob if it doesn't 00246 // already have it. 00247 void TabVector::ExtendToBox(BLOBNBOX* new_blob) { 00248 TBOX new_box = new_blob->bounding_box(); 00249 BLOBNBOX_C_IT it(&boxes_); 00250 if (!it.empty()) { 00251 BLOBNBOX* blob = it.data(); 00252 TBOX box = blob->bounding_box(); 00253 while (!it.at_last() && box.top() <= new_box.top()) { 00254 if (blob == new_blob) 00255 return; // We have it already. 00256 it.forward(); 00257 blob = it.data(); 00258 box = blob->bounding_box(); 00259 } 00260 if (box.top() >= new_box.top()) { 00261 it.add_before_stay_put(new_blob); 00262 needs_refit_ = true; 00263 return; 00264 } 00265 } 00266 needs_refit_ = true; 00267 it.add_after_stay_put(new_blob); 00268 } 00269 00270 // Set the ycoord of the start and move the xcoord to match. 00271 void TabVector::SetYStart(int start_y) { 00272 startpt_.set_x(XAtY(start_y)); 00273 startpt_.set_y(start_y); 00274 } 00275 // Set the ycoord of the end and move the xcoord to match. 00276 void TabVector::SetYEnd(int end_y) { 00277 endpt_.set_x(XAtY(end_y)); 00278 endpt_.set_y(end_y); 00279 } 00280 00281 // Rotate the ends by the given vector. Auto flip start and end if needed. 00282 void TabVector::Rotate(const FCOORD& rotation) { 00283 startpt_.rotate(rotation); 00284 endpt_.rotate(rotation); 00285 int dx = endpt_.x() - startpt_.x(); 00286 int dy = endpt_.y() - startpt_.y(); 00287 if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) { 00288 // Need to flip start/end. 00289 ICOORD tmp = startpt_; 00290 startpt_ = endpt_; 00291 endpt_ = tmp; 00292 } 00293 } 00294 00295 // Setup the initial constraints, being the limits of 00296 // the vector and the extended ends. 00297 void TabVector::SetupConstraints() { 00298 TabConstraint::CreateConstraint(this, false); 00299 TabConstraint::CreateConstraint(this, true); 00300 } 00301 00302 // Setup the constraints between the partners of this TabVector. 00303 void TabVector::SetupPartnerConstraints() { 00304 // With the first and last partner, we want a common bottom and top, 00305 // respectively, and for each change of partner, we want a common 00306 // top of first with bottom of next. 00307 TabVector_C_IT it(&partners_); 00308 TabVector* prev_partner = NULL; 00309 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00310 TabVector* partner = it.data(); 00311 if (partner->top_constraints_ == NULL || 00312 partner->bottom_constraints_ == NULL) { 00313 partner->Print("Impossible: has no constraints"); 00314 Print("This vector has it as a partner"); 00315 continue; 00316 } 00317 if (prev_partner == NULL) { 00318 // This is the first partner, so common bottom. 00319 if (TabConstraint::CompatibleConstraints(bottom_constraints_, 00320 partner->bottom_constraints_)) 00321 TabConstraint::MergeConstraints(bottom_constraints_, 00322 partner->bottom_constraints_); 00323 } else { 00324 // We need prev top to be common with partner bottom. 00325 if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_, 00326 partner->bottom_constraints_)) 00327 TabConstraint::MergeConstraints(prev_partner->top_constraints_, 00328 partner->bottom_constraints_); 00329 } 00330 prev_partner = partner; 00331 if (it.at_last()) { 00332 // This is the last partner, so common top. 00333 if (TabConstraint::CompatibleConstraints(top_constraints_, 00334 partner->top_constraints_)) 00335 TabConstraint::MergeConstraints(top_constraints_, 00336 partner->top_constraints_); 00337 } 00338 } 00339 } 00340 00341 // Setup the constraints between this and its partner. 00342 void TabVector::SetupPartnerConstraints(TabVector* partner) { 00343 if (TabConstraint::CompatibleConstraints(bottom_constraints_, 00344 partner->bottom_constraints_)) 00345 TabConstraint::MergeConstraints(bottom_constraints_, 00346 partner->bottom_constraints_); 00347 if (TabConstraint::CompatibleConstraints(top_constraints_, 00348 partner->top_constraints_)) 00349 TabConstraint::MergeConstraints(top_constraints_, 00350 partner->top_constraints_); 00351 } 00352 00353 // Use the constraints to modify the top and bottom. 00354 void TabVector::ApplyConstraints() { 00355 if (top_constraints_ != NULL) 00356 TabConstraint::ApplyConstraints(top_constraints_); 00357 if (bottom_constraints_ != NULL) 00358 TabConstraint::ApplyConstraints(bottom_constraints_); 00359 } 00360 00361 // Merge close tab vectors of the same side that overlap. 00362 void TabVector::MergeSimilarTabVectors(const ICOORD& vertical, 00363 TabVector_LIST* vectors, 00364 BlobGrid* grid) { 00365 TabVector_IT it1(vectors); 00366 for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) { 00367 TabVector* v1 = it1.data(); 00368 TabVector_IT it2(it1); 00369 for (it2.forward(); !it2.at_first(); it2.forward()) { 00370 TabVector* v2 = it2.data(); 00371 if (v2->SimilarTo(vertical, *v1, grid)) { 00372 // Merge into the forward one, in case the combined vector now 00373 // overlaps one in between. 00374 if (textord_debug_tabfind) { 00375 v2->Print("Merging"); 00376 v1->Print("by deleting"); 00377 } 00378 v2->MergeWith(vertical, it1.extract()); 00379 if (textord_debug_tabfind) { 00380 v2->Print("Producing"); 00381 } 00382 ICOORD merged_vector = v2->endpt(); 00383 merged_vector -= v2->startpt(); 00384 if (abs(merged_vector.x()) > 100) { 00385 v2->Print("Garbage result of merge?"); 00386 } 00387 break; 00388 } 00389 } 00390 } 00391 } 00392 00393 // Return true if this vector is the same side, overlaps, and close 00394 // enough to the other to be merged. 00395 bool TabVector::SimilarTo(const ICOORD& vertical, 00396 const TabVector& other, BlobGrid* grid) const { 00397 if ((IsRightTab() && other.IsRightTab()) || 00398 (IsLeftTab() && other.IsLeftTab())) { 00399 // If they don't overlap, at least in extensions, then there is no chance. 00400 if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0) 00401 return false; 00402 // A fast approximation to the scale factor of the sort_key_. 00403 int v_scale = abs(vertical.y()); 00404 if (v_scale == 0) 00405 v_scale = 1; 00406 // If they are close enough, then OK. 00407 if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ && 00408 sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_) 00409 return true; 00410 // Ragged tabs get a bigger threshold. 00411 if (!IsRagged() || !other.IsRagged() || 00412 sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ || 00413 sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_) 00414 return false; 00415 if (grid == NULL) { 00416 // There is nothing else to test! 00417 return true; 00418 } 00419 // If there is nothing in the rectangle between the vector that is going to 00420 // move, and the place it is moving to, then they can be merged. 00421 // Setup a vertical search for any blob. 00422 const TabVector* mover = (IsRightTab() && 00423 sort_key_ < other.sort_key_) ? this : &other; 00424 int top_y = mover->endpt_.y(); 00425 int bottom_y = mover->startpt_.y(); 00426 int left = MIN(mover->XAtY(top_y), mover->XAtY(bottom_y)); 00427 int right = MAX(mover->XAtY(top_y), mover->XAtY(bottom_y)); 00428 int shift = abs(sort_key_ - other.sort_key_) / v_scale; 00429 if (IsRightTab()) { 00430 right += shift; 00431 } else { 00432 left -= shift; 00433 } 00434 00435 GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> vsearch(grid); 00436 vsearch.StartVerticalSearch(left, right, top_y); 00437 BLOBNBOX* blob; 00438 while ((blob = vsearch.NextVerticalSearch(true)) != NULL) { 00439 TBOX box = blob->bounding_box(); 00440 if (box.top() > bottom_y) 00441 return true; // Nothing found. 00442 if (box.bottom() < top_y) 00443 continue; // Doesn't overlap. 00444 int left_at_box = XAtY(box.bottom()); 00445 int right_at_box = left_at_box; 00446 if (IsRightTab()) 00447 right_at_box += shift; 00448 else 00449 left_at_box -= shift; 00450 if (MIN(right_at_box, box.right()) > MAX(left_at_box, box.left())) 00451 return false; 00452 } 00453 return true; // Nothing found. 00454 } 00455 return false; 00456 } 00457 00458 // Eat the other TabVector into this and delete it. 00459 void TabVector::MergeWith(const ICOORD& vertical, TabVector* other) { 00460 extended_ymin_ = MIN(extended_ymin_, other->extended_ymin_); 00461 extended_ymax_ = MAX(extended_ymax_, other->extended_ymax_); 00462 if (other->IsRagged()) { 00463 alignment_ = other->alignment_; 00464 } 00465 // Merge sort the two lists of boxes. 00466 BLOBNBOX_C_IT it1(&boxes_); 00467 BLOBNBOX_C_IT it2(&other->boxes_); 00468 while (!it2.empty()) { 00469 BLOBNBOX* bbox2 = it2.extract(); 00470 it2.forward(); 00471 TBOX box2 = bbox2->bounding_box(); 00472 BLOBNBOX* bbox1 = it1.data(); 00473 TBOX box1 = bbox1->bounding_box(); 00474 while (box1.bottom() < box2.bottom() && !it1.at_last()) { 00475 it1.forward(); 00476 bbox1 = it1.data(); 00477 box1 = bbox1->bounding_box(); 00478 } 00479 if (box1.bottom() < box2.bottom()) { 00480 it1.add_to_end(bbox2); 00481 } else if (bbox1 != bbox2) { 00482 it1.add_before_stay_put(bbox2); 00483 } 00484 } 00485 Fit(vertical, true); 00486 other->Delete(this); 00487 } 00488 00489 // Add a new element to the list of partner TabVectors. 00490 // Partners must be added in order of increasing y coordinate of the text line 00491 // that makes them partners. 00492 // Groups of identical partners are merged into one. 00493 void TabVector::AddPartner(TabVector* partner) { 00494 if (IsSeparator() || partner->IsSeparator()) 00495 return; 00496 TabVector_C_IT it(&partners_); 00497 if (!it.empty()) { 00498 it.move_to_last(); 00499 if (it.data() == partner) 00500 return; 00501 } 00502 it.add_after_then_move(partner); 00503 } 00504 00505 // Return true if other is a partner of this. 00506 bool TabVector::IsAPartner(const TabVector* other) { 00507 TabVector_C_IT it(&partners_); 00508 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00509 if (it.data() == other) 00510 return true; 00511 } 00512 return false; 00513 } 00514 00515 // These names must be synced with the TabAlignment enum in tabvector.h. 00516 const char* kAlignmentNames[] = { 00517 "Left Aligned", 00518 "Left Ragged", 00519 "Center", 00520 "Right Aligned", 00521 "Right Ragged", 00522 "Separator" 00523 }; 00524 00525 // Print basic information about this tab vector. 00526 void TabVector::Print(const char* prefix) { 00527 if (this == NULL) { 00528 tprintf("%s <null>\n", prefix); 00529 } else { 00530 tprintf("%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d," 00531 " partners=%d\n", 00532 prefix, kAlignmentNames[alignment_], 00533 startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y(), 00534 mean_width_, percent_score_, sort_key_, 00535 boxes_.length(), partners_.length()); 00536 } 00537 } 00538 00539 // Print basic information about this tab vector and every box in it. 00540 void TabVector::Debug(const char* prefix) { 00541 Print(prefix); 00542 BLOBNBOX_C_IT it(&boxes_); 00543 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00544 BLOBNBOX* bbox = it.data(); 00545 const TBOX& box = bbox->bounding_box(); 00546 tprintf("Box at (%d,%d)->(%d,%d)\n", 00547 box.left(), box.bottom(), box.right(), box.top()); 00548 } 00549 } 00550 00551 // Draw this tabvector in place in the given window. 00552 void TabVector::Display(ScrollView* tab_win) { 00553 #ifndef GRAPHICS_DISABLED 00554 if (textord_debug_printable) 00555 tab_win->Pen(ScrollView::BLUE); 00556 else if (alignment_ == TA_LEFT_ALIGNED) 00557 tab_win->Pen(ScrollView::LIME_GREEN); 00558 else if (alignment_ == TA_LEFT_RAGGED) 00559 tab_win->Pen(ScrollView::DARK_GREEN); 00560 else if (alignment_ == TA_RIGHT_ALIGNED) 00561 tab_win->Pen(ScrollView::PINK); 00562 else if (alignment_ == TA_RIGHT_RAGGED) 00563 tab_win->Pen(ScrollView::CORAL); 00564 else 00565 tab_win->Pen(ScrollView::WHITE); 00566 tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y()); 00567 tab_win->Pen(ScrollView::GREY); 00568 tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_); 00569 tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y()); 00570 char score_buf[64]; 00571 snprintf(score_buf, sizeof(score_buf), "%d", percent_score_); 00572 tab_win->TextAttributes("Times", 50, false, false, false); 00573 tab_win->Text(startpt_.x(), startpt_.y(), score_buf); 00574 #endif 00575 } 00576 00577 // Refit the line and/or re-evaluate the vector if the dirty flags are set. 00578 void TabVector::FitAndEvaluateIfNeeded(const ICOORD& vertical, 00579 TabFind* finder) { 00580 if (needs_refit_) 00581 Fit(vertical, true); 00582 if (needs_evaluation_) 00583 Evaluate(vertical, finder); 00584 } 00585 00586 // Evaluate the vector in terms of coverage of its length by good-looking 00587 // box edges. A good looking box is one where its nearest neighbour on the 00588 // inside is nearer than half the distance its nearest neighbour on the 00589 // outside of the putative column. Bad boxes are removed from the line. 00590 // A second pass then further filters boxes by requiring that the gutter 00591 // width be a minimum fraction of the mean gutter along the line. 00592 void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) { 00593 bool debug = false; 00594 needs_evaluation_ = false; 00595 int length = endpt_.y() - startpt_.y(); 00596 if (length == 0 || boxes_.empty()) { 00597 percent_score_ = 0; 00598 Print("Zero length in evaluate"); 00599 return; 00600 } 00601 // Compute the mean box height. 00602 BLOBNBOX_C_IT it(&boxes_); 00603 int mean_height = 0; 00604 int height_count = 0; 00605 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00606 BLOBNBOX* bbox = it.data(); 00607 const TBOX& box = bbox->bounding_box(); 00608 int height = box.height(); 00609 mean_height += height; 00610 ++height_count; 00611 } 00612 mean_height /= height_count; 00613 int max_gutter = kGutterMultiple * mean_height; 00614 if (IsRagged()) { 00615 // Ragged edges face a tougher test in that the gap must always be within 00616 // the height of the blob. 00617 max_gutter = kGutterToNeighbourRatio * mean_height; 00618 } 00619 00620 STATS gutters(0, max_gutter + 1); 00621 // Evaluate the boxes for their goodness, calculating the coverage as we go. 00622 // Remove boxes that are not good and shorten the list to the first and 00623 // last good boxes. 00624 int num_deleted_boxes = 0; 00625 bool text_on_image = false; 00626 int good_length = 0; 00627 const TBOX* prev_good_box = NULL; 00628 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00629 BLOBNBOX* bbox = it.data(); 00630 const TBOX& box = bbox->bounding_box(); 00631 int mid_y = (box.top() + box.bottom()) / 2; 00632 if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) { 00633 if (!debug) { 00634 tprintf("After already deleting %d boxes, ", num_deleted_boxes); 00635 Print("Starting evaluation"); 00636 } 00637 debug = true; 00638 } 00639 // A good box is one where the nearest neighbour on the inside is closer 00640 // than half the distance to the nearest neighbour on the outside 00641 // (of the putative column). 00642 bool left = IsLeftTab(); 00643 int tab_x = XAtY(mid_y); 00644 int gutter_width; 00645 int neighbour_gap; 00646 finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, 00647 bbox, &gutter_width, &neighbour_gap); 00648 if (debug) { 00649 tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n", 00650 box.left(), box.bottom(), box.right(), box.top(), 00651 gutter_width, neighbour_gap); 00652 } 00653 // Now we can make the test. 00654 if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) { 00655 // A good box contributes its height to the good_length. 00656 good_length += box.top() - box.bottom(); 00657 gutters.add(gutter_width, 1); 00658 // Two good boxes together contribute the gap between them 00659 // to the good_length as well, as long as the gap is not 00660 // too big. 00661 if (prev_good_box != NULL) { 00662 int vertical_gap = box.bottom() - prev_good_box->top(); 00663 double size1 = sqrt(static_cast<double>(prev_good_box->area())); 00664 double size2 = sqrt(static_cast<double>(box.area())); 00665 if (vertical_gap < kMaxFillinMultiple * MIN(size1, size2)) 00666 good_length += vertical_gap; 00667 if (debug) { 00668 tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n", 00669 vertical_gap, kMaxFillinMultiple * MIN(size1, size2), 00670 good_length); 00671 } 00672 } else { 00673 // Adjust the start to the first good box. 00674 SetYStart(box.bottom()); 00675 } 00676 prev_good_box = &box; 00677 if (bbox->flow() == BTFT_TEXT_ON_IMAGE) 00678 text_on_image = true; 00679 } else { 00680 // Get rid of boxes that are not good. 00681 if (debug) { 00682 tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n", 00683 box.left(), box.bottom(), box.right(), box.top(), 00684 gutter_width, neighbour_gap); 00685 } 00686 it.extract(); 00687 ++num_deleted_boxes; 00688 } 00689 } 00690 if (debug) { 00691 Print("Evaluating:"); 00692 } 00693 // If there are any good boxes, do it again, except this time get rid of 00694 // boxes that have a gutter that is a small fraction of the mean gutter. 00695 // This filters out ends that run into a coincidental gap in the text. 00696 int search_top = endpt_.y(); 00697 int search_bottom = startpt_.y(); 00698 int median_gutter = IntCastRounded(gutters.median()); 00699 if (gutters.get_total() > 0) { 00700 prev_good_box = NULL; 00701 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00702 BLOBNBOX* bbox = it.data(); 00703 const TBOX& box = bbox->bounding_box(); 00704 int mid_y = (box.top() + box.bottom()) / 2; 00705 // A good box is one where the gutter width is at least some constant 00706 // fraction of the mean gutter width. 00707 bool left = IsLeftTab(); 00708 int tab_x = XAtY(mid_y); 00709 int max_gutter = kGutterMultiple * mean_height; 00710 if (IsRagged()) { 00711 // Ragged edges face a tougher test in that the gap must always be 00712 // within the height of the blob. 00713 max_gutter = kGutterToNeighbourRatio * mean_height; 00714 } 00715 int gutter_width; 00716 int neighbour_gap; 00717 finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, 00718 bbox, &gutter_width, &neighbour_gap); 00719 // Now we can make the test. 00720 if (gutter_width >= median_gutter * kMinGutterFraction) { 00721 if (prev_good_box == NULL) { 00722 // Adjust the start to the first good box. 00723 SetYStart(box.bottom()); 00724 search_bottom = box.top(); 00725 } 00726 prev_good_box = &box; 00727 search_top = box.bottom(); 00728 } else { 00729 // Get rid of boxes that are not good. 00730 if (debug) { 00731 tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n", 00732 box.left(), box.bottom(), box.right(), box.top(), 00733 gutter_width, median_gutter); 00734 } 00735 it.extract(); 00736 ++num_deleted_boxes = true; 00737 } 00738 } 00739 } 00740 // If there has been a good box, adjust the end. 00741 if (prev_good_box != NULL) { 00742 SetYEnd(prev_good_box->top()); 00743 // Compute the percentage of the vector that is occupied by good boxes. 00744 int length = endpt_.y() - startpt_.y(); 00745 percent_score_ = 100 * good_length / length; 00746 if (num_deleted_boxes > 0) { 00747 needs_refit_ = true; 00748 FitAndEvaluateIfNeeded(vertical, finder); 00749 if (boxes_.empty()) 00750 return; 00751 } 00752 // Test the gutter over the whole vector, instead of just at the boxes. 00753 int required_shift; 00754 if (search_bottom > search_top) { 00755 search_bottom = startpt_.y(); 00756 search_top = endpt_.y(); 00757 } 00758 double min_gutter_width = kLineCountReciprocal / boxes_.length(); 00759 min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter; 00760 min_gutter_width *= mean_height; 00761 int max_gutter_width = IntCastRounded(min_gutter_width) + 1; 00762 if (median_gutter > max_gutter_width) 00763 max_gutter_width = median_gutter; 00764 int gutter_width = finder->GutterWidth(search_bottom, search_top, *this, 00765 text_on_image, max_gutter_width, 00766 &required_shift); 00767 if (gutter_width < min_gutter_width) { 00768 if (debug) { 00769 tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n", 00770 gutter_width, min_gutter_width); 00771 } 00772 boxes_.shallow_clear(); 00773 percent_score_ = 0; 00774 } else if (debug) { 00775 tprintf("Final gutter %d, vs limit of %g, required shift = %d\n", 00776 gutter_width, min_gutter_width, required_shift); 00777 } 00778 } else { 00779 // There are no good boxes left, so score is 0. 00780 percent_score_ = 0; 00781 } 00782 00783 if (debug) { 00784 Print("Evaluation complete:"); 00785 } 00786 } 00787 00788 // (Re)Fit a line to the stored points. Returns false if the line 00789 // is degenerate. Althougth the TabVector code mostly doesn't care about the 00790 // direction of lines, XAtY would give silly results for a horizontal line. 00791 // The class is mostly aimed at use for vertical lines representing 00792 // horizontal tab stops. 00793 bool TabVector::Fit(ICOORD vertical, bool force_parallel) { 00794 needs_refit_ = false; 00795 if (boxes_.empty()) { 00796 // Don't refit something with no boxes, as that only happens 00797 // in Evaluate, and we don't want to end up with a zero vector. 00798 if (!force_parallel) 00799 return false; 00800 // If we are forcing parallel, then we just need to set the sort_key_. 00801 ICOORD midpt = startpt_; 00802 midpt += endpt_; 00803 midpt /= 2; 00804 sort_key_ = SortKey(vertical, midpt.x(), midpt.y()); 00805 return startpt_.y() != endpt_.y(); 00806 } 00807 if (!force_parallel && !IsRagged()) { 00808 // Use a fitted line as the vertical. 00809 DetLineFit linepoints; 00810 BLOBNBOX_C_IT it(&boxes_); 00811 // Fit a line to all the boxes in the list. 00812 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00813 BLOBNBOX* bbox = it.data(); 00814 TBOX box = bbox->bounding_box(); 00815 int x1 = IsRightTab() ? box.right() : box.left(); 00816 ICOORD boxpt(x1, box.bottom()); 00817 linepoints.Add(boxpt); 00818 if (it.at_last()) { 00819 ICOORD top_pt(x1, box.top()); 00820 linepoints.Add(top_pt); 00821 } 00822 } 00823 linepoints.Fit(&startpt_, &endpt_); 00824 if (startpt_.y() != endpt_.y()) { 00825 vertical = endpt_; 00826 vertical -= startpt_; 00827 } 00828 } 00829 int start_y = startpt_.y(); 00830 int end_y = endpt_.y(); 00831 sort_key_ = IsLeftTab() ? MAX_INT32 : -MAX_INT32; 00832 BLOBNBOX_C_IT it(&boxes_); 00833 // Choose a line parallel to the vertical such that all boxes are on the 00834 // correct side of it. 00835 mean_width_ = 0; 00836 int width_count = 0; 00837 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00838 BLOBNBOX* bbox = it.data(); 00839 TBOX box = bbox->bounding_box(); 00840 mean_width_ += box.width(); 00841 ++width_count; 00842 int x1 = IsRightTab() ? box.right() : box.left(); 00843 // Test both the bottom and the top, as one will be more extreme, depending 00844 // on the direction of skew. 00845 int bottom_y = box.bottom(); 00846 int top_y = box.top(); 00847 int key = SortKey(vertical, x1, bottom_y); 00848 if (IsLeftTab() == (key < sort_key_)) { 00849 sort_key_ = key; 00850 startpt_ = ICOORD(x1, bottom_y); 00851 } 00852 key = SortKey(vertical, x1, top_y); 00853 if (IsLeftTab() == (key < sort_key_)) { 00854 sort_key_ = key; 00855 startpt_ = ICOORD(x1, top_y); 00856 } 00857 if (it.at_first()) 00858 start_y = bottom_y; 00859 if (it.at_last()) 00860 end_y = top_y; 00861 } 00862 if (width_count > 0) { 00863 mean_width_ = (mean_width_ + width_count - 1) / width_count; 00864 } 00865 endpt_ = startpt_ + vertical; 00866 needs_evaluation_ = true; 00867 if (start_y != end_y) { 00868 // Set the ends of the vector to fully include the first and last blobs. 00869 startpt_.set_x(XAtY(vertical, sort_key_, start_y)); 00870 startpt_.set_y(start_y); 00871 endpt_.set_x(XAtY(vertical, sort_key_, end_y)); 00872 endpt_.set_y(end_y); 00873 return true; 00874 } 00875 return false; 00876 } 00877 00878 // Returns the singleton partner if there is one, or NULL otherwise. 00879 TabVector* TabVector::GetSinglePartner() { 00880 if (!partners_.singleton()) 00881 return NULL; 00882 TabVector_C_IT partner_it(&partners_); 00883 TabVector* partner = partner_it.data(); 00884 return partner; 00885 } 00886 00887 // Return the partner of this TabVector if the vector qualifies as 00888 // being a vertical text line, otherwise NULL. 00889 TabVector* TabVector::VerticalTextlinePartner() { 00890 if (!partners_.singleton()) 00891 return NULL; 00892 TabVector_C_IT partner_it(&partners_); 00893 TabVector* partner = partner_it.data(); 00894 BLOBNBOX_C_IT box_it1(&boxes_); 00895 BLOBNBOX_C_IT box_it2(&partner->boxes_); 00896 // Count how many boxes are also in the other list. 00897 // At the same time, gather the mean width and median vertical gap. 00898 if (textord_debug_tabfind > 1) { 00899 Print("Testing for vertical text"); 00900 partner->Print(" partner"); 00901 } 00902 int num_matched = 0; 00903 int num_unmatched = 0; 00904 int total_widths = 0; 00905 int width = startpt().x() - partner->startpt().x(); 00906 if (width < 0) 00907 width = -width; 00908 STATS gaps(0, width * 2); 00909 BLOBNBOX* prev_bbox = NULL; 00910 box_it2.mark_cycle_pt(); 00911 for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) { 00912 BLOBNBOX* bbox = box_it1.data(); 00913 TBOX box = bbox->bounding_box(); 00914 if (prev_bbox != NULL) { 00915 gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1); 00916 } 00917 while (!box_it2.cycled_list() && box_it2.data() != bbox && 00918 box_it2.data()->bounding_box().bottom() < box.bottom()) { 00919 box_it2.forward(); 00920 } 00921 if (!box_it2.cycled_list() && box_it2.data() == bbox && 00922 bbox->region_type() >= BRT_UNKNOWN && 00923 (prev_bbox == NULL || prev_bbox->region_type() >= BRT_UNKNOWN)) 00924 ++num_matched; 00925 else 00926 ++num_unmatched; 00927 total_widths += box.width(); 00928 prev_bbox = bbox; 00929 } 00930 double avg_width = total_widths * 1.0 / (num_unmatched + num_matched); 00931 double max_gap = textord_tabvector_vertical_gap_fraction * avg_width; 00932 int min_box_match = static_cast<int>((num_matched + num_unmatched) * 00933 textord_tabvector_vertical_box_ratio); 00934 bool is_vertical = (gaps.get_total() > 0 && 00935 num_matched >= min_box_match && 00936 gaps.median() <= max_gap); 00937 if (textord_debug_tabfind > 1) { 00938 tprintf("gaps=%d, matched=%d, unmatched=%d, min_match=%d " 00939 "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n", 00940 gaps.get_total(), num_matched, num_unmatched, min_box_match, 00941 gaps.median(), avg_width, max_gap, is_vertical?"Yes":"No"); 00942 } 00943 return (is_vertical) ? partner : NULL; 00944 } 00945 00946 // The constructor is private. 00947 TabVector::TabVector(int extended_ymin, int extended_ymax, 00948 TabAlignment alignment, BLOBNBOX_CLIST* boxes) 00949 : extended_ymin_(extended_ymin), extended_ymax_(extended_ymax), 00950 sort_key_(0), percent_score_(0), mean_width_(0), 00951 needs_refit_(true), needs_evaluation_(true), alignment_(alignment), 00952 top_constraints_(NULL), bottom_constraints_(NULL) { 00953 BLOBNBOX_C_IT it(&boxes_); 00954 it.add_list_after(boxes); 00955 } 00956 00957 // Delete this, but first, repoint all the partners to point to 00958 // replacement. If replacement is NULL, then partner relationships 00959 // are removed. 00960 void TabVector::Delete(TabVector* replacement) { 00961 TabVector_C_IT it(&partners_); 00962 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00963 TabVector* partner = it.data(); 00964 TabVector_C_IT p_it(&partner->partners_); 00965 // If partner already has replacement in its list, then make 00966 // replacement null, and just remove this TabVector when we find it. 00967 TabVector* partner_replacement = replacement; 00968 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) { 00969 TabVector* p_partner = p_it.data(); 00970 if (p_partner == partner_replacement) { 00971 partner_replacement = NULL; 00972 break; 00973 } 00974 } 00975 // Remove all references to this, and replace with replacement if not NULL. 00976 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) { 00977 TabVector* p_partner = p_it.data(); 00978 if (p_partner == this) { 00979 p_it.extract(); 00980 if (partner_replacement != NULL) 00981 p_it.add_before_stay_put(partner_replacement); 00982 } 00983 } 00984 if (partner_replacement != NULL) { 00985 partner_replacement->AddPartner(partner); 00986 } 00987 } 00988 delete this; 00989 } 00990 00991 00992 } // namespace tesseract.