Tesseract  3.02
tesseract-ocr/textord/colpartition.cpp
Go to the documentation of this file.
00001 
00002 // File:        colpartition.cpp
00003 // Description: Class to hold partitions of the page that correspond
00004 //              roughly to text lines.
00005 // Author:      Ray Smith
00006 // Created:     Thu Aug 14 10:54:01 PDT 2008
00007 //
00008 // (C) Copyright 2008, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #include "colpartition.h"
00022 #include "colpartitiongrid.h"
00023 #include "colpartitionset.h"
00024 #include "detlinefit.h"
00025 #include "dppoint.h"
00026 #include "imagefind.h"
00027 #include "workingpartset.h"
00028 
00029 #ifdef _MSC_VER
00030 #pragma warning(disable:4244)  // Conversion warnings
00031 #endif
00032 
00033 namespace tesseract {
00034 
00035 ELIST2IZE(ColPartition)
00036 CLISTIZE(ColPartition)
00037 
00039 
00040 // If multiple partners survive the partner depth test beyond this level,
00041 // then arbitrarily pick one.
00042 const int kMaxPartnerDepth = 4;
00043 // Maximum change in spacing (in inches) to ignore.
00044 const double kMaxSpacingDrift = 1.0 / 72;  // 1/72 is one point.
00045 // Maximum fraction of line height used as an additional allowance
00046 // for top spacing.
00047 const double kMaxTopSpacingFraction = 0.25;
00048 // What multiple of the largest line height should be used as an upper bound
00049 // for whether lines are in the same text block?
00050 const double kMaxSameBlockLineSpacing = 3;
00051 // Maximum ratio of sizes for lines to be considered the same size.
00052 const double kMaxSizeRatio = 1.5;
00053 // Fraction of max of leader width and gap for max IQR of gaps.
00054 const double kMaxLeaderGapFractionOfMax = 0.25;
00055 // Fraction of min of leader width and gap for max IQR of gaps.
00056 const double kMaxLeaderGapFractionOfMin = 0.5;
00057 // Minimum number of blobs to be considered a leader.
00058 const int kMinLeaderCount = 5;
00059 // Cost of a cut through a leader.
00060 const int kLeaderCutCost = 8;
00061 // Minimum score for a STRONG_CHAIN textline.
00062 const int kMinStrongTextValue = 6;
00063 // Minimum score for a CHAIN textline.
00064 const int kMinChainTextValue = 3;
00065 // Minimum number of blobs for strong horizontal text lines.
00066 const int kHorzStrongTextlineCount = 8;
00067 // Minimum height (in image pixels) for strong horizontal text lines.
00068 const int kHorzStrongTextlineHeight = 10;
00069 // Minimum aspect ratio for strong horizontal text lines.
00070 const int kHorzStrongTextlineAspect = 5;
00071 // Maximum upper quartile error allowed on a baseline fit as a fraction
00072 // of height.
00073 const double kMaxBaselineError = 0.4375;
00074 // Min coverage for a good baseline between vectors
00075 const double kMinBaselineCoverage = 0.5;
00076 // Max RMS color noise to compare colors.
00077 const int kMaxRMSColorNoise = 128;
00078 // Maximum distance to allow a partition color to be to use that partition
00079 // in smoothing neighbouring types. This is a squared distance.
00080 const int kMaxColorDistance = 900;
00081 
00082 // blob_type is the blob_region_type_ of the blobs in this partition.
00083 // Vertical is the direction of logical vertical on the possibly skewed image.
00084 ColPartition::ColPartition(BlobRegionType blob_type, const ICOORD& vertical)
00085   : left_margin_(-MAX_INT32), right_margin_(MAX_INT32),
00086     median_bottom_(MAX_INT32), median_top_(-MAX_INT32), median_size_(0),
00087     median_left_(MAX_INT32), median_right_(-MAX_INT32), median_width_(0),
00088     blob_type_(blob_type), flow_(BTFT_NONE), good_blob_score_(0),
00089     good_width_(false), good_column_(false),
00090     left_key_tab_(false), right_key_tab_(false),
00091     left_key_(0), right_key_(0), type_(PT_UNKNOWN), vertical_(vertical),
00092     working_set_(NULL), last_add_was_vertical_(false), block_owned_(false),
00093     desperately_merged_(false),
00094     first_column_(-1), last_column_(-1), column_set_(NULL),
00095     side_step_(0), top_spacing_(0), bottom_spacing_(0),
00096     type_before_table_(PT_UNKNOWN), inside_table_column_(false),
00097     nearest_neighbor_above_(NULL), nearest_neighbor_below_(NULL),
00098     space_above_(0), space_below_(0), space_to_left_(0), space_to_right_(0),
00099     owns_blobs_(true) {
00100   memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
00101 }
00102 
00103 // Constructs a fake ColPartition with a single fake BLOBNBOX, all made
00104 // from a single TBOX.
00105 // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
00106 // the ColPartition owns the BLOBNBOX!!!
00107 // Call DeleteBoxes before deleting the ColPartition.
00108 ColPartition* ColPartition::FakePartition(const TBOX& box,
00109                                           PolyBlockType block_type,
00110                                           BlobRegionType blob_type,
00111                                           BlobTextFlowType flow) {
00112   ColPartition* part = new ColPartition(blob_type, ICOORD(0, 1));
00113   part->set_type(block_type);
00114   part->set_flow(flow);
00115   part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
00116   part->set_left_margin(box.left());
00117   part->set_right_margin(box.right());
00118   part->SetBlobTypes();
00119   part->ComputeLimits();
00120   part->ClaimBoxes();
00121   return part;
00122 }
00123 
00124 // Constructs and returns a ColPartition with the given real BLOBNBOX,
00125 // and sets it up to be a "big" partition (single-blob partition bigger
00126 // than the surrounding text that may be a dropcap, two or more vertically
00127 // touching characters, or some graphic element.
00128 // If the given list is not NULL, the partition is also added to the list.
00129 ColPartition* ColPartition::MakeBigPartition(BLOBNBOX* box,
00130                                              ColPartition_LIST* big_part_list) {
00131   box->set_owner(NULL);
00132   ColPartition* single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
00133   single->set_flow(BTFT_NONE);
00134   single->AddBox(box);
00135   single->ComputeLimits();
00136   single->ClaimBoxes();
00137   single->SetBlobTypes();
00138   single->set_block_owned(true);
00139   if (big_part_list != NULL) {
00140     ColPartition_IT part_it(big_part_list);
00141     part_it.add_to_end(single);
00142   }
00143   return single;
00144 }
00145 
00146 ColPartition::~ColPartition() {
00147   // Remove this as a partner of all partners, as we don't want them
00148   // referring to a deleted object.
00149   ColPartition_C_IT it(&upper_partners_);
00150   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00151     it.data()->RemovePartner(false, this);
00152   }
00153   it.set_to_list(&lower_partners_);
00154   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00155     it.data()->RemovePartner(true, this);
00156   }
00157 }
00158 
00159 // Constructs a fake ColPartition with no BLOBNBOXes to represent a
00160 // horizontal or vertical line, given a type and a bounding box.
00161 ColPartition* ColPartition::MakeLinePartition(BlobRegionType blob_type,
00162                                               const ICOORD& vertical,
00163                                               int left, int bottom,
00164                                               int right, int top) {
00165   ColPartition* part = new ColPartition(blob_type, vertical);
00166   part->bounding_box_ = TBOX(left, bottom, right, top);
00167   part->median_bottom_ = bottom;
00168   part->median_top_ = top;
00169   part->median_size_ = top - bottom;
00170   part->median_width_ = right - left;
00171   part->left_key_ = part->BoxLeftKey();
00172   part->right_key_ = part->BoxRightKey();
00173   return part;
00174 }
00175 
00176 
00177 // Adds the given box to the partition, updating the partition bounds.
00178 // The list of boxes in the partition is updated, ensuring that no box is
00179 // recorded twice, and the boxes are kept in increasing left position.
00180 void ColPartition::AddBox(BLOBNBOX* bbox) {
00181   TBOX box = bbox->bounding_box();
00182   // Update the partition limits.
00183   if (boxes_.length() == 0) {
00184     bounding_box_ = box;
00185   } else {
00186     bounding_box_ += box;
00187   }
00188 
00189   if (IsVerticalType()) {
00190     if (!last_add_was_vertical_) {
00191       boxes_.sort(SortByBoxBottom<BLOBNBOX>);
00192       last_add_was_vertical_ = true;
00193     }
00194     boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);
00195   } else {
00196     if (last_add_was_vertical_) {
00197       boxes_.sort(SortByBoxLeft<BLOBNBOX>);
00198       last_add_was_vertical_ = false;
00199     }
00200     boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
00201   }
00202   if (!left_key_tab_)
00203     left_key_ = BoxLeftKey();
00204   if (!right_key_tab_)
00205     right_key_ = BoxRightKey();
00206   if (TabFind::WithinTestRegion(2, box.left(), box.bottom()))
00207     tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
00208             box.left(), box.bottom(), box.right(), box.top(),
00209             bounding_box_.left(), bounding_box_.right());
00210 }
00211 
00212 // Removes the given box from the partition, updating the bounds.
00213 void ColPartition::RemoveBox(BLOBNBOX* box) {
00214   BLOBNBOX_C_IT bb_it(&boxes_);
00215   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00216     if (box == bb_it.data()) {
00217       bb_it.extract();
00218       ComputeLimits();
00219       return;
00220     }
00221   }
00222 }
00223 
00224 // Returns the tallest box in the partition, as measured perpendicular to the
00225 // presumed flow of text.
00226 BLOBNBOX* ColPartition::BiggestBox() {
00227   BLOBNBOX* biggest = NULL;
00228   BLOBNBOX_C_IT bb_it(&boxes_);
00229   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00230     BLOBNBOX* bbox = bb_it.data();
00231     if (IsVerticalType()) {
00232       if (biggest == NULL ||
00233           bbox->bounding_box().width() > biggest->bounding_box().width())
00234         biggest = bbox;
00235     } else {
00236       if (biggest == NULL ||
00237           bbox->bounding_box().height() > biggest->bounding_box().height())
00238         biggest = bbox;
00239     }
00240   }
00241   return biggest;
00242 }
00243 
00244 // Returns the bounding box excluding the given box.
00245 TBOX ColPartition::BoundsWithoutBox(BLOBNBOX* box) {
00246   TBOX result;
00247   BLOBNBOX_C_IT bb_it(&boxes_);
00248   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00249     if (box != bb_it.data()) {
00250       result += bb_it.data()->bounding_box();
00251     }
00252   }
00253   return result;
00254 }
00255 
00256 // Claims the boxes in the boxes_list by marking them with a this owner
00257 // pointer. If a box is already owned, then it must be owned by this.
00258 void ColPartition::ClaimBoxes() {
00259   BLOBNBOX_C_IT bb_it(&boxes_);
00260   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00261     BLOBNBOX* bblob = bb_it.data();
00262     ColPartition* other = bblob->owner();
00263     if (other == NULL) {
00264       // Normal case: ownership is available.
00265       bblob->set_owner(this);
00266     } else {
00267       ASSERT_HOST(other == this);
00268     }
00269   }
00270 }
00271 
00272 // NULL the owner of the blobs in this partition, so they can be deleted
00273 // independently of the ColPartition.
00274 void ColPartition::DisownBoxes() {
00275   BLOBNBOX_C_IT bb_it(&boxes_);
00276   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00277     BLOBNBOX* bblob = bb_it.data();
00278     ASSERT_HOST(bblob->owner() == this || bblob->owner() == NULL);
00279     bblob->set_owner(NULL);
00280   }
00281 }
00282 
00283 // Delete the boxes that this partition owns.
00284 void ColPartition::DeleteBoxes() {
00285   // Although the boxes_ list is a C_LIST, in some cases it owns the
00286   // BLOBNBOXes, as the ColPartition takes ownership from the grid,
00287   // and the BLOBNBOXes own the underlying C_BLOBs.
00288   for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
00289     BLOBNBOX* bblob = bb_it.extract();
00290     delete bblob->cblob();
00291     delete bblob;
00292   }
00293 }
00294 
00295 // Reflects the partition in the y-axis, assuming that its blobs have
00296 // already been done. Corrects only a limited part of the members, since
00297 // this function is assumed to be used shortly after initial creation, which
00298 // is before a lot of the members are used.
00299 void ColPartition::ReflectInYAxis() {
00300   ColPartition_CLIST reversed_boxes;
00301   ColPartition_C_IT reversed_it(&reversed_boxes);
00302   // Reverse the order of the boxes_.
00303   BLOBNBOX_C_IT bb_it(&boxes_);
00304   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00305     reversed_it.add_before_then_move(bb_it.extract());
00306   }
00307   bb_it.add_list_after(&reversed_boxes);
00308   ASSERT_HOST(!left_key_tab_ && !right_key_tab_);
00309   int tmp = left_margin_;
00310   left_margin_ = -right_margin_;
00311   right_margin_ = -tmp;
00312   ComputeLimits();
00313 }
00314 
00315 // Returns true if this is a legal partition - meaning that the conditions
00316 // left_margin <= bounding_box left
00317 // left_key <= bounding box left key
00318 // bounding box left <= bounding box right
00319 // and likewise for right margin and key
00320 // are all met.
00321 bool ColPartition::IsLegal() {
00322   if (bounding_box_.left() > bounding_box_.right()) {
00323     if (textord_debug_bugs) {
00324       tprintf("Bounding box invalid\n");
00325       Print();
00326     }
00327     return false;  // Bounding box invalid.
00328   }
00329   if (left_margin_ > bounding_box_.left() ||
00330       right_margin_ < bounding_box_.right()) {
00331     if (textord_debug_bugs) {
00332       tprintf("Margins invalid\n");
00333       Print();
00334     }
00335     return false;  // Margins invalid.
00336   }
00337   if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
00338     if (textord_debug_bugs) {
00339       tprintf("Key inside box: %d v %d or %d v %d\n",
00340               left_key_, BoxLeftKey(), right_key_, BoxRightKey());
00341       Print();
00342     }
00343     return false;  // Keys inside the box.
00344   }
00345   return true;
00346 }
00347 
00348 // Returns true if the left and right edges are approximately equal.
00349 bool ColPartition::MatchingColumns(const ColPartition& other) const {
00350   int y = (MidY() + other.MidY()) / 2;
00351   if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
00352                    LeftAtY(y) / kColumnWidthFactor, 1))
00353     return false;
00354   if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
00355                    RightAtY(y) / kColumnWidthFactor, 1))
00356     return false;
00357   return true;
00358 }
00359 
00360 // Returns true if the colors match for two text partitions.
00361 bool ColPartition::MatchingTextColor(const ColPartition& other) const {
00362   if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&
00363       other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise)
00364     return false;  // Too noisy.
00365 
00366   // Colors must match for other to count.
00367   double d_this1_o = ImageFind::ColorDistanceFromLine(other.color1_,
00368                                                       other.color2_,
00369                                                       color1_);
00370   double d_this2_o = ImageFind::ColorDistanceFromLine(other.color1_,
00371                                                       other.color2_,
00372                                                       color2_);
00373   double d_o1_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
00374                                                       other.color1_);
00375   double d_o2_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
00376                                                       other.color2_);
00377 // All 4 distances must be small enough.
00378   return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&
00379          d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;
00380 }
00381 
00382 // Returns true if the sizes match for two text partitions,
00383 // taking orientation into account. See also SizesSimilar.
00384 bool ColPartition::MatchingSizes(const ColPartition& other) const {
00385   if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT)
00386     return !TabFind::DifferentSizes(median_width_, other.median_width_);
00387   else
00388     return !TabFind::DifferentSizes(median_size_, other.median_size_);
00389 }
00390 
00391 // Returns true if there is no tabstop violation in merging this and other.
00392 bool ColPartition::ConfirmNoTabViolation(const ColPartition& other) const {
00393   if (bounding_box_.right() < other.bounding_box_.left() &&
00394       bounding_box_.right() < other.LeftBlobRule())
00395     return false;
00396   if (other.bounding_box_.right() < bounding_box_.left() &&
00397       other.bounding_box_.right() < LeftBlobRule())
00398     return false;
00399   if (bounding_box_.left() > other.bounding_box_.right() &&
00400       bounding_box_.left() > other.RightBlobRule())
00401     return false;
00402   if (other.bounding_box_.left() > bounding_box_.right() &&
00403       other.bounding_box_.left() > RightBlobRule())
00404     return false;
00405   return true;
00406 }
00407 
00408 // Returns true if other has a similar stroke width to this.
00409 bool ColPartition::MatchingStrokeWidth(const ColPartition& other,
00410                                        double fractional_tolerance,
00411                                        double constant_tolerance) const {
00412   int match_count = 0;
00413   int nonmatch_count = 0;
00414   BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00415   BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST*>(&other.boxes_));
00416   box_it.mark_cycle_pt();
00417   other_it.mark_cycle_pt();
00418   while (!box_it.cycled_list() && !other_it.cycled_list()) {
00419     if (box_it.data()->MatchingStrokeWidth(*other_it.data(),
00420                                            fractional_tolerance,
00421                                            constant_tolerance))
00422       ++match_count;
00423     else
00424       ++nonmatch_count;
00425     box_it.forward();
00426     other_it.forward();
00427   }
00428   return match_count > nonmatch_count;
00429 }
00430 
00431 // Returns true if base is an acceptable diacritic base char merge
00432 // with this as the diacritic.
00433 // Returns true if:
00434 // (1) this is a ColPartition containing only diacritics, and
00435 // (2) the base characters indicated on the diacritics all believably lie
00436 // within the text line of the candidate ColPartition.
00437 bool ColPartition::OKDiacriticMerge(const ColPartition& candidate,
00438                                     bool debug) const {
00439   BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00440   int min_top = MAX_INT32;
00441   int max_bottom = -MAX_INT32;
00442   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00443     BLOBNBOX* blob = it.data();
00444     if (!blob->IsDiacritic()) {
00445       if (debug) {
00446         tprintf("Blob is not a diacritic:");
00447         blob->bounding_box().print();
00448       }
00449       return false;  // All blobs must have diacritic bases.
00450     }
00451     if (blob->base_char_top() < min_top)
00452       min_top = blob->base_char_top();
00453     if (blob->base_char_bottom() > max_bottom)
00454       max_bottom = blob->base_char_bottom();
00455   }
00456   // If the intersection of all vertical ranges of all base characters
00457   // overlaps the median range of this, then it is OK.
00458   bool result = min_top > candidate.median_bottom_ &&
00459                 max_bottom < candidate.median_top_;
00460   if (debug) {
00461     if (result)
00462       tprintf("OKDiacritic!\n");
00463     else
00464       tprintf("y ranges don\'t overlap: %d-%d / %d-%d\n",
00465               max_bottom, min_top, median_bottom_, median_top_);
00466   }
00467   return result;
00468 }
00469 
00470 // Sets the sort key using either the tab vector, or the bounding box if
00471 // the tab vector is NULL. If the tab_vector lies inside the bounding_box,
00472 // use the edge of the box as a key any way.
00473 void ColPartition::SetLeftTab(const TabVector* tab_vector) {
00474   if (tab_vector != NULL) {
00475     left_key_ = tab_vector->sort_key();
00476     left_key_tab_ = left_key_ <= BoxLeftKey();
00477   } else {
00478     left_key_tab_ = false;
00479   }
00480   if (!left_key_tab_)
00481     left_key_ = BoxLeftKey();
00482 }
00483 
00484 // As SetLeftTab, but with the right.
00485 void ColPartition::SetRightTab(const TabVector* tab_vector) {
00486   if (tab_vector != NULL) {
00487     right_key_ = tab_vector->sort_key();
00488     right_key_tab_ = right_key_ >= BoxRightKey();
00489   } else {
00490     right_key_tab_ = false;
00491   }
00492   if (!right_key_tab_)
00493     right_key_ = BoxRightKey();
00494 }
00495 
00496 // Copies the left/right tab from the src partition, but if take_box is
00497 // true, copies the box instead and uses that as a key.
00498 void ColPartition::CopyLeftTab(const ColPartition& src, bool take_box) {
00499   left_key_tab_ = take_box ? false : src.left_key_tab_;
00500   if (left_key_tab_) {
00501     left_key_ = src.left_key_;
00502   } else {
00503     bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
00504     left_key_ = BoxLeftKey();
00505   }
00506   if (left_margin_ > bounding_box_.left())
00507     left_margin_ = src.left_margin_;
00508 }
00509 
00510 // As CopyLeftTab, but with the right.
00511 void ColPartition::CopyRightTab(const ColPartition& src, bool take_box) {
00512   right_key_tab_ = take_box ? false : src.right_key_tab_;
00513   if (right_key_tab_) {
00514     right_key_ = src.right_key_;
00515   } else {
00516     bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
00517     right_key_ = BoxRightKey();
00518   }
00519   if (right_margin_ < bounding_box_.right())
00520     right_margin_ = src.right_margin_;
00521 }
00522 
00523 // Returns the left rule line x coord of the leftmost blob.
00524 int ColPartition::LeftBlobRule() const {
00525   BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00526   return it.data()->left_rule();
00527 }
00528 // Returns the right rule line x coord of the rightmost blob.
00529 int ColPartition::RightBlobRule() const {
00530   BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00531   it.move_to_last();
00532   return it.data()->right_rule();
00533 }
00534 
00535 float ColPartition::SpecialBlobsDensity(const BlobSpecialTextType type) const {
00536   ASSERT_HOST(type < BSTT_COUNT);
00537   return special_blobs_densities_[type];
00538 }
00539 
00540 int ColPartition::SpecialBlobsCount(const BlobSpecialTextType type) {
00541   ASSERT_HOST(type < BSTT_COUNT);
00542   BLOBNBOX_C_IT blob_it(&boxes_);
00543   int count = 0;
00544   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00545     BLOBNBOX* blob = blob_it.data();
00546     BlobSpecialTextType blob_type = blob->special_text_type();
00547     if (blob_type == type) {
00548       count++;
00549     }
00550   }
00551 
00552   return count;
00553 }
00554 
00555 void ColPartition::SetSpecialBlobsDensity(
00556     const BlobSpecialTextType type, const float density) {
00557   ASSERT_HOST(type < BSTT_COUNT);
00558   special_blobs_densities_[type] = density;
00559 }
00560 
00561 void ColPartition::ComputeSpecialBlobsDensity() {
00562   memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
00563   if (boxes_.empty()) {
00564     return;
00565   }
00566 
00567   BLOBNBOX_C_IT blob_it(&boxes_);
00568   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00569     BLOBNBOX* blob = blob_it.data();
00570     BlobSpecialTextType type = blob->special_text_type();
00571     special_blobs_densities_[type]++;
00572   }
00573 
00574   for (int type = 0; type < BSTT_COUNT; ++type) {
00575     special_blobs_densities_[type] /= boxes_.length();
00576   }
00577 }
00578 
00579 // Add a partner above if upper, otherwise below.
00580 // Add them uniquely and keep the list sorted by box left.
00581 // Partnerships are added symmetrically to partner and this.
00582 void ColPartition::AddPartner(bool upper, ColPartition* partner) {
00583   if (upper) {
00584     partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
00585                                         true, this);
00586     upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
00587   } else {
00588     partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
00589                                         true, this);
00590     lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
00591   }
00592 }
00593 
00594 // Removes the partner from this, but does not remove this from partner.
00595 // This asymmetric removal is so as not to mess up the iterator that is
00596 // working on partner's partner list.
00597 void ColPartition::RemovePartner(bool upper, ColPartition* partner) {
00598   ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
00599   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00600     if (it.data() == partner) {
00601       it.extract();
00602       break;
00603     }
00604   }
00605 }
00606 
00607 // Returns the partner if the given partner is a singleton, otherwise NULL.
00608 ColPartition* ColPartition::SingletonPartner(bool upper) {
00609   ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
00610   if (!partners->singleton())
00611     return NULL;
00612   ColPartition_C_IT it(partners);
00613   return it.data();
00614 }
00615 
00616 // Merge with the other partition and delete it.
00617 void ColPartition::Absorb(ColPartition* other, WidthCallback* cb) {
00618   // The result has to either own all of the blobs or none of them.
00619   // Verify the flag is consisent.
00620   ASSERT_HOST(owns_blobs() == other->owns_blobs());
00621   // TODO(nbeato): check owns_blobs better. Right now owns_blobs
00622   // should always be true when this is called. So there is no issues.
00623   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
00624                                 bounding_box_.bottom()) ||
00625       TabFind::WithinTestRegion(2, other->bounding_box_.left(),
00626                                 other->bounding_box_.bottom())) {
00627     tprintf("Merging:");
00628     Print();
00629     other->Print();
00630   }
00631 
00632   // Update the special_blobs_densities_.
00633   memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
00634   for (int type = 0; type < BSTT_COUNT; ++type) {
00635     int w1 = boxes_.length(), w2 = other->boxes_.length();
00636     float new_val = special_blobs_densities_[type] * w1 +
00637         other->special_blobs_densities_[type] * w2;
00638     if (!w1 || !w2) {
00639       special_blobs_densities_[type] = new_val / (w1 + w2);
00640     }
00641   }
00642 
00643   // Merge the two sorted lists.
00644   BLOBNBOX_C_IT it(&boxes_);
00645   BLOBNBOX_C_IT it2(&other->boxes_);
00646   for (; !it2.empty(); it2.forward()) {
00647     BLOBNBOX* bbox2 = it2.extract();
00648     ColPartition* prev_owner = bbox2->owner();
00649     if (prev_owner != other && prev_owner != NULL) {
00650       // A blob on other's list is owned by someone else; let them have it.
00651       continue;
00652     }
00653     ASSERT_HOST(prev_owner == other || prev_owner == NULL);
00654     if (prev_owner == other)
00655       bbox2->set_owner(this);
00656     it.add_to_end(bbox2);
00657   }
00658   left_margin_ = MIN(left_margin_, other->left_margin_);
00659   right_margin_ = MAX(right_margin_, other->right_margin_);
00660   if (other->left_key_ < left_key_) {
00661     left_key_ = other->left_key_;
00662     left_key_tab_ = other->left_key_tab_;
00663   }
00664   if (other->right_key_ > right_key_) {
00665     right_key_ = other->right_key_;
00666     right_key_tab_ = other->right_key_tab_;
00667   }
00668   // Combine the flow and blob_type in a sensible way.
00669   // Dominant flows stay.
00670   if (!DominatesInMerge(flow_, other->flow_)) {
00671     flow_ = other->flow_;
00672     blob_type_ = other->blob_type_;
00673   }
00674   SetBlobTypes();
00675   if (IsVerticalType()) {
00676     boxes_.sort(SortByBoxBottom<BLOBNBOX>);
00677     last_add_was_vertical_ = true;
00678   } else {
00679     boxes_.sort(SortByBoxLeft<BLOBNBOX>);
00680     last_add_was_vertical_ = false;
00681   }
00682   ComputeLimits();
00683   // Fix partner lists. other is going away, so remove it as a
00684   // partner of all its partners and add this in its place.
00685   for (int upper = 0; upper < 2; ++upper) {
00686     ColPartition_CLIST partners;
00687     ColPartition_C_IT part_it(&partners);
00688     part_it.add_list_after(upper ? &other->upper_partners_
00689                                  : &other->lower_partners_);
00690     for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
00691       ColPartition* partner = part_it.extract();
00692       partner->RemovePartner(!upper, other);
00693       partner->RemovePartner(!upper, this);
00694       partner->AddPartner(!upper, this);
00695     }
00696   }
00697   delete other;
00698   if (cb != NULL) {
00699     SetColumnGoodness(cb);
00700   }
00701 }
00702 
00703 // Merge1 and merge2 are candidates to be merged, yet their combined box
00704 // overlaps this. Is that allowed?
00705 // Returns true if the overlap between this and the merged pair of
00706 // merge candidates is sufficiently trivial to be allowed.
00707 // The merged box can graze the edge of this by the ok_box_overlap
00708 // if that exceeds the margin to the median top and bottom.
00709 // ok_box_overlap should be set by the caller appropriate to the sizes of
00710 // the text involved, and is usually a fraction of the median size of merge1
00711 // and/or merge2, or this.
00712 // TODO(rays) Determine whether vertical text needs to be considered.
00713 bool ColPartition::OKMergeOverlap(const ColPartition& merge1,
00714                                   const ColPartition& merge2,
00715                                   int ok_box_overlap, bool debug) {
00716   // Vertical partitions are not allowed to be involved.
00717   if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {
00718     if (debug)
00719       tprintf("Vertical partition\n");
00720     return false;
00721   }
00722   // The merging partitions must strongly overlap each other.
00723   if (!merge1.VSignificantCoreOverlap(merge2)) {
00724     if (debug)
00725       tprintf("Voverlap %d (%d)\n",
00726               merge1.VCoreOverlap(merge2),
00727               merge1.VSignificantCoreOverlap(merge2));
00728     return false;
00729   }
00730   // The merged box must not overlap the median bounds of this.
00731   TBOX merged_box(merge1.bounding_box());
00732   merged_box += merge2.bounding_box();
00733   if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
00734       merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&
00735       merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {
00736     if (debug)
00737       tprintf("Excessive box overlap\n");
00738     return false;
00739   }
00740   // Looks OK!
00741   return true;
00742 }
00743 
00744 // Find the blob at which to split this to minimize the overlap with the
00745 // given box. Returns the first blob to go in the second partition.
00746 BLOBNBOX* ColPartition::OverlapSplitBlob(const TBOX& box) {
00747   if (boxes_.empty() || boxes_.singleton())
00748     return NULL;
00749   BLOBNBOX_C_IT it(&boxes_);
00750   TBOX left_box(it.data()->bounding_box());
00751   for (it.forward(); !it.at_first(); it.forward()) {
00752     BLOBNBOX* bbox = it.data();
00753     left_box += bbox->bounding_box();
00754     if (left_box.overlap(box))
00755       return bbox;
00756   }
00757   return NULL;
00758 }
00759 
00760 // Split this partition keeping the first half in this and returning
00761 // the second half.
00762 // Splits by putting the split_blob and the blobs that follow
00763 // in the second half, and the rest in the first half.
00764 ColPartition* ColPartition::SplitAtBlob(BLOBNBOX* split_blob) {
00765   ColPartition* split_part = ShallowCopy();
00766   split_part->set_owns_blobs(owns_blobs());
00767   BLOBNBOX_C_IT it(&boxes_);
00768   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00769     BLOBNBOX* bbox = it.data();
00770     ColPartition* prev_owner = bbox->owner();
00771     ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
00772     if (bbox == split_blob || !split_part->boxes_.empty()) {
00773       split_part->AddBox(it.extract());
00774       if (owns_blobs() && prev_owner != NULL)
00775         bbox->set_owner(split_part);
00776     }
00777   }
00778   ASSERT_HOST(!it.empty());
00779   if (split_part->IsEmpty()) {
00780     // Split part ended up with nothing. Possible if split_blob is not
00781     // in the list of blobs.
00782     delete split_part;
00783     return NULL;
00784   }
00785   right_key_tab_ = false;
00786   split_part->left_key_tab_ = false;
00787   ComputeLimits();
00788   // TODO(nbeato) Merge Ray's CL like this:
00789   // if (owns_blobs())
00790   //  SetBlobTextlineGoodness();
00791   split_part->ComputeLimits();
00792   // TODO(nbeato) Merge Ray's CL like this:
00793   // if (split_part->owns_blobs())
00794   //   split_part->SetBlobTextlineGoodness();
00795   return split_part;
00796 }
00797 
00798 // Split this partition at the given x coordinate, returning the right
00799 // half and keeping the left half in this.
00800 ColPartition* ColPartition::SplitAt(int split_x) {
00801   if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right())
00802     return NULL;  // There will be no change.
00803   ColPartition* split_part = ShallowCopy();
00804   split_part->set_owns_blobs(owns_blobs());
00805   BLOBNBOX_C_IT it(&boxes_);
00806   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00807     BLOBNBOX* bbox = it.data();
00808     ColPartition* prev_owner = bbox->owner();
00809     ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
00810     const TBOX& box = bbox->bounding_box();
00811     if (box.left() >= split_x) {
00812       split_part->AddBox(it.extract());
00813       if (owns_blobs() && prev_owner != NULL)
00814         bbox->set_owner(split_part);
00815     }
00816   }
00817   ASSERT_HOST(!it.empty());
00818   if (split_part->IsEmpty()) {
00819     // Split part ended up with nothing. Possible if split_x passes
00820     // through the last blob.
00821     delete split_part;
00822     return NULL;
00823   }
00824   right_key_tab_ = false;
00825   split_part->left_key_tab_ = false;
00826   right_margin_ = split_x;
00827   split_part->left_margin_ = split_x;
00828   ComputeLimits();
00829   split_part->ComputeLimits();
00830   return split_part;
00831 }
00832 
00833 // Recalculates all the coordinate limits of the partition.
00834 void ColPartition::ComputeLimits() {
00835   bounding_box_ = TBOX();  // Clear it
00836   BLOBNBOX_C_IT it(&boxes_);
00837   BLOBNBOX* bbox = NULL;
00838   int non_leader_count = 0;
00839   if (it.empty()) {
00840     bounding_box_.set_left(left_margin_);
00841     bounding_box_.set_right(right_margin_);
00842     bounding_box_.set_bottom(0);
00843     bounding_box_.set_top(0);
00844   } else {
00845     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00846       bbox = it.data();
00847       bounding_box_ += bbox->bounding_box();
00848       if (bbox->flow() != BTFT_LEADER)
00849         ++non_leader_count;
00850     }
00851   }
00852   if (!left_key_tab_)
00853     left_key_ = BoxLeftKey();
00854   if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
00855     // TODO(rays) investigate the causes of these error messages, to find
00856     // out if they are genuinely harmful, or just indicative of junk input.
00857     tprintf("Computed left-illegal partition\n");
00858     Print();
00859   }
00860   if (!right_key_tab_)
00861     right_key_ = BoxRightKey();
00862   if (right_key_ < BoxRightKey() && textord_debug_bugs) {
00863     tprintf("Computed right-illegal partition\n");
00864     Print();
00865   }
00866   if (it.empty())
00867     return;
00868   if (IsImageType() || blob_type() == BRT_RECTIMAGE ||
00869       blob_type() == BRT_POLYIMAGE) {
00870     median_top_ = bounding_box_.top();
00871     median_bottom_ = bounding_box_.bottom();
00872     median_size_ = bounding_box_.height();
00873     median_left_ = bounding_box_.left();
00874     median_right_ = bounding_box_.right();
00875     median_width_ = bounding_box_.width();
00876   } else {
00877     STATS top_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
00878     STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
00879     STATS size_stats(0, bounding_box_.height() + 1);
00880     STATS left_stats(bounding_box_.left(), bounding_box_.right() + 1);
00881     STATS right_stats(bounding_box_.left(), bounding_box_.right() + 1);
00882     STATS width_stats(0, bounding_box_.width() + 1);
00883     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00884       bbox = it.data();
00885       if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {
00886         TBOX box = bbox->bounding_box();
00887         int area = box.area();
00888         top_stats.add(box.top(), area);
00889         bottom_stats.add(box.bottom(), area);
00890         size_stats.add(box.height(), area);
00891         left_stats.add(box.left(), area);
00892         right_stats.add(box.right(), area);
00893         width_stats.add(box.width(), area);
00894       }
00895     }
00896     median_top_ = static_cast<int>(top_stats.median() + 0.5);
00897     median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
00898     median_size_ = static_cast<int>(size_stats.median() + 0.5);
00899     median_left_ = static_cast<int>(left_stats.median() + 0.5);
00900     median_right_ = static_cast<int>(right_stats.median() + 0.5);
00901     median_width_ = static_cast<int>(width_stats.median() + 0.5);
00902   }
00903 
00904   if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
00905     tprintf("Made partition with bad right coords");
00906     Print();
00907   }
00908   if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
00909     tprintf("Made partition with bad left coords");
00910     Print();
00911   }
00912   // Fix partner lists. The bounding box has changed and partners are stored
00913   // in bounding box order, so remove and reinsert this as a partner
00914   // of all its partners.
00915   for (int upper = 0; upper < 2; ++upper) {
00916     ColPartition_CLIST partners;
00917     ColPartition_C_IT part_it(&partners);
00918     part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
00919     for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
00920       ColPartition* partner = part_it.extract();
00921       partner->RemovePartner(!upper, this);
00922       partner->AddPartner(!upper, this);
00923     }
00924   }
00925   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
00926                                 bounding_box_.bottom())) {
00927     tprintf("Recomputed box for partition %p\n", this);
00928     Print();
00929   }
00930 }
00931 
00932 // Returns the number of boxes that overlap the given box.
00933 int ColPartition::CountOverlappingBoxes(const TBOX& box) {
00934   BLOBNBOX_C_IT it(&boxes_);
00935   int overlap_count = 0;
00936   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00937     BLOBNBOX* bbox = it.data();
00938     if (box.overlap(bbox->bounding_box()))
00939       ++overlap_count;
00940   }
00941   return overlap_count;
00942 }
00943 
00944 // Computes and sets the type_ and first_colum_, last_column_ and column_set_.
00945 // resolution refers to the ppi resolution of the image.
00946 void ColPartition::SetPartitionType(int resolution, ColPartitionSet* columns) {
00947   int first_spanned_col = -1;
00948   ColumnSpanningType span_type =
00949       columns->SpanningType(resolution,
00950                             bounding_box_.left(), bounding_box_.right(),
00951                             MidY(), left_margin_, right_margin_,
00952                             &first_column_, &last_column_,
00953                             &first_spanned_col);
00954   column_set_ = columns;
00955   if (first_column_ < last_column_ && span_type == CST_PULLOUT &&
00956       !IsLineType()) {
00957     // Unequal columns may indicate that the pullout spans one of the columns
00958     // it lies in, so force it to be allocated to just that column.
00959     if (first_spanned_col >= 0) {
00960       first_column_ = first_spanned_col;
00961       last_column_ = first_spanned_col;
00962     } else {
00963       if ((first_column_ & 1) == 0)
00964         last_column_ = first_column_;
00965       else if ((last_column_ & 1) == 0)
00966         first_column_ = last_column_;
00967       else
00968         first_column_ = last_column_ = (first_column_ + last_column_) / 2;
00969     }
00970   }
00971   type_ = PartitionType(span_type);
00972 }
00973 
00974 // Returns the PartitionType from the current BlobRegionType and a column
00975 // flow spanning type ColumnSpanningType, generated by
00976 // ColPartitionSet::SpanningType, that indicates how the partition sits
00977 // in the columns.
00978 PolyBlockType ColPartition::PartitionType(ColumnSpanningType flow) const {
00979   if (flow == CST_NOISE) {
00980     if (blob_type_ != BRT_HLINE && blob_type_ != BRT_VLINE &&
00981         blob_type_ != BRT_RECTIMAGE && blob_type_ != BRT_VERT_TEXT)
00982       return PT_NOISE;
00983     flow = CST_FLOWING;
00984   }
00985 
00986   switch (blob_type_) {
00987     case BRT_NOISE:
00988       return PT_NOISE;
00989     case BRT_HLINE:
00990       return PT_HORZ_LINE;
00991     case BRT_VLINE:
00992       return PT_VERT_LINE;
00993     case BRT_RECTIMAGE:
00994     case BRT_POLYIMAGE:
00995       switch (flow) {
00996         case CST_FLOWING:
00997           return PT_FLOWING_IMAGE;
00998         case CST_HEADING:
00999           return PT_HEADING_IMAGE;
01000         case CST_PULLOUT:
01001           return PT_PULLOUT_IMAGE;
01002         default:
01003           ASSERT_HOST(!"Undefined flow type for image!");
01004       }
01005       break;
01006     case BRT_VERT_TEXT:
01007       return PT_VERTICAL_TEXT;
01008     case BRT_TEXT:
01009     case BRT_UNKNOWN:
01010     default:
01011       switch (flow) {
01012         case CST_FLOWING:
01013           return PT_FLOWING_TEXT;
01014         case CST_HEADING:
01015           return PT_HEADING_TEXT;
01016         case CST_PULLOUT:
01017           return PT_PULLOUT_TEXT;
01018         default:
01019           ASSERT_HOST(!"Undefined flow type for text!");
01020       }
01021   }
01022   ASSERT_HOST(!"Should never get here!");
01023   return PT_NOISE;
01024 }
01025 
01026 // Returns the first and last column touched by this partition.
01027 // resolution refers to the ppi resolution of the image.
01028 void ColPartition::ColumnRange(int resolution, ColPartitionSet* columns,
01029                                int* first_col, int* last_col) {
01030   int first_spanned_col = -1;
01031   ColumnSpanningType span_type =
01032       columns->SpanningType(resolution,
01033                             bounding_box_.left(), bounding_box_.right(),
01034                             MidY(), left_margin_, right_margin_,
01035                             first_col, last_col,
01036                             &first_spanned_col);
01037   type_ = PartitionType(span_type);
01038 }
01039 
01040 // Sets the internal flags good_width_ and good_column_.
01041 void ColPartition::SetColumnGoodness(WidthCallback* cb) {
01042   int y = MidY();
01043   int width = RightAtY(y) - LeftAtY(y);
01044   good_width_ = cb->Run(width);
01045   good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;
01046 }
01047 
01048 // Determines whether the blobs in this partition mostly represent
01049 // a leader (fixed pitch sequence) and sets the member blobs accordingly.
01050 // Note that height is assumed to have been tested elsewhere, and that this
01051 // function will find most fixed-pitch text as leader without a height filter.
01052 // Leader detection is limited to sequences of identical width objects,
01053 // such as .... or ----, so patterns, such as .-.-.-.-. will not be found.
01054 bool ColPartition::MarkAsLeaderIfMonospaced() {
01055   bool result = false;
01056   // Gather statistics on the gaps between blobs and the widths of the blobs.
01057   int part_width = bounding_box_.width();
01058   STATS gap_stats(0, part_width);
01059   STATS width_stats(0, part_width);
01060   BLOBNBOX_C_IT it(&boxes_);
01061   BLOBNBOX* prev_blob = it.data();
01062   prev_blob->set_flow(BTFT_NEIGHBOURS);
01063   width_stats.add(prev_blob->bounding_box().width(), 1);
01064   int blob_count = 1;
01065   for (it.forward(); !it.at_first(); it.forward()) {
01066     BLOBNBOX* blob = it.data();
01067     int left = blob->bounding_box().left();
01068     int right = blob->bounding_box().right();
01069     gap_stats.add(left - prev_blob->bounding_box().right(), 1);
01070     width_stats.add(right - left, 1);
01071     blob->set_flow(BTFT_NEIGHBOURS);
01072     prev_blob = blob;
01073     ++blob_count;
01074   }
01075   double median_gap = gap_stats.median();
01076   double median_width = width_stats.median();
01077   double max_width = MAX(median_gap, median_width);
01078   double min_width = MIN(median_gap, median_width);
01079   double gap_iqr = gap_stats.ile(0.75f) - gap_stats.ile(0.25f);
01080   if (textord_debug_tabfind >= 4) {
01081     tprintf("gap iqr = %g, blob_count=%d, limits=%g,%g\n",
01082             gap_iqr, blob_count, max_width * kMaxLeaderGapFractionOfMax,
01083             min_width * kMaxLeaderGapFractionOfMin);
01084   }
01085   if (gap_iqr < max_width * kMaxLeaderGapFractionOfMax &&
01086       gap_iqr < min_width * kMaxLeaderGapFractionOfMin &&
01087       blob_count >= kMinLeaderCount) {
01088     // This is stable enough to be called a leader, so check the widths.
01089     // Since leader dashes can join, run a dp cutting algorithm and go
01090     // on the cost.
01091     int offset = static_cast<int>(ceil(gap_iqr * 2));
01092     int min_step = static_cast<int>(median_gap + median_width + 0.5);
01093     int max_step = min_step + offset;
01094     min_step -= offset;
01095     // Pad the buffer with min_step/2 on each end.
01096     int part_left = bounding_box_.left() - min_step / 2;
01097     part_width += min_step;
01098     DPPoint* projection = new DPPoint[part_width];
01099     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01100       BLOBNBOX* blob = it.data();
01101       int left = blob->bounding_box().left();
01102       int right = blob->bounding_box().right();
01103       int height = blob->bounding_box().height();
01104       for (int x = left; x < right; ++x) {
01105         projection[left - part_left].AddLocalCost(height);
01106       }
01107     }
01108     DPPoint* best_end = DPPoint::Solve(min_step, max_step, false,
01109                                        &DPPoint::CostWithVariance,
01110                                        part_width, projection);
01111     if (best_end != NULL && best_end->total_cost() < blob_count) {
01112       // Good enough. Call it a leader.
01113       result = true;
01114       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01115         BLOBNBOX* blob = it.data();
01116         TBOX box = blob->bounding_box();
01117         // If the first or last blob is spaced too much, don't mark it.
01118         if (it.at_first()) {
01119           int gap = it.data_relative(1)->bounding_box().left() -
01120                      blob->bounding_box().right();
01121           if (blob->bounding_box().width() + gap > max_step) {
01122             it.extract();
01123             continue;
01124           }
01125         }
01126         if (it.at_last()) {
01127           int gap = blob->bounding_box().left() -
01128                      it.data_relative(-1)->bounding_box().right();
01129           if (blob->bounding_box().width() + gap > max_step) {
01130             it.extract();
01131             break;
01132           }
01133         }
01134         blob->set_region_type(BRT_TEXT);
01135         blob->set_flow(BTFT_LEADER);
01136       }
01137       blob_type_ = BRT_TEXT;
01138       flow_ = BTFT_LEADER;
01139     } else if (textord_debug_tabfind) {
01140       if (best_end == NULL) {
01141         tprintf("No path\n");
01142       } else {
01143         tprintf("Total cost = %d vs allowed %d\n",
01144                 best_end->total_cost() < blob_count);
01145       }
01146     }
01147     delete [] projection;
01148   }
01149   return result;
01150 }
01151 
01152 // Given the result of TextlineProjection::EvaluateColPartition, (positive for
01153 // horizontal text, negative for vertical text, and near zero for non-text),
01154 // sets the blob_type_ and flow_ for this partition to indicate whether it
01155 // is strongly or weakly vertical or horizontal text, or non-text.
01156 // The function assumes that the blob neighbours are valid (from
01157 // StrokeWidth::SetNeighbours) and that those neighbours have their
01158 // region_type() set.
01159 void ColPartition::SetRegionAndFlowTypesFromProjectionValue(int value) {
01160   int blob_count = 0;        // Total # blobs.
01161   int good_blob_score_ = 0;  // Total # good strokewidth neighbours.
01162   int noisy_count = 0;       // Total # neighbours marked as noise.
01163   int hline_count = 0;
01164   int vline_count = 0;
01165   BLOBNBOX_C_IT it(&boxes_);
01166   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01167     BLOBNBOX* blob = it.data();
01168     ++blob_count;
01169     noisy_count += blob->NoisyNeighbours();
01170     good_blob_score_ += blob->GoodTextBlob();
01171     if (blob->region_type() == BRT_HLINE) ++hline_count;
01172     if (blob->region_type() == BRT_VLINE) ++vline_count;
01173   }
01174   flow_ = BTFT_NEIGHBOURS;
01175   blob_type_ = BRT_UNKNOWN;
01176   if (hline_count > vline_count) {
01177     flow_ = BTFT_NONE;
01178     blob_type_ = BRT_HLINE;
01179   } else if (vline_count > hline_count) {
01180     flow_ = BTFT_NONE;
01181     blob_type_ = BRT_VLINE;
01182   } else if (value < -1 || 1 < value) {
01183     int long_side;
01184     int short_side;
01185     if (value > 0) {
01186       long_side = bounding_box_.width();
01187       short_side = bounding_box_.height();
01188       blob_type_ = BRT_TEXT;
01189     } else {
01190       long_side = bounding_box_.height();
01191       short_side = bounding_box_.width();
01192       blob_type_ = BRT_VERT_TEXT;
01193     }
01194     // We will combine the old metrics using aspect ratio and blob counts
01195     // with the input value by allowing a strong indication to flip the
01196     // STRONG_CHAIN/CHAIN flow values.
01197     int strong_score = blob_count >= kHorzStrongTextlineCount ? 1 : 0;
01198     if (short_side > kHorzStrongTextlineHeight) ++strong_score;
01199     if (short_side * kHorzStrongTextlineAspect < long_side) ++strong_score;
01200     if (abs(value) >= kMinStrongTextValue)
01201       flow_ = BTFT_STRONG_CHAIN;
01202     else if (abs(value) >= kMinChainTextValue)
01203       flow_ = BTFT_CHAIN;
01204     else
01205       flow_ = BTFT_NEIGHBOURS;
01206     // Upgrade chain to strong chain if the other indicators are good
01207     if (flow_ == BTFT_CHAIN && strong_score == 3)
01208       flow_ = BTFT_STRONG_CHAIN;
01209     // Downgrade strong vertical text to chain if the indicators are bad.
01210     if (flow_ == BTFT_STRONG_CHAIN && value < 0 && strong_score < 2)
01211       flow_ = BTFT_CHAIN;
01212   }
01213   if (flow_ == BTFT_NEIGHBOURS) {
01214     // Check for noisy neighbours.
01215     if (noisy_count >= blob_count) {
01216       flow_ = BTFT_NONTEXT;
01217       blob_type_= BRT_NOISE;
01218     }
01219   }
01220   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
01221                                 bounding_box_.bottom())) {
01222     tprintf("RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
01223             blob_count, noisy_count, good_blob_score_);
01224     tprintf(" Projection value=%d, flow=%d, blob_type=%d\n",
01225             value, flow_, blob_type_);
01226     Print();
01227   }
01228   SetBlobTypes();
01229 }
01230 
01231 // Sets all blobs with the partition blob type and flow, but never overwrite
01232 // leader blobs, as we need to be able to identify them later.
01233 void ColPartition::SetBlobTypes() {
01234   if (!owns_blobs())
01235     return;
01236   BLOBNBOX_C_IT it(&boxes_);
01237   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01238     BLOBNBOX* blob = it.data();
01239     if (blob->flow() != BTFT_LEADER)
01240       blob->set_flow(flow_);
01241     blob->set_region_type(blob_type_);
01242     ASSERT_HOST(blob->owner() == NULL || blob->owner() == this);
01243   }
01244 }
01245 
01246 // Returns true if a decent baseline can be fitted through the blobs.
01247 // Works for both horizontal and vertical text.
01248 bool ColPartition::HasGoodBaseline() {
01249   // Approximation of the baseline.
01250   DetLineFit linepoints;
01251   // Calculation of the mean height on this line segment. Note that these
01252   // variable names apply to the context of a horizontal line, and work
01253   // analogously, rather than literally in the case of a vertical line.
01254   int total_height = 0;
01255   int coverage = 0;
01256   int height_count = 0;
01257   int width = 0;
01258   BLOBNBOX_C_IT it(&boxes_);
01259   TBOX box(it.data()->bounding_box());
01260   // Accumulate points representing the baseline at the middle of each blob,
01261   // but add an additional point for each end of the line. This makes it
01262   // harder to fit a severe skew angle, as it is most likely not right.
01263   if (IsVerticalType()) {
01264     // For a vertical line, use the right side as the baseline.
01265     ICOORD first_pt(box.right(), box.bottom());
01266     // Use the bottom-right of the first (bottom) box, the top-right of the
01267     // last, and the middle-right of all others.
01268     linepoints.Add(first_pt);
01269     for (it.forward(); !it.at_last(); it.forward()) {
01270       BLOBNBOX* blob = it.data();
01271       box = blob->bounding_box();
01272       ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
01273       linepoints.Add(box_pt);
01274       total_height += box.width();
01275       coverage += box.height();
01276       ++height_count;
01277     }
01278     box = it.data()->bounding_box();
01279     ICOORD last_pt(box.right(), box.top());
01280     linepoints.Add(last_pt);
01281     width = last_pt.y() - first_pt.y();
01282 
01283   } else {
01284     // Horizontal lines use the bottom as the baseline.
01285     TBOX box(it.data()->bounding_box());
01286     // Use the bottom-left of the first box, the the bottom-right of the last,
01287     // and the middle of all others.
01288     ICOORD first_pt(box.left(), box.bottom());
01289     linepoints.Add(first_pt);
01290     for (it.forward(); !it.at_last(); it.forward()) {
01291       BLOBNBOX* blob = it.data();
01292       box = blob->bounding_box();
01293       ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
01294       linepoints.Add(box_pt);
01295       total_height += box.height();
01296       coverage += box.width();
01297       ++height_count;
01298     }
01299     box = it.data()->bounding_box();
01300     ICOORD last_pt(box.right(), box.bottom());
01301     linepoints.Add(last_pt);
01302     width = last_pt.x() - first_pt.x();
01303   }
01304   // Maximum median error allowed to be a good text line.
01305   double max_error = kMaxBaselineError * total_height / height_count;
01306   ICOORD start_pt, end_pt;
01307   double error = linepoints.Fit(&start_pt, &end_pt);
01308   return error < max_error && coverage >= kMinBaselineCoverage * width;
01309 }
01310 
01311 // Adds this ColPartition to a matching WorkingPartSet if one can be found,
01312 // otherwise starts a new one in the appropriate column, ending the previous.
01313 void ColPartition::AddToWorkingSet(const ICOORD& bleft, const ICOORD& tright,
01314                                    int resolution,
01315                                    ColPartition_LIST* used_parts,
01316                                    WorkingPartSet_LIST* working_sets) {
01317   if (block_owned_)
01318     return;  // Done it already.
01319   block_owned_ = true;
01320   WorkingPartSet_IT it(working_sets);
01321   // If there is an upper partner use its working_set_ directly.
01322   ColPartition* partner = SingletonPartner(true);
01323   if (partner != NULL && partner->working_set_ != NULL) {
01324     working_set_ = partner->working_set_;
01325     working_set_->AddPartition(this);
01326     return;
01327   }
01328   if (partner != NULL && textord_debug_bugs) {
01329     tprintf("Partition with partner has no working set!:");
01330     Print();
01331     partner->Print();
01332   }
01333   // Search for the column that the left edge fits in.
01334   WorkingPartSet* work_set = NULL;
01335   it.move_to_first();
01336   int col_index = 0;
01337   for (it.mark_cycle_pt(); !it.cycled_list() &&
01338        col_index != first_column_;
01339         it.forward(), ++col_index);
01340   if (textord_debug_tabfind >= 2) {
01341     tprintf("Match is %s for:", (col_index & 1) ? "Real" : "Between");
01342     Print();
01343   }
01344   if (it.cycled_list() && textord_debug_bugs) {
01345     tprintf("Target column=%d, only had %d\n", first_column_, col_index);
01346   }
01347   ASSERT_HOST(!it.cycled_list());
01348   work_set = it.data();
01349   // If last_column_ != first_column, then we need to scoop up all blocks
01350   // between here and the last_column_ and put back in work_set.
01351   if (!it.cycled_list() && last_column_ != first_column_) {
01352     // Find the column that the right edge falls in.
01353     BLOCK_LIST completed_blocks;
01354     TO_BLOCK_LIST to_blocks;
01355     for (; !it.cycled_list() && col_index <= last_column_;
01356          it.forward(), ++col_index) {
01357       WorkingPartSet* end_set = it.data();
01358       end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,
01359                                       &completed_blocks, &to_blocks);
01360     }
01361     work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
01362   }
01363   working_set_ = work_set;
01364   work_set->AddPartition(this);
01365 }
01366 
01367 // From the given block_parts list, builds one or more BLOCKs and
01368 // corresponding TO_BLOCKs, such that the line spacing is uniform in each.
01369 // Created blocks are appended to the end of completed_blocks and to_blocks.
01370 // The used partitions are put onto used_parts, as they may still be referred
01371 // to in the partition grid. bleft, tright and resolution are the bounds
01372 // and resolution of the original image.
01373 void ColPartition::LineSpacingBlocks(const ICOORD& bleft, const ICOORD& tright,
01374                                      int resolution,
01375                                      ColPartition_LIST* block_parts,
01376                                      ColPartition_LIST* used_parts,
01377                                      BLOCK_LIST* completed_blocks,
01378                                      TO_BLOCK_LIST* to_blocks) {
01379   int page_height = tright.y() - bleft.y();
01380   // Compute the initial spacing stats.
01381   ColPartition_IT it(block_parts);
01382   int part_count = 0;
01383   int max_line_height = 0;
01384 
01385   // TODO(joeliu): We should add some special logic for PT_INLINE_EQUATION type
01386   // because their line spacing with their neighbors maybe smaller and their
01387   // height may be slightly larger.
01388 
01389   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01390     ColPartition* part = it.data();
01391     ASSERT_HOST(!part->boxes()->empty());
01392     STATS side_steps(0, part->bounding_box().height());
01393     if (part->bounding_box().height() > max_line_height)
01394       max_line_height = part->bounding_box().height();
01395     BLOBNBOX_C_IT blob_it(part->boxes());
01396     int prev_bottom = blob_it.data()->bounding_box().bottom();
01397     for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
01398       BLOBNBOX* blob = blob_it.data();
01399       int bottom = blob->bounding_box().bottom();
01400       int step = bottom - prev_bottom;
01401       if (step < 0)
01402         step = -step;
01403       side_steps.add(step, 1);
01404       prev_bottom = bottom;
01405     }
01406     part->set_side_step(static_cast<int>(side_steps.median() + 0.5));
01407     if (!it.at_last()) {
01408       ColPartition* next_part = it.data_relative(1);
01409       part->set_bottom_spacing(part->median_bottom() -
01410                                next_part->median_bottom());
01411       part->set_top_spacing(part->median_top() - next_part->median_top());
01412     } else {
01413       part->set_bottom_spacing(page_height);
01414       part->set_top_spacing(page_height);
01415     }
01416     if (textord_debug_tabfind) {
01417       part->Print();
01418       tprintf("side step = %.2f, top spacing = %d, bottom spacing=%d\n",
01419               side_steps.median(), part->top_spacing(), part->bottom_spacing());
01420     }
01421     ++part_count;
01422   }
01423   if (part_count == 0)
01424     return;
01425 
01426   SmoothSpacings(resolution, page_height, block_parts);
01427 
01428   // Move the partitions into individual block lists and make the blocks.
01429   BLOCK_IT block_it(completed_blocks);
01430   TO_BLOCK_IT to_block_it(to_blocks);
01431   ColPartition_LIST spacing_parts;
01432   ColPartition_IT sp_block_it(&spacing_parts);
01433   int same_block_threshold = max_line_height * kMaxSameBlockLineSpacing;
01434   for (it.mark_cycle_pt(); !it.empty();) {
01435     ColPartition* part = it.extract();
01436     sp_block_it.add_to_end(part);
01437     it.forward();
01438     if (it.empty() || part->bottom_spacing() > same_block_threshold ||
01439         !part->SpacingsEqual(*it.data(), resolution)) {
01440       // There is a spacing boundary. Check to see if it.data() belongs
01441       // better in the current block or the next one.
01442       if (!it.empty() && part->bottom_spacing() <= same_block_threshold) {
01443         ColPartition* next_part = it.data();
01444         // If there is a size match one-way, then the middle line goes with
01445         // its matched size, otherwise it goes with the smallest spacing.
01446         ColPartition* third_part = it.at_last() ? NULL : it.data_relative(1);
01447         if (textord_debug_tabfind) {
01448           tprintf("Spacings unequal: upper:%d/%d, lower:%d/%d,"
01449                   " sizes %d %d %d\n",
01450                   part->top_spacing(), part->bottom_spacing(),
01451                   next_part->top_spacing(), next_part->bottom_spacing(),
01452                   part->median_size(), next_part->median_size(),
01453                   third_part != NULL ? third_part->median_size() : 0);
01454         }
01455         // We can only consider adding the next line to the block if the sizes
01456         // match and the lines are close enough for their size.
01457         if (part->SizesSimilar(*next_part) &&
01458             next_part->median_size() * kMaxSameBlockLineSpacing >
01459                 part->bottom_spacing() &&
01460             part->median_size() * kMaxSameBlockLineSpacing >
01461                 part->top_spacing()) {
01462           // Even now, we can only add it as long as the third line doesn't
01463           // match in the same way and have a smaller bottom spacing.
01464           if (third_part == NULL ||
01465               !next_part->SizesSimilar(*third_part) ||
01466               third_part->median_size() * kMaxSameBlockLineSpacing <=
01467                   next_part->bottom_spacing() ||
01468               next_part->median_size() * kMaxSameBlockLineSpacing <=
01469                   next_part->top_spacing() ||
01470                   next_part->bottom_spacing() > part->bottom_spacing()) {
01471             // Add to the current block.
01472             sp_block_it.add_to_end(it.extract());
01473             it.forward();
01474             if (textord_debug_tabfind) {
01475               tprintf("Added line to current block.\n");
01476             }
01477           }
01478         }
01479       }
01480       TO_BLOCK* to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);
01481       if (to_block != NULL) {
01482         to_block_it.add_to_end(to_block);
01483         block_it.add_to_end(to_block->block);
01484       }
01485       sp_block_it.set_to_list(&spacing_parts);
01486     } else {
01487       if (textord_debug_tabfind && !it.empty()) {
01488         ColPartition* next_part = it.data();
01489         tprintf("Spacings equal: upper:%d/%d, lower:%d/%d\n",
01490                 part->top_spacing(), part->bottom_spacing(),
01491                 next_part->top_spacing(), next_part->bottom_spacing(),
01492                 part->median_size(), next_part->median_size());
01493       }
01494     }
01495   }
01496 }
01497 
01498 // Helper function to clip the input pos to the given bleft, tright bounds.
01499 static void ClipCoord(const ICOORD& bleft, const ICOORD& tright, ICOORD* pos) {
01500   if (pos->x() < bleft.x())
01501     pos->set_x(bleft.x());
01502   if (pos->x() > tright.x())
01503     pos->set_x(tright.x());
01504   if (pos->y() < bleft.y())
01505     pos->set_y(bleft.y());
01506   if (pos->y() > tright.y())
01507     pos->set_y(tright.y());
01508 }
01509 
01510 // Helper moves the blobs from the given list of block_parts into the block
01511 // itself. Sets up the block for (old) textline formation correctly for
01512 // vertical and horizontal text. The partitions are moved to used_parts
01513 // afterwards, as they cannot be deleted yet.
01514 static TO_BLOCK* MoveBlobsToBlock(bool vertical_text, int line_spacing,
01515                                   BLOCK* block,
01516                                   ColPartition_LIST* block_parts,
01517                                   ColPartition_LIST* used_parts) {
01518   // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.
01519   // Move all the parts to a done list as they are no longer needed, except
01520   // that have have to continue to exist until the part grid is deleted.
01521   // Compute the median blob size as we go, as the block needs to know.
01522   TBOX block_box(block->bounding_box());
01523   STATS sizes(0, MAX(block_box.width(), block_box.height()));
01524   bool text_type = block->poly_block()->IsText();
01525   ColPartition_IT it(block_parts);
01526   TO_BLOCK* to_block = new TO_BLOCK(block);
01527   BLOBNBOX_IT blob_it(&to_block->blobs);
01528   ColPartition_IT used_it(used_parts);
01529   for (it.move_to_first(); !it.empty(); it.forward()) {
01530     ColPartition* part = it.extract();
01531     // Transfer blobs from all regions to the output blocks.
01532     // Blobs for non-text regions will be used to define the polygonal
01533     // bounds of the region.
01534     for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty();
01535          bb_it.forward()) {
01536       BLOBNBOX* bblob = bb_it.extract();
01537       if (bblob->owner() != part) {
01538         tprintf("Ownership incorrect for blob:");
01539         bblob->bounding_box().print();
01540         tprintf("Part=");
01541         part->Print();
01542         if (bblob->owner() == NULL) {
01543           tprintf("Not owned\n");
01544         } else {
01545           tprintf("Owner part:");
01546           bblob->owner()->Print();
01547         }
01548       }
01549       ASSERT_HOST(bblob->owner() == part);
01550       // Assert failure here is caused by arbitrarily changing the partition
01551       // type without also changing the blob type, such as in
01552       // InsertSmallBlobsAsUnknowns.
01553       ASSERT_HOST(!text_type || bblob->region_type() >= BRT_UNKNOWN);
01554       C_OUTLINE_LIST* outlines = bblob->cblob()->out_list();
01555       C_OUTLINE_IT ol_it(outlines);
01556       if (outlines->singleton()) {
01557         ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
01558         if (vertical_text)
01559           sizes.add(bblob->bounding_box().width(), 1);
01560         else
01561           sizes.add(bblob->bounding_box().height(), 1);
01562         blob_it.add_after_then_move(bblob);
01563       } else {
01564         // This blob has multiple outlines from CJK repair.
01565         // Explode the blob back into individual outlines.
01566         for (;!ol_it.empty(); ol_it.forward()) {
01567           C_OUTLINE* outline = ol_it.extract();
01568           BLOBNBOX* blob = BLOBNBOX::RealBlob(outline);
01569           if (vertical_text)
01570             sizes.add(blob->bounding_box().width(), 1);
01571           else
01572             sizes.add(blob->bounding_box().height(), 1);
01573           blob_it.add_after_then_move(blob);
01574         }
01575         delete bblob->cblob();
01576         delete bblob;
01577       }
01578     }
01579     used_it.add_to_end(part);
01580   }
01581   if (text_type && blob_it.empty()) {
01582     delete block;
01583     delete to_block;
01584     return NULL;
01585   }
01586   to_block->line_size = sizes.median();
01587   if (vertical_text) {
01588     int block_width = block->bounding_box().width();
01589     if (block_width < line_spacing)
01590       line_spacing = block_width;
01591     to_block->line_spacing = static_cast<float>(line_spacing);
01592     to_block->max_blob_size = static_cast<float>(block_width + 1);
01593   } else {
01594     int block_height = block->bounding_box().height();
01595     if (block_height < line_spacing)
01596       line_spacing = block_height;
01597     to_block->line_spacing = static_cast<float>(line_spacing);
01598     to_block->max_blob_size = static_cast<float>(block_height + 1);
01599   }
01600   return to_block;
01601 }
01602 
01603 // Constructs a block from the given list of partitions.
01604 // Arguments are as LineSpacingBlocks above.
01605 TO_BLOCK* ColPartition::MakeBlock(const ICOORD& bleft, const ICOORD& tright,
01606                                   ColPartition_LIST* block_parts,
01607                                   ColPartition_LIST* used_parts) {
01608   if (block_parts->empty())
01609     return NULL;  // Nothing to do.
01610   ColPartition_IT it(block_parts);
01611   ColPartition* part = it.data();
01612   PolyBlockType type = part->type();
01613   if (type == PT_VERTICAL_TEXT)
01614     return MakeVerticalTextBlock(bleft, tright, block_parts, used_parts);
01615   // LineSpacingBlocks has handed us a collection of evenly spaced lines and
01616   // put the average spacing in each partition, so we can just take the
01617   // linespacing from the first partition.
01618   int line_spacing = part->bottom_spacing();
01619   if (line_spacing < part->median_size())
01620     line_spacing = part->bounding_box().height();
01621   ICOORDELT_LIST vertices;
01622   ICOORDELT_IT vert_it(&vertices);
01623   ICOORD start, end;
01624   int min_x = MAX_INT32;
01625   int max_x = -MAX_INT32;
01626   int min_y = MAX_INT32;
01627   int max_y = -MAX_INT32;
01628   int iteration = 0;
01629   do {
01630     if (iteration == 0)
01631       ColPartition::LeftEdgeRun(&it, &start, &end);
01632     else
01633       ColPartition::RightEdgeRun(&it, &start, &end);
01634     ClipCoord(bleft, tright, &start);
01635     ClipCoord(bleft, tright, &end);
01636     vert_it.add_after_then_move(new ICOORDELT(start));
01637     vert_it.add_after_then_move(new ICOORDELT(end));
01638     UpdateRange(start.x(), &min_x, &max_x);
01639     UpdateRange(end.x(), &min_x, &max_x);
01640     UpdateRange(start.y(), &min_y, &max_y);
01641     UpdateRange(end.y(), &min_y, &max_y);
01642     if ((iteration == 0 && it.at_first()) ||
01643         (iteration == 1 && it.at_last())) {
01644       ++iteration;
01645       it.move_to_last();
01646     }
01647   } while (iteration < 2);
01648   if (textord_debug_tabfind)
01649     tprintf("Making block at (%d,%d)->(%d,%d)\n",
01650             min_x, min_y, max_x, max_y);
01651   BLOCK* block = new BLOCK("", true, 0, 0, min_x, min_y, max_x, max_y);
01652   block->set_poly_block(new POLY_BLOCK(&vertices, type));
01653   return MoveBlobsToBlock(false, line_spacing, block, block_parts, used_parts);
01654 }
01655 
01656 // Constructs a block from the given list of vertical text partitions.
01657 // Currently only creates rectangular blocks.
01658 TO_BLOCK* ColPartition::MakeVerticalTextBlock(const ICOORD& bleft,
01659                                               const ICOORD& tright,
01660                                               ColPartition_LIST* block_parts,
01661                                               ColPartition_LIST* used_parts) {
01662   if (block_parts->empty())
01663     return NULL;  // Nothing to do.
01664   ColPartition_IT it(block_parts);
01665   ColPartition* part = it.data();
01666   TBOX block_box = part->bounding_box();
01667   int line_spacing = block_box.width();
01668   PolyBlockType type = it.data()->type();
01669   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01670     block_box += it.data()->bounding_box();
01671   }
01672   if (textord_debug_tabfind) {
01673     tprintf("Making block at:");
01674     block_box.print();
01675   }
01676   BLOCK* block = new BLOCK("", true, 0, 0, block_box.left(), block_box.bottom(),
01677                            block_box.right(), block_box.top());
01678   block->set_poly_block(new POLY_BLOCK(block_box, type));
01679   return MoveBlobsToBlock(true, line_spacing, block, block_parts, used_parts);
01680 }
01681 
01682 // Returns a copy of everything except the list of boxes. The resulting
01683 // ColPartition is only suitable for keeping in a column candidate list.
01684 ColPartition* ColPartition::ShallowCopy() const {
01685   ColPartition* part = new ColPartition(blob_type_, vertical_);
01686   part->left_margin_ = left_margin_;
01687   part->right_margin_ = right_margin_;
01688   part->bounding_box_ = bounding_box_;
01689   memcpy(part->special_blobs_densities_, special_blobs_densities_,
01690          sizeof(special_blobs_densities_));
01691   part->median_bottom_ = median_bottom_;
01692   part->median_top_ = median_top_;
01693   part->median_size_ = median_size_;
01694   part->median_left_ = median_left_;
01695   part->median_right_ = median_right_;
01696   part->median_width_ = median_width_;
01697   part->good_width_ = good_width_;
01698   part->good_column_ = good_column_;
01699   part->left_key_tab_ = left_key_tab_;
01700   part->right_key_tab_ = right_key_tab_;
01701   part->type_ = type_;
01702   part->flow_ = flow_;
01703   part->left_key_ = left_key_;
01704   part->right_key_ = right_key_;
01705   part->first_column_ = first_column_;
01706   part->last_column_ = last_column_;
01707   part->owns_blobs_ = false;
01708   return part;
01709 }
01710 
01711 ColPartition* ColPartition::CopyButDontOwnBlobs() {
01712   ColPartition* copy = ShallowCopy();
01713   copy->set_owns_blobs(false);
01714   BLOBNBOX_C_IT inserter(copy->boxes());
01715   BLOBNBOX_C_IT traverser(boxes());
01716   for (traverser.mark_cycle_pt(); !traverser.cycled_list(); traverser.forward())
01717     inserter.add_after_then_move(traverser.data());
01718   return copy;
01719 }
01720 
01721 #ifndef GRAPHICS_DISABLED
01722 // Provides a color for BBGrid to draw the rectangle.
01723 // Must be kept in sync with PolyBlockType.
01724 ScrollView::Color  ColPartition::BoxColor() const {
01725   if (type_ == PT_UNKNOWN)
01726     return BLOBNBOX::TextlineColor(blob_type_, flow_);
01727   return POLY_BLOCK::ColorForPolyBlockType(type_);
01728 }
01729 #endif  // GRAPHICS_DISABLED
01730 
01731 // Keep in sync with BlobRegionType.
01732 static char kBlobTypes[BRT_COUNT + 1] = "NHSRIUVT";
01733 
01734 // Prints debug information on this.
01735 void ColPartition::Print() const {
01736   int y = MidY();
01737   tprintf("ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
01738           " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
01739           " ts=%d bs=%d ls=%d rs=%d\n",
01740           boxes_.empty() ? 'E' : ' ',
01741           left_margin_, left_key_tab_ ? 'T' : 'B', LeftAtY(y),
01742           bounding_box_.left(), median_left_,
01743           bounding_box_.bottom(), median_bottom_,
01744           bounding_box_.right(), RightAtY(y), right_key_tab_ ? 'T' : 'B',
01745           right_margin_, median_right_, bounding_box_.top(), median_top_,
01746           good_width_, good_column_, type_,
01747           kBlobTypes[blob_type_], flow_,
01748           first_column_, last_column_, boxes_.length(),
01749           space_above_, space_below_, space_to_left_, space_to_right_);
01750 }
01751 
01752 // Prints debug information on the colors.
01753 void ColPartition::PrintColors() {
01754   tprintf("Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n",
01755           color1_[COLOR_RED], color1_[COLOR_GREEN], color1_[COLOR_BLUE],
01756           color1_[L_ALPHA_CHANNEL],
01757           color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
01758 }
01759 
01760 // Sets the types of all partitions in the run to be the max of the types.
01761 void ColPartition::SmoothPartnerRun(int working_set_count) {
01762   STATS left_stats(0, working_set_count);
01763   STATS right_stats(0, working_set_count);
01764   PolyBlockType max_type = type_;
01765   ColPartition* partner;
01766   for (partner = SingletonPartner(false); partner != NULL;
01767        partner = partner->SingletonPartner(false)) {
01768     if (partner->type_ > max_type)
01769       max_type = partner->type_;
01770     if (column_set_ == partner->column_set_) {
01771       left_stats.add(partner->first_column_, 1);
01772       right_stats.add(partner->last_column_, 1);
01773     }
01774   }
01775   type_ = max_type;
01776   // TODO(rays) Either establish that it isn't necessary to set the columns,
01777   // or find a way to do it that does not cause an assert failure in
01778   // AddToWorkingSet.
01779 #if 0
01780   first_column_ = left_stats.mode();
01781   last_column_ = right_stats.mode();
01782   if (last_column_ < first_column_)
01783     last_column_ = first_column_;
01784 #endif
01785 
01786   for (partner = SingletonPartner(false); partner != NULL;
01787        partner = partner->SingletonPartner(false)) {
01788     partner->type_ = max_type;
01789 #if 0  // See TODO above
01790     if (column_set_ == partner->column_set_) {
01791       partner->first_column_ = first_column_;
01792       partner->last_column_ = last_column_;
01793     }
01794 #endif
01795   }
01796 }
01797 
01798 // ======= Scenario common to all Refine*Partners* functions =======
01799 // ColPartitions are aiming to represent textlines, or horizontal slices
01800 // of images, and we are trying to form bi-directional (upper/lower) chains
01801 // of UNIQUE partner ColPartitions that can be made into blocks.
01802 // The ColPartitions have previously been typed (see SetPartitionType)
01803 // according to a combination of the content type and
01804 // how they lie on the columns. We want to chain text into
01805 // groups of a single type, but image ColPartitions may have been typed
01806 // differently in different parts of the image, due to being non-rectangular.
01807 //
01808 // We previously ran a search for upper and lower partners, but there may
01809 // be more than one, and they may be of mixed types, so now we wish to
01810 // refine the partners down to at most one.
01811 // A heading may have multiple partners:
01812 // ===============================
01813 // ========  ==========  =========
01814 // ========  ==========  =========
01815 // but it should be a different type.
01816 // A regular flowing text line may have multiple partners:
01817 // ==================   ===================
01818 // =======   =================  ===========
01819 // This could be the start of a pull-out, or it might all be in a single
01820 // column and might be caused by tightly spaced text, bold words, bullets,
01821 // funny punctuation etc, all of which can cause textlines to be split into
01822 // multiple ColPartitions. Pullouts and figure captions should now be different
01823 // types so we can more aggressively merge groups of partners that all sit
01824 // in a single column.
01825 //
01826 // Cleans up the partners of the given type so that there is at most
01827 // one partner. This makes block creation simpler.
01828 // If get_desperate is true, goes to more desperate merge methods
01829 // to merge flowing text before breaking partnerships.
01830 void ColPartition::RefinePartners(PolyBlockType type, bool get_desperate,
01831                                   ColPartitionGrid* grid) {
01832   if (TypesSimilar(type_, type)) {
01833     RefinePartnersInternal(true, get_desperate, grid);
01834     RefinePartnersInternal(false, get_desperate, grid);
01835   } else if (type == PT_COUNT) {
01836     // This is the final pass. Make sure only the correctly typed
01837     // partners surivive, however many there are.
01838     RefinePartnersByType(true, &upper_partners_);
01839     RefinePartnersByType(false, &lower_partners_);
01840     // It is possible for a merge to have given a partition multiple
01841     // partners again, so the last resort is to use overlap which is
01842     // guaranteed to leave at most one partner left.
01843     if (!upper_partners_.empty() && !upper_partners_.singleton())
01844       RefinePartnersByOverlap(true, &upper_partners_);
01845     if (!lower_partners_.empty() && !lower_partners_.singleton())
01846       RefinePartnersByOverlap(false, &lower_partners_);
01847   }
01848 }
01849 
01851 
01852 // Cleans up the partners above if upper is true, else below.
01853 // If get_desperate is true, goes to more desperate merge methods
01854 // to merge flowing text before breaking partnerships.
01855 void ColPartition::RefinePartnersInternal(bool upper, bool get_desperate,
01856                                           ColPartitionGrid* grid) {
01857   ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
01858   if (!partners->empty() && !partners->singleton()) {
01859     RefinePartnersByType(upper, partners);
01860     if (!partners->empty() && !partners->singleton()) {
01861       // Check for transitive partnerships and break the cycle.
01862       RefinePartnerShortcuts(upper, partners);
01863       if (!partners->empty() && !partners->singleton()) {
01864         // Types didn't fix it. Flowing text keeps the one with the longest
01865         // sequence of singleton matching partners. All others max overlap.
01866         if (TypesSimilar(type_, PT_FLOWING_TEXT) && get_desperate) {
01867           RefineTextPartnersByMerge(upper, false, partners, grid);
01868           if (!partners->empty() && !partners->singleton())
01869             RefineTextPartnersByMerge(upper, true, partners, grid);
01870         }
01871         // The last resort is to use overlap.
01872         if (!partners->empty() && !partners->singleton())
01873           RefinePartnersByOverlap(upper, partners);
01874       }
01875     }
01876   }
01877 }
01878 
01879 // Cleans up the partners above if upper is true, else below.
01880 // Restricts the partners to only desirable types. For text and BRT_HLINE this
01881 // means the same type_ , and for image types it means any image type.
01882 void ColPartition::RefinePartnersByType(bool upper,
01883                                         ColPartition_CLIST* partners) {
01884   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
01885                                          bounding_box_.bottom());
01886   if (debug) {
01887     tprintf("Refining %d %s partners by type for:\n",
01888             partners->length(), upper ? "Upper" : "Lower");
01889     Print();
01890   }
01891   ColPartition_C_IT it(partners);
01892   // Purify text by type.
01893   if (!IsImageType()) {
01894     // Keep only partners matching type_.
01895     // Exception: PT_VERTICAL_TEXT is allowed to stay with the other
01896     // text types if it is the only partner.
01897     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01898       ColPartition* partner = it.data();
01899       if (!TypesSimilar(type_, partner->type_)) {
01900         if (debug) {
01901           tprintf("Removing partner:");
01902           partner->Print();
01903         }
01904         partner->RemovePartner(!upper, this);
01905         it.extract();
01906       } else if (debug) {
01907         tprintf("Keeping partner:");
01908         partner->Print();
01909       }
01910     }
01911   } else {
01912     // Only polyimages are allowed to have partners of any kind!
01913     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01914       ColPartition* partner = it.data();
01915       if (partner->blob_type() != BRT_POLYIMAGE ||
01916           blob_type() != BRT_POLYIMAGE) {
01917         if (debug) {
01918           tprintf("Removing partner:");
01919           partner->Print();
01920         }
01921         partner->RemovePartner(!upper, this);
01922         it.extract();
01923       } else if (debug) {
01924         tprintf("Keeping partner:");
01925         partner->Print();
01926       }
01927     }
01928   }
01929 }
01930 
01931 // Cleans up the partners above if upper is true, else below.
01932 // Remove transitive partnerships: this<->a, and a<->b and this<->b.
01933 // Gets rid of this<->b, leaving a clean chain.
01934 // Also if we have this<->a and a<->this, then gets rid of this<->a, as
01935 // this has multiple partners.
01936 void ColPartition::RefinePartnerShortcuts(bool upper,
01937                                           ColPartition_CLIST* partners) {
01938   bool done_any = false;
01939   do {
01940     done_any = false;
01941     ColPartition_C_IT it(partners);
01942     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01943       ColPartition* a = it.data();
01944       // Check for a match between all of a's partners (it1/b1) and all
01945       // of this's partners (it2/b2).
01946       ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
01947       for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
01948         ColPartition* b1 = it1.data();
01949         if (b1 == this) {
01950           done_any = true;
01951           it.extract();
01952           a->RemovePartner(!upper, this);
01953           break;
01954         }
01955         ColPartition_C_IT it2(partners);
01956         for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
01957           ColPartition* b2 = it2.data();
01958           if (b1 == b2) {
01959             // Jackpot! b2 should not be a partner of this.
01960             it2.extract();
01961             b2->RemovePartner(!upper, this);
01962             done_any = true;
01963             // That potentially invalidated all the iterators, so break out
01964             // and start again.
01965             break;
01966           }
01967         }
01968         if (done_any)
01969           break;
01970       }
01971       if (done_any)
01972         break;
01973     }
01974   } while (done_any && !partners->empty() && !partners->singleton());
01975 }
01976 
01977 // Cleans up the partners above if upper is true, else below.
01978 // If multiple text partners can be merged, (with each other, NOT with this),
01979 // then do so.
01980 // If desperate is true, then an increase in overlap with the merge is
01981 // allowed. If the overlap increases, then the desperately_merged_ flag
01982 // is set, indicating that the textlines probably need to be regenerated
01983 // by aggressive line fitting/splitting, as there are probably vertically
01984 // joined blobs that cross textlines.
01985 void ColPartition::RefineTextPartnersByMerge(bool upper, bool desperate,
01986                                              ColPartition_CLIST* partners,
01987                                              ColPartitionGrid* grid) {
01988   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
01989                                          bounding_box_.bottom());
01990   if (debug) {
01991     tprintf("Refining %d %s partners by merge for:\n",
01992             partners->length(), upper ? "Upper" : "Lower");
01993     Print();
01994   }
01995   while (!partners->empty() && !partners->singleton()) {
01996     // Absorb will mess up the iterators, so we have to merge one partition
01997     // at a time and rebuild the iterators each time.
01998     ColPartition_C_IT it(partners);
01999     ColPartition* part = it.data();
02000     // Gather a list of merge candidates, from the list of partners, that
02001     // are all in the same single column. See general scenario comment above.
02002     ColPartition_CLIST candidates;
02003     ColPartition_C_IT cand_it(&candidates);
02004     for (it.forward(); !it.at_first(); it.forward()) {
02005       ColPartition* candidate = it.data();
02006       if (part->first_column_ == candidate->last_column_ &&
02007           part->last_column_ == candidate->first_column_)
02008         cand_it.add_after_then_move(it.data());
02009     }
02010     int overlap_increase;
02011     ColPartition* candidate = grid->BestMergeCandidate(part, &candidates, debug,
02012                                                        NULL, &overlap_increase);
02013     if (candidate != NULL && (overlap_increase <= 0 || desperate)) {
02014       if (debug) {
02015         tprintf("Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
02016                 part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
02017                 overlap_increase);
02018       }
02019       // Remove before merge and re-insert to keep the integrity of the grid.
02020       grid->RemoveBBox(candidate);
02021       grid->RemoveBBox(part);
02022       part->Absorb(candidate, NULL);
02023       // We modified the box of part, so re-insert it into the grid.
02024       grid->InsertBBox(true, true, part);
02025       if (overlap_increase > 0)
02026         part->desperately_merged_ = true;
02027     } else {
02028       break;  // Can't merge.
02029     }
02030   }
02031 }
02032 
02033 // Cleans up the partners above if upper is true, else below.
02034 // Keep the partner with the biggest overlap.
02035 void ColPartition::RefinePartnersByOverlap(bool upper,
02036                                            ColPartition_CLIST* partners) {
02037   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
02038                                          bounding_box_.bottom());
02039   if (debug) {
02040     tprintf("Refining %d %s partners by overlap for:\n",
02041             partners->length(), upper ? "Upper" : "Lower");
02042     Print();
02043   }
02044   ColPartition_C_IT it(partners);
02045   ColPartition* best_partner = it.data();
02046   // Find the partner with the best overlap.
02047   int best_overlap = 0;
02048   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
02049     ColPartition* partner = it.data();
02050     int overlap = MIN(bounding_box_.right(), partner->bounding_box_.right())
02051                 - MAX(bounding_box_.left(), partner->bounding_box_.left());
02052     if (overlap > best_overlap) {
02053       best_overlap = overlap;
02054       best_partner = partner;
02055     }
02056   }
02057   // Keep only the best partner.
02058   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
02059     ColPartition* partner = it.data();
02060     if (partner != best_partner) {
02061       if (debug) {
02062         tprintf("Removing partner:");
02063         partner->Print();
02064       }
02065       partner->RemovePartner(!upper, this);
02066       it.extract();
02067     }
02068   }
02069 }
02070 
02071 // Return true if bbox belongs better in this than other.
02072 bool ColPartition::ThisPartitionBetter(BLOBNBOX* bbox,
02073                                        const ColPartition& other) {
02074   TBOX box = bbox->bounding_box();
02075   // Margins take priority.
02076   int left = box.left();
02077   int right = box.right();
02078   if (left < left_margin_ || right > right_margin_)
02079     return false;
02080   if (left < other.left_margin_ || right > other.right_margin_)
02081     return true;
02082   int top = box.top();
02083   int bottom = box.bottom();
02084   int this_overlap = MIN(top, median_top_) - MAX(bottom, median_bottom_);
02085   int other_overlap = MIN(top, other.median_top_) -
02086                       MAX(bottom, other.median_bottom_);
02087   int this_miss = median_top_ - median_bottom_ - this_overlap;
02088   int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
02089   if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {
02090     tprintf("Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
02091             box.left(), box.bottom(), box.right(), box.top(),
02092             this_overlap, other_overlap, this_miss, other_miss,
02093             median_top_, other.median_top_);
02094   }
02095   if (this_miss < other_miss)
02096     return true;
02097   if (this_miss > other_miss)
02098     return false;
02099   if (this_overlap > other_overlap)
02100     return true;
02101   if (this_overlap < other_overlap)
02102     return false;
02103   return median_top_ >= other.median_top_;
02104 }
02105 
02106 // Returns the median line-spacing between the current position and the end
02107 // of the list.
02108 // The iterator is passed by value so the iteration does not modify the
02109 // caller's iterator.
02110 static int MedianSpacing(int page_height, ColPartition_IT it) {
02111   STATS stats(0, page_height);
02112   while (!it.cycled_list()) {
02113     ColPartition* part = it.data();
02114     it.forward();
02115     stats.add(part->bottom_spacing(), 1);
02116     stats.add(part->top_spacing(), 1);
02117   }
02118   return static_cast<int>(stats.median() + 0.5);
02119 }
02120 
02121 // Returns true if this column partition is in the same column as
02122 // part. This function will only work after the SetPartitionType function
02123 // has been called on both column partitions. This is useful for
02124 // doing a SideSearch when you want things in the same page column.
02125 //
02126 // Currently called by the table detection code to identify if potential table
02127 // partitions exist in the same column.
02128 bool ColPartition::IsInSameColumnAs(const ColPartition& part) const {
02129   // Overlap does not occur when last < part.first or first > part.last.
02130   // In other words, one is completely to the side of the other.
02131   // This is just DeMorgan's law applied to that so the function returns true.
02132   return (last_column_ >= part.first_column_) &&
02133          (first_column_ <= part.last_column_);
02134 }
02135 
02136 // Smoothes the spacings in the list into groups of equal linespacing.
02137 // resolution is the resolution of the original image, used as a basis
02138 // for thresholds in change of spacing. page_height is in pixels.
02139 void ColPartition::SmoothSpacings(int resolution, int page_height,
02140                                   ColPartition_LIST* parts) {
02141   // The task would be trivial if we didn't have to allow for blips -
02142   // occasional offsets in spacing caused by anomolous text, such as all
02143   // caps, groups of descenders, joined words, Arabic etc.
02144   // The neighbourhood stores a consecutive group of partitions so that
02145   // blips can be detected correctly, yet conservatively enough to not
02146   // mistake genuine spacing changes for blips. See example below.
02147   ColPartition* neighbourhood[PN_COUNT];
02148   ColPartition_IT it(parts);
02149   it.mark_cycle_pt();
02150   // Although we know nothing about the spacings is this list, the median is
02151   // used as an approximation to allow blips.
02152   // If parts of this block aren't spaced to the median, then we can't
02153   // accept blips in those parts, but we'll recalculate it each time we
02154   // split the block, so the median becomes more likely to match all the text.
02155   int median_space = MedianSpacing(page_height, it);
02156   ColPartition_IT start_it(it);
02157   ColPartition_IT end_it(it);
02158   for (int i = 0; i < PN_COUNT; ++i) {
02159     if (i < PN_UPPER || it.cycled_list()) {
02160       neighbourhood[i] = NULL;
02161     } else {
02162       if (i == PN_LOWER)
02163         end_it = it;
02164       neighbourhood[i] = it.data();
02165       it.forward();
02166     }
02167   }
02168   while (neighbourhood[PN_UPPER] != NULL) {
02169     // Test for end of a group. Normally SpacingsEqual is true within a group,
02170     // but in the case of a blip, it will be false. Here is an example:
02171     // Line enum   Spacing below (spacing between tops of lines)
02172     //  1   ABOVE2    20
02173     //  2   ABOVE1    20
02174     //  3   UPPER     15
02175     //  4   LOWER     25
02176     //  5   BELOW1    20
02177     //  6   BELOW2    20
02178     // Line 4 is all in caps (regular caps), so the spacing between line 3
02179     // and line 4 (looking at the tops) is smaller than normal, and the
02180     // spacing between line 4 and line 5 is larger than normal, but the
02181     // two of them add to twice the normal spacing.
02182     // The following if has to accept unequal spacings 3 times to pass the
02183     // blip (20/15, 15/25 and 25/20)
02184     // When the blip is in the middle, OKSpacingBlip tests that one of
02185     // ABOVE1 and BELOW1 matches the median.
02186     // The first time, everything is shifted down 1, so we present
02187     // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.
02188     // The last time, everything is shifted up 1, so we present OKSpacingBlip
02189     // with neighbourhood-1 and check that PN_LOWER matches the median.
02190     if (neighbourhood[PN_LOWER] == NULL ||
02191         (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
02192                                                  resolution) &&
02193          !OKSpacingBlip(resolution, median_space, neighbourhood) &&
02194          (!OKSpacingBlip(resolution, median_space, neighbourhood - 1) ||
02195           !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
02196          (!OKSpacingBlip(resolution, median_space, neighbourhood + 1) ||
02197           !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
02198       // The group has ended. PN_UPPER is the last member.
02199       // Compute the mean spacing over the group.
02200       ColPartition_IT sum_it(start_it);
02201       ColPartition* last_part = neighbourhood[PN_UPPER];
02202       double total_bottom = 0.0;
02203       double total_top = 0.0;
02204       int total_count = 0;
02205       ColPartition* upper = sum_it.data();
02206       // We do not process last_part, as its spacing is different.
02207       while (upper != last_part) {
02208         total_bottom += upper->bottom_spacing();
02209         total_top += upper->top_spacing();
02210         ++total_count;
02211         sum_it.forward();
02212         upper = sum_it.data();
02213       }
02214       if (total_count > 0) {
02215         // There were at least 2 lines, so set them all to the mean.
02216         int top_spacing = static_cast<int>(total_top / total_count + 0.5);
02217         int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
02218         if (textord_debug_tabfind) {
02219           tprintf("Spacing run ended. Cause:");
02220           if (neighbourhood[PN_LOWER] == NULL) {
02221             tprintf("No more lines\n");
02222           } else {
02223             tprintf("Spacing change. Spacings:\n");
02224             for (int i = 0; i < PN_COUNT; ++i) {
02225               if (neighbourhood[i] == NULL) {
02226                 tprintf("NULL");
02227                 if (i > 0 && neighbourhood[i - 1] != NULL) {
02228                   if (neighbourhood[i - 1]->SingletonPartner(false) != NULL) {
02229                     tprintf(" Lower partner:");
02230                     neighbourhood[i - 1]->SingletonPartner(false)->Print();
02231                   } else {
02232                     tprintf(" NULL lower partner:\n");
02233                   }
02234                 } else {
02235                   tprintf("\n");
02236                 }
02237               } else {
02238                 tprintf("Top = %d, bottom = %d\n",
02239                         neighbourhood[i]->top_spacing(),
02240                         neighbourhood[i]->bottom_spacing());
02241               }
02242             }
02243           }
02244           tprintf("Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
02245         }
02246         sum_it = start_it;
02247         upper = sum_it.data();
02248         while (upper != last_part) {
02249           upper->set_top_spacing(top_spacing);
02250           upper->set_bottom_spacing(bottom_spacing);
02251           if (textord_debug_tabfind) {
02252             tprintf("Setting mean on:");
02253             upper->Print();
02254           }
02255           sum_it.forward();
02256           upper = sum_it.data();
02257         }
02258       }
02259       // PN_LOWER starts the next group and end_it is the next start_it.
02260       start_it = end_it;
02261       // Recalculate the median spacing to maximize the chances of detecting
02262       // spacing blips.
02263       median_space = MedianSpacing(page_height, end_it);
02264     }
02265     // Shuffle pointers.
02266     for (int j = 1; j < PN_COUNT; ++j) {
02267       neighbourhood[j - 1] = neighbourhood[j];
02268     }
02269     if (it.cycled_list()) {
02270       neighbourhood[PN_COUNT - 1] = NULL;
02271     } else {
02272       neighbourhood[PN_COUNT - 1] = it.data();
02273       it.forward();
02274     }
02275     end_it.forward();
02276   }
02277 }
02278 
02279 // Returns true if the parts array of pointers to partitions matches the
02280 // condition for a spacing blip. See SmoothSpacings for what this means
02281 // and how it is used.
02282 bool ColPartition::OKSpacingBlip(int resolution, int median_spacing,
02283                                  ColPartition** parts) {
02284   if (parts[PN_UPPER] == NULL || parts[PN_LOWER] == NULL)
02285     return false;
02286   // The blip is OK if upper and lower sum to an OK value and at least
02287   // one of above1 and below1 is equal to the median.
02288   return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER],
02289                                           median_spacing, resolution) &&
02290          ((parts[PN_ABOVE1] != NULL &&
02291            parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
02292           (parts[PN_BELOW1] != NULL &&
02293            parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
02294 }
02295 
02296 // Returns true if both the top and bottom spacings of this match the given
02297 // spacing to within suitable margins dictated by the image resolution.
02298 bool ColPartition::SpacingEqual(int spacing, int resolution) const {
02299   int bottom_error = BottomSpacingMargin(resolution);
02300   int top_error = TopSpacingMargin(resolution);
02301   return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
02302          NearlyEqual(top_spacing_, spacing, top_error);
02303 }
02304 
02305 // Returns true if both the top and bottom spacings of this and other
02306 // match to within suitable margins dictated by the image resolution.
02307 bool ColPartition::SpacingsEqual(const ColPartition& other,
02308                                  int resolution) const {
02309   int bottom_error = MAX(BottomSpacingMargin(resolution),
02310                          other.BottomSpacingMargin(resolution));
02311   int top_error = MAX(TopSpacingMargin(resolution),
02312                       other.TopSpacingMargin(resolution));
02313   return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
02314          (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
02315           NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
02316                       bottom_error));
02317 }
02318 
02319 // Returns true if the sum spacing of this and other match the given
02320 // spacing (or twice the given spacing) to within a suitable margin dictated
02321 // by the image resolution.
02322 bool ColPartition::SummedSpacingOK(const ColPartition& other,
02323                                    int spacing, int resolution) const {
02324   int bottom_error = MAX(BottomSpacingMargin(resolution),
02325                          other.BottomSpacingMargin(resolution));
02326   int top_error = MAX(TopSpacingMargin(resolution),
02327                       other.TopSpacingMargin(resolution));
02328   int bottom_total = bottom_spacing_ + other.bottom_spacing_;
02329   int top_total = top_spacing_ + other.top_spacing_;
02330   return (NearlyEqual(spacing, bottom_total, bottom_error) &&
02331           NearlyEqual(spacing, top_total, top_error)) ||
02332          (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
02333           NearlyEqual(spacing * 2, top_total, top_error));
02334 }
02335 
02336 // Returns a suitable spacing margin that can be applied to bottoms of
02337 // text lines, based on the resolution and the stored side_step_.
02338 int ColPartition::BottomSpacingMargin(int resolution) const {
02339   return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;
02340 }
02341 
02342 // Returns a suitable spacing margin that can be applied to tops of
02343 // text lines, based on the resolution and the stored side_step_.
02344 int ColPartition::TopSpacingMargin(int resolution) const {
02345   return static_cast<int>(kMaxTopSpacingFraction * median_size_ + 0.5) +
02346          BottomSpacingMargin(resolution);
02347 }
02348 
02349 // Returns true if the median text sizes of this and other agree to within
02350 // a reasonable multiplicative factor.
02351 bool ColPartition::SizesSimilar(const ColPartition& other) const {
02352   return median_size_ <= other.median_size_ * kMaxSizeRatio &&
02353          other.median_size_ <= median_size_ * kMaxSizeRatio;
02354 }
02355 
02356 // Helper updates margin_left and margin_right, being the bounds of the left
02357 // margin of part of a block. Returns false and does not update the bounds if
02358 // this partition has a disjoint margin with the established margin.
02359 static bool UpdateLeftMargin(const ColPartition& part,
02360                              int* margin_left, int* margin_right) {
02361   const TBOX& part_box = part.bounding_box();
02362   int top = part_box.top();
02363   int bottom = part_box.bottom();
02364   int tl_key = part.SortKey(part.left_margin(), top);
02365   int tr_key = part.SortKey(part_box.left(), top);
02366   int bl_key = part.SortKey(part.left_margin(), bottom);
02367   int br_key = part.SortKey(part_box.left(), bottom);
02368   int left_key = MAX(tl_key, bl_key);
02369   int right_key = MIN(tr_key, br_key);
02370   if (left_key <= *margin_right && right_key >= *margin_left) {
02371     // This part is good - let's keep it.
02372     *margin_right = MIN(*margin_right, right_key);
02373     *margin_left = MAX(*margin_left, left_key);
02374     return true;
02375   }
02376   return false;
02377 }
02378 
02379 // Computes and returns in start, end a line segment formed from a
02380 // forwards-iterated group of left edges of partitions that satisfy the
02381 // condition that the intersection of the left margins is non-empty, ie the
02382 // rightmost left margin is to the left of the leftmost left bounding box edge.
02383 // On return the iterator is set to the start of the next run.
02384 void ColPartition::LeftEdgeRun(ColPartition_IT* part_it,
02385                                ICOORD* start, ICOORD* end) {
02386   ColPartition* part = part_it->data();
02387   ColPartition* start_part = part;
02388   int start_y = part->bounding_box_.top();
02389   if (!part_it->at_first()) {
02390     int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
02391     if (prev_bottom < start_y)
02392       start_y = prev_bottom;
02393     else if (prev_bottom > start_y)
02394       start_y = (start_y + prev_bottom) / 2;
02395   }
02396   int end_y = part->bounding_box_.bottom();
02397   int margin_right = MAX_INT32;
02398   int margin_left = -MAX_INT32;
02399   UpdateLeftMargin(*part, &margin_left, &margin_right);
02400   do {
02401     part_it->forward();
02402     part = part_it->data();
02403   } while (!part_it->at_first() &&
02404            UpdateLeftMargin(*part, &margin_left, &margin_right));
02405   // The run ended. If we were pushed inwards, compute the next run and
02406   // extend it backwards into the run we just calculated to find the end of
02407   // this run that provides a tight box.
02408   int next_margin_right = MAX_INT32;
02409   int next_margin_left = -MAX_INT32;
02410   UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
02411   if (next_margin_left > margin_right) {
02412     ColPartition_IT next_it(*part_it);
02413     do {
02414       next_it.forward();
02415       part = next_it.data();
02416     } while (!next_it.at_first() &&
02417              UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
02418     // Now extend the next run backwards into the original run to get the
02419     // tightest fit.
02420     do {
02421       part_it->backward();
02422       part = part_it->data();
02423     } while (part != start_part &&
02424              UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
02425     part_it->forward();
02426   }
02427   // Now calculate the end_y.
02428   part = part_it->data_relative(-1);
02429   end_y = part->bounding_box_.bottom();
02430   if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y)
02431     end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
02432   start->set_y(start_y);
02433   start->set_x(part->XAtY(margin_right, start_y));
02434   end->set_y(end_y);
02435   end->set_x(part->XAtY(margin_right, end_y));
02436   if (textord_debug_tabfind && !part_it->at_first())
02437     tprintf("Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
02438             start_y, end_y, part->XAtY(margin_left, end_y),
02439             end->x(), part->left_margin_, part->bounding_box_.left());
02440 }
02441 
02442 // Helper updates margin_left and margin_right, being the bounds of the right
02443 // margin of part of a block. Returns false and does not update the bounds if
02444 // this partition has a disjoint margin with the established margin.
02445 static bool UpdateRightMargin(const ColPartition& part,
02446                               int* margin_left, int* margin_right) {
02447   const TBOX& part_box = part.bounding_box();
02448   int top = part_box.top();
02449   int bottom = part_box.bottom();
02450   int tl_key = part.SortKey(part_box.right(), top);
02451   int tr_key = part.SortKey(part.right_margin(), top);
02452   int bl_key = part.SortKey(part_box.right(), bottom);
02453   int br_key = part.SortKey(part.right_margin(), bottom);
02454   int left_key = MAX(tl_key, bl_key);
02455   int right_key = MIN(tr_key, br_key);
02456   if (left_key <= *margin_right && right_key >= *margin_left) {
02457     // This part is good - let's keep it.
02458     *margin_right = MIN(*margin_right, right_key);
02459     *margin_left = MAX(*margin_left, left_key);
02460     return true;
02461   }
02462   return false;
02463 }
02464 
02465 // Computes and returns in start, end a line segment formed from a
02466 // backwards-iterated group of right edges of partitions that satisfy the
02467 // condition that the intersection of the right margins is non-empty, ie the
02468 // leftmost right margin is to the right of the rightmost right bounding box
02469 // edge.
02470 // On return the iterator is set to the start of the next run.
02471 void ColPartition::RightEdgeRun(ColPartition_IT* part_it,
02472                                 ICOORD* start, ICOORD* end) {
02473   ColPartition* part = part_it->data();
02474   ColPartition* start_part = part;
02475   int start_y = part->bounding_box_.bottom();
02476   if (!part_it->at_last()) {
02477     int next_y = part_it->data_relative(1)->bounding_box_.top();
02478     if (next_y > start_y)
02479       start_y = next_y;
02480     else if (next_y < start_y)
02481       start_y = (start_y + next_y) / 2;
02482   }
02483   int end_y = part->bounding_box_.top();
02484   int margin_right = MAX_INT32;
02485   int margin_left = -MAX_INT32;
02486   UpdateRightMargin(*part, &margin_left, &margin_right);
02487   do {
02488     part_it->backward();
02489     part = part_it->data();
02490   } while (!part_it->at_last() &&
02491            UpdateRightMargin(*part, &margin_left, &margin_right));
02492   // The run ended. If we were pushed inwards, compute the next run and
02493   // extend it backwards to find the end of this run for a tight box.
02494   int next_margin_right = MAX_INT32;
02495   int next_margin_left = -MAX_INT32;
02496   UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
02497   if (next_margin_right < margin_left) {
02498     ColPartition_IT next_it(*part_it);
02499     do {
02500       next_it.backward();
02501       part = next_it.data();
02502     } while (!next_it.at_last() &&
02503              UpdateRightMargin(*part, &next_margin_left,
02504                                &next_margin_right));
02505     // Now extend the next run forwards into the original run to get the
02506     // tightest fit.
02507     do {
02508       part_it->forward();
02509       part = part_it->data();
02510     } while (part != start_part &&
02511              UpdateRightMargin(*part, &next_margin_left,
02512                                &next_margin_right));
02513     part_it->backward();
02514   }
02515   // Now calculate the end_y.
02516   part = part_it->data_relative(1);
02517   end_y = part->bounding_box().top();
02518   if (!part_it->at_last() &&
02519       part_it->data()->bounding_box_.bottom() > end_y)
02520     end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
02521   start->set_y(start_y);
02522   start->set_x(part->XAtY(margin_left, start_y));
02523   end->set_y(end_y);
02524   end->set_x(part->XAtY(margin_left, end_y));
02525   if (textord_debug_tabfind && !part_it->at_last())
02526     tprintf("Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
02527             start_y, end_y, end->x(), part->XAtY(margin_right, end_y),
02528             part->bounding_box_.right(), part->right_margin_);
02529 }
02530 
02531 }  // namespace tesseract.