Tesseract  3.02
tesseract-ocr/textord/colpartitiongrid.cpp
Go to the documentation of this file.
00001 
00002 // File:        colpartitionrid.h
00003 // Description: Class collecting code that acts on a BBGrid of ColPartitions.
00004 // Author:      Ray Smith
00005 // Created:     Mon Oct 05 08:42:01 PDT 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "colpartitiongrid.h"
00021 #include "colpartitionset.h"
00022 #include "imagefind.h"
00023 
00024 namespace tesseract {
00025 
00026 BOOL_VAR(textord_tabfind_show_color_fit, false, "Show stroke widths");
00027 
00028 // Max pad factor used to search the neighbourhood of a partition to smooth
00029 // partition types.
00030 const int kMaxPadFactor = 6;
00031 // Max multiple of size (min(height, width)) for the distance of the nearest
00032 // neighbour for the change of type to be used.
00033 const int kMaxNeighbourDistFactor = 4;
00034 // Max RMS color noise to compare colors.
00035 const int kMaxRMSColorNoise = 128;
00036 // Minimum number of blobs in text to make a strong text partition.
00037 const int kHorzStrongTextlineCount = 10;
00038 // Maximum number of lines in a credible figure caption.
00039 const int kMaxCaptionLines = 7;
00040 // Min ratio between biggest and smallest gap to bound a caption.
00041 const double kMinCaptionGapRatio = 2.0;
00042 // Min ratio between biggest gap and mean line height to bound a caption.
00043 const double kMinCaptionGapHeightRatio = 0.5;
00044 // Min fraction of ColPartition height to be overlapping for margin purposes.
00045 const double kMarginOverlapFraction = 0.25;
00046 // Size ratio required to consider an unmerged overlapping partition to be big.
00047 const double kBigPartSizeRatio = 1.75;
00048 // Allowed proportional change in stroke width to match for smoothing.
00049 const double kStrokeWidthFractionTolerance = 0.25;
00050 // Allowed constant change in stroke width to match for smoothing.
00051 const double kStrokeWidthConstantTolerance = 2.0;
00052 // Fraction of gridsize to allow arbitrary overlap between partitions.
00053 const double kTinyEnoughTextlineOverlapFraction = 0.25;
00054 // Max vertical distance of neighbouring ColPartition as a multiple of
00055 // partition height for it to be a partner.
00056 // TODO(rays) fix the problem that causes a larger number to not work well.
00057 // The value needs to be larger as sparse text blocks in a page that gets
00058 // marked as single column will not find adjacent lines as partners, and
00059 // will merge horizontally distant, but aligned lines. See rep.4B3 p5.
00060 // The value needs to be small because double-spaced legal docs written
00061 // in a single column, but justified courier have widely spaced lines
00062 // that need to get merged before they partner-up with the lines above
00063 // and below. See legal.3B5 p13/17. Neither of these should depend on
00064 // the value of kMaxPartitionSpacing to be successful, and ColPartition
00065 // merging needs attention to fix this problem.
00066 const double kMaxPartitionSpacing = 1.75;
00067 // Margin by which text has to beat image or vice-versa to make a firm
00068 // decision in GridSmoothNeighbour.
00069 const int kSmoothDecisionMargin = 4;
00070 
00071 ColPartitionGrid::ColPartitionGrid() {
00072 }
00073 ColPartitionGrid::ColPartitionGrid(int gridsize,
00074                                    const ICOORD& bleft, const ICOORD& tright)
00075   : BBGrid<ColPartition, ColPartition_CLIST, ColPartition_C_IT>(gridsize,
00076                                                                 bleft, tright) {
00077 }
00078 
00079 ColPartitionGrid::~ColPartitionGrid() {
00080 }
00081 
00082 // Handles a click event in a display window.
00083 void ColPartitionGrid::HandleClick(int x, int y) {
00084   BBGrid<ColPartition,
00085          ColPartition_CLIST, ColPartition_C_IT>::HandleClick(x, y);
00086   // Run a radial search for partitions that overlap.
00087   ColPartitionGridSearch radsearch(this);
00088   radsearch.SetUniqueMode(true);
00089   radsearch.StartRadSearch(x, y, 1);
00090   ColPartition* neighbour;
00091   FCOORD click(x, y);
00092   while ((neighbour = radsearch.NextRadSearch()) != NULL) {
00093     TBOX nbox = neighbour->bounding_box();
00094     if (nbox.contains(click)) {
00095       tprintf("Block box:");
00096       neighbour->bounding_box().print();
00097       neighbour->Print();
00098     }
00099   }
00100 }
00101 
00102 // Merges ColPartitions in the grid that look like they belong in the same
00103 // textline.
00104 // For all partitions in the grid, calls the box_cb permanent callback
00105 // to compute the search box, seaches the box, and if a candidate is found,
00106 // calls the confirm_cb to check any more rules. If the confirm_cb returns
00107 // true, then the partitions are merged.
00108 // Both callbacks are deleted before returning.
00109 void ColPartitionGrid::Merges(
00110     TessResultCallback2<bool, ColPartition*, TBOX*>* box_cb,
00111     TessResultCallback2<bool, const ColPartition*,
00112                         const ColPartition*>* confirm_cb) {
00113   // Iterate the ColPartitions in the grid.
00114   ColPartitionGridSearch gsearch(this);
00115   gsearch.StartFullSearch();
00116   ColPartition* part;
00117   while ((part = gsearch.NextFullSearch()) != NULL) {
00118     if (MergePart(box_cb, confirm_cb, part))
00119       gsearch.RepositionIterator();
00120   }
00121   delete box_cb;
00122   delete confirm_cb;
00123 }
00124 
00125 // For the given partition, calls the box_cb permanent callback
00126 // to compute the search box, searches the box, and if a candidate is found,
00127 // calls the confirm_cb to check any more rules. If the confirm_cb returns
00128 // true, then the partitions are merged.
00129 // Returns true if the partition is consumed by one or more merges.
00130 bool ColPartitionGrid::MergePart(
00131     TessResultCallback2<bool, ColPartition*, TBOX*>* box_cb,
00132     TessResultCallback2<bool, const ColPartition*,
00133                         const ColPartition*>* confirm_cb,
00134     ColPartition* part) {
00135   if (part->IsUnMergeableType())
00136     return false;
00137   bool any_done = false;
00138   // Repeatedly merge part while we find a best merge candidate that works.
00139   bool merge_done = false;
00140   do {
00141     merge_done = false;
00142     TBOX box = part->bounding_box();
00143     bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());
00144     if (debug) {
00145       tprintf("Merge candidate:");
00146       box.print();
00147     }
00148     // Set up a rectangle search bounded by the part.
00149     if (!box_cb->Run(part, &box))
00150       continue;
00151     // Create a list of merge candidates.
00152     ColPartition_CLIST merge_candidates;
00153     FindMergeCandidates(part, box, debug, &merge_candidates);
00154     // Find the best merge candidate based on minimal overlap increase.
00155     int overlap_increase;
00156     ColPartition* neighbour = BestMergeCandidate(part, &merge_candidates, debug,
00157                                                  confirm_cb,
00158                                                  &overlap_increase);
00159     if (neighbour != NULL && overlap_increase <= 0) {
00160       if (debug) {
00161         tprintf("Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
00162                 part->HCoreOverlap(*neighbour), part->VCoreOverlap(*neighbour),
00163                 overlap_increase);
00164       }
00165       // Looks like a good candidate so merge it.
00166       RemoveBBox(neighbour);
00167       // We will modify the box of part, so remove it from the grid, merge
00168       // it and then re-insert it into the grid.
00169       RemoveBBox(part);
00170       part->Absorb(neighbour, NULL);
00171       InsertBBox(true, true, part);
00172       merge_done = true;
00173       any_done = true;
00174     } else if (neighbour != NULL) {
00175       if (debug) {
00176         tprintf("Overlapped when merged with increase %d: ", overlap_increase);
00177         neighbour->bounding_box().print();
00178       }
00179     } else if (debug) {
00180       tprintf("No candidate neighbour returned\n");
00181     }
00182   } while (merge_done);
00183   return any_done;
00184 }
00185 
00186 // Returns true if the given part and merge candidate might believably
00187 // be part of a single text line according to the default rules.
00188 // In general we only want to merge partitions that look like they
00189 // are on the same text line, ie their median limits overlap, but we have
00190 // to make exceptions for diacritics and stray punctuation.
00191 static bool OKMergeCandidate(const ColPartition* part,
00192                              const ColPartition* candidate,
00193                              bool debug) {
00194   const TBOX& part_box = part->bounding_box();
00195   if (candidate == part)
00196     return false;  // Ignore itself.
00197   if (!part->TypesMatch(*candidate) || candidate->IsUnMergeableType())
00198     return false;  // Don't mix inappropriate types.
00199 
00200   const TBOX& c_box = candidate->bounding_box();
00201   if (debug) {
00202     tprintf("Examining merge candidate:");
00203     c_box.print();
00204   }
00205   // Candidates must be within a reasonable distance.
00206   if (candidate->IsVerticalType() || part->IsVerticalType()) {
00207     int h_dist = -part->HCoreOverlap(*candidate);
00208     if (h_dist >= MAX(part_box.width(), c_box.width()) / 2) {
00209       if (debug)
00210         tprintf("Too far away: h_dist = %d\n", h_dist);
00211       return false;
00212     }
00213   } else {
00214     // Coarse filter by vertical distance between partitions.
00215     int v_dist = -part->VCoreOverlap(*candidate);
00216     if (v_dist >= MAX(part_box.height(), c_box.height()) / 2) {
00217       if (debug)
00218         tprintf("Too far away: v_dist = %d\n", v_dist);
00219       return false;
00220     }
00221     // Candidates must either overlap in median y,
00222     // or part or candidate must be an acceptable diacritic.
00223     if (!part->VSignificantCoreOverlap(*candidate) &&
00224         !part->OKDiacriticMerge(*candidate, debug) &&
00225         !candidate->OKDiacriticMerge(*part, debug)) {
00226       if (debug)
00227         tprintf("Candidate fails overlap and diacritic tests!\n");
00228       return false;
00229     }
00230   }
00231   return true;
00232 }
00233 
00234 // Helper function to compute the increase in overlap of the parts list of
00235 // Colpartitions with the combination of merge1 and merge2, compared to
00236 // the overlap with them uncombined.
00237 // An overlap is not counted if passes the OKMergeOverlap test with ok_overlap
00238 // as the pixel overlap limit. merge1 and merge2 must both be non-NULL.
00239 static int IncreaseInOverlap(const ColPartition* merge1,
00240                              const ColPartition* merge2,
00241                              int ok_overlap,
00242                              ColPartition_CLIST* parts) {
00243   ASSERT_HOST(merge1 != NULL && merge2 != NULL);
00244   int total_area = 0;
00245   ColPartition_C_IT it(parts);
00246   TBOX merged_box(merge1->bounding_box());
00247   merged_box += merge2->bounding_box();
00248   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00249     ColPartition* part = it.data();
00250     if (part == merge1 || part == merge2)
00251       continue;
00252     TBOX part_box = part->bounding_box();
00253     // Compute the overlap of the merged box with part.
00254     int overlap_area = part_box.intersection(merged_box).area();
00255     if (overlap_area > 0 && !part->OKMergeOverlap(*merge1, *merge2,
00256                                                   ok_overlap, false)) {
00257       total_area += overlap_area;
00258       // Subtract the overlap of merge1 and merge2 individually.
00259       overlap_area = part_box.intersection(merge1->bounding_box()).area();
00260       if (overlap_area > 0)
00261         total_area -= overlap_area;
00262       TBOX intersection_box = part_box.intersection(merge2->bounding_box());
00263       overlap_area = intersection_box.area();
00264       if (overlap_area > 0) {
00265         total_area -= overlap_area;
00266         // Add back the 3-way area.
00267         intersection_box &= merge1->bounding_box();  // In-place intersection.
00268         overlap_area = intersection_box.area();
00269         if (overlap_area > 0)
00270           total_area += overlap_area;
00271       }
00272     }
00273   }
00274   return total_area;
00275 }
00276 
00277 // Helper function to test that each partition in candidates is either a
00278 // good diacritic merge with part or an OK merge candidate with all others
00279 // in the candidates list.
00280 // ASCII Art Scenario:
00281 // We sometimes get text such as "join-this" where the - is actually a long
00282 // dash culled from a standard set of extra characters that don't match the
00283 // font of the text. This makes its strokewidth not match and forms a broken
00284 // set of 3 partitions for "join", "-" and "this" and the dash may slightly
00285 // overlap BOTH words.
00286 // -------  -------
00287 // |     ====     |
00288 // -------  -------
00289 // The standard merge rule: "you can merge 2 partitions as long as there is
00290 // no increase in overlap elsewhere" fails miserably here. Merge any pair
00291 // of partitions and the combined box overlaps more with the third than
00292 // before. To allow the merge, we need to consider whether it is safe to
00293 // merge everything, without merging separate text lines. For that we need
00294 // everything to be an OKMergeCandidate (which is supposed to prevent
00295 // separate text lines merging), but this is hard for diacritics to satisfy,
00296 // so an alternative to being OKMergeCandidate with everything is to be an
00297 // OKDiacriticMerge with part as the base character.
00298 static bool TestCompatibleCandidates(const ColPartition& part, bool debug,
00299                                      ColPartition_CLIST* candidates) {
00300   ColPartition_C_IT it(candidates);
00301   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00302     ColPartition* candidate = it.data();
00303     if (!candidate->OKDiacriticMerge(part, false)) {
00304       ColPartition_C_IT it2(it);
00305       for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
00306         ColPartition* candidate2 = it2.data();
00307         if (candidate2 != candidate &&
00308             !OKMergeCandidate(candidate, candidate2, false)) {
00309           if (debug) {
00310             tprintf("NC overlap failed:Candidate:");
00311             candidate2->bounding_box().print();
00312             tprintf("fails to be a good merge with:");
00313             candidate->bounding_box().print();
00314           }
00315           return false;
00316         }
00317       }
00318     }
00319   }
00320   return true;
00321 }
00322 
00323 // Finds all the ColPartitions in the grid that overlap with the given
00324 // box and returns them SortByBoxLeft(ed) and uniqued in the given list.
00325 // Any partition equal to not_this (may be NULL) is excluded.
00326 void ColPartitionGrid::FindOverlappingPartitions(const TBOX& box,
00327                                                  const ColPartition* not_this,
00328                                                  ColPartition_CLIST* parts) {
00329   ColPartitionGridSearch rsearch(this);
00330   rsearch.StartRectSearch(box);
00331   ColPartition* part;
00332   while ((part = rsearch.NextRectSearch()) != NULL) {
00333     if (part != not_this)
00334       parts->add_sorted(SortByBoxLeft<ColPartition>, true, part);
00335   }
00336 }
00337 
00338 // Finds and returns the best candidate ColPartition to merge with part,
00339 // selected from the candidates list, based on the minimum increase in
00340 // pairwise overlap among all the partitions overlapped by the combined box.
00341 // If overlap_increase is not NULL then it returns the increase in overlap
00342 // that would result from the merge.
00343 // confirm_cb is a permanent callback that (if non-null) will be used to
00344 // confirm the validity of a proposed merge candidate before selecting it.
00345 //
00346 // ======HOW MERGING WORKS======
00347 // The problem:
00348 // We want to merge all the parts of a textline together, but avoid merging
00349 // separate textlines. Diacritics, i dots, punctuation, and broken characters
00350 // are examples of small bits that need merging with the main textline.
00351 // Drop-caps and descenders in one line that touch ascenders in the one below
00352 // are examples of cases where we don't want to merge.
00353 //
00354 // The solution:
00355 // Merges that increase overlap among other partitions are generally bad.
00356 // Those that don't increase overlap (much) and minimize the total area
00357 // seem to be good.
00358 //
00359 // Ascii art example:
00360 // The text:
00361 // groggy descenders
00362 // minimum ascenders
00363 // The boxes: The === represents a small box near or overlapping the lower box.
00364 // -----------------
00365 // |               |
00366 // -----------------
00367 // -===-------------
00368 // |               |
00369 // -----------------
00370 // In considering what to do with the small === box, we find the 2 larger
00371 // boxes as neighbours and possible merge candidates, but merging with the
00372 // upper box increases overlap with the lower box, whereas merging with the
00373 // lower box does not increase overlap.
00374 // If the small === box didn't overlap either to start with, total area
00375 // would be minimized by merging with the nearer (lower) box.
00376 //
00377 // This is a simple example. In reality, we have to allow some increase
00378 // in overlap, or tightly spaced text would end up in bits.
00379 ColPartition* ColPartitionGrid::BestMergeCandidate(
00380     const ColPartition* part, ColPartition_CLIST* candidates, bool debug,
00381     TessResultCallback2<bool, const ColPartition*, const ColPartition*>* confirm_cb,
00382     int* overlap_increase) {
00383   if (overlap_increase != NULL)
00384     *overlap_increase = 0;
00385   if (candidates->empty())
00386     return NULL;
00387   int ok_overlap =
00388       static_cast<int>(kTinyEnoughTextlineOverlapFraction * gridsize() + 0.5);
00389   // The best neighbour to merge with is the one that causes least
00390   // total pairwise overlap among all the neighbours.
00391   // If more than one offers the same total overlap, choose the one
00392   // with the least total area.
00393   const TBOX& part_box = part->bounding_box();
00394   ColPartition_C_IT it(candidates);
00395   ColPartition* best_candidate = NULL;
00396   // Find the total combined box of all candidates and the original.
00397   TBOX full_box(part_box);
00398   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00399     ColPartition* candidate = it.data();
00400     full_box += candidate->bounding_box();
00401   }
00402   // Keep valid neighbours in a list.
00403   ColPartition_CLIST neighbours;
00404   // Now run a rect search of the merged box for overlapping neighbours, as
00405   // we need anything that might be overlapped by the merged box.
00406   FindOverlappingPartitions(full_box, part, &neighbours);
00407   if (debug) {
00408     tprintf("Finding best merge candidate from %d, %d neighbours for box:",
00409             candidates->length(), neighbours.length());
00410     part_box.print();
00411   }
00412   // If the best increase in overlap is positive, then we also check the
00413   // worst non-candidate overlap. This catches the case of multiple good
00414   // candidates that overlap each other when merged. If the worst
00415   // non-candidate overlap is better than the best overlap, then return
00416   // the worst non-candidate overlap instead.
00417   ColPartition_CLIST non_candidate_neighbours;
00418   non_candidate_neighbours.set_subtract(SortByBoxLeft<ColPartition>, true,
00419                                         &neighbours, candidates);
00420   int worst_nc_increase = 0;
00421   int best_increase = MAX_INT32;
00422   int best_area = 0;
00423   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00424     ColPartition* candidate = it.data();
00425     if (confirm_cb != NULL && !confirm_cb->Run(part, candidate)) {
00426       if (debug) {
00427         tprintf("Candidate not confirmed:");
00428         candidate->bounding_box().print();
00429       }
00430       continue;
00431     }
00432     int increase = IncreaseInOverlap(part, candidate, ok_overlap, &neighbours);
00433     const TBOX& cand_box = candidate->bounding_box();
00434     if (best_candidate == NULL || increase < best_increase) {
00435       best_candidate = candidate;
00436       best_increase = increase;
00437       best_area = cand_box.bounding_union(part_box).area() - cand_box.area();
00438       if (debug) {
00439         tprintf("New best merge candidate has increase %d, area %d, over box:",
00440                 increase, best_area);
00441         full_box.print();
00442         candidate->Print();
00443       }
00444     } else if (increase == best_increase) {
00445       int area = cand_box.bounding_union(part_box).area() - cand_box.area();
00446       if (area < best_area) {
00447         best_area = area;
00448         best_candidate = candidate;
00449       }
00450     }
00451     increase = IncreaseInOverlap(part, candidate, ok_overlap,
00452                                  &non_candidate_neighbours);
00453     if (increase > worst_nc_increase)
00454       worst_nc_increase = increase;
00455   }
00456   if (best_increase > 0) {
00457     // If the worst non-candidate increase is less than the best increase
00458     // including the candidates, then all the candidates can merge together
00459     // and the increase in outside overlap would be less, so use that result,
00460     // but only if each candidate is either a good diacritic merge with part,
00461     // or an ok merge candidate with all the others.
00462     // See TestCompatibleCandidates for more explanation and a picture.
00463     if (worst_nc_increase < best_increase &&
00464         TestCompatibleCandidates(*part, debug, candidates)) {
00465       best_increase = worst_nc_increase;
00466     }
00467   }
00468   if (overlap_increase != NULL)
00469     *overlap_increase = best_increase;
00470   return best_candidate;
00471 }
00472 
00473 // Helper to remove the given box from the given partition, put it in its
00474 // own partition, and add to the partition list.
00475 static void RemoveBadBox(BLOBNBOX* box, ColPartition* part,
00476                          ColPartition_LIST* part_list) {
00477   part->RemoveBox(box);
00478   ColPartition::MakeBigPartition(box, part_list);
00479 }
00480 
00481 
00482 // Split partitions where it reduces overlap between their bounding boxes.
00483 // ColPartitions are after all supposed to be a partitioning of the blobs
00484 // AND of the space on the page!
00485 // Blobs that cause overlaps get removed, put in individual partitions
00486 // and added to the big_parts list. They are most likely characters on
00487 // 2 textlines that touch, or something big like a dropcap.
00488 void ColPartitionGrid::SplitOverlappingPartitions(
00489     ColPartition_LIST* big_parts) {
00490   int ok_overlap =
00491       static_cast<int>(kTinyEnoughTextlineOverlapFraction * gridsize() + 0.5);
00492   // Iterate the ColPartitions in the grid.
00493   ColPartitionGridSearch gsearch(this);
00494   gsearch.StartFullSearch();
00495   ColPartition* part;
00496   while ((part = gsearch.NextFullSearch()) != NULL) {
00497     // Set up a rectangle search bounded by the part.
00498     const TBOX& box = part->bounding_box();
00499     ColPartitionGridSearch rsearch(this);
00500     rsearch.SetUniqueMode(true);
00501     rsearch.StartRectSearch(box);
00502     int unresolved_overlaps = 0;
00503 
00504     ColPartition* neighbour;
00505     while ((neighbour = rsearch.NextRectSearch()) != NULL) {
00506       if (neighbour == part)
00507         continue;
00508       const TBOX& neighbour_box = neighbour->bounding_box();
00509       if (neighbour->OKMergeOverlap(*part, *part, ok_overlap, false) &&
00510           part->OKMergeOverlap(*neighbour, *neighbour, ok_overlap, false))
00511         continue;  // The overlap is OK both ways.
00512 
00513       // If removal of the biggest box from either partition eliminates the
00514       // overlap, and it is much bigger than the box left behind, then
00515       // it is either a drop-cap, an inter-line join, or some junk that
00516       // we don't want anyway, so put it in the big_parts list.
00517       if (!part->IsSingleton()) {
00518         BLOBNBOX* excluded = part->BiggestBox();
00519         TBOX shrunken = part->BoundsWithoutBox(excluded);
00520         if (!shrunken.overlap(neighbour_box) &&
00521             excluded->bounding_box().height() >
00522               kBigPartSizeRatio * shrunken.height()) {
00523           // Removing the biggest box fixes the overlap, so do it!
00524           gsearch.RemoveBBox();
00525           RemoveBadBox(excluded, part, big_parts);
00526           InsertBBox(true, true, part);
00527           gsearch.RepositionIterator();
00528           break;
00529         }
00530       } else if (box.contains(neighbour_box)) {
00531         ++unresolved_overlaps;
00532         continue;  // No amount of splitting will fix it.
00533       }
00534       if (!neighbour->IsSingleton()) {
00535         BLOBNBOX* excluded = neighbour->BiggestBox();
00536         TBOX shrunken = neighbour->BoundsWithoutBox(excluded);
00537         if (!shrunken.overlap(box) &&
00538             excluded->bounding_box().height() >
00539               kBigPartSizeRatio * shrunken.height()) {
00540           // Removing the biggest box fixes the overlap, so do it!
00541           rsearch.RemoveBBox();
00542           RemoveBadBox(excluded, neighbour, big_parts);
00543           InsertBBox(true, true, neighbour);
00544           gsearch.RepositionIterator();
00545           break;
00546         }
00547       }
00548       int part_overlap_count = part->CountOverlappingBoxes(neighbour_box);
00549       int neighbour_overlap_count = neighbour->CountOverlappingBoxes(box);
00550       ColPartition* right_part = NULL;
00551       if (neighbour_overlap_count <= part_overlap_count ||
00552           part->IsSingleton()) {
00553         // Try to split the neighbour to reduce overlap.
00554         BLOBNBOX* split_blob = neighbour->OverlapSplitBlob(box);
00555         if (split_blob != NULL) {
00556           rsearch.RemoveBBox();
00557           right_part = neighbour->SplitAtBlob(split_blob);
00558           InsertBBox(true, true, neighbour);
00559           ASSERT_HOST(right_part != NULL);
00560         }
00561       } else {
00562         // Try to split part to reduce overlap.
00563         BLOBNBOX* split_blob = part->OverlapSplitBlob(neighbour_box);
00564         if (split_blob != NULL) {
00565           gsearch.RemoveBBox();
00566           right_part = part->SplitAtBlob(split_blob);
00567           InsertBBox(true, true, part);
00568           ASSERT_HOST(right_part != NULL);
00569         }
00570       }
00571       if (right_part != NULL) {
00572         InsertBBox(true, true, right_part);
00573         gsearch.RepositionIterator();
00574         rsearch.RepositionIterator();
00575         break;
00576       }
00577     }
00578     if (unresolved_overlaps > 2 && part->IsSingleton()) {
00579       // This part is no good so just add to big_parts.
00580       RemoveBBox(part);
00581       ColPartition_IT big_it(big_parts);
00582       part->set_block_owned(true);
00583       big_it.add_to_end(part);
00584       gsearch.RepositionIterator();
00585     }
00586   }
00587 }
00588 
00589 // Filters partitions of source_type by looking at local neighbours.
00590 // Where a majority of neighbours have a text type, the partitions are
00591 // changed to text, where the neighbours have image type, they are changed
00592 // to image, and partitions that have no definite neighbourhood type are
00593 // left unchanged.
00594 // im_box and rerotation are used to map blob coordinates onto the
00595 // nontext_map, which is used to prevent the spread of text neighbourhoods
00596 // into images.
00597 // Returns true if anything was changed.
00598 bool ColPartitionGrid::GridSmoothNeighbours(BlobTextFlowType source_type,
00599                                             Pix* nontext_map,
00600                                             const TBOX& im_box,
00601                                             const FCOORD& rotation) {
00602   // Iterate the ColPartitions in the grid.
00603   ColPartitionGridSearch gsearch(this);
00604   gsearch.StartFullSearch();
00605   ColPartition* part;
00606   bool any_changed = false;
00607   while ((part = gsearch.NextFullSearch()) != NULL) {
00608     if (part->flow() != source_type || BLOBNBOX::IsLineType(part->blob_type()))
00609       continue;
00610     const TBOX& box = part->bounding_box();
00611     bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());
00612     if (SmoothRegionType(nontext_map, im_box, rotation, debug, part))
00613       any_changed = true;
00614   }
00615   return any_changed;
00616 }
00617 
00618 // Compute the mean RGB of the light and dark pixels in each ColPartition
00619 // and also the rms error in the linearity of color.
00620 void ColPartitionGrid::ComputePartitionColors(Pix* scaled_color,
00621                                               int scaled_factor,
00622                                               const FCOORD& rerotation) {
00623   if (scaled_color == NULL)
00624     return;
00625   Pix* color_map1 = NULL;
00626   Pix* color_map2 = NULL;
00627   Pix* rms_map = NULL;
00628   if (textord_tabfind_show_color_fit) {
00629     int width = pixGetWidth(scaled_color);
00630     int height = pixGetHeight(scaled_color);
00631     color_map1 = pixCreate(width, height, 32);
00632     color_map2 = pixCreate(width, height, 32);
00633     rms_map = pixCreate(width, height, 8);
00634   }
00635   // Iterate the ColPartitions in the grid.
00636   ColPartitionGridSearch gsearch(this);
00637   gsearch.StartFullSearch();
00638   ColPartition* part;
00639   while ((part = gsearch.NextFullSearch()) != NULL) {
00640     TBOX part_box = part->bounding_box();
00641     part_box.rotate_large(rerotation);
00642     ImageFind::ComputeRectangleColors(part_box, scaled_color,
00643                                       scaled_factor,
00644                                       color_map1, color_map2, rms_map,
00645                                       part->color1(), part->color2());
00646   }
00647   if (color_map1 != NULL) {
00648     pixWrite("swcolorinput.png", scaled_color, IFF_PNG);
00649     pixWrite("swcolor1.png", color_map1, IFF_PNG);
00650     pixWrite("swcolor2.png", color_map2, IFF_PNG);
00651     pixWrite("swrms.png", rms_map, IFF_PNG);
00652     pixDestroy(&color_map1);
00653     pixDestroy(&color_map2);
00654     pixDestroy(&rms_map);
00655   }
00656 }
00657 
00658 // Reflects the grid and its colpartitions in the y-axis, assuming that
00659 // all blob boxes have already been done.
00660 void ColPartitionGrid::ReflectInYAxis() {
00661   ColPartition_LIST parts;
00662   ColPartition_IT part_it(&parts);
00663   // Iterate the ColPartitions in the grid to extract them.
00664   ColPartitionGridSearch gsearch(this);
00665   gsearch.StartFullSearch();
00666   ColPartition* part;
00667   while ((part = gsearch.NextFullSearch()) != NULL) {
00668     part_it.add_after_then_move(part);
00669   }
00670   ICOORD bot_left(-tright().x(), bleft().y());
00671   ICOORD top_right(-bleft().x(), tright().y());
00672   // Reinitializing the grid with reflected coords also clears all the
00673   // pointers, so parts will now own the ColPartitions. (Briefly).
00674   Init(gridsize(), bot_left, top_right);
00675   for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
00676     part = part_it.extract();
00677     part->ReflectInYAxis();
00678     InsertBBox(true, true, part);
00679   }
00680 }
00681 
00682 // Rotates the grid and its colpartitions by the given angle, assuming that
00683 // all blob boxes have already been done.
00684 void ColPartitionGrid::Deskew(const FCOORD& deskew) {
00685   ColPartition_LIST parts;
00686   ColPartition_IT part_it(&parts);
00687   // Iterate the ColPartitions in the grid to extract them.
00688   ColPartitionGridSearch gsearch(this);
00689   gsearch.StartFullSearch();
00690   ColPartition* part;
00691   while ((part = gsearch.NextFullSearch()) != NULL) {
00692     part_it.add_after_then_move(part);
00693   }
00694   // Rebuild the grid to the new size.
00695   TBOX grid_box(bleft_, tright_);
00696   grid_box.rotate_large(deskew);
00697   Init(gridsize(), grid_box.botleft(), grid_box.topright());
00698   // Reinitializing the grid with rotated coords also clears all the
00699   // pointers, so parts will now own the ColPartitions. (Briefly).
00700   for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
00701     part = part_it.extract();
00702     part->ComputeLimits();
00703     InsertBBox(true, true, part);
00704   }
00705 }
00706 
00707 // Sets the left and right tabs of the partitions in the grid.
00708 void ColPartitionGrid::SetTabStops(TabFind* tabgrid) {
00709   // Iterate the ColPartitions in the grid.
00710   ColPartitionGridSearch gsearch(this);
00711   gsearch.StartFullSearch();
00712   ColPartition* part;
00713   while ((part = gsearch.NextFullSearch()) != NULL) {
00714     const TBOX& part_box = part->bounding_box();
00715     TabVector* left_line = tabgrid->LeftTabForBox(part_box, true, false);
00716     // If the overlapping line is not a left tab, try for non-overlapping.
00717     if (left_line != NULL && !left_line->IsLeftTab())
00718       left_line = tabgrid->LeftTabForBox(part_box, false, false);
00719     if (left_line != NULL && left_line->IsLeftTab())
00720       part->SetLeftTab(left_line);
00721     TabVector* right_line = tabgrid->RightTabForBox(part_box, true, false);
00722     if (right_line != NULL && !right_line->IsRightTab())
00723       right_line = tabgrid->RightTabForBox(part_box, false, false);
00724     if (right_line != NULL && right_line->IsRightTab())
00725       part->SetRightTab(right_line);
00726     part->SetColumnGoodness(tabgrid->WidthCB());
00727   }
00728 }
00729 
00730 // Makes the ColPartSets and puts them in the PartSetVector ready
00731 // for finding column bounds. Returns false if no partitions were found.
00732 bool ColPartitionGrid::MakeColPartSets(PartSetVector* part_sets) {
00733   ColPartition_LIST* part_lists = new ColPartition_LIST[gridheight()];
00734   part_sets->reserve(gridheight());
00735   // Iterate the ColPartitions in the grid to get parts onto lists for the
00736   // y bottom of each.
00737   ColPartitionGridSearch gsearch(this);
00738   gsearch.StartFullSearch();
00739   ColPartition* part;
00740   bool any_parts_found = false;
00741   while ((part = gsearch.NextFullSearch()) != NULL) {
00742     BlobRegionType blob_type = part->blob_type();
00743     if (blob_type != BRT_NOISE &&
00744         (blob_type != BRT_UNKNOWN || !part->boxes()->singleton())) {
00745       int grid_x, grid_y;
00746       const TBOX& part_box = part->bounding_box();
00747       GridCoords(part_box.left(), part_box.bottom(), &grid_x, &grid_y);
00748       ColPartition_IT part_it(&part_lists[grid_y]);
00749       part_it.add_to_end(part);
00750       any_parts_found = true;
00751     }
00752   }
00753   if (any_parts_found) {
00754     for (int grid_y = 0; grid_y < gridheight(); ++grid_y) {
00755       ColPartitionSet* line_set = NULL;
00756       if (!part_lists[grid_y].empty()) {
00757         line_set = new ColPartitionSet(&part_lists[grid_y]);
00758       }
00759       part_sets->push_back(line_set);
00760     }
00761   }
00762   delete [] part_lists;
00763   return any_parts_found;
00764 }
00765 
00766 // Makes a single ColPartitionSet consisting of a single ColPartition that
00767 // represents the total horizontal extent of the significant content on the
00768 // page. Used for the single column setting in place of automatic detection.
00769 // Returns NULL if the page is empty of significant content.
00770 ColPartitionSet* ColPartitionGrid::MakeSingleColumnSet(WidthCallback* cb) {
00771   ColPartition* single_column_part = NULL;
00772   // Iterate the ColPartitions in the grid to get parts onto lists for the
00773   // y bottom of each.
00774   ColPartitionGridSearch gsearch(this);
00775   gsearch.StartFullSearch();
00776   ColPartition* part;
00777   while ((part = gsearch.NextFullSearch()) != NULL) {
00778     BlobRegionType blob_type = part->blob_type();
00779     if (blob_type != BRT_NOISE &&
00780         (blob_type != BRT_UNKNOWN || !part->boxes()->singleton())) {
00781       // Consider for single column.
00782       BlobTextFlowType flow = part->flow();
00783       if ((blob_type == BRT_TEXT &&
00784           (flow == BTFT_STRONG_CHAIN || flow == BTFT_CHAIN ||
00785            flow == BTFT_LEADER || flow == BTFT_TEXT_ON_IMAGE)) ||
00786           blob_type == BRT_RECTIMAGE || blob_type == BRT_POLYIMAGE) {
00787         if (single_column_part == NULL) {
00788           single_column_part = part->ShallowCopy();
00789           single_column_part->set_blob_type(BRT_TEXT);
00790           // Copy the tabs from itself to properly setup the margins.
00791           single_column_part->CopyLeftTab(*single_column_part, false);
00792           single_column_part->CopyRightTab(*single_column_part, false);
00793         } else {
00794           if (part->left_key() < single_column_part->left_key())
00795             single_column_part->CopyLeftTab(*part, false);
00796           if (part->right_key() > single_column_part->right_key())
00797             single_column_part->CopyRightTab(*part, false);
00798         }
00799       }
00800     }
00801   }
00802   if (single_column_part != NULL) {
00803     // Make a ColPartitionSet out of the single_column_part as a candidate
00804     // for the single column case.
00805     single_column_part->SetColumnGoodness(cb);
00806     return new ColPartitionSet(single_column_part);
00807   }
00808   return NULL;
00809 }
00810 
00811 // Mark the BLOBNBOXes in each partition as being owned by that partition.
00812 void ColPartitionGrid::ClaimBoxes() {
00813   // Iterate the ColPartitions in the grid.
00814   ColPartitionGridSearch gsearch(this);
00815   gsearch.StartFullSearch();
00816   ColPartition* part;
00817   while ((part = gsearch.NextFullSearch()) != NULL) {
00818     part->ClaimBoxes();
00819   }
00820 }
00821 
00822 // Retypes all the blobs referenced by the partitions in the grid.
00823 // Image blobs are found and returned in the im_blobs list, as they are not
00824 // owned by the block.
00825 void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) {
00826   BLOBNBOX_IT im_blob_it(im_blobs);
00827   ColPartition_LIST dead_parts;
00828   ColPartition_IT dead_part_it(&dead_parts);
00829   // Iterate the ColPartitions in the grid.
00830   ColPartitionGridSearch gsearch(this);
00831   gsearch.StartFullSearch();
00832   ColPartition* part;
00833   while ((part = gsearch.NextFullSearch()) != NULL) {
00834     BlobRegionType blob_type = part->blob_type();
00835     BlobTextFlowType flow = part->flow();
00836     if (blob_type == BRT_POLYIMAGE || blob_type == BRT_RECTIMAGE) {
00837       BLOBNBOX_C_IT blob_it(part->boxes());
00838       for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00839         BLOBNBOX* blob = blob_it.data();
00840         im_blob_it.add_after_then_move(blob);
00841       }
00842     } else if (blob_type != BRT_NOISE) {
00843       // Make sure the blobs are marked with the correct type and flow.
00844       BLOBNBOX_C_IT blob_it(part->boxes());
00845       for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00846         BLOBNBOX* blob = blob_it.data();
00847         if (blob->region_type() == BRT_NOISE) {
00848           // TODO(rays) Deprecated. Change this section to an assert to verify
00849           // and then delete.
00850           ASSERT_HOST(blob->cblob()->area() != 0);
00851           blob->set_owner(NULL);
00852           blob_it.extract();
00853         } else {
00854           blob->set_region_type(blob_type);
00855           if (blob->flow() != BTFT_LEADER)
00856             blob->set_flow(flow);
00857         }
00858       }
00859     }
00860     if (blob_type == BRT_NOISE || part->boxes()->empty()) {
00861       BLOBNBOX_C_IT blob_it(part->boxes());
00862       part->DisownBoxes();
00863       dead_part_it.add_to_end(part);
00864       gsearch.RemoveBBox();
00865       for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00866         BLOBNBOX* blob = blob_it.data();
00867         if (blob->cblob()->area() == 0) {
00868           // Any blob with zero area is a fake image blob and should be deleted.
00869           delete blob->cblob();
00870           delete blob;
00871         }
00872       }
00873     }
00874   }
00875 }
00876 
00877 // The boxes within the partitions have changed (by deskew) so recompute
00878 // the bounds of all the partitions and reinsert them into the grid.
00879 void ColPartitionGrid::RecomputeBounds(int gridsize,
00880                                        const ICOORD& bleft,
00881                                        const ICOORD& tright,
00882                                        const ICOORD& vertical) {
00883   ColPartition_LIST saved_parts;
00884   ColPartition_IT part_it(&saved_parts);
00885   // Iterate the ColPartitions in the grid to get parts onto a list.
00886   ColPartitionGridSearch gsearch(this);
00887   gsearch.StartFullSearch();
00888   ColPartition* part;
00889   while ((part = gsearch.NextFullSearch()) != NULL) {
00890     part_it.add_to_end(part);
00891   }
00892   // Reinitialize grid to the new size.
00893   Init(gridsize, bleft, tright);
00894   // Recompute the bounds of the parts and put them back in the new grid.
00895   for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
00896     part = part_it.extract();
00897     part->set_vertical(vertical);
00898     part->ComputeLimits();
00899     InsertBBox(true, true, part);
00900   }
00901 }
00902 
00903 // Improves the margins of the ColPartitions in the grid by calling
00904 // FindPartitionMargins on each.
00905 // best_columns, which may be NULL, is an array of pointers indicating the
00906 // column set at each y-coordinate in the grid.
00907 // best_columns is usually the best_columns_ member of ColumnFinder.
00908 void ColPartitionGrid::GridFindMargins(ColPartitionSet** best_columns) {
00909   // Iterate the ColPartitions in the grid.
00910   ColPartitionGridSearch gsearch(this);
00911   gsearch.StartFullSearch();
00912   ColPartition* part;
00913   while ((part = gsearch.NextFullSearch()) != NULL) {
00914     // Set up a rectangle search x-bounded by the column and y by the part.
00915     ColPartitionSet* columns = best_columns != NULL
00916                              ? best_columns[gsearch.GridY()]
00917                              : NULL;
00918     FindPartitionMargins(columns, part);
00919     const TBOX& box = part->bounding_box();
00920     if (AlignedBlob::WithinTestRegion(2, box.left(), box.bottom())) {
00921       tprintf("Computed margins for part:");
00922       part->Print();
00923     }
00924   }
00925 }
00926 
00927 // Improves the margins of the ColPartitions in the list by calling
00928 // FindPartitionMargins on each.
00929 // best_columns, which may be NULL, is an array of pointers indicating the
00930 // column set at each y-coordinate in the grid.
00931 // best_columns is usually the best_columns_ member of ColumnFinder.
00932 void ColPartitionGrid::ListFindMargins(ColPartitionSet** best_columns,
00933                                        ColPartition_LIST* parts) {
00934   ColPartition_IT part_it(parts);
00935   for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
00936     ColPartition* part = part_it.data();
00937     ColPartitionSet* columns = NULL;
00938     if (best_columns != NULL) {
00939       TBOX part_box = part->bounding_box();
00940       // Get the columns from the y grid coord.
00941       int grid_x, grid_y;
00942       GridCoords(part_box.left(), part_box.bottom(), &grid_x, &grid_y);
00943       columns = best_columns[grid_y];
00944     }
00945     FindPartitionMargins(columns, part);
00946   }
00947 }
00948 
00949 // Deletes all the partitions in the grid after disowning all the blobs.
00950 void ColPartitionGrid::DeleteParts() {
00951   ColPartition_LIST dead_parts;
00952   ColPartition_IT dead_it(&dead_parts);
00953   ColPartitionGridSearch gsearch(this);
00954   gsearch.StartFullSearch();
00955   ColPartition* part;
00956   while ((part = gsearch.NextFullSearch()) != NULL) {
00957     part->DisownBoxes();
00958     dead_it.add_to_end(part);  // Parts will be deleted on return.
00959   }
00960   Clear();
00961 }
00962 
00963 // Deletes all the partitions in the grid that are of type BRT_UNKNOWN and
00964 // all the blobs in them.
00965 void ColPartitionGrid::DeleteUnknownParts(TO_BLOCK* block) {
00966   ColPartitionGridSearch gsearch(this);
00967   gsearch.StartFullSearch();
00968   ColPartition* part;
00969   while ((part = gsearch.NextFullSearch()) != NULL) {
00970     if (part->blob_type() == BRT_UNKNOWN) {
00971       gsearch.RemoveBBox();
00972       // Once marked, the blobs will be swept up by DeleteUnownedNoise.
00973       part->set_flow(BTFT_NONTEXT);
00974       part->set_blob_type(BRT_NOISE);
00975       part->SetBlobTypes();
00976       part->DisownBoxes();
00977       delete part;
00978     }
00979   }
00980   block->DeleteUnownedNoise();
00981 }
00982 
00983 // Finds and marks text partitions that represent figure captions.
00984 void ColPartitionGrid::FindFigureCaptions() {
00985   // For each image region find its best candidate text caption region,
00986   // if any and mark it as such.
00987   ColPartitionGridSearch gsearch(this);
00988   gsearch.StartFullSearch();
00989   ColPartition* part;
00990   while ((part = gsearch.NextFullSearch()) != NULL) {
00991     if (part->IsImageType()) {
00992       const TBOX& part_box = part->bounding_box();
00993       bool debug = AlignedBlob::WithinTestRegion(2, part_box.left(),
00994                                                  part_box.bottom());
00995       ColPartition* best_caption = NULL;
00996       int best_dist = 0;   // Distance to best_caption.
00997       int best_upper = 0;  // Direction of best_caption.
00998       // Handle both lower and upper directions.
00999       for (int upper = 0; upper < 2; ++upper) {
01000         ColPartition_C_IT partner_it(upper ? part->upper_partners()
01001                                            : part->lower_partners());
01002         // If there are no image partners, then this direction is ok.
01003         for (partner_it.mark_cycle_pt(); !partner_it.cycled_list();
01004              partner_it.forward()) {
01005           ColPartition* partner = partner_it.data();
01006           if (partner->IsImageType()) {
01007             break;
01008           }
01009         }
01010         if (!partner_it.cycled_list()) continue;
01011         // Find the nearest totally overlapping text partner.
01012         for (partner_it.mark_cycle_pt(); !partner_it.cycled_list();
01013              partner_it.forward()) {
01014           ColPartition* partner = partner_it.data();
01015           if (!partner->IsTextType()) continue;
01016           const TBOX& partner_box = partner->bounding_box();
01017           if (debug) {
01018             tprintf("Finding figure captions for image part:");
01019             part_box.print();
01020             tprintf("Considering partner:");
01021             partner_box.print();
01022           }
01023           if (partner_box.left() >= part_box.left() &&
01024               partner_box.right() <= part_box.right()) {
01025             int dist = partner_box.y_gap(part_box);
01026             if (best_caption == NULL || dist < best_dist) {
01027               best_dist = dist;
01028               best_caption = partner;
01029               best_upper = upper;
01030             }
01031           }
01032         }
01033       }
01034       if (best_caption != NULL) {
01035         if (debug) {
01036           tprintf("Best caption candidate:");
01037           best_caption->bounding_box().print();
01038         }
01039         // We have a candidate caption. Qualify it as being separable from
01040         // any body text. We are looking for either a small number of lines
01041         // or a big gap that indicates a separation from the body text.
01042         int line_count = 0;
01043         int biggest_gap = 0;
01044         int smallest_gap = MAX_INT16;
01045         int total_height = 0;
01046         int mean_height = 0;
01047         ColPartition* end_partner = NULL;
01048         ColPartition* next_partner = NULL;
01049         for (ColPartition* partner = best_caption; partner != NULL &&
01050              line_count <= kMaxCaptionLines;
01051              partner = next_partner) {
01052           if (!partner->IsTextType()) {
01053             end_partner = partner;
01054             break;
01055           }
01056           ++line_count;
01057           total_height += partner->bounding_box().height();
01058           next_partner = partner->SingletonPartner(best_upper);
01059           if (next_partner != NULL) {
01060             int gap = partner->bounding_box().y_gap(
01061                 next_partner->bounding_box());
01062             if (gap > biggest_gap) {
01063               biggest_gap = gap;
01064               end_partner = next_partner;
01065               mean_height = total_height / line_count;
01066             } else if (gap < smallest_gap) {
01067               smallest_gap = gap;
01068             }
01069             // If the gap looks big compared to the text size and the smallest
01070             // gap seen so far, then we can stop.
01071             if (biggest_gap > mean_height * kMinCaptionGapHeightRatio &&
01072                 biggest_gap > smallest_gap * kMinCaptionGapRatio)
01073               break;
01074           }
01075         }
01076         if (debug) {
01077           tprintf("Line count=%d, biggest gap %d, smallest%d, mean height %d\n",
01078                   line_count, biggest_gap, smallest_gap, mean_height);
01079           if (end_partner != NULL) {
01080             tprintf("End partner:");
01081             end_partner->bounding_box().print();
01082           }
01083         }
01084         if (next_partner == NULL && line_count <= kMaxCaptionLines)
01085           end_partner = NULL;  // No gap, but line count is small.
01086         if (line_count <= kMaxCaptionLines) {
01087           // This is a qualified caption. Mark the text as caption.
01088           for (ColPartition* partner = best_caption; partner != NULL &&
01089                partner != end_partner;
01090                partner = next_partner) {
01091             partner->set_type(PT_CAPTION_TEXT);
01092             partner->SetBlobTypes();
01093             if (debug) {
01094               tprintf("Set caption type for partition:");
01095               partner->bounding_box().print();
01096             }
01097             next_partner = partner->SingletonPartner(best_upper);
01098           }
01099         }
01100       }
01101     }
01102   }
01103 }
01104 
01107 
01108 // For every ColPartition in the grid, finds its upper and lower neighbours.
01109 void ColPartitionGrid::FindPartitionPartners() {
01110   ColPartitionGridSearch gsearch(this);
01111   gsearch.StartFullSearch();
01112   ColPartition* part;
01113   while ((part = gsearch.NextFullSearch()) != NULL) {
01114     if (part->IsVerticalType()) {
01115       FindVPartitionPartners(true, part);
01116       FindVPartitionPartners(false, part);
01117     } else {
01118       FindPartitionPartners(true, part);
01119       FindPartitionPartners(false, part);
01120     }
01121   }
01122 }
01123 
01124 // Finds the best partner in the given direction for the given partition.
01125 // Stores the result with AddPartner.
01126 void ColPartitionGrid::FindPartitionPartners(bool upper, ColPartition* part) {
01127   if (part->type() == PT_NOISE)
01128     return;  // Noise is not allowed to partner anything.
01129   const TBOX& box = part->bounding_box();
01130   int top = part->median_top();
01131   int bottom = part->median_bottom();
01132   int height = top - bottom;
01133   int mid_y = (bottom + top) / 2;
01134   ColPartitionGridSearch vsearch(this);
01135   // Search down for neighbour below
01136   vsearch.StartVerticalSearch(box.left(), box.right(), part->MidY());
01137   ColPartition* neighbour;
01138   ColPartition* best_neighbour = NULL;
01139   int best_dist = MAX_INT32;
01140   while ((neighbour = vsearch.NextVerticalSearch(!upper)) != NULL) {
01141     if (neighbour == part || neighbour->type() == PT_NOISE)
01142       continue;  // Noise is not allowed to partner anything.
01143     int neighbour_bottom = neighbour->median_bottom();
01144     int neighbour_top = neighbour->median_top();
01145     int neighbour_y = (neighbour_bottom + neighbour_top) / 2;
01146     if (upper != (neighbour_y > mid_y))
01147       continue;
01148     if (!part->HOverlaps(*neighbour) && !part->WithinSameMargins(*neighbour))
01149       continue;
01150     if (!part->TypesMatch(*neighbour)) {
01151       if (best_neighbour == NULL)
01152         best_neighbour = neighbour;
01153       continue;
01154     }
01155     int dist = upper ? neighbour_bottom - top : bottom - neighbour_top;
01156     if (dist <= kMaxPartitionSpacing * height) {
01157       if (dist < best_dist) {
01158         best_dist = dist;
01159         best_neighbour = neighbour;
01160       }
01161     } else {
01162       break;
01163     }
01164   }
01165   if (best_neighbour != NULL)
01166     part->AddPartner(upper, best_neighbour);
01167 }
01168 
01169 // Finds the best partner in the given direction for the given partition.
01170 // Stores the result with AddPartner.
01171 void ColPartitionGrid::FindVPartitionPartners(bool to_the_left,
01172                                               ColPartition* part) {
01173   if (part->type() == PT_NOISE)
01174     return;  // Noise is not allowed to partner anything.
01175   const TBOX& box = part->bounding_box();
01176   int left = part->median_left();
01177   int right = part->median_right();
01178   int width = right - left;
01179   int mid_x = (left + right) / 2;
01180   ColPartitionGridSearch hsearch(this);
01181   // Search left for neighbour to_the_left
01182   hsearch.StartSideSearch(mid_x, box.bottom(), box.top());
01183   ColPartition* neighbour;
01184   ColPartition* best_neighbour = NULL;
01185   int best_dist = MAX_INT32;
01186   while ((neighbour = hsearch.NextSideSearch(to_the_left)) != NULL) {
01187     if (neighbour == part || neighbour->type() == PT_NOISE)
01188       continue;  // Noise is not allowed to partner anything.
01189     int neighbour_left = neighbour->median_left();
01190     int neighbour_right = neighbour->median_right();
01191     int neighbour_x = (neighbour_left + neighbour_right) / 2;
01192     if (to_the_left != (neighbour_x < mid_x))
01193       continue;
01194     if (!part->VOverlaps(*neighbour))
01195       continue;
01196     if (!part->TypesMatch(*neighbour))
01197       continue;  // Only match to other vertical text.
01198     int dist = to_the_left ? left - neighbour_right : neighbour_left - right;
01199     if (dist <= kMaxPartitionSpacing * width) {
01200       if (dist < best_dist || best_neighbour == NULL) {
01201         best_dist = dist;
01202         best_neighbour = neighbour;
01203       }
01204     } else {
01205       break;
01206     }
01207   }
01208   // For vertical partitions, the upper partner is to the left, and lower is
01209   // to the right.
01210   if (best_neighbour != NULL)
01211     part->AddPartner(to_the_left, best_neighbour);
01212 }
01213 
01214 // For every ColPartition with multiple partners in the grid, reduces the
01215 // number of partners to 0 or 1. If get_desperate is true, goes to more
01216 // desperate merge methods to merge flowing text before breaking partnerships.
01217 void ColPartitionGrid::RefinePartitionPartners(bool get_desperate) {
01218   ColPartitionGridSearch gsearch(this);
01219   // Refine in type order so that chasing multiple partners can be done
01220   // before eliminating type mis-matching partners.
01221   for (int type = PT_UNKNOWN + 1; type <= PT_COUNT; type++) {
01222     // Iterate the ColPartitions in the grid.
01223     gsearch.StartFullSearch();
01224     ColPartition* part;
01225     while ((part = gsearch.NextFullSearch()) != NULL) {
01226       part->RefinePartners(static_cast<PolyBlockType>(type),
01227                            get_desperate, this);
01228       // Iterator may have been messed up by a merge.
01229       gsearch.RepositionIterator();
01230     }
01231   }
01232 }
01233 
01234 
01235 // ========================== PRIVATE CODE ========================
01236 
01237 // Finds and returns a list of candidate ColPartitions to merge with part.
01238 // The candidates must overlap search_box, and when merged must not
01239 // overlap any other partitions that are not overlapped by each individually.
01240 void ColPartitionGrid::FindMergeCandidates(const ColPartition* part,
01241                                            const TBOX& search_box, bool debug,
01242                                            ColPartition_CLIST* candidates) {
01243   int ok_overlap =
01244       static_cast<int>(kTinyEnoughTextlineOverlapFraction * gridsize() + 0.5);
01245   const TBOX& part_box = part->bounding_box();
01246   // Now run the rect search.
01247   ColPartitionGridSearch rsearch(this);
01248   rsearch.SetUniqueMode(true);
01249   rsearch.StartRectSearch(search_box);
01250   ColPartition* candidate;
01251   while ((candidate = rsearch.NextRectSearch()) != NULL) {
01252     if (!OKMergeCandidate(part, candidate, debug))
01253       continue;
01254     const TBOX& c_box = candidate->bounding_box();
01255     // Candidate seems to be a potential merge with part. If one contains
01256     // the other, then the merge is a no-brainer. Otherwise, search the
01257     // combined box to see if anything else is inappropriately overlapped.
01258     if (!part_box.contains(c_box) && !c_box.contains(part_box)) {
01259       // Search the combined rectangle to see if anything new is overlapped.
01260       // This is a preliminary test designed to quickly weed-out stupid
01261       // merge candidates that would create a big list of overlapped objects
01262       // for the squared-order overlap analysis. Eg. vertical and horizontal
01263       // line-like objects that overlap real text when merged:
01264       // || ==========================
01265       // ||
01266       // ||  r e a l  t e x t
01267       // ||
01268       // ||
01269       TBOX merged_box(part_box);
01270       merged_box += c_box;
01271       ColPartitionGridSearch msearch(this);
01272       msearch.SetUniqueMode(true);
01273       msearch.StartRectSearch(merged_box);
01274       ColPartition* neighbour;
01275       while ((neighbour = msearch.NextRectSearch()) != NULL) {
01276         if (neighbour == part || neighbour == candidate)
01277           continue;  // Ignore itself.
01278         if (neighbour->OKMergeOverlap(*part, *candidate, ok_overlap, false))
01279           continue;  // This kind of merge overlap is OK.
01280         TBOX n_box = neighbour->bounding_box();
01281         // The overlap is OK if:
01282         // * the n_box already overlapped the part or the candidate OR
01283         // * the n_box is a suitable merge with either part or candidate
01284         if (!n_box.overlap(part_box) && !n_box.overlap(c_box) &&
01285             !OKMergeCandidate(part, neighbour, false) &&
01286             !OKMergeCandidate(candidate, neighbour, false))
01287           break;
01288       }
01289       if (neighbour != NULL) {
01290         if (debug) {
01291           tprintf("Combined box overlaps another that is not OK despite"
01292                   " allowance of %d:", ok_overlap);
01293           neighbour->bounding_box().print();
01294           tprintf("Reason:");
01295           OKMergeCandidate(part, neighbour, true);
01296           tprintf("...and:");
01297           OKMergeCandidate(candidate, neighbour, true);
01298           tprintf("Overlap:");
01299           neighbour->OKMergeOverlap(*part, *candidate, ok_overlap, true);
01300         }
01301         continue;
01302       }
01303     }
01304     if (debug) {
01305       tprintf("Adding candidate:");
01306       candidate->bounding_box().print();
01307     }
01308     // Unique elements as they arrive.
01309     candidates->add_sorted(SortByBoxLeft<ColPartition>, true, candidate);
01310   }
01311 }
01312 
01313 // Smoothes the region type/flow type of the given part by looking at local
01314 // neigbours and the given image mask. Searches a padded rectangle with the
01315 // padding truncated on one size of the part's box in turn for each side,
01316 // using the result (if any) that has the least distance to all neighbours
01317 // that contribute to the decision. This biases in favor of rectangular
01318 // regions without completely enforcing them.
01319 // If a good decision cannot be reached, the part is left unchanged.
01320 // im_box and rerotation are used to map blob coordinates onto the
01321 // nontext_map, which is used to prevent the spread of text neighbourhoods
01322 // into images.
01323 // Returns true if the partition was changed.
01324 bool ColPartitionGrid::SmoothRegionType(Pix* nontext_map,
01325                                         const TBOX& im_box,
01326                                         const FCOORD& rerotation,
01327                                         bool debug,
01328                                         ColPartition* part) {
01329   const TBOX& part_box = part->bounding_box();
01330   if (debug) {
01331     tprintf("Smooothing part at:");
01332     part_box.print();
01333   }
01334   BlobRegionType best_type = BRT_UNKNOWN;
01335   int best_dist = MAX_INT32;
01336   int max_dist = MIN(part_box.width(), part_box.height());
01337   max_dist = MAX(max_dist * kMaxNeighbourDistFactor, gridsize() * 2);
01338   // Search with the pad truncated on each side of the box in turn.
01339   bool any_image = false;
01340   bool all_image = true;
01341   for (int d = 0; d < BND_COUNT; ++d) {
01342     int dist;
01343     BlobNeighbourDir dir = static_cast<BlobNeighbourDir>(d);
01344     BlobRegionType type = SmoothInOneDirection(dir, nontext_map, im_box,
01345                                                rerotation, debug, *part,
01346                                                &dist);
01347     if (debug) {
01348       tprintf("Result in dir %d = %d at dist %d\n", dir, type, dist);
01349     }
01350     if (type != BRT_UNKNOWN && dist < best_dist) {
01351       best_dist = dist;
01352       best_type = type;
01353     }
01354     if (type == BRT_POLYIMAGE)
01355       any_image = true;
01356     else
01357       all_image = false;
01358   }
01359   if (best_dist > max_dist)
01360     return false;  // Too far away to set the type with it.
01361   if (part->flow() == BTFT_STRONG_CHAIN && !all_image) {
01362       return false;  // We are not modifying it.
01363   }
01364   BlobRegionType new_type = part->blob_type();
01365   BlobTextFlowType new_flow = part->flow();
01366   if (best_type == BRT_TEXT && !any_image) {
01367     new_flow = BTFT_STRONG_CHAIN;
01368     new_type = BRT_TEXT;
01369   } else if (best_type == BRT_VERT_TEXT && !any_image) {
01370     new_flow = BTFT_STRONG_CHAIN;
01371     new_type = BRT_VERT_TEXT;
01372   } else if (best_type == BRT_POLYIMAGE) {
01373     new_flow = BTFT_NONTEXT;
01374     new_type = BRT_UNKNOWN;
01375   }
01376   if (new_type != part->blob_type() || new_flow != part->flow()) {
01377     part->set_flow(new_flow);
01378     part->set_blob_type(new_type);
01379     part->SetBlobTypes();
01380     if (debug) {
01381       tprintf("Modified part:");
01382       part->Print();
01383     }
01384     return true;
01385   } else {
01386     return false;
01387   }
01388 }
01389 
01390 // Sets up a search box based on the part_box, padded in all directions
01391 // except direction. Also setup dist_scaling to weight x,y distances according
01392 // to the given direction.
01393 static void ComputeSearchBoxAndScaling(BlobNeighbourDir direction,
01394                                        const TBOX& part_box,
01395                                        int min_padding,
01396                                        TBOX* search_box,
01397                                        ICOORD* dist_scaling) {
01398   *search_box = part_box;
01399   // Generate a pad value based on the min dimension of part_box, but at least
01400   // min_padding and then scaled by kMaxPadFactor.
01401   int padding = MIN(part_box.height(), part_box.width());
01402   padding = MAX(padding, min_padding);
01403   padding *= kMaxPadFactor;
01404   search_box->pad(padding, padding);
01405   // Truncate the box in the appropriate direction and make the distance
01406   // metric slightly biased in the truncated direction.
01407   switch (direction) {
01408     case BND_LEFT:
01409       search_box->set_left(part_box.left());
01410       *dist_scaling = ICOORD(2, 1);
01411       break;
01412     case BND_BELOW:
01413       search_box->set_bottom(part_box.bottom());
01414       *dist_scaling = ICOORD(1, 2);
01415       break;
01416     case BND_RIGHT:
01417       search_box->set_right(part_box.right());
01418       *dist_scaling = ICOORD(2, 1);
01419       break;
01420     case BND_ABOVE:
01421       search_box->set_top(part_box.top());
01422       *dist_scaling = ICOORD(1, 2);
01423       break;
01424     default:
01425       ASSERT_HOST(false);
01426   }
01427 }
01428 
01429 // Local enum used by SmoothInOneDirection and AccumulatePartDistances
01430 // for the different types of partition neighbour.
01431 enum NeighbourPartitionType {
01432   NPT_HTEXT,       // Definite horizontal text.
01433   NPT_VTEXT,       // Definite vertical text.
01434   NPT_WEAK_HTEXT,  // Weakly horizontal text. Counts as HTEXT for HTEXT, but
01435                    // image for image and VTEXT.
01436   NPT_WEAK_VTEXT,  // Weakly vertical text. Counts as VTEXT for VTEXT, but
01437                    // image for image and HTEXT.
01438   NPT_IMAGE,       // Defininte non-text.
01439   NPT_COUNT        // Number of array elements.
01440 };
01441 
01442 // Executes the search for SmoothRegionType in a single direction.
01443 // Creates a bounding box that is padded in all directions except direction,
01444 // and searches it for other partitions. Finds the nearest collection of
01445 // partitions that makes a decisive result (if any) and returns the type
01446 // and the distance of the collection. If there are any pixels in the
01447 // nontext_map, then the decision is biased towards image.
01448 BlobRegionType ColPartitionGrid::SmoothInOneDirection(
01449     BlobNeighbourDir direction, Pix* nontext_map,
01450     const TBOX& im_box, const FCOORD& rerotation,
01451     bool debug, const ColPartition& part, int* best_distance) {
01452   // Set up a rectangle search bounded by the part.
01453   TBOX part_box = part.bounding_box();
01454   TBOX search_box;
01455   ICOORD dist_scaling;
01456   ComputeSearchBoxAndScaling(direction, part_box, gridsize(),
01457                              &search_box, &dist_scaling);
01458   bool image_region = ImageFind::CountPixelsInRotatedBox(search_box, im_box,
01459                                                          rerotation,
01460                                                          nontext_map) > 0;
01461   GenericVector<int> dists[NPT_COUNT];
01462   AccumulatePartDistances(part, dist_scaling, search_box,
01463                           nontext_map, im_box, rerotation, debug, dists);
01464   // By iteratively including the next smallest distance across the vectors,
01465   // (as in a merge sort) we can use the vector indices as counts of each type
01466   // and find the nearest set of objects that give us a definite decision.
01467   int counts[NPT_COUNT];
01468   memset(counts, 0, sizeof(counts[0]) * NPT_COUNT);
01469   // If there is image in the search box, tip the balance in image's favor.
01470   int image_bias = image_region ? kSmoothDecisionMargin / 2 : 0;
01471   BlobRegionType text_dir = part.blob_type();
01472   BlobTextFlowType flow_type = part.flow();
01473   int min_dist = 0;
01474   do {
01475     // Find the minimum new entry across the vectors
01476     min_dist = MAX_INT32;
01477     for (int i = 0; i < NPT_COUNT; ++i) {
01478       if (counts[i] < dists[i].size() && dists[i][counts[i]] < min_dist)
01479         min_dist = dists[i][counts[i]];
01480     }
01481     // Step all the indices/counts forward to include min_dist.
01482     for (int i = 0; i < NPT_COUNT; ++i) {
01483       while (counts[i] < dists[i].size() && dists[i][counts[i]] <= min_dist)
01484         ++counts[i];
01485     }
01486     *best_distance = min_dist;
01487     if (debug) {
01488       tprintf("Totals: htext=%d+%d, vtext=%d+%d, image=%d+%d, at dist=%d\n",
01489               counts[NPT_HTEXT], counts[NPT_WEAK_HTEXT],
01490               counts[NPT_VTEXT], counts[NPT_WEAK_VTEXT],
01491               counts[NPT_IMAGE], image_bias, min_dist);
01492     }
01493     // See if we have a decision yet.
01494     int image_count = counts[NPT_IMAGE];
01495     int htext_score = counts[NPT_HTEXT] + counts[NPT_WEAK_HTEXT] -
01496         (image_count + counts[NPT_WEAK_VTEXT]);
01497     int vtext_score = counts[NPT_VTEXT] + counts[NPT_WEAK_VTEXT] -
01498         (image_count + counts[NPT_WEAK_HTEXT]);
01499     if (image_count > 0 &&
01500         image_bias - htext_score >= kSmoothDecisionMargin &&
01501         image_bias - vtext_score >= kSmoothDecisionMargin) {
01502       *best_distance = dists[NPT_IMAGE][0];
01503       if (dists[NPT_WEAK_VTEXT].size() > 0 &&
01504           *best_distance > dists[NPT_WEAK_VTEXT][0])
01505         *best_distance = dists[NPT_WEAK_VTEXT][0];
01506       if (dists[NPT_WEAK_HTEXT].size() > 0 &&
01507           *best_distance > dists[NPT_WEAK_HTEXT][0])
01508         *best_distance = dists[NPT_WEAK_HTEXT][0];
01509       return BRT_POLYIMAGE;
01510     }
01511     if ((text_dir != BRT_VERT_TEXT || flow_type != BTFT_CHAIN) &&
01512         counts[NPT_HTEXT] > 0 && htext_score >= kSmoothDecisionMargin) {
01513       *best_distance = dists[NPT_HTEXT][0];
01514       return BRT_TEXT;
01515     } else if ((text_dir != BRT_TEXT || flow_type != BTFT_CHAIN) &&
01516         counts[NPT_VTEXT] > 0 && vtext_score >= kSmoothDecisionMargin) {
01517       *best_distance = dists[NPT_VTEXT][0];
01518       return BRT_VERT_TEXT;
01519     }
01520   } while (min_dist < MAX_INT32);
01521   return BRT_UNKNOWN;
01522 }
01523 
01524 // Counts the partitions in the given search_box by appending the gap
01525 // distance (scaled by dist_scaling) of the part from the base_part to the
01526 // vector of the appropriate type for the partition. Prior to return, the
01527 // vectors in the dists array are sorted in increasing order.
01528 // The nontext_map (+im_box, rerotation) is used to make text invisible if
01529 // there is non-text in between.
01530 // dists must be an array of GenericVectors of size NPT_COUNT.
01531 void ColPartitionGrid::AccumulatePartDistances(const ColPartition& base_part,
01532                                                const ICOORD& dist_scaling,
01533                                                const TBOX& search_box,
01534                                                Pix* nontext_map,
01535                                                const TBOX& im_box,
01536                                                const FCOORD& rerotation,
01537                                                bool debug,
01538                                                GenericVector<int>* dists) {
01539   const TBOX& part_box = base_part.bounding_box();
01540   ColPartitionGridSearch rsearch(this);
01541   rsearch.SetUniqueMode(true);
01542   rsearch.StartRectSearch(search_box);
01543   ColPartition* neighbour;
01544   // Search for compatible neighbours with a similar strokewidth, but not
01545   // on the other side of a tab vector.
01546   while ((neighbour = rsearch.NextRectSearch()) != NULL) {
01547     if (neighbour->IsUnMergeableType() ||
01548         !base_part.ConfirmNoTabViolation(*neighbour) ||
01549         neighbour == &base_part)
01550       continue;
01551     TBOX nbox = neighbour->bounding_box();
01552     BlobRegionType n_type = neighbour->blob_type();
01553     if ((n_type == BRT_TEXT || n_type == BRT_VERT_TEXT) &&
01554         !ImageFind::BlankImageInBetween(part_box, nbox, im_box, rerotation,
01555                                         nontext_map))
01556       continue;  // Text not visible the other side of image.
01557     if (BLOBNBOX::IsLineType(n_type))
01558       continue;  // Don't use horizontal lines as neighbours.
01559     int x_gap = MAX(part_box.x_gap(nbox), 0);
01560     int y_gap = MAX(part_box.y_gap(nbox), 0);
01561     int n_dist = x_gap * dist_scaling.x() + y_gap* dist_scaling.y();
01562     if (debug) {
01563       tprintf("Part has x-gap=%d, y=%d, dist=%d at:",
01564               x_gap, y_gap, n_dist);
01565       nbox.print();
01566     }
01567     // Truncate the number of boxes, so text doesn't get too much advantage.
01568     int n_boxes = MIN(neighbour->boxes_count(), kSmoothDecisionMargin);
01569     BlobTextFlowType n_flow = neighbour->flow();
01570     GenericVector<int>* count_vector = NULL;
01571     if (n_flow == BTFT_STRONG_CHAIN) {
01572       if (n_type == BRT_TEXT)
01573         count_vector = &dists[NPT_HTEXT];
01574       else
01575         count_vector = &dists[NPT_VTEXT];
01576       if (debug) {
01577         tprintf("%s %d\n", n_type == BRT_TEXT ? "Htext" : "Vtext", n_boxes);
01578       }
01579     } else if ((n_type == BRT_TEXT || n_type == BRT_VERT_TEXT) &&
01580                (n_flow == BTFT_CHAIN || n_flow == BTFT_NEIGHBOURS)) {
01581       // Medium text counts as weak, and all else counts as image.
01582       if (n_type == BRT_TEXT)
01583         count_vector = &dists[NPT_WEAK_HTEXT];
01584       else
01585         count_vector = &dists[NPT_WEAK_VTEXT];
01586       if (debug) tprintf("Weak %d\n", n_boxes);
01587     } else {
01588       count_vector = &dists[NPT_IMAGE];
01589       if (debug) tprintf("Image %d\n", n_boxes);
01590     }
01591     if (count_vector != NULL) {
01592       for (int i = 0; i < n_boxes; ++i)
01593         count_vector->push_back(n_dist);
01594     }
01595     if (debug) {
01596       neighbour->Print();
01597     }
01598   }
01599   for (int i = 0; i < NPT_COUNT; ++i)
01600     dists[i].sort();
01601 }
01602 
01603 // Improves the margins of the part ColPartition by searching for
01604 // neighbours that vertically overlap significantly.
01605 // columns may be NULL, and indicates the assigned column structure this
01606 // is applicable to part.
01607 void ColPartitionGrid::FindPartitionMargins(ColPartitionSet* columns,
01608                                             ColPartition* part) {
01609   // Set up a rectangle search x-bounded by the column and y by the part.
01610   TBOX box = part->bounding_box();
01611   int y = part->MidY();
01612   // Initial left margin is based on the column, if there is one.
01613   int left_margin = bleft().x();
01614   int right_margin = tright().x();
01615   if (columns != NULL) {
01616     ColPartition* column = columns->ColumnContaining(box.left(), y);
01617     if (column != NULL)
01618       left_margin = column->LeftAtY(y);
01619     column = columns->ColumnContaining(box.right(), y);
01620     if (column != NULL)
01621       right_margin = column->RightAtY(y);
01622   }
01623   left_margin -= kColumnWidthFactor;
01624   right_margin += kColumnWidthFactor;
01625   // Search for ColPartitions that reduce the margin.
01626   left_margin = FindMargin(box.left() + box.height(), true, left_margin,
01627                            box.bottom(), box.top(), part);
01628   part->set_left_margin(left_margin);
01629   // Search for ColPartitions that reduce the margin.
01630   right_margin = FindMargin(box.right() - box.height(), false, right_margin,
01631                             box.bottom(), box.top(), part);
01632   part->set_right_margin(right_margin);
01633 }
01634 
01635 // Starting at x, and going in the specified direction, upto x_limit, finds
01636 // the margin for the given y range by searching sideways,
01637 // and ignoring not_this.
01638 int ColPartitionGrid::FindMargin(int x, bool right_to_left, int x_limit,
01639                                  int y_bottom, int y_top,
01640                                  const ColPartition* not_this) {
01641   int height = y_top - y_bottom;
01642   // Iterate the ColPartitions in the grid.
01643   ColPartitionGridSearch side_search(this);
01644   side_search.SetUniqueMode(true);
01645   side_search.StartSideSearch(x, y_bottom, y_top);
01646   ColPartition* part;
01647   while ((part = side_search.NextSideSearch(right_to_left)) != NULL) {
01648     // Ignore itself.
01649     if (part == not_this)  // || part->IsLineType())
01650       continue;
01651     // Must overlap by enough, based on the min of the heights, so
01652     // large partitions can't smash through small ones.
01653     TBOX box = part->bounding_box();
01654     int min_overlap = MIN(height, box.height());
01655     min_overlap = static_cast<int>(min_overlap * kMarginOverlapFraction + 0.5);
01656     int y_overlap = MIN(y_top, box.top()) - MAX(y_bottom, box.bottom());
01657     if (y_overlap < min_overlap)
01658       continue;
01659     // Must be going the right way.
01660     int x_edge = right_to_left ? box.right() : box.left();
01661     if ((x_edge < x) != right_to_left)
01662       continue;
01663     // If we have gone past x_limit, then x_limit will do.
01664     if ((x_edge < x_limit) == right_to_left)
01665       break;
01666     // It reduces x limit, so save the new one.
01667     x_limit = x_edge;
01668   }
01669   return x_limit;
01670 }
01671 
01672 
01673 }  // namespace tesseract.