Tesseract  3.02
tesseract-ocr/textord/strokewidth.cpp
Go to the documentation of this file.
00001 
00002 // File:        strokewidth.cpp
00003 // Description: Subclass of BBGrid to find uniformity of strokewidth.
00004 // Author:      Ray Smith
00005 // Created:     Mon Mar 31 16:17:01 PST 2008
00006 //
00007 // (C) Copyright 2008, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include "strokewidth.h"
00025 
00026 #include <math.h>
00027 
00028 #include "blobbox.h"
00029 #include "colpartition.h"
00030 #include "colpartitiongrid.h"
00031 #include "imagefind.h"
00032 #include "linlsq.h"
00033 #include "statistc.h"
00034 #include "tabfind.h"
00035 #include "textlineprojection.h"
00036 #include "tordmain.h"  // For SetBlobStrokeWidth.
00037 
00038 // Include automatically generated configuration file if running autoconf.
00039 #ifdef HAVE_CONFIG_H
00040 #include "config_auto.h"
00041 #endif
00042 
00043 namespace tesseract {
00044 
00045 INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
00046 BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
00047 BOOL_VAR(textord_tabfind_vertical_text, true, "Enable vertical detection");
00048 BOOL_VAR(textord_tabfind_force_vertical_text, false,
00049          "Force using vertical text page mode");
00050 BOOL_VAR(textord_tabfind_vertical_horizontal_mix, true,
00051          "find horizontal lines such as headers in vertical page mode");
00052 double_VAR(textord_tabfind_vertical_text_ratio, 0.5,
00053            "Fraction of textlines deemed vertical to use vertical page mode");
00054 
00056 const double kStrokeWidthFractionTolerance = 0.125;
00061 const double kStrokeWidthTolerance = 1.5;
00062 // Same but for CJK we are a bit more generous.
00063 const double kStrokeWidthFractionCJK = 0.25;
00064 const double kStrokeWidthCJK = 2.0;
00065 // Radius in grid cells of search for broken CJK. Doesn't need to be very
00066 // large as the grid size should be about the size of a character anyway.
00067 const int kCJKRadius = 2;
00068 // Max distance fraction of size to join close but broken CJK characters.
00069 const double kCJKBrokenDistanceFraction = 0.25;
00070 // Max number of components in a broken CJK character.
00071 const int kCJKMaxComponents = 8;
00072 // Max aspect ratio of CJK broken characters when put back together.
00073 const double kCJKAspectRatio = 1.25;
00074 // Max increase in aspect ratio of CJK broken characters when merged.
00075 const double kCJKAspectRatioIncrease = 1.0625;
00076 // Max multiple of the grid size that will be used in computing median CJKsize.
00077 const int kMaxCJKSizeRatio = 5;
00078 // Min fraction of blobs broken CJK to iterate and run it again.
00079 const double kBrokenCJKIterationFraction = 0.125;
00080 // Multiple of gridsize as x-padding for a search box for diacritic base
00081 // characters.
00082 const double kDiacriticXPadRatio = 7.0;
00083 // Multiple of gridsize as y-padding for a search box for diacritic base
00084 // characters.
00085 const double kDiacriticYPadRatio = 1.75;
00086 // Min multiple of diacritic height that a neighbour must be to be a
00087 // convincing base character.
00088 const double kMinDiacriticSizeRatio = 1.0625;
00089 // Max multiple of a textline's median height as a threshold for the sum of
00090 // a diacritic's farthest x and y distances (gap + size).
00091 const double kMaxDiacriticDistanceRatio = 1.25;
00092 // Max x-gap between a diacritic and its base char as a fraction of the height
00093 // of the base char (allowing other blobs to fill the gap.)
00094 const double kMaxDiacriticGapToBaseCharHeight = 1.0;
00095 // Radius of a search for diacritics in grid units.
00096 const int kSearchRadius = 2;
00097 // Ratio between longest side of a line and longest side of a character.
00098 // (neighbor_min > blob_min * kLineTrapShortest &&
00099 //  neighbor_max < blob_max / kLineTrapLongest)
00100 // => neighbor is a grapheme and blob is a line.
00101 const int kLineTrapLongest = 4;
00102 // Ratio between shortest side of a line and shortest side of a character.
00103 const int kLineTrapShortest = 2;
00104 // Max aspect ratio of the total box before CountNeighbourGaps
00105 // decides immediately based on the aspect ratio.
00106 const int kMostlyOneDirRatio = 3;
00107 // Aspect ratio for a blob to be considered as line residue.
00108 const double kLineResidueAspectRatio = 8.0;
00109 // Padding ratio for line residue search box.
00110 const int kLineResiduePadRatio = 3;
00111 // Min multiple of neighbour size for a line residue to be genuine.
00112 const double kLineResidueSizeRatio = 1.75;
00113 // Aspect ratio filter for OSD.
00114 const float kSizeRatioToReject = 2.0;
00115 // Max number of normal blobs a large blob may overlap before it is rejected
00116 // and determined to be image
00117 const int kMaxLargeOverlaps = 3;
00118 // Expansion factor for search box for good neighbours.
00119 const double kNeighbourSearchFactor = 2.5;
00120 
00121 StrokeWidth::StrokeWidth(int gridsize,
00122                          const ICOORD& bleft, const ICOORD& tright)
00123   : BlobGrid(gridsize, bleft, tright), nontext_map_(NULL), projection_(NULL),
00124     denorm_(NULL), grid_box_(bleft, tright), rerotation_(1.0f, 0.0f) {
00125   leaders_win_ = NULL;
00126   widths_win_ = NULL;
00127   initial_widths_win_ = NULL;
00128   chains_win_ = NULL;
00129   diacritics_win_ = NULL;
00130   textlines_win_ = NULL;
00131   smoothed_win_ = NULL;
00132 }
00133 
00134 StrokeWidth::~StrokeWidth() {
00135   if (widths_win_ != NULL) {
00136     #ifndef GRAPHICS_DISABLED
00137     delete widths_win_->AwaitEvent(SVET_DESTROY);
00138     #endif  // GRAPHICS_DISABLED
00139     if (textord_tabfind_only_strokewidths)
00140       exit(0);
00141     delete widths_win_;
00142   }
00143   delete leaders_win_;
00144   delete initial_widths_win_;
00145   delete chains_win_;
00146   delete textlines_win_;
00147   delete smoothed_win_;
00148   delete diacritics_win_;
00149 }
00150 
00151 // Sets the neighbours member of the medium-sized blobs in the block.
00152 // Searches on 4 sides of each blob for similar-sized, similar-strokewidth
00153 // blobs and sets pointers to the good neighbours.
00154 void StrokeWidth::SetNeighboursOnMediumBlobs(TO_BLOCK* block) {
00155   // Run a preliminary strokewidth neighbour detection on the medium blobs.
00156   InsertBlobList(&block->blobs);
00157   BLOBNBOX_IT blob_it(&block->blobs);
00158   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00159     SetNeighbours(false, false, blob_it.data());
00160   }
00161   Clear();
00162 }
00163 
00164 // Sets the neighbour/textline writing direction members of the medium
00165 // and large blobs with optional repair of broken CJK characters first.
00166 // Repair of broken CJK is needed here because broken CJK characters
00167 // can fool the textline direction detection algorithm.
00168 void StrokeWidth::FindTextlineDirectionAndFixBrokenCJK(bool cjk_merge,
00169                                                        TO_BLOCK* input_block) {
00170   // Setup the grid with the remaining (non-noise) blobs.
00171   InsertBlobs(input_block);
00172   // Repair broken CJK characters if needed.
00173   while (cjk_merge && FixBrokenCJK(input_block));
00174   // Grade blobs by inspection of neighbours.
00175   FindTextlineFlowDirection(false);
00176   // Clear the grid ready for rotation or leader finding.
00177   Clear();
00178 }
00179 
00180 // Helper to collect and count horizontal and vertical blobs from a list.
00181 static void CollectHorizVertBlobs(BLOBNBOX_LIST* input_blobs,
00182                                   int* num_vertical_blobs,
00183                                   int* num_horizontal_blobs,
00184                                   BLOBNBOX_CLIST* vertical_blobs,
00185                                   BLOBNBOX_CLIST* horizontal_blobs,
00186                                   BLOBNBOX_CLIST* nondescript_blobs) {
00187   BLOBNBOX_C_IT v_it(vertical_blobs);
00188   BLOBNBOX_C_IT h_it(horizontal_blobs);
00189   BLOBNBOX_C_IT n_it(nondescript_blobs);
00190   BLOBNBOX_IT blob_it(input_blobs);
00191   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00192     BLOBNBOX* blob = blob_it.data();
00193     const TBOX& box = blob->bounding_box();
00194     float y_x = static_cast<float>(box.height()) / box.width();
00195     float x_y = 1.0f / y_x;
00196     // Select a >= 1.0 ratio
00197     float ratio = x_y > y_x ? x_y : y_x;
00198     // If the aspect ratio is small and we want them for osd, save the blob.
00199     bool ok_blob = ratio <= kSizeRatioToReject;
00200     if (blob->UniquelyVertical()) {
00201       ++*num_vertical_blobs;
00202       if (ok_blob) v_it.add_after_then_move(blob);
00203     } else if (blob->UniquelyHorizontal()) {
00204       ++*num_horizontal_blobs;
00205       if (ok_blob) h_it.add_after_then_move(blob);
00206     } else if (ok_blob) {
00207       n_it.add_after_then_move(blob);
00208     }
00209   }
00210 }
00211 
00212 
00213 // Types all the blobs as vertical or horizontal text or unknown and
00214 // returns true if the majority are vertical.
00215 // If the blobs are rotated, it is necessary to call CorrectForRotation
00216 // after rotating everything, otherwise the work done here will be enough.
00217 // If osd_blobs is not null, a list of blobs from the dominant textline
00218 // direction are returned for use in orientation and script detection.
00219 bool StrokeWidth::TestVerticalTextDirection(TO_BLOCK* block,
00220                                             BLOBNBOX_CLIST* osd_blobs) {
00221   if (textord_tabfind_force_vertical_text) return true;
00222   if (!textord_tabfind_vertical_text) return false;
00223 
00224   int vertical_boxes = 0;
00225   int horizontal_boxes = 0;
00226   // Count vertical normal and large blobs.
00227   BLOBNBOX_CLIST vertical_blobs;
00228   BLOBNBOX_CLIST horizontal_blobs;
00229   BLOBNBOX_CLIST nondescript_blobs;
00230   CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes,
00231                         &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
00232   CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes,
00233                         &vertical_blobs, &horizontal_blobs, &nondescript_blobs);
00234   if (textord_debug_tabfind)
00235     tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n",
00236             horizontal_boxes, vertical_boxes,
00237             horizontal_blobs.length(), vertical_blobs.length(),
00238             nondescript_blobs.length());
00239   if (osd_blobs != NULL && vertical_boxes == 0 && horizontal_boxes == 0) {
00240     // Only nondescript blobs available, so return those.
00241     BLOBNBOX_C_IT osd_it(osd_blobs);
00242     osd_it.add_list_after(&nondescript_blobs);
00243     return false;
00244   }
00245   int min_vert_boxes = static_cast<int>((vertical_boxes + horizontal_boxes) *
00246                                         textord_tabfind_vertical_text_ratio);
00247   if (vertical_boxes >= min_vert_boxes) {
00248     if (osd_blobs != NULL) {
00249       BLOBNBOX_C_IT osd_it(osd_blobs);
00250       osd_it.add_list_after(&vertical_blobs);
00251     }
00252     return true;
00253   } else {
00254     if (osd_blobs != NULL) {
00255       BLOBNBOX_C_IT osd_it(osd_blobs);
00256       osd_it.add_list_after(&horizontal_blobs);
00257     }
00258     return false;
00259   }
00260 }
00261 
00262 // Corrects the data structures for the given rotation.
00263 void StrokeWidth::CorrectForRotation(const FCOORD& rotation,
00264                                      ColPartitionGrid* part_grid) {
00265   Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());
00266   grid_box_ = TBOX(bleft(), tright());
00267   rerotation_.set_x(rotation.x());
00268   rerotation_.set_y(-rotation.y());
00269 }
00270 
00271 // Finds leader partitions and inserts them into the given part_grid.
00272 void StrokeWidth::FindLeaderPartitions(TO_BLOCK* block,
00273                                        ColPartitionGrid* part_grid) {
00274   Clear();
00275   // Find and isolate leaders in the noise list.
00276   ColPartition_LIST leader_parts;
00277   FindLeadersAndMarkNoise(block, &leader_parts);
00278   // Setup the strokewidth grid with the block's remaining (non-noise) blobs.
00279   InsertBlobList(&block->blobs);
00280   // Mark blobs that have leader neighbours.
00281   for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
00282     ColPartition* part = it.extract();
00283     part->ClaimBoxes();
00284     MarkLeaderNeighbours(part, LR_LEFT);
00285     MarkLeaderNeighbours(part, LR_RIGHT);
00286     part_grid->InsertBBox(true, true, part);
00287   }
00288 }
00289 
00290 // Finds and marks noise those blobs that look like bits of vertical lines
00291 // that would otherwise screw up layout analysis.
00292 void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) {
00293   BlobGridSearch gsearch(this);
00294   BLOBNBOX* bbox;
00295   // For every vertical line-like bbox in the grid, search its neighbours
00296   // to find the tallest, and if the original box is taller by sufficient
00297   // margin, then call it line residue and delete it.
00298   gsearch.StartFullSearch();
00299   while ((bbox = gsearch.NextFullSearch()) != NULL) {
00300     TBOX box = bbox->bounding_box();
00301     if (box.height() < box.width() * kLineResidueAspectRatio)
00302       continue;
00303     // Set up a rectangle search around the blob to find the size of its
00304     // neighbours.
00305     int padding = box.height() * kLineResiduePadRatio;
00306     TBOX search_box = box;
00307     search_box.pad(padding, padding);
00308     bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
00309                                                box.bottom());
00310     // Find the largest object in the search box not equal to bbox.
00311     BlobGridSearch rsearch(this);
00312     int max_size = 0;
00313     BLOBNBOX* n;
00314     rsearch.StartRectSearch(search_box);
00315     while ((n = rsearch.NextRectSearch()) != NULL) {
00316       if (n == bbox) continue;
00317       TBOX nbox = n->bounding_box();
00318       if (nbox.height() > max_size) {
00319         max_size = nbox.height();
00320       }
00321     }
00322     if (debug) {
00323       tprintf("Max neighbour size=%d for candidate line box at:", max_size);
00324       box.print();
00325     }
00326     if (max_size * kLineResidueSizeRatio < box.height()) {
00327       #ifndef GRAPHICS_DISABLED
00328       if (leaders_win_ != NULL) {
00329         // We are debugging, so display deleted in pink blobs in the same
00330         // window that we use to display leader detection.
00331         leaders_win_->Pen(ScrollView::PINK);
00332         leaders_win_->Rectangle(box.left(), box.bottom(),
00333                                 box.right(), box.top());
00334       }
00335       #endif  // GRAPHICS_DISABLED
00336       ColPartition::MakeBigPartition(bbox, big_part_list);
00337     }
00338   }
00339 }
00340 
00341 // Types all the blobs as vertical text or horizontal text or unknown and
00342 // puts them into initial ColPartitions in the supplied part_grid.
00343 // rerotation determines how to get back to the image coordinates from the
00344 // blob coordinates (since they may have been rotated for vertical text).
00345 // block is the single block for the whole page or rectangle to be OCRed.
00346 // nontext_pix (full-size), is a binary mask used to prevent merges across
00347 // photo/text boundaries. It is not kept beyond this function.
00348 // denorm provides a mapping back to the image from the current blob
00349 // coordinate space.
00350 // projection provides a measure of textline density over the image and
00351 // provides functions to assist with diacritic detection. It should be a
00352 // pointer to a new TextlineProjection, and will be setup here.
00353 // part_grid is the output grid of textline partitions.
00354 // Large blobs that cause overlap are put in separate partitions and added
00355 // to the big_parts list.
00356 void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
00357                                            TO_BLOCK* block,
00358                                            Pix* nontext_pix,
00359                                            const DENORM* denorm,
00360                                            TextlineProjection* projection,
00361                                            ColPartitionGrid* part_grid,
00362                                            ColPartition_LIST* big_parts) {
00363   nontext_map_ = nontext_pix;
00364   projection_ = projection;
00365   denorm_ = denorm;
00366   // Clear and re Insert to take advantage of the tab stops in the blobs.
00367   Clear();
00368   // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
00369   InsertBlobs(block);
00370 
00371   // Run FixBrokenCJK() again if the page is rotated and the blobs
00372   // lists are reset and re-flitered, because we may have some new
00373   // blobs in the medium blob list.
00374   if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
00375     FixBrokenCJK(block);
00376   }
00377   FindTextlineFlowDirection(true);
00378   projection_->ConstructProjection(block, rerotation, nontext_map_);
00379   if (textord_tabfind_show_strokewidths) {
00380     ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
00381     projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);
00382     projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);
00383   }
00384   projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);
00385   projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);
00386   // Clear and re Insert to take advantage of the removed diacritics.
00387   Clear();
00388   InsertBlobs(block);
00389   FindInitialPartitions(rerotation, block, part_grid, big_parts);
00390   nontext_map_ = NULL;
00391   projection_ = NULL;
00392   denorm_ = NULL;
00393 }
00394 
00395 static void PrintBoxWidths(BLOBNBOX* neighbour) {
00396   TBOX nbox = neighbour->bounding_box();
00397   tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n",
00398           nbox.left(), nbox.bottom(), nbox.right(), nbox.top(),
00399           neighbour->horz_stroke_width(), neighbour->vert_stroke_width(),
00400           2.0 * neighbour->cblob()->area()/neighbour->cblob()->perimeter());
00401 }
00402 
00404 void StrokeWidth::HandleClick(int x, int y) {
00405   BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT>::HandleClick(x, y);
00406   // Run a radial search for blobs that overlap.
00407   BlobGridSearch radsearch(this);
00408   radsearch.StartRadSearch(x, y, 1);
00409   BLOBNBOX* neighbour;
00410   FCOORD click(static_cast<float>(x), static_cast<float>(y));
00411   while ((neighbour = radsearch.NextRadSearch()) != NULL) {
00412     TBOX nbox = neighbour->bounding_box();
00413     if (nbox.contains(click) && neighbour->cblob() != NULL) {
00414       PrintBoxWidths(neighbour);
00415       if (neighbour->neighbour(BND_LEFT) != NULL)
00416         PrintBoxWidths(neighbour->neighbour(BND_LEFT));
00417       if (neighbour->neighbour(BND_RIGHT) != NULL)
00418         PrintBoxWidths(neighbour->neighbour(BND_RIGHT));
00419       if (neighbour->neighbour(BND_ABOVE) != NULL)
00420         PrintBoxWidths(neighbour->neighbour(BND_ABOVE));
00421       if (neighbour->neighbour(BND_BELOW) != NULL)
00422         PrintBoxWidths(neighbour->neighbour(BND_BELOW));
00423       int gaps[BND_COUNT];
00424       neighbour->NeighbourGaps(gaps);
00425       tprintf("Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
00426               "Good=    %d        %d        %d        %d\n",
00427               gaps[BND_LEFT], gaps[BND_RIGHT],
00428               gaps[BND_ABOVE], gaps[BND_BELOW],
00429               neighbour->horz_possible(),
00430               neighbour->vert_possible(),
00431               neighbour->good_stroke_neighbour(BND_LEFT),
00432               neighbour->good_stroke_neighbour(BND_RIGHT),
00433               neighbour->good_stroke_neighbour(BND_ABOVE),
00434               neighbour->good_stroke_neighbour(BND_BELOW));
00435       break;
00436     }
00437   }
00438 }
00439 
00440 // Detects and marks leader dots/dashes.
00441 //    Leaders are horizontal chains of small or noise blobs that look
00442 //    monospace according to ColPartition::MarkAsLeaderIfMonospaced().
00443 // Detected leaders become the only occupants of the block->small_blobs list.
00444 // Non-leader small blobs get moved to the blobs list.
00445 // Non-leader noise blobs remain singletons in the noise list.
00446 // All small and noise blobs in high density regions are marked BTFT_NONTEXT.
00447 // block is the single block for the whole page or rectangle to be OCRed.
00448 // leader_parts is the output.
00449 void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK* block,
00450                                           ColPartition_LIST* leader_parts) {
00451   InsertBlobList(&block->small_blobs);
00452   InsertBlobList(&block->noise_blobs);
00453   BlobGridSearch gsearch(this);
00454   BLOBNBOX* bbox;
00455   // For every bbox in the grid, set its neighbours.
00456   gsearch.StartFullSearch();
00457   while ((bbox = gsearch.NextFullSearch()) != NULL) {
00458     SetNeighbours(true, false, bbox);
00459   }
00460   ColPartition_IT part_it(leader_parts);
00461   gsearch.StartFullSearch();
00462   while ((bbox = gsearch.NextFullSearch()) != NULL) {
00463     if (bbox->flow() == BTFT_NONE) {
00464       if (bbox->neighbour(BND_RIGHT) == NULL &&
00465           bbox->neighbour(BND_LEFT) == NULL)
00466         continue;
00467       // Put all the linked blobs into a ColPartition.
00468       ColPartition* part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
00469       BLOBNBOX* blob;
00470       for (blob = bbox; blob != NULL && blob->flow() == BTFT_NONE;
00471            blob = blob->neighbour(BND_RIGHT))
00472         part->AddBox(blob);
00473       for (blob = bbox->neighbour(BND_LEFT); blob != NULL &&
00474            blob->flow() == BTFT_NONE;
00475            blob = blob->neighbour(BND_LEFT))
00476         part->AddBox(blob);
00477       if (part->MarkAsLeaderIfMonospaced())
00478         part_it.add_after_then_move(part);
00479       else
00480         delete part;
00481     }
00482   }
00483   if (textord_tabfind_show_strokewidths) {
00484     leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0);
00485   }
00486   // Move any non-leaders from the small to the blobs list, as they are
00487   // most likely dashes or broken characters.
00488   BLOBNBOX_IT blob_it(&block->blobs);
00489   BLOBNBOX_IT small_it(&block->small_blobs);
00490   for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
00491     BLOBNBOX* blob = small_it.data();
00492     if (blob->flow() != BTFT_LEADER) {
00493       if (blob->flow() == BTFT_NEIGHBOURS)
00494         blob->set_flow(BTFT_NONE);
00495       blob->ClearNeighbours();
00496       blob_it.add_to_end(small_it.extract());
00497     }
00498   }
00499   // Move leaders from the noise list to the small list, leaving the small
00500   // list exclusively leaders, so they don't get processed further,
00501   // and the remaining small blobs all in the noise list.
00502   BLOBNBOX_IT noise_it(&block->noise_blobs);
00503   for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
00504     BLOBNBOX* blob = noise_it.data();
00505     if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {
00506       small_it.add_to_end(noise_it.extract());
00507     } else if (blob->flow() == BTFT_NEIGHBOURS) {
00508       blob->set_flow(BTFT_NONE);
00509       blob->ClearNeighbours();
00510     }
00511   }
00512   // Clear the grid as we don't want the small stuff hanging around in it.
00513   Clear();
00514 }
00515 
00518 void StrokeWidth::InsertBlobs(TO_BLOCK* block) {
00519   InsertBlobList(&block->blobs);
00520   InsertBlobList(&block->large_blobs);
00521 }
00522 
00523 // Checks the left or right side of the given leader partition and sets the
00524 // (opposite) leader_on_right or leader_on_left flags for blobs
00525 // that are next to the given side of the given leader partition.
00526 void StrokeWidth::MarkLeaderNeighbours(const ColPartition* part,
00527                                        LeftOrRight side) {
00528   const TBOX& part_box = part->bounding_box();
00529   BlobGridSearch blobsearch(this);
00530   // Search to the side of the leader for the nearest neighbour.
00531   BLOBNBOX* best_blob = NULL;
00532   int best_gap = 0;
00533   blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left()
00534                                              : part_box.right(),
00535                              part_box.bottom(), part_box.top());
00536   BLOBNBOX* blob;
00537   while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != NULL) {
00538     const TBOX& blob_box = blob->bounding_box();
00539     if (!blob_box.y_overlap(part_box))
00540       continue;
00541     int x_gap = blob_box.x_gap(part_box);
00542     if (x_gap > 2 * gridsize()) {
00543       break;
00544     } else if (best_blob == NULL || x_gap < best_gap) {
00545       best_blob = blob;
00546       best_gap = x_gap;
00547     }
00548   }
00549   if (best_blob != NULL) {
00550     if (side == LR_LEFT)
00551       best_blob->set_leader_on_right(true);
00552     else
00553       best_blob->set_leader_on_left(true);
00554     #ifndef GRAPHICS_DISABLED
00555     if (leaders_win_ != NULL) {
00556       leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);
00557       const TBOX& blob_box = best_blob->bounding_box();
00558       leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(),
00559                               blob_box.right(), blob_box.top());
00560     }
00561     #endif  // GRAPHICS_DISABLED
00562   }
00563 }
00564 
00565 // Helper to compute the UQ of the square-ish CJK charcters.
00566 static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST* blobs) {
00567   STATS sizes(0, gridsize * kMaxCJKSizeRatio);
00568   BLOBNBOX_IT it(blobs);
00569   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00570     BLOBNBOX* blob = it.data();
00571     int width = blob->bounding_box().width();
00572     int height = blob->bounding_box().height();
00573     if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio)
00574       sizes.add(height, 1);
00575   }
00576   return static_cast<int>(sizes.ile(0.75f) + 0.5);
00577 }
00578 
00579 // Fix broken CJK characters, using the fake joined blobs mechanism.
00580 // Blobs are really merged, ie the master takes all the outlines and the
00581 // others are deleted.
00582 // Returns true if sufficient blobs are merged that it may be worth running
00583 // again, due to a better estimate of character size.
00584 bool StrokeWidth::FixBrokenCJK(TO_BLOCK* block) {
00585   BLOBNBOX_LIST* blobs = &block->blobs;
00586   int median_height = UpperQuartileCJKSize(gridsize(), blobs);
00587   int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);
00588   int max_size = static_cast<int>(median_height * kCJKAspectRatio);
00589   int num_fixed = 0;
00590   BLOBNBOX_IT blob_it(blobs);
00591 
00592   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00593     BLOBNBOX* blob = blob_it.data();
00594     if (blob->cblob() == NULL || blob->cblob()->out_list()->empty())
00595       continue;
00596     TBOX bbox = blob->bounding_box();
00597     bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(),
00598                                                bbox.bottom());
00599     if (debug) {
00600       tprintf("Checking for Broken CJK (max size=%d):", max_size);
00601       bbox.print();
00602     }
00603     // Generate a list of blobs that overlap or are near enough to merge.
00604     BLOBNBOX_CLIST overlapped_blobs;
00605     AccumulateOverlaps(blob, debug, max_size, max_dist,
00606                        &bbox, &overlapped_blobs);
00607     if (!overlapped_blobs.empty()) {
00608       // There are overlapping blobs, so qualify them as being satisfactory
00609       // before removing them from the grid and replacing them with the union.
00610       // The final box must be roughly square.
00611       if (bbox.width() > bbox.height() * kCJKAspectRatio ||
00612           bbox.height() > bbox.width() * kCJKAspectRatio) {
00613         if (debug) {
00614           tprintf("Bad final aspectratio:");
00615           bbox.print();
00616         }
00617         continue;
00618       }
00619       // There can't be too many blobs to merge.
00620       if (overlapped_blobs.length() >= kCJKMaxComponents) {
00621         if (debug)
00622           tprintf("Too many neighbours: %d\n", overlapped_blobs.length());
00623         continue;
00624       }
00625       // The strokewidths must match amongst the join candidates.
00626       BLOBNBOX_C_IT n_it(&overlapped_blobs);
00627       for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
00628         BLOBNBOX* neighbour = NULL;
00629         neighbour = n_it.data();
00630         if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK,
00631                                        kStrokeWidthCJK))
00632           break;
00633       }
00634       if (!n_it.cycled_list()) {
00635         if (debug) {
00636           tprintf("Bad stroke widths:");
00637           PrintBoxWidths(blob);
00638         }
00639         continue;  // Not good enough.
00640       }
00641 
00642       // Merge all the candidates into blob.
00643       // We must remove blob from the grid and reinsert it after merging
00644       // to maintain the integrity of the grid.
00645       RemoveBBox(blob);
00646       // Everything else will be calculated later.
00647       for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
00648         BLOBNBOX* neighbour = n_it.data();
00649         RemoveBBox(neighbour);
00650         blob->really_merge(neighbour);
00651         if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
00652           blob->rotate_box(rerotation_);
00653         }
00654       }
00655       InsertBBox(true, true, blob);
00656       ++num_fixed;
00657       if (debug) {
00658         tprintf("Done! Final box:");
00659         bbox.print();
00660       }
00661     }
00662   }
00663   // Mark for deletion all the empty shell blobs that contain no outlines.
00664   int num_remaining = 0;
00665   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00666     BLOBNBOX* blob = blob_it.data();
00667     if (blob->cblob() == NULL || blob->cblob()->out_list()->empty()) {
00668       // Mark blob for deletion.
00669       blob->set_region_type(BRT_NOISE);
00670     } else {
00671       ++num_remaining;
00672     }
00673   }
00674   // Permanently delete all the marked blobs after first removing all
00675   // references in the neighbour members.
00676   block->DeleteUnownedNoise();
00677   return num_fixed > num_remaining * kBrokenCJKIterationFraction;
00678 }
00679 
00680 // Helper function to determine whether it is reasonable to merge the
00681 // bbox and the nbox for repairing broken CJK.
00682 // The distance apart must not exceed max_dist, the combined size must
00683 // not exceed max_size, and the aspect ratio must either improve or at
00684 // least not get worse by much.
00685 static bool AcceptableCJKMerge(const TBOX& bbox, const TBOX& nbox,
00686                                bool debug, int max_size, int max_dist,
00687                                int* x_gap, int* y_gap) {
00688   *x_gap = bbox.x_gap(nbox);
00689   *y_gap = bbox.y_gap(nbox);
00690   TBOX merged(nbox);
00691   merged += bbox;
00692   if (debug) {
00693     tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap);
00694     merged.print();
00695   }
00696   if (*x_gap <= max_dist && *y_gap <= max_dist &&
00697       merged.width() <= max_size && merged.height() <= max_size) {
00698     // Close enough to call overlapping. Check aspect ratios.
00699     double old_ratio = static_cast<double>(bbox.width()) / bbox.height();
00700     if (old_ratio < 1.0) old_ratio = 1.0 / old_ratio;
00701     double new_ratio = static_cast<double>(merged.width()) / merged.height();
00702     if (new_ratio < 1.0) new_ratio = 1.0 / new_ratio;
00703     if (new_ratio <= old_ratio * kCJKAspectRatioIncrease)
00704       return true;
00705   }
00706   return false;
00707 }
00708 
00709 // Collect blobs that overlap or are within max_dist of the input bbox.
00710 // Return them in the list of blobs and expand the bbox to be the union
00711 // of all the boxes. not_this is excluded from the search, as are blobs
00712 // that cause the merged box to exceed max_size in either dimension.
00713 void StrokeWidth::AccumulateOverlaps(const BLOBNBOX* not_this, bool debug,
00714                                      int max_size, int max_dist,
00715                                      TBOX* bbox, BLOBNBOX_CLIST* blobs) {
00716   // While searching, nearests holds the nearest failed blob in each
00717   // direction. When we have a nearest in each of the 4 directions, then
00718   // the search is over, and at this point the final bbox must not overlap
00719   // any of the nearests.
00720   BLOBNBOX* nearests[BND_COUNT];
00721   for (int i = 0; i < BND_COUNT; ++i) {
00722     nearests[i] = NULL;
00723   }
00724   int x = (bbox->left() + bbox->right()) / 2;
00725   int y = (bbox->bottom() + bbox->top()) / 2;
00726   // Run a radial search for blobs that overlap or are sufficiently close.
00727   BlobGridSearch radsearch(this);
00728   radsearch.StartRadSearch(x, y, kCJKRadius);
00729   BLOBNBOX* neighbour;
00730   while ((neighbour = radsearch.NextRadSearch()) != NULL) {
00731     if (neighbour == not_this) continue;
00732     TBOX nbox = neighbour->bounding_box();
00733     int x_gap, y_gap;
00734     if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist,
00735                            &x_gap, &y_gap)) {
00736       // Close enough to call overlapping. Merge boxes.
00737       *bbox += nbox;
00738       blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
00739       if (debug) {
00740         tprintf("Added:");
00741         nbox.print();
00742       }
00743       // Since we merged, search the nearests, as some might now me mergeable.
00744       for (int dir = 0; dir < BND_COUNT; ++dir) {
00745         if (nearests[dir] == NULL) continue;
00746         nbox = nearests[dir]->bounding_box();
00747         if (AcceptableCJKMerge(*bbox, nbox, debug, max_size,
00748                                max_dist, &x_gap, &y_gap)) {
00749           // Close enough to call overlapping. Merge boxes.
00750           *bbox += nbox;
00751           blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);
00752           if (debug) {
00753             tprintf("Added:");
00754             nbox.print();
00755           }
00756           nearests[dir] = NULL;
00757           dir = -1;  // Restart the search.
00758         }
00759       }
00760     } else if (x_gap < 0 && x_gap <= y_gap) {
00761       // A vertical neighbour. Record the nearest.
00762       BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;
00763       if (nearests[dir] == NULL ||
00764           y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
00765         nearests[dir] = neighbour;
00766       }
00767     } else if (y_gap < 0 && y_gap <= x_gap) {
00768       // A horizontal neighbour. Record the nearest.
00769       BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;
00770       if (nearests[dir] == NULL ||
00771           x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
00772         nearests[dir] = neighbour;
00773       }
00774     }
00775     // If all nearests are non-null, then we have finished.
00776     if (nearests[BND_LEFT] && nearests[BND_RIGHT] &&
00777         nearests[BND_ABOVE] && nearests[BND_BELOW])
00778       break;
00779   }
00780   // Final overlap with a nearest is not allowed.
00781   for (int dir = 0; dir < BND_COUNT; ++dir) {
00782     if (nearests[dir] == NULL) continue;
00783     const TBOX& nbox = nearests[dir]->bounding_box();
00784     if (debug) {
00785       tprintf("Testing for overlap with:");
00786       nbox.print();
00787     }
00788     if (bbox->overlap(nbox)) {
00789       blobs->shallow_clear();
00790       if (debug)
00791         tprintf("Final box overlaps nearest\n");
00792       return;
00793     }
00794   }
00795 }
00796 
00797 // For each blob in this grid, Finds the textline direction to be horizontal
00798 // or vertical according to distance to neighbours and 1st and 2nd order
00799 // neighbours. Non-text tends to end up without a definite direction.
00800 // Result is setting of the neighbours and vert_possible/horz_possible
00801 // flags in the BLOBNBOXes currently in this grid.
00802 // This function is called more than once if page orientation is uncertain,
00803 // so display_if_debugging is true on the final call to display the results.
00804 void StrokeWidth::FindTextlineFlowDirection(bool display_if_debugging) {
00805   BlobGridSearch gsearch(this);
00806   BLOBNBOX* bbox;
00807   // For every bbox in the grid, set its neighbours.
00808   gsearch.StartFullSearch();
00809   while ((bbox = gsearch.NextFullSearch()) != NULL) {
00810     SetNeighbours(false, display_if_debugging, bbox);
00811   }
00812   // Where vertical or horizontal wins by a big margin, clarify it.
00813   gsearch.StartFullSearch();
00814   while ((bbox = gsearch.NextFullSearch()) != NULL) {
00815     SimplifyObviousNeighbours(bbox);
00816   }
00817   // Now try to make the blobs only vertical or horizontal using neighbours.
00818   gsearch.StartFullSearch();
00819   while ((bbox = gsearch.NextFullSearch()) != NULL) {
00820     SetNeighbourFlows(bbox);
00821   }
00822   if ((textord_tabfind_show_strokewidths  && display_if_debugging) ||
00823       textord_tabfind_show_strokewidths > 1) {
00824     initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0);
00825   }
00826   // Improve flow direction with neighbours.
00827   gsearch.StartFullSearch();
00828   while ((bbox = gsearch.NextFullSearch()) != NULL) {
00829     SmoothNeighbourTypes(bbox, false);
00830   }
00831   // Now allow reset of firm values to fix renegades.
00832   gsearch.StartFullSearch();
00833   while ((bbox = gsearch.NextFullSearch()) != NULL) {
00834     SmoothNeighbourTypes(bbox, true);
00835   }
00836   // Repeat.
00837   gsearch.StartFullSearch();
00838   while ((bbox = gsearch.NextFullSearch()) != NULL) {
00839     SmoothNeighbourTypes(bbox, true);
00840   }
00841   if ((textord_tabfind_show_strokewidths  && display_if_debugging) ||
00842       textord_tabfind_show_strokewidths > 1) {
00843     widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0);
00844   }
00845 }
00846 
00847 // Sets the neighbours and good_stroke_neighbours members of the blob by
00848 // searching close on all 4 sides.
00849 // When finding leader dots/dashes, there is a slightly different rule for
00850 // what makes a good neighbour.
00851 void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap,
00852                                 BLOBNBOX* blob) {
00853   int line_trap_count = 0;
00854   for (int dir = 0; dir < BND_COUNT; ++dir) {
00855     BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
00856     line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
00857   }
00858   if (line_trap_count > 0 && activate_line_trap) {
00859     // It looks like a line so isolate it by clearing its neighbours.
00860     blob->ClearNeighbours();
00861     const TBOX& box = blob->bounding_box();
00862     blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);
00863   }
00864 }
00865 
00866 
00867 // Sets the good_stroke_neighbours member of the blob if it has a
00868 // GoodNeighbour on the given side.
00869 // Also sets the neighbour in the blob, whether or not a good one is found.
00870 // Returns the number of blobs in the nearby search area that would lead us to
00871 // believe that this blob is a line separator.
00872 // Leaders get extra special lenient treatment.
00873 int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders,
00874                                    BLOBNBOX* blob) {
00875   // Search for neighbours that overlap vertically.
00876   TBOX blob_box = blob->bounding_box();
00877   bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(),
00878                                              blob_box.bottom());
00879   if (debug) {
00880     tprintf("FGN in dir %d for blob:", dir);
00881     blob_box.print();
00882   }
00883   int top = blob_box.top();
00884   int bottom = blob_box.bottom();
00885   int left = blob_box.left();
00886   int right = blob_box.right();
00887   int width = right - left;
00888   int height = top - bottom;
00889 
00890   // A trap to detect lines tests for the min dimension of neighbours
00891   // being larger than a multiple of the min dimension of the line
00892   // and the larger dimension being smaller than a fraction of the max
00893   // dimension of the line.
00894   int line_trap_max = MAX(width, height) / kLineTrapLongest;
00895   int line_trap_min = MIN(width, height) * kLineTrapShortest;
00896   int line_trap_count = 0;
00897 
00898   int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
00899                        ? height / 2 : width / 2;
00900   int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT)
00901                        ? height / 3 : width / 3;
00902   if (leaders)
00903     min_good_overlap = min_decent_overlap = 1;
00904 
00905   int search_pad = static_cast<int>(
00906       sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);
00907   if (gridsize() > search_pad)
00908     search_pad = gridsize();
00909   TBOX search_box = blob_box;
00910   // Pad the search in the appropriate direction.
00911   switch (dir) {
00912   case BND_LEFT:
00913     search_box.set_left(search_box.left() - search_pad);
00914     break;
00915   case BND_RIGHT:
00916     search_box.set_right(search_box.right() + search_pad);
00917     break;
00918   case BND_BELOW:
00919     search_box.set_bottom(search_box.bottom() - search_pad);
00920     break;
00921   case BND_ABOVE:
00922     search_box.set_top(search_box.top() + search_pad);
00923     break;
00924   case BND_COUNT:
00925     return 0;
00926   }
00927 
00928   BlobGridSearch rectsearch(this);
00929   rectsearch.StartRectSearch(search_box);
00930   BLOBNBOX* best_neighbour = NULL;
00931   double best_goodness = 0.0;
00932   bool best_is_good = false;
00933   BLOBNBOX* neighbour;
00934   while ((neighbour = rectsearch.NextRectSearch()) != NULL) {
00935     TBOX nbox = neighbour->bounding_box();
00936     if (neighbour == blob)
00937       continue;
00938     int mid_x = (nbox.left() + nbox.right()) / 2;
00939     if (mid_x < blob->left_rule() || mid_x > blob->right_rule())
00940       continue;  // In a different column.
00941     if (debug) {
00942       tprintf("Neighbour at:");
00943       nbox.print();
00944     }
00945 
00946     // Last-minute line detector. There is a small upper limit to the line
00947     // width accepted by the morphological line detector.
00948     int n_width = nbox.width();
00949     int n_height = nbox.height();
00950     if (MIN(n_width, n_height) > line_trap_min &&
00951         MAX(n_width, n_height) < line_trap_max)
00952       ++line_trap_count;
00953     // Heavily joined text, such as Arabic may have very different sizes when
00954     // looking at the maxes, but the heights may be almost identical, so check
00955     // for a difference in height if looking sideways or width vertically.
00956     if (TabFind::VeryDifferentSizes(MAX(n_width, n_height),
00957                                     MAX(width, height)) &&
00958         (((dir == BND_LEFT || dir ==BND_RIGHT) &&
00959             TabFind::DifferentSizes(n_height, height)) ||
00960          ((dir == BND_BELOW || dir ==BND_ABOVE) &&
00961              TabFind::DifferentSizes(n_width, width)))) {
00962       if (debug) tprintf("Bad size\n");
00963       continue;  // Could be a different font size or non-text.
00964     }
00965     // Amount of vertical overlap between the blobs.
00966     int overlap;
00967     // If the overlap is along the short side of the neighbour, and it
00968     // is fully overlapped, then perp_overlap holds the length of the long
00969     // side of the neighbour. A measure to include hyphens and dashes as
00970     // legitimate neighbours.
00971     int perp_overlap;
00972     int gap;
00973     if (dir == BND_LEFT || dir == BND_RIGHT) {
00974       overlap = MIN(nbox.top(), top) - MAX(nbox.bottom(), bottom);
00975       if (overlap == nbox.height() && nbox.width() > nbox.height())
00976         perp_overlap = nbox.width();
00977       else
00978         perp_overlap = overlap;
00979       gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;
00980       if (gap <= 0) {
00981         if (debug) tprintf("On wrong side\n");
00982         continue;  // On the wrong side.
00983       }
00984       gap -= n_width;
00985     } else {
00986       overlap = MIN(nbox.right(), right) - MAX(nbox.left(), left);
00987       if (overlap == nbox.width() && nbox.height() > nbox.width())
00988         perp_overlap = nbox.height();
00989       else
00990         perp_overlap = overlap;
00991       gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;
00992       if (gap <= 0) {
00993         if (debug) tprintf("On wrong side\n");
00994         continue;  // On the wrong side.
00995       }
00996       gap -= n_height;
00997     }
00998     if (-gap > overlap) {
00999       if (debug) tprintf("Overlaps wrong way\n");
01000       continue;  // Overlaps the wrong way.
01001     }
01002     if (perp_overlap < min_decent_overlap) {
01003       if (debug) tprintf("Doesn't overlap enough\n");
01004       continue;  // Doesn't overlap enough.
01005     }
01006     bool bad_sizes = TabFind::DifferentSizes(height, n_height) &&
01007                      TabFind::DifferentSizes(width, n_width);
01008     bool is_good = overlap >= min_good_overlap && !bad_sizes &&
01009                    blob->MatchingStrokeWidth(*neighbour,
01010                                              kStrokeWidthFractionTolerance,
01011                                              kStrokeWidthTolerance);
01012     // Best is a fuzzy combination of gap, overlap and is good.
01013     // Basically if you make one thing twice as good without making
01014     // anything else twice as bad, then it is better.
01015     if (gap < 1) gap = 1;
01016     double goodness = (1.0 + is_good) * overlap / gap;
01017     if (debug) {
01018       tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n",
01019               goodness, best_goodness, is_good, overlap, gap);
01020     }
01021     if (goodness > best_goodness) {
01022       best_neighbour = neighbour;
01023       best_goodness = goodness;
01024       best_is_good = is_good;
01025     }
01026   }
01027   blob->set_neighbour(dir, best_neighbour, best_is_good);
01028   return line_trap_count;
01029 }
01030 
01031 // Helper to get a list of 1st-order neighbours.
01032 static void ListNeighbours(const BLOBNBOX* blob,
01033                            BLOBNBOX_CLIST* neighbours) {
01034   for (int dir = 0; dir < BND_COUNT; ++dir) {
01035     BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
01036     BLOBNBOX* neighbour = blob->neighbour(bnd);
01037     if (neighbour != NULL) {
01038       neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
01039     }
01040   }
01041 }
01042 
01043 // Helper to get a list of 1st and 2nd order neighbours.
01044 static void List2ndNeighbours(const BLOBNBOX* blob,
01045                               BLOBNBOX_CLIST* neighbours) {
01046   ListNeighbours(blob, neighbours);
01047   for (int dir = 0; dir < BND_COUNT; ++dir) {
01048     BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
01049     BLOBNBOX* neighbour = blob->neighbour(bnd);
01050     if (neighbour != NULL) {
01051       ListNeighbours(neighbour, neighbours);
01052     }
01053   }
01054 }
01055 
01056 // Helper to get a list of 1st, 2nd and 3rd order neighbours.
01057 static void List3rdNeighbours(const BLOBNBOX* blob,
01058                               BLOBNBOX_CLIST* neighbours) {
01059   List2ndNeighbours(blob, neighbours);
01060   for (int dir = 0; dir < BND_COUNT; ++dir) {
01061     BlobNeighbourDir bnd = static_cast<BlobNeighbourDir>(dir);
01062     BLOBNBOX* neighbour = blob->neighbour(bnd);
01063     if (neighbour != NULL) {
01064       List2ndNeighbours(neighbour, neighbours);
01065     }
01066   }
01067 }
01068 
01069 // Helper to count the evidence for verticalness or horizontalness
01070 // in a list of neighbours.
01071 static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST* neighbours,
01072                                int* pure_h_count, int* pure_v_count) {
01073   if (neighbours->length() <= kMostlyOneDirRatio)
01074     return;
01075   BLOBNBOX_C_IT it(neighbours);
01076   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01077     BLOBNBOX* blob = it.data();
01078     int h_min, h_max, v_min, v_max;
01079     blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
01080     if (debug)
01081       tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
01082     if (h_max < v_min ||
01083         blob->leader_on_left() || blob->leader_on_right()) {
01084       // Horizontal gaps are clear winners. Count a pure horizontal.
01085       ++*pure_h_count;
01086       if (debug) tprintf("Horz at:");
01087     } else if (v_max < h_min) {
01088       // Vertical gaps are clear winners. Clear a pure vertical.
01089       ++*pure_v_count;
01090       if (debug) tprintf("Vert at:");
01091     } else {
01092       if (debug) tprintf("Neither at:");
01093     }
01094     if (debug)
01095       blob->bounding_box().print();
01096   }
01097 }
01098 
01099 // Makes the blob to be only horizontal or vertical where evidence
01100 // is clear based on gaps of 2nd order neighbours, or definite individual
01101 // blobs.
01102 void StrokeWidth::SetNeighbourFlows(BLOBNBOX* blob) {
01103   if (blob->DefiniteIndividualFlow())
01104     return;
01105   bool debug = AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
01106                                              blob->bounding_box().bottom());
01107   if (debug) {
01108     tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:",
01109             blob->flow(), blob->region_type());
01110     blob->bounding_box().print();
01111   }
01112   BLOBNBOX_CLIST neighbours;
01113   List3rdNeighbours(blob, &neighbours);
01114   // The number of pure horizontal and vertical neighbours.
01115   int pure_h_count = 0;
01116   int pure_v_count = 0;
01117   CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
01118   if (debug) {
01119     HandleClick(blob->bounding_box().left() + 1,
01120                 blob->bounding_box().bottom() + 1);
01121     tprintf("SetFlows: h_count=%d, v_count=%d\n",
01122             pure_h_count, pure_v_count);
01123   }
01124   if (!neighbours.empty()) {
01125     blob->set_vert_possible(true);
01126     blob->set_horz_possible(true);
01127     if (pure_h_count > 2 * pure_v_count) {
01128       // Horizontal gaps are clear winners. Clear vertical neighbours.
01129       blob->set_vert_possible(false);
01130     } else if (pure_v_count > 2 * pure_h_count) {
01131       // Vertical gaps are clear winners. Clear horizontal neighbours.
01132       blob->set_horz_possible(false);
01133     }
01134   } else {
01135     // Lonely blob. Can't tell its flow direction.
01136     blob->set_vert_possible(false);
01137     blob->set_horz_possible(false);
01138   }
01139 }
01140 
01141 
01142 // Helper to count the number of horizontal and vertical blobs in a list.
01143 static void CountNeighbourTypes(BLOBNBOX_CLIST* neighbours,
01144                                 int* pure_h_count, int* pure_v_count) {
01145   BLOBNBOX_C_IT it(neighbours);
01146   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01147     BLOBNBOX* blob = it.data();
01148     if (blob->UniquelyHorizontal())
01149       ++*pure_h_count;
01150     if (blob->UniquelyVertical())
01151       ++*pure_v_count;
01152   }
01153 }
01154 
01155 // Nullify the neighbours in the wrong directions where the direction
01156 // is clear-cut based on a distance margin. Good for isolating vertical
01157 // text from neighbouring horizontal text.
01158 void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX* blob) {
01159   // Case 1: We have text that is likely several characters, blurry and joined
01160   //         together.
01161   if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&
01162        blob->bounding_box().height() > 3 * blob->area_stroke_width())) {
01163     // The blob is complex (not stick-like).
01164     if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {
01165       // Horizontal conjoined text.
01166       blob->set_neighbour(BND_ABOVE, NULL, false);
01167       blob->set_neighbour(BND_BELOW, NULL, false);
01168       return;
01169     }
01170     if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {
01171       // Vertical conjoined text.
01172       blob->set_neighbour(BND_LEFT, NULL, false);
01173       blob->set_neighbour(BND_RIGHT, NULL, false);
01174       return;
01175     }
01176   }
01177 
01178   // Case 2: This blob is likely a single character.
01179   int margin = gridsize() / 2;
01180   int h_min, h_max, v_min, v_max;
01181   blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
01182   if ((h_max + margin < v_min && h_max < margin / 2) ||
01183       blob->leader_on_left() || blob->leader_on_right()) {
01184     // Horizontal gaps are clear winners. Clear vertical neighbours.
01185     blob->set_neighbour(BND_ABOVE, NULL, false);
01186     blob->set_neighbour(BND_BELOW, NULL, false);
01187   } else if (v_max + margin < h_min && v_max < margin / 2) {
01188     // Vertical gaps are clear winners. Clear horizontal neighbours.
01189     blob->set_neighbour(BND_LEFT, NULL, false);
01190     blob->set_neighbour(BND_RIGHT, NULL, false);
01191   }
01192 }
01193 
01194 // Smoothes the vertical/horizontal type of the blob based on the
01195 // 2nd-order neighbours. If reset_all is true, then all blobs are
01196 // changed. Otherwise, only ambiguous blobs are processed.
01197 void StrokeWidth::SmoothNeighbourTypes(BLOBNBOX* blob, bool reset_all) {
01198   if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {
01199     // There are both horizontal and vertical so try to fix it.
01200     BLOBNBOX_CLIST neighbours;
01201     List2ndNeighbours(blob, &neighbours);
01202     // The number of pure horizontal and vertical neighbours.
01203     int pure_h_count = 0;
01204     int pure_v_count = 0;
01205     CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
01206     if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
01207                                       blob->bounding_box().bottom())) {
01208       HandleClick(blob->bounding_box().left() + 1,
01209                   blob->bounding_box().bottom() + 1);
01210       tprintf("pure_h=%d, pure_v=%d\n",
01211               pure_h_count, pure_v_count);
01212     }
01213     if (pure_h_count > pure_v_count) {
01214       // Horizontal gaps are clear winners. Clear vertical neighbours.
01215       blob->set_vert_possible(false);
01216       blob->set_horz_possible(true);
01217     } else if (pure_v_count > pure_h_count) {
01218       // Vertical gaps are clear winners. Clear horizontal neighbours.
01219       blob->set_horz_possible(false);
01220       blob->set_vert_possible(true);
01221     }
01222   } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
01223                                     blob->bounding_box().bottom())) {
01224     HandleClick(blob->bounding_box().left() + 1,
01225                 blob->bounding_box().bottom() + 1);
01226     tprintf("Clean on pass 3!\n");
01227   }
01228 }
01229 
01230 // Partition creation. Accumulates vertical and horizontal text chains,
01231 // puts the remaining blobs in as unknowns, and then merges/splits to
01232 // minimize overlap and smoothes the types with neighbours and the color
01233 // image if provided. rerotation is used to rotate the coordinate space
01234 // back to the nontext_map_ image.
01235 void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
01236                                         TO_BLOCK* block,
01237                                         ColPartitionGrid* part_grid,
01238                                         ColPartition_LIST* big_parts) {
01239   FindVerticalTextChains(part_grid);
01240   FindHorizontalTextChains(part_grid);
01241   if (textord_tabfind_show_strokewidths) {
01242     chains_win_ = MakeWindow(0, 400, "Initial text chains");
01243     part_grid->DisplayBoxes(chains_win_);
01244     projection_->DisplayProjection();
01245   }
01246   part_grid->SplitOverlappingPartitions(big_parts);
01247   EasyMerges(part_grid);
01248   RemoveLargeUnusedBlobs(block, part_grid, big_parts);
01249   TBOX grid_box(bleft(), tright());
01250   while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
01251                                          rerotation));
01252   while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
01253                                          grid_box, rerotation));
01254   TestDiacritics(part_grid, block);
01255   MergeDiacritics(block, part_grid);
01256   if (textord_tabfind_show_strokewidths) {
01257     textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
01258     part_grid->DisplayBoxes(textlines_win_);
01259     diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block);
01260   }
01261   PartitionRemainingBlobs(part_grid);
01262   part_grid->SplitOverlappingPartitions(big_parts);
01263   EasyMerges(part_grid);
01264   while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box,
01265                                          rerotation));
01266   while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
01267                                          grid_box, rerotation));
01268   // Now eliminate strong stuff in a sea of the opposite.
01269   while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_,
01270                                          grid_box, rerotation));
01271   if (textord_tabfind_show_strokewidths) {
01272     smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
01273     part_grid->DisplayBoxes(smoothed_win_);
01274   }
01275 }
01276 
01277 // Helper verifies that blob's neighbour in direction dir is good to add to a
01278 // vertical text chain by returning the neighbour if it is not null, not owned,
01279 // and not uniquely horizontal, as well as its neighbour in the opposite
01280 // direction is blob.
01281 static BLOBNBOX* MutualUnusedVNeighbour(const BLOBNBOX* blob,
01282                                         BlobNeighbourDir dir) {
01283   BLOBNBOX* next_blob = blob->neighbour(dir);
01284   if (next_blob == NULL || next_blob->owner() != NULL ||
01285       next_blob->UniquelyHorizontal())
01286     return NULL;
01287   if (next_blob->neighbour(DirOtherWay(dir)) == blob)
01288     return next_blob;
01289   return NULL;
01290 }
01291 
01292 // Finds vertical chains of text-like blobs and puts them in ColPartitions.
01293 void StrokeWidth::FindVerticalTextChains(ColPartitionGrid* part_grid) {
01294   BlobGridSearch gsearch(this);
01295   BLOBNBOX* bbox;
01296   gsearch.StartFullSearch();
01297   while ((bbox = gsearch.NextFullSearch()) != NULL) {
01298     // Only process boxes that have no horizontal hope and have not yet
01299     // been included in a chain.
01300     BLOBNBOX* blob;
01301     if (bbox->owner() == NULL && bbox->UniquelyVertical() &&
01302         (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != NULL) {
01303       // Put all the linked blobs into a ColPartition.
01304       ColPartition* part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));
01305       part->AddBox(bbox);
01306       while (blob != NULL) {
01307         part->AddBox(blob);
01308         blob = MutualUnusedVNeighbour(blob, BND_ABOVE);
01309       }
01310       blob = MutualUnusedVNeighbour(bbox, BND_BELOW);
01311       while (blob != NULL) {
01312         part->AddBox(blob);
01313         blob = MutualUnusedVNeighbour(blob, BND_BELOW);
01314       }
01315       CompletePartition(part, part_grid);
01316     }
01317   }
01318 }
01319 
01320 // Helper verifies that blob's neighbour in direction dir is good to add to a
01321 // horizontal text chain by returning the neighbour if it is not null, not
01322 // owned, and not uniquely vertical, as well as its neighbour in the opposite
01323 // direction is blob.
01324 static BLOBNBOX* MutualUnusedHNeighbour(const BLOBNBOX* blob,
01325                                         BlobNeighbourDir dir) {
01326   BLOBNBOX* next_blob = blob->neighbour(dir);
01327   if (next_blob == NULL || next_blob->owner() != NULL ||
01328       next_blob->UniquelyVertical())
01329     return NULL;
01330   if (next_blob->neighbour(DirOtherWay(dir)) == blob)
01331     return next_blob;
01332   return NULL;
01333 }
01334 
01335 // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
01336 void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid* part_grid) {
01337   BlobGridSearch gsearch(this);
01338   BLOBNBOX* bbox;
01339   gsearch.StartFullSearch();
01340   while ((bbox = gsearch.NextFullSearch()) != NULL) {
01341     BLOBNBOX* blob;
01342     if (bbox->owner() == NULL && bbox->UniquelyHorizontal() &&
01343         (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != NULL) {
01344       // Put all the linked blobs into a ColPartition.
01345       ColPartition* part = new ColPartition(BRT_TEXT, ICOORD(0, 1));
01346       part->AddBox(bbox);
01347       while (blob != NULL) {
01348         part->AddBox(blob);
01349         blob = MutualUnusedHNeighbour(blob, BND_RIGHT);
01350       }
01351       blob = MutualUnusedHNeighbour(bbox, BND_LEFT);
01352       while (blob != NULL) {
01353         part->AddBox(blob);
01354         blob = MutualUnusedVNeighbour(blob, BND_LEFT);
01355       }
01356       CompletePartition(part, part_grid);
01357     }
01358   }
01359 }
01360 
01361 // Finds diacritics and saves their base character in the blob.
01362 // The objective is to move all diacritics to the noise_blobs list, so
01363 // they don't mess up early textline finding/merging, or force splits
01364 // on textlines that overlap a bit. Blobs that become diacritics must be
01365 // either part of no ColPartition (NULL owner) or in a small partition in
01366 // which ALL the blobs are diacritics, in which case the partition is
01367 // exploded (deleted) back to its blobs.
01368 void StrokeWidth::TestDiacritics(ColPartitionGrid* part_grid, TO_BLOCK* block) {
01369   BlobGrid small_grid(gridsize(), bleft(), tright());
01370   small_grid.InsertBlobList(&block->noise_blobs);
01371   small_grid.InsertBlobList(&block->blobs);
01372   int medium_diacritics = 0;
01373   int small_diacritics = 0;
01374   BLOBNBOX_IT small_it(&block->noise_blobs);
01375   for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
01376     BLOBNBOX* blob = small_it.data();
01377     if (blob->owner() == NULL && !blob->IsDiacritic() &&
01378         DiacriticBlob(&small_grid, blob)) {
01379       ++small_diacritics;
01380     }
01381   }
01382   BLOBNBOX_IT blob_it(&block->blobs);
01383   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
01384     BLOBNBOX* blob = blob_it.data();
01385     if (blob->IsDiacritic()) {
01386       small_it.add_to_end(blob_it.extract());
01387       continue;  // Already a diacritic.
01388     }
01389     ColPartition* part = blob->owner();
01390     if (part == NULL && DiacriticBlob(&small_grid, blob)) {
01391       ++medium_diacritics;
01392       RemoveBBox(blob);
01393       small_it.add_to_end(blob_it.extract());
01394     } else if (part != NULL && !part->block_owned() &&
01395         part->boxes_count() < 3) {
01396       // We allow blobs in small partitions to become diacritics if ALL the
01397       // blobs in the partition qualify as we can then cleanly delete the
01398       // partition, turn all the blobs in it to diacritics and they can be
01399       // merged into the base character partition more easily than merging
01400       // the partitions.
01401       BLOBNBOX_C_IT box_it(part->boxes());
01402       for (box_it.mark_cycle_pt(); !box_it.cycled_list() &&
01403            DiacriticBlob(&small_grid, box_it.data());
01404            box_it.forward());
01405       if (box_it.cycled_list()) {
01406         // They are all good.
01407         while (!box_it.empty()) {
01408           // Liberate the blob from its partition so it can be treated
01409           // as a diacritic and merged explicitly with the base part.
01410           // The blob is really owned by the block. The partition "owner"
01411           // is NULLed to allow the blob to get merged with its base character
01412           // partition.
01413           BLOBNBOX* box = box_it.extract();
01414           box->set_owner(NULL);
01415           box_it.forward();
01416           ++medium_diacritics;
01417           // We remove the blob from the grid so it isn't found by subsequent
01418           // searches where we might not want to include diacritics.
01419           RemoveBBox(box);
01420         }
01421         // We only move the one blob to the small list here, but the others
01422         // all get moved by the test at the top of the loop.
01423         small_it.add_to_end(blob_it.extract());
01424         part_grid->RemoveBBox(part);
01425         delete part;
01426       }
01427     } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
01428                                              blob->bounding_box().bottom())) {
01429       tprintf("Blob not available to be a diacritic at:");
01430       blob->bounding_box().print();
01431     }
01432   }
01433   if (textord_tabfind_show_strokewidths) {
01434     tprintf("Found %d small diacritics, %d medium\n",
01435             small_diacritics, medium_diacritics);
01436   }
01437 }
01438 
01439 // Searches this grid for an appropriately close and sized neighbour of the
01440 // given [small] blob. If such a blob is found, the diacritic base is saved
01441 // in the blob and true is returned.
01442 // The small_grid is a secondary grid that contains the small/noise objects
01443 // that are not in this grid, but may be useful for determining a connection
01444 // between blob and its potential base character. (See DiacriticXGapFilled.)
01445 bool StrokeWidth::DiacriticBlob(BlobGrid* small_grid, BLOBNBOX* blob) {
01446   if (BLOBNBOX::UnMergeableType(blob->region_type()) ||
01447       blob->region_type() == BRT_VERT_TEXT)
01448     return false;
01449   TBOX small_box(blob->bounding_box());
01450   bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(),
01451                                              small_box.bottom());
01452   if (debug) {
01453     tprintf("Testing blob for diacriticness at:");
01454     small_box.print();
01455   }
01456   int x = (small_box.left() + small_box.right()) / 2;
01457   int y = (small_box.bottom() + small_box.top()) / 2;
01458   int grid_x, grid_y;
01459   GridCoords(x, y, &grid_x, &grid_y);
01460   int height = small_box.height();
01461   // Setup a rectangle search to find its nearest base-character neighbour.
01462   // We keep 2 different best candidates:
01463   // best_x_overlap is a category of base characters that have an overlap in x
01464   // (like a acute) in which we look for the least y-gap, computed using the
01465   // projection to favor base characters in the same textline.
01466   // best_y_overlap is a category of base characters that have no x overlap,
01467   // (nominally a y-overlap is preferrecd but not essential) in which we
01468   // look for the least weighted sum of x-gap and y-gap, with x-gap getting
01469   // a lower weight to catch quotes at the end of a textline.
01470   // NOTE that x-gap and y-gap are measured from the nearest side of the base
01471   // character to the FARTHEST side of the diacritic to allow small diacritics
01472   // to be a reasonable distance away, but not big diacritics.
01473   BLOBNBOX* best_x_overlap = NULL;
01474   BLOBNBOX* best_y_overlap = NULL;
01475   int best_total_dist = 0;
01476   int best_y_gap = 0;
01477   TBOX best_xbox;
01478   // TODO(rays) the search box could be setup using the projection as a guide.
01479   TBOX search_box(small_box);
01480   int x_pad = IntCastRounded(gridsize() * kDiacriticXPadRatio);
01481   int y_pad = IntCastRounded(gridsize() * kDiacriticYPadRatio);
01482   search_box.pad(x_pad, y_pad);
01483   BlobGridSearch rsearch(this);
01484   rsearch.SetUniqueMode(true);
01485   int min_height = height * kMinDiacriticSizeRatio;
01486   rsearch.StartRectSearch(search_box);
01487   BLOBNBOX* neighbour;
01488   while ((neighbour = rsearch.NextRectSearch()) != NULL) {
01489     if (BLOBNBOX::UnMergeableType(neighbour->region_type()) ||
01490         neighbour == blob || neighbour->owner() == blob->owner())
01491       continue;
01492     TBOX nbox = neighbour->bounding_box();
01493     if (neighbour->owner() == NULL || neighbour->owner()->IsVerticalType() ||
01494         (neighbour->flow() != BTFT_CHAIN &&
01495             neighbour->flow() != BTFT_STRONG_CHAIN)) {
01496       if (debug) {
01497         tprintf("Neighbour not strong enough:");
01498         nbox.print();
01499       }
01500       continue;  // Diacritics must be attached to strong text.
01501     }
01502     if (nbox.height() < min_height) {
01503       if (debug) {
01504         tprintf("Neighbour not big enough:");
01505         nbox.print();
01506       }
01507       continue;  // Too small to be the base character.
01508     }
01509     int x_gap = small_box.x_gap(nbox);
01510     int y_gap = small_box.y_gap(nbox);
01511     int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox,
01512                                                            true, denorm_,
01513                                                            debug);
01514     if (debug) tprintf("xgap=%d, y=%d, total dist=%d\n",
01515                        x_gap, y_gap, total_distance);
01516     if (total_distance >
01517         neighbour->owner()->median_size() * kMaxDiacriticDistanceRatio) {
01518       if (debug) {
01519         tprintf("Neighbour with median size %d too far away:",
01520                 neighbour->owner()->median_size());
01521         neighbour->bounding_box().print();
01522       }
01523       continue;  // Diacritics must not be too distant.
01524     }
01525     if (x_gap <= 0) {
01526       if (debug) {
01527         tprintf("Computing reduced box for :");
01528         nbox.print();
01529       }
01530       int left = small_box.left() - small_box.width();
01531       int right = small_box.right() + small_box.width();
01532       nbox = neighbour->BoundsWithinLimits(left, right);
01533       y_gap = small_box.y_gap(nbox);
01534       if (best_x_overlap == NULL || y_gap < best_y_gap) {
01535         best_x_overlap = neighbour;
01536         best_xbox = nbox;
01537         best_y_gap = y_gap;
01538         if (debug) {
01539           tprintf("New best:");
01540           nbox.print();
01541         }
01542       } else if (debug) {
01543         tprintf("Shrunken box doesn't win:");
01544         nbox.print();
01545       }
01546     } else if (blob->ConfirmNoTabViolation(*neighbour)) {
01547       if (best_y_overlap == NULL || total_distance < best_total_dist) {
01548         if (debug) {
01549           tprintf("New best y overlap:");
01550           nbox.print();
01551         }
01552         best_y_overlap = neighbour;
01553         best_total_dist = total_distance;
01554       } else if (debug) {
01555         tprintf("New y overlap box doesn't win:");
01556         nbox.print();
01557       }
01558     } else if (debug) {
01559       tprintf("Neighbour wrong side of a tab:");
01560       nbox.print();
01561     }
01562   }
01563   if (best_x_overlap != NULL &&
01564       (best_y_overlap == NULL ||
01565        best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {
01566     blob->set_diacritic_box(best_xbox);
01567     blob->set_base_char_blob(best_x_overlap);
01568     if (debug) {
01569       tprintf("DiacriticBlob OK! (x-overlap:");
01570       small_box.print();
01571       best_xbox.print();
01572     }
01573     return true;
01574   }
01575   if (best_y_overlap != NULL &&
01576       DiacriticXGapFilled(small_grid, small_box,
01577                           best_y_overlap->bounding_box()) &&
01578       NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {
01579     blob->set_diacritic_box(best_y_overlap->bounding_box());
01580     blob->set_base_char_blob(best_y_overlap);
01581     if (debug) {
01582       tprintf("DiacriticBlob OK! (y-overlap:");
01583       small_box.print();
01584       best_y_overlap->bounding_box().print();
01585     }
01586     return true;
01587   }
01588   if (debug) {
01589     tprintf("DiacriticBlob fails:");
01590     small_box.print();
01591     tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
01592     if (best_y_overlap != NULL) {
01593       tprintf("XGapFilled=%d, NoiseBetween=%d\n",
01594               DiacriticXGapFilled(small_grid, small_box,
01595                                   best_y_overlap->bounding_box()),
01596               NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));
01597     }
01598   }
01599   return false;
01600 }
01601 
01602 // Returns true if there is no gap between the base char and the diacritic
01603 // bigger than a fraction of the height of the base char:
01604 // Eg: line end.....'
01605 // The quote is a long way from the end of the line, yet it needs to be a
01606 // diacritic. To determine that the quote is not part of an image, or
01607 // a different text block, we check for other marks in the gap between
01608 // the base char and the diacritic.
01609 //                          '<--Diacritic
01610 // |---------|
01611 // |         |<-toobig-gap->
01612 // | Base    |<ok gap>
01613 // |---------|        x<-----Dot occupying gap
01614 // The grid is const really.
01615 bool StrokeWidth::DiacriticXGapFilled(BlobGrid* grid,
01616                                       const TBOX& diacritic_box,
01617                                       const TBOX& base_box) {
01618   // Since most gaps are small, use an iterative algorithm to search the gap.
01619   int max_gap = IntCastRounded(base_box.height() *
01620                                kMaxDiacriticGapToBaseCharHeight);
01621   TBOX occupied_box(base_box);
01622   int diacritic_gap;
01623   while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {
01624     TBOX search_box(occupied_box);
01625     if (diacritic_box.left() > search_box.right()) {
01626       // We are looking right.
01627       search_box.set_left(search_box.right());
01628       search_box.set_right(search_box.left() + max_gap);
01629     } else {
01630       // We are looking left.
01631       search_box.set_right(search_box.left());
01632       search_box.set_left(search_box.left() - max_gap);
01633     }
01634     BlobGridSearch rsearch(grid);
01635     rsearch.StartRectSearch(search_box);
01636     BLOBNBOX* neighbour;
01637     while ((neighbour = rsearch.NextRectSearch()) != NULL) {
01638       const TBOX& nbox = neighbour->bounding_box();
01639       if (nbox.x_gap(diacritic_box) < diacritic_gap) {
01640         if (nbox.left() < occupied_box.left())
01641           occupied_box.set_left(nbox.left());
01642         if (nbox.right() > occupied_box.right())
01643           occupied_box.set_right(nbox.right());
01644         break;
01645       }
01646     }
01647     if (neighbour == NULL)
01648       return false;  // Found a big gap.
01649   }
01650   return true;  // The gap was filled.
01651 }
01652 
01653 // Merges diacritics with the ColPartition of the base character blob.
01654 void StrokeWidth::MergeDiacritics(TO_BLOCK* block,
01655                                   ColPartitionGrid* part_grid) {
01656   BLOBNBOX_IT small_it(&block->noise_blobs);
01657   for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
01658     BLOBNBOX* blob = small_it.data();
01659     if (blob->base_char_blob() != NULL) {
01660       ColPartition* part = blob->base_char_blob()->owner();
01661       // The base character must be owned by a partition and that partition
01662       // must not be on the big_parts list (not block owned).
01663       if (part != NULL && !part->block_owned() && blob->owner() == NULL &&
01664           blob->IsDiacritic()) {
01665         // The partition has to be removed from the grid and reinserted
01666         // because its bounding box may change.
01667         part_grid->RemoveBBox(part);
01668         part->AddBox(blob);
01669         blob->set_region_type(part->blob_type());
01670         blob->set_flow(part->flow());
01671         blob->set_owner(part);
01672         part_grid->InsertBBox(true, true, part);
01673       }
01674       // Set all base chars to NULL before any blobs get deleted.
01675       blob->set_base_char_blob(NULL);
01676     }
01677   }
01678 }
01679 
01680 // Any blobs on the large_blobs list of block that are still unowned by a
01681 // ColPartition, are probably drop-cap or vertically touching so the blobs
01682 // are removed to the big_parts list and treated separately.
01683 void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK* block,
01684                                          ColPartitionGrid* part_grid,
01685                                          ColPartition_LIST* big_parts) {
01686   BLOBNBOX_IT large_it(&block->large_blobs);
01687   for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
01688     BLOBNBOX* blob = large_it.data();
01689     ColPartition* big_part = blob->owner();
01690     if (big_part == NULL) {
01691       // Large blobs should have gone into partitions by now if they are
01692       // genuine characters, so move any unowned ones out to the big parts
01693       // list. This will include drop caps and vertically touching characters.
01694       ColPartition::MakeBigPartition(blob, big_parts);
01695     }
01696   }
01697 }
01698 
01699 // All remaining unused blobs are put in individual ColPartitions.
01700 void StrokeWidth::PartitionRemainingBlobs(ColPartitionGrid* part_grid) {
01701   BlobGridSearch gsearch(this);
01702   BLOBNBOX* bbox;
01703   int prev_grid_x = -1;
01704   int prev_grid_y = -1;
01705   BLOBNBOX_CLIST cell_list;
01706   BLOBNBOX_C_IT cell_it(&cell_list);
01707   bool cell_all_noise = true;
01708   gsearch.StartFullSearch();
01709   while ((bbox = gsearch.NextFullSearch()) != NULL) {
01710     int grid_x = gsearch.GridX();
01711     int grid_y = gsearch.GridY();
01712     if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
01713       // New cell. Process old cell.
01714       MakePartitionsFromCellList(cell_all_noise, part_grid, &cell_list);
01715       cell_it.set_to_list(&cell_list);
01716       prev_grid_x = grid_x;
01717       prev_grid_y = grid_y;
01718       cell_all_noise = true;
01719     }
01720     if (bbox->owner() == NULL) {
01721       cell_it.add_to_end(bbox);
01722       if (bbox->flow() != BTFT_NONTEXT)
01723         cell_all_noise = false;
01724     } else {
01725       cell_all_noise = false;
01726     }
01727   }
01728   MakePartitionsFromCellList(cell_all_noise, part_grid, &cell_list);
01729 }
01730 
01731 // If combine, put all blobs in the cell_list into a single partition, otherwise
01732 // put each one into its own partition.
01733 void StrokeWidth::MakePartitionsFromCellList(bool combine,
01734                                              ColPartitionGrid* part_grid,
01735                                              BLOBNBOX_CLIST* cell_list) {
01736   if (cell_list->empty())
01737     return;
01738   BLOBNBOX_C_IT cell_it(cell_list);
01739   if (combine) {
01740     BLOBNBOX* bbox = cell_it.extract();
01741     ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
01742     part->AddBox(bbox);
01743     part->set_flow(bbox->flow());
01744     for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
01745       part->AddBox(cell_it.extract());
01746     }
01747     CompletePartition(part, part_grid);
01748   } else {
01749     for (; !cell_it.empty(); cell_it.forward()) {
01750       BLOBNBOX* bbox = cell_it.extract();
01751       ColPartition* part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
01752       part->set_flow(bbox->flow());
01753       part->AddBox(bbox);
01754       CompletePartition(part, part_grid);
01755     }
01756   }
01757 }
01758 
01759 // Helper function to finish setting up a ColPartition and insert into
01760 // part_grid.
01761 void StrokeWidth::CompletePartition(ColPartition* part,
01762                                     ColPartitionGrid* part_grid) {
01763   part->ComputeLimits();
01764   TBOX box = part->bounding_box();
01765   bool debug = AlignedBlob::WithinTestRegion(2, box.left(),
01766                                              box.bottom());
01767   int value = projection_->EvaluateColPartition(*part, denorm_, debug);
01768   part->SetRegionAndFlowTypesFromProjectionValue(value);
01769   part->ClaimBoxes();
01770   part_grid->InsertBBox(true, true, part);
01771 }
01772 
01773 // Merge partitions where the merge appears harmless.
01774 // As this
01775 void StrokeWidth::EasyMerges(ColPartitionGrid* part_grid) {
01776   part_grid->Merges(
01777       NewPermanentTessCallback(this, &StrokeWidth::OrientationSearchBox),
01778       NewPermanentTessCallback(this, &StrokeWidth::ConfirmEasyMerge));
01779 }
01780 
01781 // Compute a search box based on the orientation of the partition.
01782 // Returns true if a suitable box can be calculated.
01783 // Callback for EasyMerges.
01784 bool StrokeWidth::OrientationSearchBox(ColPartition* part, TBOX* box) {
01785   if (part->IsVerticalType()) {
01786     box->set_top(box->top() + box->width());
01787     box->set_bottom(box->bottom() - box->width());
01788   } else {
01789     box->set_left(box->left() - box->height());
01790     box->set_right(box->right() + box->height());
01791   }
01792   return true;
01793 }
01794 
01795 // Merge confirmation callback for EasyMerges.
01796 bool StrokeWidth::ConfirmEasyMerge(const ColPartition* p1,
01797                                    const ColPartition* p2) {
01798   ASSERT_HOST(p1 != NULL && p2 != NULL);
01799   ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());
01800   if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||
01801       (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT))
01802     return false;  // Don't merge confirmed image with text.
01803   if ((p1->IsVerticalType() || p2->IsVerticalType()) &&
01804        p1->HCoreOverlap(*p2) <= 0 &&
01805        ((!p1->IsSingleton() &&
01806          !p2->IsSingleton()) ||
01807         !p1->bounding_box().major_overlap(p2->bounding_box())))
01808     return false;  // Overlap must be in the text line.
01809   if ((p1->IsHorizontalType() || p2->IsHorizontalType()) &&
01810       p1->VCoreOverlap(*p2) <= 0 &&
01811       ((!p1->IsSingleton() &&
01812         !p2->IsSingleton()) ||
01813        (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
01814         !p1->OKDiacriticMerge(*p2, false) &&
01815         !p2->OKDiacriticMerge(*p1, false))))
01816     return false;  // Overlap must be in the text line.
01817   if (!p1->ConfirmNoTabViolation(*p2))
01818     return false;
01819   if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT)
01820     return true;
01821   return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
01822 }
01823 
01824 // Returns true if there is no significant noise in between the boxes.
01825 bool StrokeWidth::NoNoiseInBetween(const TBOX& box1, const TBOX& box2) const {
01826   return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_,
01827                                         nontext_map_);
01828 }
01829 
01833 ScrollView* StrokeWidth::DisplayGoodBlobs(const char* window_name,
01834                                           int x, int y) {
01835   ScrollView* window = NULL;
01836 #ifndef GRAPHICS_DISABLED
01837   window = MakeWindow(x, y, window_name);
01838   // For every blob in the grid, display it.
01839   window->Brush(ScrollView::NONE);
01840 
01841   // For every bbox in the grid, display it.
01842   BlobGridSearch gsearch(this);
01843   gsearch.StartFullSearch();
01844   BLOBNBOX* bbox;
01845   while ((bbox = gsearch.NextFullSearch()) != NULL) {
01846     TBOX box = bbox->bounding_box();
01847     int left_x = box.left();
01848     int right_x = box.right();
01849     int top_y = box.top();
01850     int bottom_y = box.bottom();
01851     int goodness = bbox->GoodTextBlob();
01852     BlobRegionType blob_type = bbox->region_type();
01853     if (bbox->UniquelyVertical())
01854       blob_type = BRT_VERT_TEXT;
01855     if (bbox->UniquelyHorizontal())
01856       blob_type = BRT_TEXT;
01857     BlobTextFlowType flow = bbox->flow();
01858     if (flow == BTFT_NONE) {
01859       if (goodness == 0)
01860         flow = BTFT_NEIGHBOURS;
01861       else if (goodness == 1)
01862         flow = BTFT_CHAIN;
01863       else
01864         flow = BTFT_STRONG_CHAIN;
01865     }
01866     window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));
01867     window->Rectangle(left_x, bottom_y, right_x, top_y);
01868   }
01869   window->Update();
01870 #endif
01871   return window;
01872 }
01873 
01874 static void DrawDiacriticJoiner(const BLOBNBOX* blob, ScrollView* window) {
01875   const TBOX& blob_box(blob->bounding_box());
01876   int top = MAX(blob_box.top(), blob->base_char_top());
01877   int bottom = MIN(blob_box.bottom(), blob->base_char_bottom());
01878   int x = (blob_box.left() + blob_box.right()) / 2;
01879   #ifndef GRAPHICS_DISABLED
01880   window->Line(x, top, x, bottom);
01881   #endif  // GRAPHICS_DISABLED
01882 }
01883 
01884 // Displays blobs colored according to whether or not they are diacritics.
01885 ScrollView* StrokeWidth::DisplayDiacritics(const char* window_name,
01886                                            int x, int y, TO_BLOCK* block) {
01887   ScrollView* window = NULL;
01888 #ifndef GRAPHICS_DISABLED
01889   window = MakeWindow(x, y, window_name);
01890   // For every blob in the grid, display it.
01891   window->Brush(ScrollView::NONE);
01892 
01893   BLOBNBOX_IT it(&block->blobs);
01894   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01895     BLOBNBOX* blob = it.data();
01896     if (blob->IsDiacritic()) {
01897       window->Pen(ScrollView::GREEN);
01898       DrawDiacriticJoiner(blob, window);
01899     } else {
01900       window->Pen(blob->BoxColor());
01901     }
01902     const TBOX& box = blob->bounding_box();
01903     window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
01904   }
01905   it.set_to_list(&block->noise_blobs);
01906   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01907     BLOBNBOX* blob = it.data();
01908     if (blob->IsDiacritic()) {
01909       window->Pen(ScrollView::GREEN);
01910       DrawDiacriticJoiner(blob, window);
01911     } else {
01912       window->Pen(ScrollView::WHITE);
01913     }
01914     const TBOX& box = blob->bounding_box();
01915     window->Rectangle(box.left(), box. bottom(), box.right(), box.top());
01916   }
01917   window->Update();
01918 #endif
01919   return window;
01920 }
01921 
01922 }  // namespace tesseract.