Tesseract  3.02
tesseract-ocr/textord/tablefind.cpp
Go to the documentation of this file.
00001 
00002 // File:        tablefind.cpp
00003 // Description: Helper classes to find tables from ColPartitions.
00004 // Author:      Faisal Shafait (faisal.shafait@dfki.de)
00005 // Created:     Tue Jan 06 11:13:01 PST 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include "tablefind.h"
00025 #include <math.h>
00026 #ifdef HAVE_CONFIG_H
00027 #include "config_auto.h"
00028 #endif
00029 
00030 #include "allheaders.h"
00031 
00032 #include "colpartitionset.h"
00033 #include "tablerecog.h"
00034 
00035 namespace tesseract {
00036 
00037 // These numbers are used to calculate the global median stats.
00038 // They just set an upper bound on the stats objects.
00039 // Maximum vertical spacing between neighbor partitions.
00040 const int kMaxVerticalSpacing = 500;
00041 // Maximum width of a blob in a partition.
00042 const int kMaxBlobWidth = 500;
00043 
00044 // Minimum whitespace size to split a partition (measured as a multiple
00045 // of a partition's median width).
00046 const double kSplitPartitionSize = 2.0;
00047 // To insert text, the partition must satisfy these size constraints
00048 // in AllowTextPartition(). The idea is to filter noise partitions
00049 // determined by the size compared to the global medians.
00050 // TODO(nbeato): Need to find good numbers again.
00051 const double kAllowTextHeight = 0.5;
00052 const double kAllowTextWidth = 0.6;
00053 const double kAllowTextArea = 0.8;
00054 // The same thing applies to blobs (to filter noise).
00055 // TODO(nbeato): These numbers are a shot in the dark...
00056 // height and width are 0.5 * gridsize() in colfind.cpp
00057 // area is a rough guess for the size of a period.
00058 const double kAllowBlobHeight = 0.3;
00059 const double kAllowBlobWidth = 0.4;
00060 const double kAllowBlobArea = 0.05;
00061 
00062 // Minimum number of components in a text partition. A partition having fewer
00063 // components than that is more likely a data partition and is a candidate
00064 // table cell.
00065 const int kMinBoxesInTextPartition = 10;
00066 
00067 // Maximum number of components that a data partition can have
00068 const int kMaxBoxesInDataPartition = 20;
00069 
00070 // Maximum allowed gap in a text partitions as a multiple of its median size.
00071 const double kMaxGapInTextPartition = 4.0;
00072 
00073 // Minimum value that the maximum gap in a text partition should have as a
00074 // factor of its median size.
00075 const double kMinMaxGapInTextPartition = 0.5;
00076 
00077 // The amount of overlap that is "normal" for adjacent blobs in a text
00078 // partition. This is used to calculate gap between overlapping blobs.
00079 const double kMaxBlobOverlapFactor = 4.0;
00080 
00081 // Maximum x-height a table partition can have as a multiple of global
00082 // median x-height
00083 const double kMaxTableCellXheight = 2.0;
00084 
00085 // Maximum line spacing between a table column header and column contents
00086 // for merging the two (as a multiple of the partition's median_size).
00087 const int kMaxColumnHeaderDistance = 4;
00088 
00089 // Minimum ratio of num_table_partitions to num_text_partitions in a column
00090 // block to be called it a table column
00091 const double kTableColumnThreshold = 3.0;
00092 
00093 // Search for horizontal ruling lines within the vertical margin as a
00094 // multiple of grid size
00095 const int kRulingVerticalMargin = 3;
00096 
00097 // Minimum overlap that a colpartition must have with a table region
00098 // to become part of that table
00099 const double kMinOverlapWithTable = 0.6;
00100 
00101 // Maximum side space (distance from column boundary) that a typical
00102 // text-line in flowing text should have as a multiple of its x-height
00103 // (Median size).
00104 const int kSideSpaceMargin = 10;
00105 
00106 // Fraction of the peak of x-projection of a table region to set the
00107 // threshold for the x-projection histogram
00108 const double kSmallTableProjectionThreshold = 0.35;
00109 const double kLargeTableProjectionThreshold = 0.45;
00110 // Minimum number of rows required to look for more rows in the projection.
00111 const int kLargeTableRowCount = 6;
00112 
00113 // Minimum number of rows in a table
00114 const int kMinRowsInTable = 3;
00115 
00116 // The number of "whitespace blobs" that should appear between the
00117 // ColPartition's bounding box and the column tab stops to the left/right
00118 // when looking for center justified tab stops.
00119 const double kRequiredFullJustifiedSpacing = 4.0;
00120 
00121 // The amount of padding (multiplied by global_median_xheight_ during use)
00122 // that is vertically added to the search adjacent leader search during
00123 // ColPartition marking.
00124 const int kAdjacentLeaderSearchPadding = 2;
00125 
00126 // Used when filtering false positives. When finding the last line
00127 // of a paragraph (typically left-aligned), the previous line should have
00128 // its center to the right of the last line by this scaled amount.
00129 const double kParagraphEndingPreviousLineRatio = 1.3;
00130 
00131 // The maximum amount of whitespace allowed left of a paragraph ending.
00132 // Do not filter a ColPartition with more than this space left of it.
00133 const double kMaxParagraphEndingLeftSpaceMultiple = 3.0;
00134 
00135 // Used when filtering false positives. The last line of a paragraph
00136 // should be preceded by a line that is predominantly text. This is the
00137 // ratio of text to whitespace (to the right of the text) that is required
00138 // for the previous line to be a text.
00139 const double kMinParagraphEndingTextToWhitespaceRatio = 3.0;
00140 
00141 // When counting table columns, this is the required gap between two columns
00142 // (it is multiplied by global_median_xheight_).
00143 const double kMaxXProjectionGapFactor = 2.0;
00144 
00145 // Used for similarity in partitions using stroke width. Values copied
00146 // from ColFind.cpp in Ray's CL.
00147 const double kStrokeWidthFractionalTolerance = 0.25;
00148 const double kStrokeWidthConstantTolerance = 2.0;
00149 
00150 BOOL_VAR(textord_dump_table_images, false, "Paint table detection output");
00151 BOOL_VAR(textord_show_tables, false, "Show table regions");
00152 BOOL_VAR(textord_tablefind_show_mark, false,
00153          "Debug table marking steps in detail");
00154 BOOL_VAR(textord_tablefind_show_stats, false,
00155          "Show page stats used in table finding");
00156 BOOL_VAR(textord_tablefind_recognize_tables, false,
00157          "Enables the table recognizer for table layout and filtering.");
00158 
00159 ELISTIZE(ColSegment)
00160 CLISTIZE(ColSegment)
00161 
00162 // Templated helper function used to create destructor callbacks for the
00163 // BBGrid::ClearGridData() method.
00164 template <typename T> void DeleteObject(T *object) {
00165   delete object;
00166 }
00167 
00168 TableFinder::TableFinder()
00169     : resolution_(0),
00170       global_median_xheight_(0),
00171       global_median_blob_width_(0),
00172       global_median_ledding_(0),
00173       left_to_right_language_(true) {
00174 }
00175 
00176 TableFinder::~TableFinder() {
00177   // ColPartitions and ColSegments created by this class for storage in grids
00178   // need to be deleted explicitly.
00179   clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
00180   leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
00181   fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
00182   col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
00183   table_grid_.ClearGridData(&DeleteObject<ColSegment>);
00184 }
00185 
00186 void TableFinder::set_left_to_right_language(bool order) {
00187   left_to_right_language_ = order;
00188 }
00189 
00190 void TableFinder::Init(int grid_size, const ICOORD& bottom_left,
00191                        const ICOORD& top_right) {
00192   // Initialize clean partitions list and grid
00193   clean_part_grid_.Init(grid_size, bottom_left, top_right);
00194   leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
00195   fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
00196   col_seg_grid_.Init(grid_size, bottom_left, top_right);
00197   table_grid_.Init(grid_size, bottom_left, top_right);
00198 }
00199 
00200 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
00201 // insert leaders and rulers into the leader_and_ruling_grid_
00202 void TableFinder::InsertCleanPartitions(ColPartitionGrid* grid,
00203                                         TO_BLOCK* block) {
00204   // Calculate stats. This lets us filter partitions in AllowTextPartition()
00205   // and filter blobs in AllowBlob().
00206   SetGlobalSpacings(grid);
00207 
00208   // Iterate the ColPartitions in the grid.
00209   ColPartitionGridSearch gsearch(grid);
00210   gsearch.SetUniqueMode(true);
00211   gsearch.StartFullSearch();
00212   ColPartition* part = NULL;
00213   while ((part = gsearch.NextFullSearch()) != NULL) {
00214     // Reject partitions with nothing useful inside of them.
00215     if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
00216       continue;
00217     ColPartition* clean_part = part->ShallowCopy();
00218     ColPartition* leader_part = NULL;
00219     if (part->IsLineType()) {
00220       InsertRulingPartition(clean_part);
00221       continue;
00222     }
00223     // Insert all non-text partitions to clean_parts
00224     if (!part->IsTextType()) {
00225       InsertImagePartition(clean_part);
00226       continue;
00227     }
00228     // Insert text colpartitions after removing noisy components from them
00229     // The leaders are split into a separate grid.
00230     BLOBNBOX_CLIST* part_boxes = part->boxes();
00231     BLOBNBOX_C_IT pit(part_boxes);
00232     for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
00233       BLOBNBOX *pblob = pit.data();
00234       // Bad blobs... happens in UNLV set.
00235       // news.3G1, page 17 (around x=6)
00236       if (!AllowBlob(*pblob))
00237         continue;
00238       if (pblob->flow() == BTFT_LEADER) {
00239         if (leader_part == NULL) {
00240           leader_part = part->ShallowCopy();
00241           leader_part->set_flow(BTFT_LEADER);
00242         }
00243         leader_part->AddBox(pblob);
00244       } else if (pblob->region_type() != BRT_NOISE) {
00245         clean_part->AddBox(pblob);
00246       }
00247     }
00248     clean_part->ComputeLimits();
00249     ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
00250     InsertTextPartition(clean_part);
00251     SplitAndInsertFragmentedTextPartition(fragmented);
00252     if (leader_part != NULL) {
00253       // TODO(nbeato): Note that ComputeLimits does not update the column
00254       // information. So the leader may appear to span more columns than it
00255       // really does later on when IsInSameColumnAs gets called to test
00256       // for adjacent leaders.
00257       leader_part->ComputeLimits();
00258       InsertLeaderPartition(leader_part);
00259     }
00260   }
00261 
00262   // Make the partition partners better for upper and lower neighbors.
00263   clean_part_grid_.FindPartitionPartners();
00264   clean_part_grid_.RefinePartitionPartners(false);
00265 }
00266 
00267 // High level function to perform table detection
00268 void TableFinder::LocateTables(ColPartitionGrid* grid,
00269                                ColPartitionSet** all_columns,
00270                                WidthCallback* width_cb,
00271                                const FCOORD& reskew) {
00272   // initialize spacing, neighbors, and columns
00273   InitializePartitions(all_columns);
00274   if (textord_show_tables) {
00275     ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
00276     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00277     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00278                          ScrollView::AQUAMARINE);
00279     DisplayColPartitionConnections(table_win, &clean_part_grid_,
00280                                    ScrollView::ORANGE);
00281 
00282     table_win = MakeWindow(100, 300, "Fragmented Text");
00283     DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE);
00284   }
00285   // mark, filter, and smooth candidate table partitions
00286   MarkTablePartitions();
00287 
00288   // Make single-column blocks from good_columns_ partitions. col_segments are
00289   // moved to a grid later which takes the ownership
00290   ColSegment_LIST column_blocks;
00291   GetColumnBlocks(all_columns, &column_blocks);
00292   // Set the ratio of candidate table partitions in each column
00293   SetColumnsType(&column_blocks);
00294 
00295   // Move column segments to col_seg_grid_
00296   MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
00297 
00298   // Detect split in column layout that might have occurred due to the
00299   // presence of a table. In such a case, merge the corresponding columns.
00300   GridMergeColumnBlocks();
00301 
00302   // Group horizontally overlapping table partitions into table columns.
00303   // table_columns created here get deleted at the end of this method.
00304   ColSegment_LIST table_columns;
00305   GetTableColumns(&table_columns);
00306 
00307   // Within each column, mark the range table regions occupy based on the
00308   // table columns detected. table_regions are moved to a grid later which
00309   // takes the ownership
00310   ColSegment_LIST table_regions;
00311   GetTableRegions(&table_columns, &table_regions);
00312 
00313   if (textord_tablefind_show_mark) {
00314     ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
00315     DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
00316     DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
00317   }
00318 
00319   // Merge table regions across columns for tables spanning multiple
00320   // columns
00321   MoveColSegmentsToGrid(&table_regions, &table_grid_);
00322   GridMergeTableRegions();
00323 
00324   // Adjust table boundaries by including nearby horizontal lines and left
00325   // out column headers
00326   AdjustTableBoundaries();
00327   GridMergeTableRegions();
00328 
00329   if (textord_tablefind_recognize_tables) {
00330     // Remove false alarms consiting of a single column
00331     DeleteSingleColumnTables();
00332 
00333     if (textord_show_tables) {
00334       ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
00335       DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00336       DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
00337       table_grid_.DisplayBoxes(table_win);
00338     }
00339 
00340     // Find table grid structure and reject tables that are malformed.
00341     RecognizeTables();
00342     GridMergeTableRegions();
00343     RecognizeTables();
00344 
00345     if (textord_show_tables) {
00346       ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
00347       DisplayColPartitions(table_win, &clean_part_grid_,
00348                            ScrollView::BLUE, ScrollView::BLUE);
00349       table_grid_.DisplayBoxes(table_win);
00350     }
00351   } else {
00352     // Remove false alarms consiting of a single column
00353     // TODO(nbeato): verify this is a NOP after structured table rejection.
00354     // Right now it isn't. If the recognize function is doing what it is
00355     // supposed to do, this function is obsolete.
00356     DeleteSingleColumnTables();
00357 
00358     if (textord_show_tables) {
00359       ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
00360       DisplayColPartitions(table_win, &clean_part_grid_,
00361                            ScrollView::BLUE, ScrollView::BLUE);
00362       table_grid_.DisplayBoxes(table_win);
00363     }
00364   }
00365 
00366   if (textord_dump_table_images)
00367     WriteToPix(reskew);
00368 
00369   // Merge all colpartitions in table regions to make them a single
00370   // colpartition and revert types of isolated table cells not
00371   // assigned to any table to their original types.
00372   MakeTableBlocks(grid, all_columns, width_cb);
00373 }
00374 // All grids have the same dimensions. The clean_part_grid_ sizes are set from
00375 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as
00376 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_
00377 // dimensions instead of duplicated memory.
00378 int TableFinder::gridsize() const {
00379   return clean_part_grid_.gridsize();
00380 }
00381 int TableFinder::gridwidth() const {
00382   return clean_part_grid_.gridwidth();
00383 }
00384 int TableFinder::gridheight() const {
00385   return clean_part_grid_.gridheight();
00386 }
00387 const ICOORD& TableFinder::bleft() const {
00388   return clean_part_grid_.bleft();
00389 }
00390 const ICOORD& TableFinder::tright() const {
00391   return clean_part_grid_.tright();
00392 }
00393 
00394 void TableFinder::InsertTextPartition(ColPartition* part) {
00395   ASSERT_HOST(part != NULL);
00396   if (AllowTextPartition(*part)) {
00397     clean_part_grid_.InsertBBox(true, true, part);
00398   } else {
00399     delete part;
00400   }
00401 }
00402 void TableFinder::InsertFragmentedTextPartition(ColPartition* part) {
00403   ASSERT_HOST(part != NULL);
00404   if (AllowTextPartition(*part)) {
00405     fragmented_text_grid_.InsertBBox(true, true, part);
00406   } else {
00407     delete part;
00408   }
00409 }
00410 void TableFinder::InsertLeaderPartition(ColPartition* part) {
00411   ASSERT_HOST(part != NULL);
00412   if (!part->IsEmpty() && part->bounding_box().area() > 0) {
00413     leader_and_ruling_grid_.InsertBBox(true, true, part);
00414   } else {
00415     delete part;
00416   }
00417 }
00418 void TableFinder::InsertRulingPartition(ColPartition* part) {
00419   leader_and_ruling_grid_.InsertBBox(true, true, part);
00420 }
00421 void TableFinder::InsertImagePartition(ColPartition* part) {
00422   // NOTE: If images are placed into a different grid in the future,
00423   // the function SetPartitionSpacings needs to be updated. It should
00424   // be the only thing that cares about image partitions.
00425   clean_part_grid_.InsertBBox(true, true, part);
00426 }
00427 
00428 // Splits a partition into its "words". The splits happen
00429 // at locations with wide inter-blob spacing. This is useful
00430 // because it allows the table recognize to "cut through" the
00431 // text lines on the page. The assumption is that a table
00432 // will have several lines with similar overlapping whitespace
00433 // whereas text will not have this type of property.
00434 // Note: The code Assumes that blobs are sorted by the left side x!
00435 // This will not work (as well) if the blobs are sorted by center/right.
00436 void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition* part) {
00437   ASSERT_HOST(part != NULL);
00438   // Bye bye empty partitions!
00439   if (part->boxes()->empty()) {
00440     delete part;
00441     return;
00442   }
00443 
00444   // The AllowBlob function prevents this.
00445   ASSERT_HOST(part->median_width() > 0);
00446   const double kThreshold = part->median_width() * kSplitPartitionSize;
00447 
00448   ColPartition* right_part = part;
00449   bool found_split = true;
00450   while (found_split) {
00451     found_split = false;
00452     BLOBNBOX_C_IT box_it(right_part->boxes());
00453     // Blobs are sorted left side first. If blobs overlap,
00454     // the previous blob may have a "more right" right side.
00455     // Account for this by always keeping the largest "right"
00456     // so far.
00457     int previous_right = MIN_INT32;
00458 
00459     // Look for the next split in the partition.
00460     for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
00461       const TBOX& box = box_it.data()->bounding_box();
00462       if (previous_right != MIN_INT32 &&
00463           box.left() - previous_right > kThreshold) {
00464         // We have a split position. Split the partition in two pieces.
00465         // Insert the left piece in the grid and keep processing the right.
00466         int mid_x = (box.left() + previous_right) / 2;
00467         ColPartition* left_part = right_part;
00468         right_part = left_part->SplitAt(mid_x);
00469 
00470         InsertFragmentedTextPartition(left_part);
00471         found_split = true;
00472         break;
00473       }
00474 
00475       // The right side of the previous blobs.
00476       previous_right = MAX(previous_right, box.right());
00477     }
00478   }
00479   // When a split is not found, the right part is minimized
00480   // as much as possible, so process it.
00481   InsertFragmentedTextPartition(right_part);
00482 }
00483 
00484 // Some simple criteria to filter out now. We want to make sure the
00485 // average blob size in the partition is consistent with the
00486 // global page stats.
00487 // The area metric will almost always pass for multi-blob partitions.
00488 // It is useful when filtering out noise caused by an isolated blob.
00489 bool TableFinder::AllowTextPartition(const ColPartition& part) const {
00490   const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
00491   const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
00492   const int median_area = global_median_xheight_ * global_median_blob_width_;
00493   const double kAreaPerBlobRequired = median_area * kAllowTextArea;
00494   // Keep comparisons strictly greater to disallow 0!
00495   return part.median_size() > kHeightRequired &&
00496          part.median_width() > kWidthRequired &&
00497          part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
00498 }
00499 
00500 // Same as above, applied to blobs. Keep in mind that
00501 // leaders, commas, and periods are important in tables.
00502 bool TableFinder::AllowBlob(const BLOBNBOX& blob) const {
00503   const TBOX& box = blob.bounding_box();
00504   const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
00505   const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
00506   const int median_area = global_median_xheight_ * global_median_blob_width_;
00507   const double kAreaRequired = median_area * kAllowBlobArea;
00508   // Keep comparisons strictly greater to disallow 0!
00509   return box.height() > kHeightRequired &&
00510          box.width() > kWidthRequired &&
00511          box.area() > kAreaRequired;
00512 }
00513 
00514 // TODO(nbeato): The grid that makes the window doesn't seem to matter.
00515 // The only downside is that window messages will be caught by
00516 // clean_part_grid_ instead of a useful object. This is a temporary solution
00517 // for the debug windows created by the TableFinder.
00518 ScrollView* TableFinder::MakeWindow(int x, int y, const char* window_name) {
00519   return clean_part_grid_.MakeWindow(x, y, window_name);
00520 }
00521 
00522 // Make single-column blocks from good_columns_ partitions.
00523 void TableFinder::GetColumnBlocks(ColPartitionSet** all_columns,
00524                                   ColSegment_LIST* column_blocks) {
00525   for (int i = 0; i < gridheight(); ++i) {
00526     ColPartitionSet* columns = all_columns[i];
00527     if (columns != NULL) {
00528       ColSegment_LIST new_blocks;
00529       // Get boxes from the current vertical position on the grid
00530       columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
00531       // Merge the new_blocks boxes into column_blocks if they are well-aligned
00532       GroupColumnBlocks(&new_blocks, column_blocks);
00533     }
00534   }
00535 }
00536 
00537 // Merge column segments into the current list if they are well aligned.
00538 void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks,
00539                                     ColSegment_LIST* column_blocks) {
00540   ColSegment_IT src_it(new_blocks);
00541   ColSegment_IT dest_it(column_blocks);
00542   // iterate through the source list
00543   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00544     ColSegment* src_seg = src_it.data();
00545     TBOX src_box = src_seg->bounding_box();
00546     bool match_found = false;
00547     // iterate through the destination list to find a matching column block
00548     for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
00549       ColSegment* dest_seg = dest_it.data();
00550       TBOX dest_box = dest_seg->bounding_box();
00551       if (ConsecutiveBoxes(src_box, dest_box)) {
00552         // If matching block is found, insert the current block into it
00553         // and delete the soure block
00554         dest_seg->InsertBox(src_box);
00555         match_found = true;
00556         delete src_it.extract();
00557         break;
00558       }
00559     }
00560     // If no match is found, just append the source block to column_blocks
00561     if (!match_found) {
00562       dest_it.add_after_then_move(src_it.extract());
00563     }
00564   }
00565 }
00566 
00567 // are the two boxes immediate neighbors along the vertical direction
00568 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
00569   int x_margin = 20;
00570   int y_margin = 5;
00571   return (abs(b1.left() - b2.left()) < x_margin) &&
00572       (abs(b1.right() - b2.right()) < x_margin) &&
00573       (abs(b1.top()-b2.bottom()) < y_margin ||
00574        abs(b2.top()-b1.bottom()) < y_margin);
00575 }
00576 
00577 // Set up info for clean_part_grid_ partitions to be valid during detection
00578 // code.
00579 void TableFinder::InitializePartitions(ColPartitionSet** all_columns) {
00580   FindNeighbors();
00581   SetPartitionSpacings(&clean_part_grid_, all_columns);
00582   SetGlobalSpacings(&clean_part_grid_);
00583 }
00584 
00585 // Set left, right and top, bottom spacings of each colpartition.
00586 void TableFinder::SetPartitionSpacings(ColPartitionGrid* grid,
00587                                        ColPartitionSet** all_columns) {
00588   // Iterate the ColPartitions in the grid.
00589   ColPartitionGridSearch gsearch(grid);
00590   gsearch.StartFullSearch();
00591   ColPartition* part = NULL;
00592   while ((part = gsearch.NextFullSearch()) != NULL) {
00593     ColPartitionSet* columns = all_columns[gsearch.GridY()];
00594     TBOX box = part->bounding_box();
00595     int y = part->MidY();
00596     ColPartition* left_column = columns->ColumnContaining(box.left(), y);
00597     ColPartition* right_column = columns->ColumnContaining(box.right(), y);
00598     // set distance from left column as space to the left
00599     if (left_column) {
00600       int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
00601       part->set_space_to_left(left_space);
00602     }
00603     // set distance from right column as space to the right
00604     if (right_column) {
00605       int right_space = MAX(0, right_column->RightAtY(y) - box.right());
00606       part->set_space_to_right(right_space);
00607     }
00608 
00609     // Look for images that may be closer.
00610     // NOTE: used to be part_grid_, might cause issues now
00611     ColPartitionGridSearch hsearch(grid);
00612     hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
00613     ColPartition* neighbor = NULL;
00614     while ((neighbor = hsearch.NextSideSearch(true)) != NULL) {
00615       if (neighbor->type() == PT_PULLOUT_IMAGE ||
00616           neighbor->type() == PT_FLOWING_IMAGE ||
00617           neighbor->type() == PT_HEADING_IMAGE) {
00618         int right = neighbor->bounding_box().right();
00619         if (right < box.left()) {
00620           int space = MIN(box.left() - right, part->space_to_left());
00621           part->set_space_to_left(space);
00622         }
00623       }
00624     }
00625     hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
00626     neighbor = NULL;
00627     while ((neighbor = hsearch.NextSideSearch(false)) != NULL) {
00628       if (neighbor->type() == PT_PULLOUT_IMAGE ||
00629           neighbor->type() == PT_FLOWING_IMAGE ||
00630           neighbor->type() == PT_HEADING_IMAGE) {
00631         int left = neighbor->bounding_box().left();
00632         if (left > box.right()) {
00633           int space = MIN(left - box.right(), part->space_to_right());
00634           part->set_space_to_right(space);
00635         }
00636       }
00637     }
00638 
00639     ColPartition* upper_part = part->SingletonPartner(true);
00640     if (upper_part) {
00641       int space = MAX(0, upper_part->bounding_box().bottom() -
00642                          part->bounding_box().bottom());
00643       part->set_space_above(space);
00644     } else {
00645       // TODO(nbeato): What constitutes a good value?
00646       // 0 is the default value when not set, explicitly noting it needs to
00647       // be something else.
00648       part->set_space_above(MAX_INT32);
00649     }
00650 
00651     ColPartition* lower_part = part->SingletonPartner(false);
00652     if (lower_part) {
00653       int space = MAX(0, part->bounding_box().bottom() -
00654                          lower_part->bounding_box().bottom());
00655       part->set_space_below(space);
00656     } else {
00657       // TODO(nbeato): What constitutes a good value?
00658       // 0 is the default value when not set, explicitly noting it needs to
00659       // be something else.
00660       part->set_space_below(MAX_INT32);
00661     }
00662   }
00663 }
00664 
00665 // Set spacing and closest neighbors above and below a given colpartition.
00666 void TableFinder::SetVerticalSpacing(ColPartition* part) {
00667   TBOX box = part->bounding_box();
00668   int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
00669   int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
00670   box.set_top(top_range);
00671   box.set_bottom(bottom_range);
00672 
00673   TBOX part_box = part->bounding_box();
00674   // Start a rect search
00675   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
00676       rectsearch(&clean_part_grid_);
00677   rectsearch.StartRectSearch(box);
00678   ColPartition* neighbor;
00679   int min_space_above = kMaxVerticalSpacing;
00680   int min_space_below = kMaxVerticalSpacing;
00681   ColPartition* above_neighbor = NULL;
00682   ColPartition* below_neighbor = NULL;
00683   while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
00684     if (neighbor == part)
00685       continue;
00686     TBOX neighbor_box = neighbor->bounding_box();
00687     if (neighbor_box.major_x_overlap(part_box)) {
00688       int gap = abs(part->median_bottom() - neighbor->median_bottom());
00689       // If neighbor is below current partition
00690       if (neighbor_box.top() < part_box.bottom() &&
00691           gap < min_space_below) {
00692         min_space_below = gap;
00693         below_neighbor = neighbor;
00694       }  // If neighbor is above current partition
00695       else if (part_box.top() < neighbor_box.bottom() &&
00696                gap < min_space_above) {
00697         min_space_above = gap;
00698         above_neighbor = neighbor;
00699        }
00700     }
00701   }
00702   part->set_space_above(min_space_above);
00703   part->set_space_below(min_space_below);
00704   part->set_nearest_neighbor_above(above_neighbor);
00705   part->set_nearest_neighbor_below(below_neighbor);
00706 }
00707 
00708 // Set global spacing and x-height estimates
00709 void TableFinder::SetGlobalSpacings(ColPartitionGrid* grid) {
00710   STATS xheight_stats(0, kMaxVerticalSpacing + 1);
00711   STATS width_stats(0, kMaxBlobWidth + 1);
00712   STATS ledding_stats(0, kMaxVerticalSpacing + 1);
00713   // Iterate the ColPartitions in the grid.
00714   ColPartitionGridSearch gsearch(grid);
00715   gsearch.SetUniqueMode(true);
00716   gsearch.StartFullSearch();
00717   ColPartition* part = NULL;
00718   while ((part = gsearch.NextFullSearch()) != NULL) {
00719     // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
00720     // ComputeLimits needs to get called somewhere outside of TableFinder
00721     // to make sure the partitions are properly initialized.
00722     // When this is called, SmoothPartitionPartners dies in an assert after
00723     // table find runs. Alternative solution.
00724     // part->ComputeLimits();
00725     if (part->IsTextType()) {
00726       // xheight_stats.add(part->median_size(), part->boxes_count());
00727       // width_stats.add(part->median_width(), part->boxes_count());
00728 
00729       // This loop can be removed when above issues are fixed.
00730       // Replace it with the 2 lines commented out above.
00731       BLOBNBOX_C_IT it(part->boxes());
00732       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00733         xheight_stats.add(it.data()->bounding_box().height(), 1);
00734         width_stats.add(it.data()->bounding_box().width(), 1);
00735       }
00736 
00737       ledding_stats.add(part->space_above(), 1);
00738       ledding_stats.add(part->space_below(), 1);
00739     }
00740   }
00741   // Set estimates based on median of statistics obtained
00742   set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
00743   set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
00744   set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
00745   #ifndef GRAPHICS_DISABLED
00746   if (textord_tablefind_show_stats) {
00747     const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
00748     ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
00749     xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
00750     width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
00751     ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
00752   }
00753   #endif  // GRAPHICS_DISABLED
00754 }
00755 
00756 void TableFinder::set_global_median_xheight(int xheight) {
00757   global_median_xheight_ = xheight;
00758 }
00759 void TableFinder::set_global_median_blob_width(int width) {
00760   global_median_blob_width_ = width;
00761 }
00762 void TableFinder::set_global_median_ledding(int ledding) {
00763   global_median_ledding_ = ledding;
00764 }
00765 
00766 void TableFinder::FindNeighbors() {
00767   ColPartitionGridSearch gsearch(&clean_part_grid_);
00768   gsearch.StartFullSearch();
00769   ColPartition* part = NULL;
00770   while ((part = gsearch.NextFullSearch()) != NULL) {
00771     // TODO(nbeato): Rename this function, meaning is different now.
00772     // IT is finding nearest neighbors its own way
00773     //SetVerticalSpacing(part);
00774 
00775     ColPartition* upper = part->SingletonPartner(true);
00776     if (upper)
00777       part->set_nearest_neighbor_above(upper);
00778 
00779     ColPartition* lower = part->SingletonPartner(false);
00780     if (lower)
00781       part->set_nearest_neighbor_below(lower);
00782   }
00783 }
00784 
00785 // High level interface. Input is an unmarked ColPartitionGrid
00786 // (namely, clean_part_grid_). Partitions are identified using local
00787 // information and filter/smoothed. The function exit should contain
00788 // a good sampling of the table partitions.
00789 void TableFinder::MarkTablePartitions() {
00790   MarkPartitionsUsingLocalInformation();
00791   if (textord_tablefind_show_mark) {
00792     ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
00793     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00794     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00795                          ScrollView::AQUAMARINE);
00796   }
00797   FilterFalseAlarms();
00798   if (textord_tablefind_show_mark) {
00799     ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
00800     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00801     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00802                          ScrollView::AQUAMARINE);
00803   }
00804   SmoothTablePartitionRuns();
00805   if (textord_tablefind_show_mark) {
00806     ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
00807     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00808     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00809                          ScrollView::AQUAMARINE);
00810   }
00811   FilterFalseAlarms();
00812   if (textord_tablefind_show_mark || textord_show_tables) {
00813     ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
00814     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00815     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00816                          ScrollView::AQUAMARINE);
00817   }
00818 }
00819 
00820 // These types of partitions are marked as table partitions:
00821 //  1- Partitions that have at lease one large gap between words
00822 //  2- Partitions that consist of only one word (no significant gap
00823 //     between components)
00824 //  3- Partitions that vertically overlap with other partitions within the
00825 //     same column.
00826 //  4- Partitions with leaders before/after them.
00827 void TableFinder::MarkPartitionsUsingLocalInformation() {
00828   // Iterate the ColPartitions in the grid.
00829   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
00830     gsearch(&clean_part_grid_);
00831   gsearch.StartFullSearch();
00832   ColPartition* part = NULL;
00833   while ((part = gsearch.NextFullSearch()) != NULL) {
00834     if (!part->IsTextType())  // Only consider text partitions
00835       continue;
00836     // Only consider partitions in dominant font size or smaller
00837     if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
00838       continue;
00839     // Mark partitions with a large gap, or no significant gap as
00840     // table partitions.
00841     // Comments: It produces several false alarms at:
00842     //  - last line of a paragraph (fixed)
00843     //  - single word section headings
00844     //  - page headers and footers
00845     //  - numbered equations
00846     //  - line drawing regions
00847     // TODO(faisal): detect and fix above-mentioned cases
00848     if (HasWideOrNoInterWordGap(part) ||
00849         HasLeaderAdjacent(*part)) {
00850       part->set_table_type();
00851     }
00852   }
00853 }
00854 
00855 // Check if the partition has at least one large gap between words or no
00856 // significant gap at all
00857 bool TableFinder::HasWideOrNoInterWordGap(ColPartition* part) const {
00858   // Should only get text partitions.
00859   ASSERT_HOST(part->IsTextType());
00860   // Blob access
00861   BLOBNBOX_CLIST* part_boxes = part->boxes();
00862   BLOBNBOX_C_IT it(part_boxes);
00863   // Check if this is a relatively small partition (such as a single word)
00864   if (part->bounding_box().width() <
00865       kMinBoxesInTextPartition * part->median_size() &&
00866       part_boxes->length() < kMinBoxesInTextPartition)
00867     return true;
00868 
00869   // Variables used to compute inter-blob spacing.
00870   int current_x0 = -1;
00871   int current_x1 = -1;
00872   int previous_x1 = -1;
00873   // Stores the maximum gap detected.
00874   int largest_partition_gap_found = -1;
00875   // Text partition gap limits. If this is text (and not a table),
00876   // there should be at least one gap larger than min_gap and no gap
00877   // larger than max_gap.
00878   const double max_gap = kMaxGapInTextPartition * part->median_size();
00879   const double min_gap = kMinMaxGapInTextPartition * part->median_size();
00880 
00881   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00882     BLOBNBOX* blob = it.data();
00883     current_x0 = blob->bounding_box().left();
00884     current_x1 = blob->bounding_box().right();
00885     if (previous_x1 != -1) {
00886       int gap = current_x0 - previous_x1;
00887 
00888       // TODO(nbeato): Boxes may overlap? Huh?
00889       // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
00890       // on the top right of the page are filtered out with this line.
00891       // Note 2: Iterating over blobs in a partition, so we are looking for
00892       // spacing between the words.
00893       if (gap < 0) {
00894         // More likely case, the blobs slightly overlap. This can happen
00895         // with diacritics (accents) or broken alphabet symbols (characters).
00896         // Merge boxes together by taking max of right sides.
00897         if (-gap < part->median_size() * kMaxBlobOverlapFactor) {
00898           previous_x1 = MAX(previous_x1, current_x1);
00899           continue;
00900         }
00901         // Extreme case, blobs overlap significantly in the same partition...
00902         // This should not happen often (if at all), but it does.
00903         // TODO(nbeato): investigate cases when this happens.
00904         else {
00905           // The behavior before was to completely ignore this case.
00906         }
00907       }
00908 
00909       // If a large enough gap is found, mark it as a table cell (return true)
00910       if (gap > max_gap)
00911         return true;
00912       if (gap > largest_partition_gap_found)
00913         largest_partition_gap_found = gap;
00914     }
00915     previous_x1 = current_x1;
00916   }
00917   // Since no large gap was found, return false if the partition is too
00918   // long to be a data cell
00919   if (part->bounding_box().width() >
00920       kMaxBoxesInDataPartition * part->median_size() ||
00921       part_boxes->length() > kMaxBoxesInDataPartition)
00922     return false;
00923 
00924   // A partition may be a single blob. In this case, it's an isolated symbol
00925   // or non-text (such as a ruling or image).
00926   // Detect these as table partitions? Shouldn't this be case by case?
00927   // The behavior before was to ignore this, making max_partition_gap < 0
00928   // and implicitly return true. Just making it explicit.
00929   if (largest_partition_gap_found == -1)
00930     return true;
00931 
00932   // return true if the maximum gap found is smaller than the minimum allowed
00933   // max_gap in a text partition. This indicates that there is no signficant
00934   // space in the partition, hence it is likely a single word.
00935   return largest_partition_gap_found < min_gap;
00936 }
00937 
00938 // A criteria for possible tables is that a table may have leaders
00939 // between data cells. An aggressive solution to find such tables is to
00940 // explicitly mark partitions that have adjacent leaders.
00941 // Note that this includes overlapping leaders. However, it does not
00942 // include leaders in different columns on the page.
00943 // Possible false-positive will include lists, such as a table of contents.
00944 // As these arise, the agressive nature of this search may need to be
00945 // trimmed down.
00946 bool TableFinder::HasLeaderAdjacent(const ColPartition& part) {
00947   if (part.flow() == BTFT_LEADER)
00948     return true;
00949   // Search range is left and right bounded by an offset of the
00950   // median xheight. This offset is to allow some tolerance to the
00951   // the leaders on the page in the event that the alignment is still
00952   // a bit off.
00953   const TBOX& box = part.bounding_box();
00954   const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
00955   const int top = box.top() + search_size;
00956   const int bottom = box.bottom() - search_size;
00957   ColPartitionGridSearch hsearch(&leader_and_ruling_grid_);
00958   for (int direction = 0; direction < 2; ++direction) {
00959     bool right_to_left = (direction == 0);
00960     int x = right_to_left ? box.right() : box.left();
00961     hsearch.StartSideSearch(x, bottom, top);
00962     ColPartition* leader = NULL;
00963     while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
00964       // This should not happen, they are in different grids.
00965       ASSERT_HOST(&part != leader);
00966       // The leader could be a horizontal ruling in the grid.
00967       // Make sure it is actually a leader.
00968       if (leader->flow() != BTFT_LEADER)
00969         continue;
00970       // Make sure the leader shares a page column with the partition,
00971       // otherwise we are spreading across columns.
00972       if (!part.IsInSameColumnAs(*leader))
00973         break;
00974       // There should be a significant vertical overlap
00975       if (!leader->VSignificantCoreOverlap(part))
00976         continue;
00977       // Leader passed all tests, so it is adjacent.
00978       return true;
00979     }
00980   }
00981   // No leaders are adjacent to the given partition.
00982   return false;
00983 }
00984 
00985 // Filter individual text partitions marked as table partitions
00986 // consisting of paragraph endings, small section headings, and
00987 // headers and footers.
00988 void TableFinder::FilterFalseAlarms() {
00989   FilterParagraphEndings();
00990   FilterHeaderAndFooter();
00991   // TODO(nbeato): Fully justified text as non-table?
00992 }
00993 
00994 void TableFinder::FilterParagraphEndings() {
00995   // Detect last line of paragraph
00996   // Iterate the ColPartitions in the grid.
00997   ColPartitionGridSearch gsearch(&clean_part_grid_);
00998   gsearch.StartFullSearch();
00999   ColPartition* part = NULL;
01000   while ((part = gsearch.NextFullSearch()) != NULL) {
01001     if (part->type() != PT_TABLE)
01002       continue;  // Consider only table partitions
01003 
01004     // Paragraph ending should have flowing text above it.
01005     ColPartition* upper_part = part->nearest_neighbor_above();
01006     if (!upper_part)
01007       continue;
01008     if (upper_part->type() != PT_FLOWING_TEXT)
01009       continue;
01010     if (upper_part->bounding_box().width() <
01011         2 * part->bounding_box().width())
01012       continue;
01013     // Check if its the last line of a paragraph.
01014     // In most cases, a paragraph ending should be left-aligned to text line
01015     // above it. Sometimes, it could be a 2 line paragraph, in which case
01016     // the line above it is indented.
01017     // To account for that, check if the partition center is to
01018     // the left of the one above it.
01019     int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
01020     int upper_mid = (upper_part->bounding_box().left() +
01021                      upper_part->bounding_box().right()) / 2;
01022     int current_spacing = 0;  // spacing of the current line to margin
01023     int upper_spacing = 0;    // spacing of the previous line to the margin
01024     if (left_to_right_language_) {
01025       // Left to right languages, use mid - left to figure out the distance
01026       // the middle is from the left margin.
01027       int left = MIN(part->bounding_box().left(),
01028                      upper_part->bounding_box().left());
01029       current_spacing = mid - left;
01030       upper_spacing = upper_mid - left;
01031     } else {
01032       // Right to left languages, use right - mid to figure out the distance
01033       // the middle is from the right margin.
01034       int right = MAX(part->bounding_box().right(),
01035                       upper_part->bounding_box().right());
01036       current_spacing = right - mid;
01037       upper_spacing = right - upper_mid;
01038     }
01039     if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
01040       continue;
01041 
01042     // Paragraphs should have similar fonts.
01043     if (!part->MatchingSizes(*upper_part) ||
01044         !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
01045                                    kStrokeWidthConstantTolerance)) {
01046       continue;
01047     }
01048 
01049     // The last line of a paragraph should be left aligned.
01050     // TODO(nbeato): This would be untrue if the text was right aligned.
01051     // How often is that?
01052     if (part->space_to_left() >
01053         kMaxParagraphEndingLeftSpaceMultiple * part->median_size())
01054       continue;
01055     // The line above it should be right aligned (assuming justified format).
01056     // Since we can't assume justified text, we compare whitespace to text.
01057     // The above line should have majority spanning text (or the current
01058     // line could have fit on the previous line). So compare
01059     // whitespace to text.
01060     if (upper_part->bounding_box().width() <
01061         kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right())
01062       continue;
01063 
01064     // Ledding above the line should be less than ledding below
01065     if (part->space_above() >= part->space_below() ||
01066         part->space_above() > 2 * global_median_ledding_)
01067       continue;
01068 
01069     // If all checks failed, it is probably text.
01070     part->clear_table_type();
01071   }
01072 }
01073 
01074 void TableFinder::FilterHeaderAndFooter() {
01075   // Consider top-most text colpartition as header and bottom most as footer
01076   ColPartition* header = NULL;
01077   ColPartition* footer = NULL;
01078   int max_top = MIN_INT32;
01079   int min_bottom = MAX_INT32;
01080   ColPartitionGridSearch gsearch(&clean_part_grid_);
01081   gsearch.StartFullSearch();
01082   ColPartition* part = NULL;
01083   while ((part = gsearch.NextFullSearch()) != NULL) {
01084     if (!part->IsTextType())
01085       continue;  // Consider only text partitions
01086     int top = part->bounding_box().top();
01087     int bottom = part->bounding_box().bottom();
01088     if (top > max_top) {
01089       max_top = top;
01090       header = part;
01091     }
01092     if (bottom < min_bottom) {
01093       min_bottom = bottom;
01094       footer = part;
01095     }
01096   }
01097   if (header)
01098     header->clear_table_type();
01099   if (footer)
01100     footer->clear_table_type();
01101 }
01102 
01103 // Mark all ColPartitions as table cells that have a table cell above
01104 // and below them
01105 // TODO(faisal): This is too aggressive at the moment. The method needs to
01106 // consider spacing and alignment as well. Detection of false alarm table cells
01107 // should also be done as part of it.
01108 void TableFinder::SmoothTablePartitionRuns() {
01109   // Iterate the ColPartitions in the grid.
01110   ColPartitionGridSearch gsearch(&clean_part_grid_);
01111   gsearch.StartFullSearch();
01112   ColPartition* part = NULL;
01113   while ((part = gsearch.NextFullSearch()) != NULL) {
01114     if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
01115       continue;  // Consider only text partitions
01116     ColPartition* upper_part = part->nearest_neighbor_above();
01117     ColPartition* lower_part = part->nearest_neighbor_below();
01118     if (!upper_part || !lower_part)
01119       continue;
01120     if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
01121       part->set_table_type();
01122   }
01123 
01124   // Pass 2, do the opposite. If both the upper and lower neighbors
01125   // exist and are not tables, this probably shouldn't be a table.
01126   gsearch.StartFullSearch();
01127   part = NULL;
01128   while ((part = gsearch.NextFullSearch()) != NULL) {
01129     if (part->type() != PT_TABLE)
01130       continue;  // Consider only text partitions
01131     ColPartition* upper_part = part->nearest_neighbor_above();
01132     ColPartition* lower_part = part->nearest_neighbor_below();
01133 
01134     // table can't be by itself
01135     if ((upper_part && upper_part->type() != PT_TABLE) &&
01136         (lower_part && lower_part->type() != PT_TABLE)) {
01137       part->clear_table_type();
01138     }
01139   }
01140 }
01141 
01142 // Set the type of a column segment based on the ratio of table to text cells
01143 void TableFinder::SetColumnsType(ColSegment_LIST* column_blocks) {
01144   ColSegment_IT it(column_blocks);
01145   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01146     ColSegment* seg = it.data();
01147     TBOX box = seg->bounding_box();
01148     int num_table_cells = 0;
01149     int num_text_cells = 0;
01150     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01151         rsearch(&clean_part_grid_);
01152     rsearch.SetUniqueMode(true);
01153     rsearch.StartRectSearch(box);
01154     ColPartition* part = NULL;
01155     while ((part = rsearch.NextRectSearch()) != NULL) {
01156       if (part->type() == PT_TABLE) {
01157         num_table_cells++;
01158       } else if (part->type() == PT_FLOWING_TEXT) {
01159         num_text_cells++;
01160       }
01161     }
01162     // If a column block has no text or table partition in it, it is not needed
01163     // for table detection.
01164     if (!num_table_cells && !num_text_cells) {
01165       delete it.extract();
01166     } else {
01167       seg->set_num_table_cells(num_table_cells);
01168       seg->set_num_text_cells(num_text_cells);
01169       // set column type based on the ratio of table to text cells
01170       seg->set_type();
01171     }
01172   }
01173 }
01174 
01175 // Move column blocks to grid
01176 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
01177                                          ColSegmentGrid *col_seg_grid) {
01178   ColSegment_IT it(segments);
01179   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01180     ColSegment* seg = it.extract();
01181     col_seg_grid->InsertBBox(true, true, seg);
01182   }
01183 }
01184 
01185 // Merge column blocks if a split is detected due to the presence of a
01186 // table. A text block is considered split if it has multiple
01187 // neighboring blocks above/below it, and at least one of the
01188 // neighboring blocks is of table type (has a high density of table
01189 // partitions). In this case neighboring blocks in the direction
01190 // (above/below) of the table block are merged with the text block.
01191 
01192 // Comment: This method does not handle split due to a full page table
01193 // since table columns in this case do not have a text column on which
01194 // split decision can be based.
01195 void TableFinder::GridMergeColumnBlocks() {
01196   int margin = gridsize();
01197 
01198   // Iterate the Column Blocks in the grid.
01199   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01200     gsearch(&col_seg_grid_);
01201   gsearch.StartFullSearch();
01202   ColSegment* seg;
01203   while ((seg = gsearch.NextFullSearch()) != NULL) {
01204     if (seg->type() != COL_TEXT)
01205       continue;  // only consider text blocks for split detection
01206     bool neighbor_found = false;
01207     bool modified = false;  // Modified at least once
01208     // keep expanding current box as long as neighboring table columns
01209     // are found above or below it.
01210     do {
01211       TBOX box = seg->bounding_box();
01212       // slightly expand the search region vertically
01213       int top_range = MIN(box.top() + margin, tright().y());
01214       int bottom_range = MAX(box.bottom() - margin, bleft().y());
01215       box.set_top(top_range);
01216       box.set_bottom(bottom_range);
01217       neighbor_found = false;
01218       GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01219           rectsearch(&col_seg_grid_);
01220       rectsearch.StartRectSearch(box);
01221       ColSegment* neighbor = NULL;
01222       while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
01223         if (neighbor == seg)
01224           continue;
01225         const TBOX& neighbor_box = neighbor->bounding_box();
01226         // If the neighbor box significantly overlaps with the current
01227         // box (due to the expansion of the current box in the
01228         // previous iteration of this loop), remove the neighbor box
01229         // and expand the current box to include it.
01230         if (neighbor_box.overlap_fraction(box) >= 0.9) {
01231           seg->InsertBox(neighbor_box);
01232           modified = true;
01233           rectsearch.RemoveBBox();
01234           gsearch.RepositionIterator();
01235           delete neighbor;
01236           continue;
01237         }
01238         // Only expand if the neighbor box is of table type
01239         if (neighbor->type() != COL_TABLE)
01240           continue;
01241         // Insert the neighbor box into the current column block
01242         if (neighbor_box.major_x_overlap(box) &&
01243             !box.contains(neighbor_box)) {
01244           seg->InsertBox(neighbor_box);
01245           neighbor_found = true;
01246           modified = true;
01247           rectsearch.RemoveBBox();
01248           gsearch.RepositionIterator();
01249           delete neighbor;
01250         }
01251       }
01252     } while (neighbor_found);
01253     if (modified) {
01254       // Because the box has changed, it has to be removed first.
01255       gsearch.RemoveBBox();
01256       col_seg_grid_.InsertBBox(true, true, seg);
01257       gsearch.RepositionIterator();
01258     }
01259   }
01260 }
01261 
01262 // Group horizontally overlapping table partitions into table columns.
01263 // TODO(faisal): This is too aggressive at the moment. The method should
01264 // consider more attributes to group table partitions together. Some common
01265 // errors are:
01266 //  1- page number is merged with a table column above it even
01267 //      if there is a large vertical gap between them.
01268 //  2- column headers go on to catch one of the columns arbitrarily
01269 //  3- an isolated noise blob near page top or bottom merges with the table
01270 //     column below/above it
01271 //  4- cells from two vertically adjacent tables merge together to make a
01272 //     single column resulting in merging of the two tables
01273 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
01274   ColSegment_IT it(table_columns);
01275   // Iterate the ColPartitions in the grid.
01276   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01277     gsearch(&clean_part_grid_);
01278   gsearch.StartFullSearch();
01279   ColPartition* part;
01280   while ((part = gsearch.NextFullSearch()) != NULL) {
01281     if (part->inside_table_column() || part->type() != PT_TABLE)
01282       continue;  // prevent a partition to be assigned to multiple columns
01283     const TBOX& box = part->bounding_box();
01284     ColSegment* col = new ColSegment();
01285     col->InsertBox(box);
01286     part->set_inside_table_column(true);
01287     // Start a search below the current cell to find bottom neighbours
01288     // Note: a full search will always process things above it first, so
01289     // this should be starting at the highest cell and working its way down.
01290     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01291         vsearch(&clean_part_grid_);
01292     vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
01293     ColPartition* neighbor = NULL;
01294     bool found_neighbours = false;
01295     while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
01296       // only consider neighbors not assigned to any column yet
01297       if (neighbor->inside_table_column())
01298         continue;
01299       // Horizontal lines should not break the flow
01300       if (neighbor->IsHorizontalLine())
01301         continue;
01302       // presence of a non-table neighbor marks the end of current
01303       // table column
01304       if (neighbor->type() != PT_TABLE)
01305         break;
01306       // add the neighbor partition to the table column
01307       const TBOX& neighbor_box = neighbor->bounding_box();
01308       col->InsertBox(neighbor_box);
01309       neighbor->set_inside_table_column(true);
01310       found_neighbours = true;
01311     }
01312     if (found_neighbours) {
01313       it.add_after_then_move(col);
01314     } else {
01315       part->set_inside_table_column(false);
01316       delete col;
01317     }
01318   }
01319 }
01320 
01321 // Mark regions in a column that are x-bounded by the column boundaries and
01322 // y-bounded by the table columns' projection on the y-axis as table regions
01323 void TableFinder::GetTableRegions(ColSegment_LIST* table_columns,
01324                                   ColSegment_LIST* table_regions) {
01325   ColSegment_IT cit(table_columns);
01326   ColSegment_IT rit(table_regions);
01327   // Iterate through column blocks
01328   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01329       gsearch(&col_seg_grid_);
01330   gsearch.StartFullSearch();
01331   ColSegment* part;
01332   int page_height = tright().y() - bleft().y();
01333   ASSERT_HOST(page_height > 0);
01334   // create a bool array to hold projection on y-axis
01335   bool* table_region = new bool[page_height];
01336   while ((part = gsearch.NextFullSearch()) != NULL) {
01337     TBOX part_box = part->bounding_box();
01338     // reset the projection array
01339     for (int i = 0; i < page_height; i++) {
01340       table_region[i] = false;
01341     }
01342     // iterate through all table columns to find regions in the current
01343     // page column block
01344     cit.move_to_first();
01345     for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
01346       TBOX col_box = cit.data()->bounding_box();
01347       // find intersection region of table column and page column
01348       TBOX intersection_box = col_box.intersection(part_box);
01349       // project table column on the y-axis
01350       for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
01351         table_region[i - bleft().y()] = true;
01352       }
01353     }
01354     // set x-limits of table regions to page column width
01355     TBOX current_table_box;
01356     current_table_box.set_left(part_box.left());
01357     current_table_box.set_right(part_box.right());
01358     // go through the y-axis projection to find runs of table
01359     // regions. Each run makes one table region.
01360     for (int i = 1; i < page_height; i++) {
01361       // detect start of a table region
01362       if (!table_region[i - 1] && table_region[i]) {
01363         current_table_box.set_bottom(i + bleft().y());
01364       }
01365       // TODO(nbeato): Is it guaranteed that the last row is not a table region?
01366       // detect end of a table region
01367       if (table_region[i - 1] && !table_region[i]) {
01368         current_table_box.set_top(i + bleft().y());
01369         if (!current_table_box.null_box()) {
01370           ColSegment* seg = new ColSegment();
01371           seg->InsertBox(current_table_box);
01372           rit.add_after_then_move(seg);
01373         }
01374       }
01375     }
01376   }
01377   delete[] table_region;
01378 }
01379 
01380 // Merge table regions corresponding to tables spanning multiple columns if
01381 // there is a colpartition (horizontal ruling line or normal text) that
01382 // touches both regions.
01383 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
01384 // tables with aligned ruling lines. In this case, line finder returns a
01385 // single line and hence the tables get merged together
01386 void TableFinder::GridMergeTableRegions() {
01387   // Iterate the table regions in the grid.
01388   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01389       gsearch(&table_grid_);
01390   gsearch.StartFullSearch();
01391   ColSegment* seg = NULL;
01392   while ((seg = gsearch.NextFullSearch()) != NULL) {
01393     bool neighbor_found = false;
01394     bool modified = false;  // Modified at least once
01395     do {
01396       // Start a rectangle search x-bounded by the image and y by the table
01397       const TBOX& box = seg->bounding_box();
01398       TBOX search_region(box);
01399       search_region.set_left(bleft().x());
01400       search_region.set_right(tright().x());
01401       neighbor_found = false;
01402       GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01403           rectsearch(&table_grid_);
01404       rectsearch.StartRectSearch(search_region);
01405       ColSegment* neighbor = NULL;
01406       while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
01407         if (neighbor == seg)
01408           continue;
01409         const TBOX& neighbor_box = neighbor->bounding_box();
01410         // Check if a neighbor box has a large overlap with the table
01411         // region.  This may happen as a result of merging two table
01412         // regions in the previous iteration.
01413         if (neighbor_box.overlap_fraction(box) >= 0.9) {
01414           seg->InsertBox(neighbor_box);
01415           rectsearch.RemoveBBox();
01416           gsearch.RepositionIterator();
01417           delete neighbor;
01418           modified = true;
01419           continue;
01420         }
01421         // Check if two table regions belong together based on a common
01422         // horizontal ruling line
01423         if (BelongToOneTable(box, neighbor_box)) {
01424           seg->InsertBox(neighbor_box);
01425           neighbor_found = true;
01426           modified = true;
01427           rectsearch.RemoveBBox();
01428           gsearch.RepositionIterator();
01429           delete neighbor;
01430         }
01431       }
01432     } while (neighbor_found);
01433     if (modified) {
01434       // Because the box has changed, it has to be removed first.
01435       gsearch.RemoveBBox();
01436       table_grid_.InsertBBox(true, true, seg);
01437       gsearch.RepositionIterator();
01438     }
01439   }
01440 }
01441 
01442 // Decide if two table regions belong to one table based on a common
01443 // horizontal ruling line or another colpartition
01444 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
01445   // Check the obvious case. Most likely not true because overlapping boxes
01446   // should already be merged, but seems like a good thing to do in case things
01447   // change.
01448   if (box1.overlap(box2))
01449     return true;
01450   // Check for ColPartitions spanning both table regions
01451   TBOX bbox = box1.bounding_union(box2);
01452   // Start a rect search on bbox
01453   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01454       rectsearch(&clean_part_grid_);
01455   rectsearch.StartRectSearch(bbox);
01456   ColPartition* part = NULL;
01457   while ((part = rectsearch.NextRectSearch()) != NULL) {
01458     const TBOX& part_box = part->bounding_box();
01459     // return true if a colpartition spanning both table regions is found
01460     if (part_box.overlap(box1) && part_box.overlap(box2))
01461       return true;
01462   }
01463   return false;
01464 }
01465 
01466 // Adjust table boundaries by:
01467 //  - building a tight bounding box around all ColPartitions contained in it.
01468 //  - expanding table boundaries to include all colpartitions that overlap the
01469 //    table by more than half of their area
01470 //  - expanding table boundaries to include nearby horizontal rule lines
01471 //  - expanding table vertically to include left out column headers
01472 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
01473 //               makes following errors:
01474 //  1- horizontal lines consisting of underlines are included in the table if
01475 //     they are close enough
01476 //  2- horizontal lines originating from noise tend to get merged with a table
01477 //     near the top of the page
01478 //  3- the criteria for including horizontal lines is very generous. Many times
01479 //     horizontal lines separating headers and footers get merged with a
01480 //     single-column table in a multi-column page thereby including text
01481 //     from the neighboring column inside the table
01482 //  4- the criteria for including left out column headers also tends to
01483 //     occasionally include text-lines above the tables, typically from
01484 //     table caption
01485 void TableFinder::AdjustTableBoundaries() {
01486   // Iterate the table regions in the grid
01487   ColSegment_CLIST adjusted_tables;
01488   ColSegment_C_IT it(&adjusted_tables);
01489   ColSegmentGridSearch gsearch(&table_grid_);
01490   gsearch.StartFullSearch();
01491   ColSegment* table = NULL;
01492   while ((table = gsearch.NextFullSearch()) != NULL) {
01493     const TBOX& table_box = table->bounding_box();
01494     TBOX grown_box = table_box;
01495     GrowTableBox(table_box, &grown_box);
01496     // To prevent a table from expanding again, do not insert the
01497     // modified box back to the grid. Instead move it to a list and
01498     // and remove it from the grid. The list is moved later back to the grid.
01499     if (!grown_box.null_box()) {
01500       ColSegment* col = new ColSegment();
01501       col->InsertBox(grown_box);
01502       it.add_after_then_move(col);
01503     }
01504     gsearch.RemoveBBox();
01505     delete table;
01506   }
01507   // clear table grid to move final tables in it
01508   // TODO(nbeato): table_grid_ should already be empty. The above loop
01509   // removed everything. Maybe just assert it is empty?
01510   table_grid_.Clear();
01511   it.move_to_first();
01512   // move back final tables to table_grid_
01513   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01514     ColSegment* seg = it.extract();
01515     table_grid_.InsertBBox(true, true, seg);
01516   }
01517 }
01518 
01519 void TableFinder::GrowTableBox(const TBOX& table_box, TBOX* result_box) {
01520   // TODO(nbeato): The growing code is a bit excessive right now.
01521   // By removing these lines, the partitions considered need
01522   // to have some overlap or be special cases. These lines could
01523   // be added again once a check is put in place to make sure that
01524   // growing tables don't stomp on a lot of non-table partitions.
01525 
01526   // search for horizontal ruling lines within the vertical margin
01527   // int vertical_margin = kRulingVerticalMargin * gridsize();
01528   TBOX search_box = table_box;
01529   // int top = MIN(search_box.top() + vertical_margin, tright().y());
01530   // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
01531   // search_box.set_top(top);
01532   // search_box.set_bottom(bottom);
01533 
01534   GrowTableToIncludePartials(table_box, search_box, result_box);
01535   GrowTableToIncludeLines(table_box, search_box, result_box);
01536   IncludeLeftOutColumnHeaders(result_box);
01537 }
01538 
01539 // Grow a table by increasing the size of the box to include
01540 // partitions with significant overlap with the table.
01541 void TableFinder::GrowTableToIncludePartials(const TBOX& table_box,
01542                                              const TBOX& search_range,
01543                                              TBOX* result_box) {
01544   // Rulings are in a different grid, so search 2 grids for rulings, text,
01545   // and table partitions that are not entirely within the new box.
01546   for (int i = 0; i < 2; ++i) {
01547     ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
01548                                         &leader_and_ruling_grid_;
01549     ColPartitionGridSearch rectsearch(grid);
01550     rectsearch.StartRectSearch(search_range);
01551     ColPartition* part = NULL;
01552     while ((part = rectsearch.NextRectSearch()) != NULL) {
01553      // Only include text and table types.
01554       if (part->IsImageType())
01555         continue;
01556       const TBOX& part_box = part->bounding_box();
01557       // Include partition in the table if more than half of it
01558       // is covered by the table
01559       if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
01560         *result_box = result_box->bounding_union(part_box);
01561         continue;
01562       }
01563     }
01564   }
01565 }
01566 
01567 // Grow a table by expanding to the extents of significantly
01568 // overlapping lines.
01569 void TableFinder::GrowTableToIncludeLines(const TBOX& table_box,
01570                                           const TBOX& search_range,
01571                                           TBOX* result_box) {
01572   ColPartitionGridSearch rsearch(&leader_and_ruling_grid_);
01573   rsearch.SetUniqueMode(true);
01574   rsearch.StartRectSearch(search_range);
01575   ColPartition* part = NULL;
01576   while ((part = rsearch.NextRectSearch()) != NULL) {
01577     // TODO(nbeato) This should also do vertical, but column
01578     // boundaries are breaking things. This function needs to be
01579     // updated to allow vertical lines as well.
01580     if (!part->IsLineType())
01581       continue;
01582     // Avoid the following function call if the result of the
01583     // function is irrelevant.
01584     const TBOX& part_box = part->bounding_box();
01585     if (result_box->contains(part_box))
01586       continue;
01587     // Include a partially overlapping horizontal line only if the
01588     // extra ColPartitions that will be included due to expansion
01589     // have large side spacing w.r.t. columns containing them.
01590     if (HLineBelongsToTable(*part, table_box))
01591       *result_box = result_box->bounding_union(part_box);
01592     // TODO(nbeato): Vertical
01593   }
01594 }
01595 
01596 // Checks whether the horizontal line belong to the table by looking at the
01597 // side spacing of extra ColParitions that will be included in the table
01598 // due to expansion
01599 bool TableFinder::HLineBelongsToTable(const ColPartition& part,
01600                                       const TBOX& table_box) {
01601   if (!part.IsHorizontalLine())
01602     return false;
01603   const TBOX& part_box = part.bounding_box();
01604   if (!part_box.major_x_overlap(table_box))
01605     return false;
01606   // Do not consider top-most horizontal line since it usually
01607   // originates from noise.
01608   // TODO(nbeato): I had to comment this out because the ruling grid doesn't
01609   // have neighbors solved.
01610   // if (!part.nearest_neighbor_above())
01611   //   return false;
01612   const TBOX bbox = part_box.bounding_union(table_box);
01613   // In the "unioned table" box (the table extents expanded by the line),
01614   // keep track of how many partitions have significant padding to the left
01615   // and right. If more than half of the partitions covered by the new table
01616   // have significant spacing, the line belongs to the table and the table
01617   // grows to include all of the partitions.
01618   int num_extra_partitions = 0;
01619   int extra_space_to_right = 0;
01620   int extra_space_to_left = 0;
01621   // Rulings are in a different grid, so search 2 grids for rulings, text,
01622   // and table partitions that are introduced by the new box.
01623   for (int i = 0; i < 2; ++i) {
01624     ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
01625                                         &leader_and_ruling_grid_;
01626     // Start a rect search on bbox
01627     ColPartitionGridSearch rectsearch(grid);
01628     rectsearch.SetUniqueMode(true);
01629     rectsearch.StartRectSearch(bbox);
01630     ColPartition* extra_part = NULL;
01631     while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
01632       // ColPartition already in table
01633       const TBOX& extra_part_box = extra_part->bounding_box();
01634       if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
01635         continue;
01636       // Non-text ColPartitions do not contribute
01637       if (extra_part->IsImageType())
01638         continue;
01639       // Consider this partition.
01640       num_extra_partitions++;
01641       // presence of a table cell is a strong hint, so just increment the scores
01642       // without looking at the spacing.
01643       if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
01644         extra_space_to_right++;
01645         extra_space_to_left++;
01646         continue;
01647       }
01648       int space_threshold = kSideSpaceMargin * part.median_size();
01649       if (extra_part->space_to_right() > space_threshold)
01650         extra_space_to_right++;
01651       if (extra_part->space_to_left() > space_threshold)
01652         extra_space_to_left++;
01653     }
01654   }
01655   // tprintf("%d %d %d\n",
01656   // num_extra_partitions,extra_space_to_right,extra_space_to_left);
01657   return (extra_space_to_right > num_extra_partitions / 2) ||
01658       (extra_space_to_left > num_extra_partitions / 2);
01659 }
01660 
01661 // Look for isolated column headers above the given table box and
01662 // include them in the table
01663 void TableFinder::IncludeLeftOutColumnHeaders(TBOX* table_box) {
01664   // Start a search above the current table to look for column headers
01665   ColPartitionGridSearch vsearch(&clean_part_grid_);
01666   vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
01667                               table_box->top());
01668   ColPartition* neighbor = NULL;
01669   ColPartition* previous_neighbor = NULL;
01670   while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
01671     // Max distance to find a table heading.
01672     const int max_distance = kMaxColumnHeaderDistance *
01673                              neighbor->median_size();
01674     int table_top = table_box->top();
01675     const TBOX& box = neighbor->bounding_box();
01676     // Do not continue if the next box is way above
01677     if (box.bottom() - table_top > max_distance)
01678       break;
01679     // Unconditionally include partitions of type TABLE or LINE
01680     // TODO(faisal): add some reasonable conditions here
01681     if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
01682       table_box->set_top(box.top());
01683       previous_neighbor = NULL;
01684       continue;
01685     }
01686     // If there are two text partitions, one above the other, without a table
01687     // cell on their left or right side, consider them a barrier and quit
01688     if (previous_neighbor == NULL) {
01689       previous_neighbor = neighbor;
01690     } else {
01691       const TBOX& previous_box = previous_neighbor->bounding_box();
01692       if (!box.major_y_overlap(previous_box))
01693         break;
01694     }
01695   }
01696 }
01697 
01698 // Remove false alarms consiting of a single column based on their
01699 // projection on the x-axis. Projection of a real table on the x-axis
01700 // should have at least one zero-valley larger than the global median
01701 // x-height of the page.
01702 void TableFinder::DeleteSingleColumnTables() {
01703   int page_width = tright().x() - bleft().x();
01704   ASSERT_HOST(page_width > 0);
01705   // create an integer array to hold projection on x-axis
01706   int* table_xprojection = new int[page_width];
01707   // Iterate through all tables in the table grid
01708   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01709       table_search(&table_grid_);
01710   table_search.StartFullSearch();
01711   ColSegment* table;
01712   while ((table = table_search.NextFullSearch()) != NULL) {
01713     TBOX table_box = table->bounding_box();
01714     // reset the projection array
01715     for (int i = 0; i < page_width; i++) {
01716       table_xprojection[i] = 0;
01717     }
01718     // Start a rect search on table_box
01719     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01720         rectsearch(&clean_part_grid_);
01721     rectsearch.SetUniqueMode(true);
01722     rectsearch.StartRectSearch(table_box);
01723     ColPartition* part;
01724     while ((part = rectsearch.NextRectSearch()) != NULL) {
01725       if (!part->IsTextType())
01726         continue;  // Do not consider non-text partitions
01727       if (part->flow() == BTFT_LEADER)
01728         continue;  // Assume leaders are in tables
01729       TBOX part_box = part->bounding_box();
01730       // Do not consider partitions partially covered by the table
01731       if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
01732         continue;
01733       BLOBNBOX_CLIST* part_boxes = part->boxes();
01734       BLOBNBOX_C_IT pit(part_boxes);
01735 
01736       // Make sure overlapping blobs don't artificially inflate the number
01737       // of rows in the table. This happens frequently with things such as
01738       // decimals and split characters. Do this by assuming the column
01739       // partition is sorted mostly left to right and just clip
01740       // bounding boxes by the previous box's extent.
01741       int next_position_to_write = 0;
01742 
01743       for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
01744         BLOBNBOX *pblob = pit.data();
01745         // ignore blob height for the purpose of projection since we
01746         // are only interested in finding valleys
01747         int xstart = pblob->bounding_box().left();
01748         int xend = pblob->bounding_box().right();
01749 
01750         xstart = MAX(xstart, next_position_to_write);
01751         for (int i = xstart; i < xend; i++)
01752           table_xprojection[i - bleft().x()]++;
01753         next_position_to_write = xend;
01754       }
01755     }
01756     // Find largest valley between two reasonable peaks in the table
01757     if (!GapInXProjection(table_xprojection, page_width)) {
01758       table_search.RemoveBBox();
01759       delete table;
01760     }
01761   }
01762   delete[] table_xprojection;
01763 }
01764 
01765 // Return true if at least one gap larger than the global x-height
01766 // exists in the horizontal projection
01767 bool TableFinder::GapInXProjection(int* xprojection, int length) {
01768   // Find peak value of the histogram
01769   int peak_value = 0;
01770   for (int i = 0; i < length; i++) {
01771     if (xprojection[i] > peak_value) {
01772       peak_value = xprojection[i];
01773     }
01774   }
01775   // Peak value represents the maximum number of horizontally
01776   // overlapping colpartitions, so this can be considered as the
01777   // number of rows in the table
01778   if (peak_value < kMinRowsInTable)
01779     return false;
01780   double projection_threshold = kSmallTableProjectionThreshold * peak_value;
01781   if (peak_value >= kLargeTableRowCount)
01782     projection_threshold = kLargeTableProjectionThreshold * peak_value;
01783   // Threshold the histogram
01784   for (int i = 0; i < length; i++) {
01785     xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
01786   }
01787   // Find the largest run of zeros between two ones
01788   int largest_gap = 0;
01789   int run_start = -1;
01790   for (int i = 1; i < length; i++) {
01791     // detect start of a run of zeros
01792     if (xprojection[i - 1] && !xprojection[i]) {
01793       run_start = i;
01794     }
01795     // detect end of a run of zeros and update the value of largest gap
01796     if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
01797       int gap = i - run_start;
01798       if (gap > largest_gap)
01799         largest_gap = gap;
01800       run_start = -1;
01801     }
01802   }
01803   return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
01804 }
01805 
01806 // Given the location of a table "guess", try to overlay a cellular
01807 // grid in the location, adjusting the boundaries.
01808 // TODO(nbeato): Falsely introduces:
01809 //   -headers/footers (not any worse, too much overlap destroys cells)
01810 //   -page numbers (not worse, included because maximize margins)
01811 //   -equations (nicely fit into a celluar grid, but more sparsely)
01812 //   -figures (random text box, also sparse)
01813 //   -small left-aligned text areas with overlapping positioned whitespace
01814 //       (rejected before)
01815 // Overall, this just needs some more work.
01816 void TableFinder::RecognizeTables() {
01817   ScrollView* table_win = NULL;
01818   if (textord_show_tables) {
01819     table_win = MakeWindow(0, 0, "Table Structure");
01820     DisplayColPartitions(table_win, &fragmented_text_grid_,
01821                          ScrollView::BLUE, ScrollView::LIGHT_BLUE);
01822     // table_grid_.DisplayBoxes(table_win);
01823   }
01824 
01825 
01826   TableRecognizer recognizer;
01827   recognizer.Init();
01828   recognizer.set_line_grid(&leader_and_ruling_grid_);
01829   recognizer.set_text_grid(&fragmented_text_grid_);
01830   recognizer.set_max_text_height(global_median_xheight_ * 2.0);
01831   recognizer.set_min_height(1.5 * gridheight());
01832   // Loop over all of the tables and try to fit them.
01833   // Store the good tables here.
01834   ColSegment_CLIST good_tables;
01835   ColSegment_C_IT good_it(&good_tables);
01836 
01837   ColSegmentGridSearch gsearch(&table_grid_);
01838   gsearch.StartFullSearch();
01839   ColSegment* found_table = NULL;
01840   while ((found_table = gsearch.NextFullSearch()) != NULL) {
01841     gsearch.RemoveBBox();
01842 
01843     // The goal is to make the tables persistent in a list.
01844     // When that happens, this will move into the search loop.
01845     const TBOX& found_box = found_table->bounding_box();
01846     StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
01847 
01848     // Process a table. Good tables are inserted into the grid again later on
01849     // We can't change boxes in the grid while it is running a search.
01850     if (table_structure != NULL) {
01851       if (textord_show_tables) {
01852         table_structure->Display(table_win, ScrollView::LIME_GREEN);
01853       }
01854       found_table->set_bounding_box(table_structure->bounding_box());
01855       delete table_structure;
01856       good_it.add_after_then_move(found_table);
01857     } else {
01858       delete found_table;
01859     }
01860   }
01861   // TODO(nbeato): MERGE!! There is awesome info now available for merging.
01862 
01863   // At this point, the grid is empty. We can safely insert the good tables
01864   // back into grid.
01865   for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
01866     table_grid_.InsertBBox(true, true, good_it.extract());
01867 }
01868 
01869 // Displays the column segments in some window.
01870 void TableFinder::DisplayColSegments(ScrollView* win,
01871                                      ColSegment_LIST *segments,
01872                                      ScrollView::Color color) {
01873 #ifndef GRAPHICS_DISABLED
01874   win->Pen(color);
01875   win->Brush(ScrollView::NONE);
01876   ColSegment_IT it(segments);
01877   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01878     ColSegment* col = it.data();
01879     const TBOX& box = col->bounding_box();
01880     int left_x = box.left();
01881     int right_x = box.right();
01882     int top_y = box.top();
01883     int bottom_y = box.bottom();
01884     win->Rectangle(left_x, bottom_y, right_x, top_y);
01885   }
01886   win->UpdateWindow();
01887 #endif
01888 }
01889 
01890 void TableFinder::DisplayColSegmentGrid(ScrollView* win, ColSegmentGrid* grid,
01891                                          ScrollView::Color color) {
01892 #ifndef GRAPHICS_DISABLED
01893   // Iterate the ColPartitions in the grid.
01894   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01895     gsearch(grid);
01896   gsearch.StartFullSearch();
01897   ColSegment* seg = NULL;
01898   while ((seg = gsearch.NextFullSearch()) != NULL) {
01899     const TBOX& box = seg->bounding_box();
01900     int left_x = box.left();
01901     int right_x = box.right();
01902     int top_y = box.top();
01903     int bottom_y = box.bottom();
01904     win->Brush(ScrollView::NONE);
01905     win->Pen(color);
01906     win->Rectangle(left_x, bottom_y, right_x, top_y);
01907   }
01908   win->UpdateWindow();
01909 #endif
01910 }
01911 
01912 // Displays the colpartitions using a new coloring on an existing window.
01913 // Note: This method is only for debug purpose during development and
01914 // would not be part of checked in code
01915 void TableFinder::DisplayColPartitions(ScrollView* win,
01916                                        ColPartitionGrid* grid,
01917                                        ScrollView::Color default_color,
01918                                        ScrollView::Color table_color) {
01919 #ifndef GRAPHICS_DISABLED
01920   ScrollView::Color color = default_color;
01921   // Iterate the ColPartitions in the grid.
01922   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01923     gsearch(grid);
01924   gsearch.StartFullSearch();
01925   ColPartition* part = NULL;
01926   while ((part = gsearch.NextFullSearch()) != NULL) {
01927     color = default_color;
01928     if (part->type() == PT_TABLE)
01929       color = table_color;
01930 
01931     const TBOX& box = part->bounding_box();
01932     int left_x = box.left();
01933     int right_x = box.right();
01934     int top_y = box.top();
01935     int bottom_y = box.bottom();
01936     win->Brush(ScrollView::NONE);
01937     win->Pen(color);
01938     win->Rectangle(left_x, bottom_y, right_x, top_y);
01939   }
01940   win->UpdateWindow();
01941 #endif
01942 }
01943 void TableFinder::DisplayColPartitions(ScrollView* win,
01944                                        ColPartitionGrid* grid,
01945                                        ScrollView::Color default_color) {
01946   DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
01947 }
01948 
01949 void TableFinder::DisplayColPartitionConnections(
01950                      ScrollView* win,
01951                      ColPartitionGrid* grid,
01952                      ScrollView::Color color) {
01953 #ifndef GRAPHICS_DISABLED
01954   // Iterate the ColPartitions in the grid.
01955   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01956     gsearch(grid);
01957   gsearch.StartFullSearch();
01958   ColPartition* part = NULL;
01959   while ((part = gsearch.NextFullSearch()) != NULL) {
01960     const TBOX& box = part->bounding_box();
01961     int left_x = box.left();
01962     int right_x = box.right();
01963     int top_y = box.top();
01964     int bottom_y = box.bottom();
01965 
01966     ColPartition* upper_part = part->nearest_neighbor_above();
01967     if (upper_part) {
01968       TBOX upper_box = upper_part->bounding_box();
01969       int mid_x = (left_x + right_x) / 2;
01970       int mid_y = (top_y + bottom_y) / 2;
01971       int other_x = (upper_box.left() + upper_box.right()) / 2;
01972       int other_y = (upper_box.top() + upper_box.bottom()) / 2;
01973       win->Brush(ScrollView::NONE);
01974       win->Pen(color);
01975       win->Line(mid_x, mid_y, other_x, other_y);
01976     }
01977     ColPartition* lower_part = part->nearest_neighbor_below();
01978     if (lower_part) {
01979       TBOX lower_box = lower_part->bounding_box();
01980       int mid_x = (left_x + right_x) / 2;
01981       int mid_y = (top_y + bottom_y) / 2;
01982       int other_x = (lower_box.left() + lower_box.right()) / 2;
01983       int other_y = (lower_box.top() + lower_box.bottom()) / 2;
01984       win->Brush(ScrollView::NONE);
01985       win->Pen(color);
01986       win->Line(mid_x, mid_y, other_x, other_y);
01987     }
01988   }
01989   win->UpdateWindow();
01990 #endif
01991 }
01992 
01993 
01994 // Write debug image and text file.
01995 // Note: This method is only for debug purpose during development and
01996 // would not be part of checked in code
01997 void TableFinder::WriteToPix(const FCOORD& reskew) {
01998   // Input file must be named test1.tif
01999   PIX* pix = pixRead("test1.tif");
02000   if (!pix) {
02001     tprintf("Input file test1.tif not found.\n");
02002     return;
02003   }
02004   int img_height = pixGetHeight(pix);
02005   int img_width = pixGetWidth(pix);
02006   // Maximum number of text or table partitions
02007   int num_boxes = 10;
02008   BOXA* text_box_array = boxaCreate(num_boxes);
02009   BOXA* table_box_array = boxaCreate(num_boxes);
02010   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
02011     gsearch(&clean_part_grid_);
02012   gsearch.StartFullSearch();
02013   ColPartition* part;
02014   // load colpartitions into text_box_array and table_box_array
02015   while ((part = gsearch.NextFullSearch()) != NULL) {
02016     TBOX box = part->bounding_box();
02017     box.rotate_large(reskew);
02018     BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
02019                               box.right() - box.left(),
02020                               box.top() - box.bottom());
02021     if (part->type() == PT_TABLE)
02022       boxaAddBox(table_box_array, lept_box, L_INSERT);
02023     else
02024       boxaAddBox(text_box_array, lept_box, L_INSERT);
02025   }
02026   // draw colpartitions on the output image
02027   PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000);
02028   out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00);
02029 
02030   BOXA* table_array = boxaCreate(num_boxes);
02031   // text file containing detected table bounding boxes
02032   FILE* fptr = fopen("tess-table.txt", "wb");
02033   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
02034       table_search(&table_grid_);
02035   table_search.StartFullSearch();
02036   ColSegment* table;
02037   // load table boxes to table_array and write them to text file as well
02038   while ((table = table_search.NextFullSearch()) != NULL) {
02039     TBOX box = table->bounding_box();
02040     box.rotate_large(reskew);
02041     // Since deskewing introduces negative coordinates, reskewing
02042     // might not completely recover from that since both steps enlarge
02043     // the actual box. Hence a box that undergoes deskewing/reskewing
02044     // may go out of image boundaries. Crop a table box if needed to
02045     // contain it inside the image dimensions.
02046     box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1));
02047     BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
02048                               box.right() - box.left(),
02049                               box.top() - box.bottom());
02050     boxaAddBox(table_array, lept_box, L_INSERT);
02051     fprintf(fptr, "%d %d %d %d TABLE\n", box.left(),
02052             img_height - box.top(), box.right(), img_height - box.bottom());
02053   }
02054   fclose(fptr);
02055   // paint table boxes on the debug image
02056   out = pixDrawBoxa(out, table_array, 5, 0x7fff0000);
02057 
02058   pixWrite("out.png", out, IFF_PNG);
02059   // memory cleanup
02060   boxaDestroy(&text_box_array);
02061   boxaDestroy(&table_box_array);
02062   boxaDestroy(&table_array);
02063   pixDestroy(&pix);
02064   pixDestroy(&out);
02065 }
02066 
02067 // Merge all colpartitions in table regions to make them a single
02068 // colpartition and revert types of isolated table cells not
02069 // assigned to any table to their original types.
02070 void TableFinder::MakeTableBlocks(ColPartitionGrid* grid,
02071                                   ColPartitionSet** all_columns,
02072                                   WidthCallback* width_cb) {
02073   // Since we have table blocks already, remove table tags from all
02074   // colpartitions
02075   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
02076     gsearch(grid);
02077   gsearch.StartFullSearch();
02078   ColPartition* part = NULL;
02079 
02080   while ((part = gsearch.NextFullSearch()) != NULL) {
02081     if (part->type() == PT_TABLE) {
02082       part->clear_table_type();
02083     }
02084   }
02085   // Now make a single colpartition out of each table block and remove
02086   // all colpartitions contained within a table
02087   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
02088       table_search(&table_grid_);
02089   table_search.StartFullSearch();
02090   ColSegment* table;
02091   while ((table = table_search.NextFullSearch()) != NULL) {
02092     TBOX table_box = table->bounding_box();
02093     // Start a rect search on table_box
02094     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
02095         rectsearch(grid);
02096     rectsearch.StartRectSearch(table_box);
02097     ColPartition* part;
02098     ColPartition* table_partition = NULL;
02099     while ((part = rectsearch.NextRectSearch()) != NULL) {
02100      // Do not consider image partitions
02101       if (!part->IsTextType())
02102         continue;
02103       TBOX part_box = part->bounding_box();
02104       // Include partition in the table if more than half of it
02105       // is covered by the table
02106       if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
02107         rectsearch.RemoveBBox();
02108         if (table_partition) {
02109           table_partition->Absorb(part, width_cb);
02110         } else {
02111           table_partition = part;
02112         }
02113       }
02114     }
02115     // Insert table colpartition back to part_grid_
02116     if (table_partition) {
02117       // To match the columns used when transforming to blocks, the new table
02118       // partition must have its first and last column set at the grid y that
02119       // corresponds to its bottom.
02120       const TBOX& table_box = table_partition->bounding_box();
02121       int grid_x, grid_y;
02122       grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
02123       table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
02124       table_partition->set_table_type();
02125       table_partition->set_blob_type(BRT_TEXT);
02126       table_partition->set_flow(BTFT_CHAIN);
02127       table_partition->SetBlobTypes();
02128       grid->InsertBBox(true, true, table_partition);
02129     }
02130   }
02131 }
02132 
02135 ColSegment::ColSegment()
02136     : ELIST_LINK(),
02137       num_table_cells_(0),
02138       num_text_cells_(0),
02139       type_(COL_UNKNOWN) {
02140 }
02141 ColSegment::~ColSegment() {
02142 }
02143 
02144 // Provides a color for BBGrid to draw the rectangle.
02145 ScrollView::Color  ColSegment::BoxColor() const {
02146   const ScrollView::Color kBoxColors[PT_COUNT] = {
02147     ScrollView::YELLOW,
02148     ScrollView::BLUE,
02149     ScrollView::YELLOW,
02150     ScrollView::MAGENTA,
02151   };
02152   return kBoxColors[type_];
02153 }
02154 
02155 // Insert a box into this column segment
02156 void ColSegment::InsertBox(const TBOX& other) {
02157   bounding_box_ = bounding_box_.bounding_union(other);
02158 }
02159 
02160 // Set column segment type based on the ratio of text and table partitions
02161 // in it.
02162 void ColSegment::set_type() {
02163   if (num_table_cells_ > kTableColumnThreshold * num_text_cells_)
02164     type_ = COL_TABLE;
02165   else if (num_text_cells_ > num_table_cells_)
02166     type_ = COL_TEXT;
02167   else
02168     type_ = COL_MIXED;
02169 }
02170 
02171 }  // namespace tesseract.