Tesseract  3.02
tesseract-ocr/textord/tablerecog.cpp
Go to the documentation of this file.
00001 
00002 // File:        tablerecog.cpp
00003 // Description: Helper class to help structure table areas. Given an bounding
00004 //              box from TableFinder, the TableRecognizer should give a
00005 //              StructuredTable (maybe a list in the future) of "good" tables
00006 //              in that area.
00007 // Author:      Nicholas Beato
00008 // Created:     Friday, Aug. 20, 2010
00009 //
00010 // (C) Copyright 2009, Google Inc.
00011 // Licensed under the Apache License, Version 2.0 (the "License");
00012 // you may not use this file except in compliance with the License.
00013 // You may obtain a copy of the License at
00014 // http://www.apache.org/licenses/LICENSE-2.0
00015 // Unless required by applicable law or agreed to in writing, software
00016 // distributed under the License is distributed on an "AS IS" BASIS,
00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00018 // See the License for the specific language governing permissions and
00019 // limitations under the License.
00020 //
00022 
00023 #include "tablerecog.h"
00024 
00025 namespace tesseract {
00026 
00027 // The amount of space required between the ColPartitions in 2 columns
00028 // of a non-lined table as a multiple of the median width.
00029 const double kHorizontalSpacing = 0.30;
00030 // The amount of space required between the ColPartitions in 2 rows
00031 // of a non-lined table as multiples of the median height.
00032 const double kVerticalSpacing = -0.2;
00033 // The number of cells that the grid lines may intersect.
00034 // See FindCellSplitLocations for explanation.
00035 const int kCellSplitRowThreshold = 0;
00036 const int kCellSplitColumnThreshold = 0;
00037 // For "lined tables", the number of required lines. Currently a guess.
00038 const int kLinedTableMinVerticalLines = 3;
00039 const int kLinedTableMinHorizontalLines = 3;
00040 // Number of columns required, as a fraction of the most columns found.
00041 // None of these are tweaked at all.
00042 const double kRequiredColumns = 0.7;
00043 // The tolerance for comparing margins of potential tables.
00044 const double kMarginFactor = 1.1;
00045 // The first and last row should be consistent cell height.
00046 // This factor is the first and last row cell height max.
00047 const double kMaxRowSize = 2.5;
00048 // Number of filled columns required to form a strong table row.
00049 // For small tables, this is an absolute number.
00050 const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 };
00051 const int kGoodRowNumberOfColumnsSmallSize = 
00052     sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1;
00053 // For large tables, it is a relative number
00054 const double kGoodRowNumberOfColumnsLarge = 0.7;
00055 // The amount of area that must be covered in a cell by ColPartitions to
00056 // be considered "filled"
00057 const double kMinFilledArea = 0.35;
00058 
00062 
00063 StructuredTable::StructuredTable()
00064     : text_grid_(NULL),
00065       line_grid_(NULL),
00066       is_lined_(false),
00067       space_above_(0),
00068       space_below_(0),
00069       space_left_(0),
00070       space_right_(0),
00071       median_cell_height_(0),
00072       median_cell_width_(0),
00073       max_text_height_(MAX_INT32) {
00074 }
00075 
00076 StructuredTable::~StructuredTable() {
00077 }
00078 
00079 void StructuredTable::Init() {
00080 }
00081 
00082 void StructuredTable::set_text_grid(ColPartitionGrid* text_grid) {
00083   text_grid_ = text_grid;
00084 }
00085 void StructuredTable::set_line_grid(ColPartitionGrid* line_grid) {
00086   line_grid_ = line_grid;
00087 }
00088 void StructuredTable::set_max_text_height(int height) {
00089   max_text_height_ = height;
00090 }
00091 bool StructuredTable::is_lined() const {
00092   return is_lined_;
00093 }
00094 int StructuredTable::row_count() const {
00095   return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
00096 }
00097 int StructuredTable::column_count() const {
00098   return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
00099 }
00100 int StructuredTable::cell_count() const {
00101   return row_count() * column_count();
00102 }
00103 void StructuredTable::set_bounding_box(const TBOX& box) {
00104   bounding_box_ = box;
00105 }
00106 const TBOX& StructuredTable::bounding_box() const {
00107   return bounding_box_;
00108 }
00109 int StructuredTable::median_cell_height() {
00110   return median_cell_height_;
00111 }
00112 int StructuredTable::median_cell_width() {
00113   return median_cell_width_;
00114 }
00115 int StructuredTable::row_height(int row) const {
00116   ASSERT_HOST(0 <= row && row < row_count());
00117   return cell_y_[row + 1] - cell_y_[row];
00118 }
00119 int StructuredTable::column_width(int column) const {
00120   ASSERT_HOST(0 <= column && column < column_count());
00121   return cell_x_[column + 1] - cell_x_[column];
00122 }
00123 int StructuredTable::space_above() const {
00124   return space_above_;
00125 }
00126 int StructuredTable::space_below() const {
00127   return space_below_;
00128 }
00129 
00130 // At this point, we know that the lines are contained
00131 // by the box (by FindLinesBoundingBox).
00132 // So try to find the cell structure and make sure it works out.
00133 // The assumption is that all lines span the table. If this
00134 // assumption fails, the VerifyLinedTable method will
00135 // abort the lined table. The TableRecognizer will fall
00136 // back on FindWhitespacedStructure.
00137 bool StructuredTable::FindLinedStructure() {
00138   ClearStructure();
00139 
00140   // Search for all of the lines in the current box.
00141   // Update the cellular structure with the exact lines.
00142   ColPartitionGridSearch box_search(line_grid_);
00143   box_search.SetUniqueMode(true);
00144   box_search.StartRectSearch(bounding_box_);
00145   ColPartition* line = NULL;
00146 
00147   while ((line = box_search.NextRectSearch()) != NULL) {
00148     if (line->IsHorizontalLine())
00149       cell_y_.push_back(line->MidY());
00150     if (line->IsVerticalLine())
00151       cell_x_.push_back(line->MidX());
00152   }
00153 
00154   // HasSignificantLines should guarantee cells.
00155   // Because that code is a different class, just gracefully
00156   // return false. This could be an assert.
00157   if (cell_x_.length() < 3 || cell_y_.length() < 3)
00158     return false;
00159 
00160   cell_x_.sort();
00161   cell_y_.sort();
00162 
00163   // Remove duplicates that may have occurred due to split lines.
00164   cell_x_.compact_sorted();
00165   cell_y_.compact_sorted();
00166 
00167   // The border should be the extents of line boxes, not middle.
00168   cell_x_[0] = bounding_box_.left();
00169   cell_x_[cell_x_.length() - 1] = bounding_box_.right();
00170   cell_y_[0] = bounding_box_.bottom();
00171   cell_y_[cell_y_.length() - 1] = bounding_box_.top();
00172 
00173   // Remove duplicates that may have occurred due to moving the borders.
00174   cell_x_.compact_sorted();
00175   cell_y_.compact_sorted();
00176 
00177   CalculateMargins();
00178   CalculateStats();
00179   is_lined_ = VerifyLinedTableCells();
00180   return is_lined_;
00181 }
00182 
00183 // Finds the cellular structure given a particular box.
00184 bool StructuredTable::FindWhitespacedStructure() {
00185   ClearStructure();
00186   FindWhitespacedColumns();
00187   FindWhitespacedRows();
00188 
00189   if (!VerifyWhitespacedTable()) {
00190     return false;
00191   } else {
00192     bounding_box_.set_left(cell_x_[0]);
00193     bounding_box_.set_right(cell_x_[cell_x_.length() - 1]);
00194     bounding_box_.set_bottom(cell_y_[0]);
00195     bounding_box_.set_top(cell_y_[cell_y_.length() - 1]);
00196     AbsorbNearbyLines();
00197     CalculateMargins();
00198     CalculateStats();
00199     return true;
00200   }
00201 }
00202 
00203 // Tests if a partition fits inside the table structure.
00204 // Partitions must fully span a grid line in order to intersect it.
00205 // This means that a partition does not intersect a line
00206 // that it "just" touches. This is mainly because the assumption
00207 // throughout the code is that "0" distance is a very very small space.
00208 bool StructuredTable::DoesPartitionFit(const ColPartition& part) const {
00209   const TBOX& box = part.bounding_box();
00210   for (int i = 0; i < cell_x_.length(); ++i)
00211     if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
00212       return false;
00213   for (int i = 0; i < cell_y_.length(); ++i)
00214     if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
00215       return false;
00216   return true;
00217 }
00218 
00219 // Checks if a sub-table has multiple data cells filled.
00220 int StructuredTable::CountFilledCells() {
00221   return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
00222 }
00223 int StructuredTable::CountFilledCellsInRow(int row) {
00224   return CountFilledCells(row, row, 0, column_count() - 1);
00225 }
00226 int StructuredTable::CountFilledCellsInColumn(int column) {
00227   return CountFilledCells(0, row_count() - 1, column, column);
00228 }
00229 int StructuredTable::CountFilledCells(int row_start, int row_end,
00230                             int column_start, int column_end) {
00231   ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
00232   ASSERT_HOST(0 <= column_start && column_start <= column_end &&
00233               column_end < column_count());
00234   int cell_count = 0;
00235   TBOX cell_box;
00236   for (int row = row_start; row <= row_end; ++row) {
00237     cell_box.set_bottom(cell_y_[row]);
00238     cell_box.set_top(cell_y_[row + 1]);
00239     for (int col = column_start; col <= column_end; ++col) {
00240       cell_box.set_left(cell_x_[col]);
00241       cell_box.set_right(cell_x_[col + 1]);
00242       if (CountPartitions(cell_box) > 0)
00243         ++cell_count;
00244     }
00245   }
00246   return cell_count;
00247 }
00248 
00249 // Makes sure that at least one cell in a row has substantial area filled.
00250 // This can filter out large whitespace caused by growing tables too far
00251 // and page numbers.
00252 bool StructuredTable::VerifyRowFilled(int row) {
00253   for (int i = 0; i < column_count(); ++i) {
00254     double area_filled = CalculateCellFilledPercentage(row, i);
00255     if (area_filled >= kMinFilledArea)
00256       return true;
00257   }
00258   return false;
00259 }
00260 
00261 // Finds the filled area in a cell.
00262 // Assume ColPartitions do not overlap for simplicity (even though they do).
00263 double StructuredTable::CalculateCellFilledPercentage(int row, int column) {
00264   ASSERT_HOST(0 <= row && row <= row_count());
00265   ASSERT_HOST(0 <= column && column <= column_count());
00266   const TBOX kCellBox(cell_x_[column], cell_y_[row],
00267                       cell_x_[column + 1], cell_y_[row + 1]);
00268   ASSERT_HOST(!kCellBox.null_box());
00269 
00270   ColPartitionGridSearch gsearch(text_grid_);
00271   gsearch.SetUniqueMode(true);
00272   gsearch.StartRectSearch(kCellBox);
00273   double area_covered = 0;
00274   ColPartition* text = NULL;
00275   while ((text = gsearch.NextRectSearch()) != NULL) {
00276     if (text->IsTextType())
00277       area_covered += text->bounding_box().intersection(kCellBox).area();
00278   }
00279   return MIN(1.0, area_covered / kCellBox.area());
00280 }
00281 
00282 void StructuredTable::Display(ScrollView* window, ScrollView::Color color) {
00283 #ifndef GRAPHICS_DISABLED
00284   window->Brush(ScrollView::NONE);
00285   window->Pen(color);
00286   window->Rectangle(bounding_box_.left(), bounding_box_.bottom(),
00287                     bounding_box_.right(), bounding_box_.top());
00288   for (int i = 0; i < cell_x_.length(); i++) {
00289     window->Line(cell_x_[i], bounding_box_.bottom(),
00290                  cell_x_[i], bounding_box_.top());
00291   }
00292   for (int i = 0; i < cell_y_.length(); i++) {
00293     window->Line(bounding_box_.left(), cell_y_[i],
00294                  bounding_box_.right(), cell_y_[i]);
00295   }
00296   window->UpdateWindow();
00297 #endif
00298 }
00299 
00300 // Clear structure information.
00301 void StructuredTable::ClearStructure() {
00302   cell_x_.clear();
00303   cell_y_.clear();
00304   is_lined_ = false;
00305   space_above_ = 0;
00306   space_below_ = 0;
00307   space_left_ = 0;
00308   space_right_ = 0;
00309   median_cell_height_ = 0;
00310   median_cell_width_ = 0;
00311 }
00312 
00313 // When a table has lines, the lines should not intersect any partitions.
00314 // The following function makes sure the previous assumption is met.
00315 bool StructuredTable::VerifyLinedTableCells() {
00316   // Function only called when lines exist.
00317   ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
00318   for (int i = 0; i < cell_y_.length(); ++i) {
00319     if (CountHorizontalIntersections(cell_y_[i]) > 0)
00320       return false;
00321   }
00322   for (int i = 0; i < cell_x_.length(); ++i) {
00323     if (CountVerticalIntersections(cell_x_[i]) > 0)
00324       return false;
00325   }
00326   return true;
00327 }
00328 
00329 // TODO(nbeato): Could be much better than this.
00330 // Examples:
00331 //   - Caclulate the percentage of filled cells.
00332 //   - Calculate the average number of ColPartitions per cell.
00333 //   - Calculate the number of cells per row with partitions.
00334 //   - Check if ColPartitions in adjacent cells are similar.
00335 //   - Check that all columns are at least a certain width.
00336 //   - etc.
00337 bool StructuredTable::VerifyWhitespacedTable() {
00338   // criteria for a table, must be at least 2x3 or 3x2
00339   return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
00340 }
00341 
00342 // Finds vertical splits in the ColPartitions of text_grid_ by considering
00343 // all possible "good" guesses. A good guess is just the left/right sides of
00344 // the partitions, since these locations will uniquely define where the
00345 // extremal values where the splits can occur. The split happens
00346 // in the middle of the two nearest partitions.
00347 void StructuredTable::FindWhitespacedColumns() {
00348   // Set of the extents of all partitions on the page.
00349   GenericVectorEqEq<int> left_sides;
00350   GenericVectorEqEq<int> right_sides;
00351 
00352   // Look at each text partition. We want to find the partitions
00353   // that have extremal left/right sides. These will give us a basis
00354   // for the table columns.
00355   ColPartitionGridSearch gsearch(text_grid_);
00356   gsearch.SetUniqueMode(true);
00357   gsearch.StartRectSearch(bounding_box_);
00358   ColPartition* text = NULL;
00359   while ((text = gsearch.NextRectSearch()) != NULL) {
00360     if (!text->IsTextType())
00361       continue;
00362 
00363     ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
00364     int spacing = static_cast<int>(text->median_width() *
00365                                    kHorizontalSpacing / 2.0 + 0.5);
00366     left_sides.push_back(text->bounding_box().left() - spacing);
00367     right_sides.push_back(text->bounding_box().right() + spacing);
00368   }
00369   // It causes disaster below, so avoid it!
00370   if (left_sides.length() == 0 || right_sides.length() == 0)
00371     return;
00372 
00373   // Since data may be inserted in grid order, we sort the left/right sides.
00374   left_sides.sort();
00375   right_sides.sort();
00376 
00377   // At this point, in the "merged list", we expect to have a left side,
00378   // followed by either more left sides or a right side. The last number
00379   // should be a right side. We find places where the splits occur by looking
00380   // for "valleys". If we want to force gap sizes or allow overlap, change
00381   // the spacing above. If you want to let lines "slice" partitions as long
00382   // as it is infrequent, change the following function.
00383   FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
00384                          &cell_x_);
00385 }
00386 
00387 // Finds horizontal splits in the ColPartitions of text_grid_ by considering
00388 // all possible "good" guesses. A good guess is just the bottom/top sides of
00389 // the partitions, since these locations will uniquely define where the
00390 // extremal values where the splits can occur. The split happens
00391 // in the middle of the two nearest partitions.
00392 void StructuredTable::FindWhitespacedRows() {
00393   // Set of the extents of all partitions on the page.
00394   GenericVectorEqEq<int> bottom_sides;
00395   GenericVectorEqEq<int> top_sides;
00396   // We will be "shrinking" partitions, so keep the min/max around to
00397   // make sure the bottom/top lines do not intersect text.
00398   int min_bottom = MAX_INT32;
00399   int max_top = MIN_INT32;
00400 
00401   // Look at each text partition. We want to find the partitions
00402   // that have extremal bottom/top sides. These will give us a basis
00403   // for the table rows. Because the textlines can be skewed and close due
00404   // to warping, the height of the partitions is toned down a little bit.
00405   ColPartitionGridSearch gsearch(text_grid_);
00406   gsearch.SetUniqueMode(true);
00407   gsearch.StartRectSearch(bounding_box_);
00408   ColPartition* text = NULL;
00409   while ((text = gsearch.NextRectSearch()) != NULL) {
00410     if (!text->IsTextType())
00411       continue;
00412 
00413     ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
00414     min_bottom = MIN(min_bottom, text->bounding_box().bottom());
00415     max_top = MAX(max_top, text->bounding_box().top());
00416 
00417     // Ignore "tall" text partitions, as these are usually false positive
00418     // vertical text or multiple lines pulled together.
00419     if (text->bounding_box().height() > max_text_height_)
00420       continue;
00421 
00422     int spacing = static_cast<int>(text->bounding_box().height() *
00423                                    kVerticalSpacing / 2.0 + 0.5);
00424     int bottom = text->bounding_box().bottom() - spacing;
00425     int top = text->bounding_box().top() + spacing;
00426     // For horizontal text, the factor can be negative. This should
00427     // probably cause a warning or failure. I haven't actually checked if
00428     // it happens.
00429     if (bottom >= top)
00430       continue;
00431 
00432     bottom_sides.push_back(bottom);
00433     top_sides.push_back(top);
00434   }
00435   // It causes disaster below, so avoid it!
00436   if (bottom_sides.length() == 0 || top_sides.length() == 0)
00437     return;
00438 
00439   // Since data may be inserted in grid order, we sort the bottom/top sides.
00440   bottom_sides.sort();
00441   top_sides.sort();
00442 
00443   // At this point, in the "merged list", we expect to have a bottom side,
00444   // followed by either more bottom sides or a top side. The last number
00445   // should be a top side. We find places where the splits occur by looking
00446   // for "valleys". If we want to force gap sizes or allow overlap, change
00447   // the spacing above. If you want to let lines "slice" partitions as long
00448   // as it is infrequent, change the following function.
00449   FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
00450                          &cell_y_);
00451 
00452   // Recover the min/max correctly since it was shifted.
00453   cell_y_[0] = min_bottom;
00454   cell_y_[cell_y_.length() - 1] = max_top;
00455 }
00456 
00457 void StructuredTable::CalculateMargins() {
00458   space_above_ = MAX_INT32;
00459   space_below_ = MAX_INT32;
00460   space_right_ = MAX_INT32;
00461   space_left_ = MAX_INT32;
00462   UpdateMargins(text_grid_);
00463   UpdateMargins(line_grid_);
00464 }
00465 // Finds the nearest partition in grid to the table
00466 // boundaries and updates the margin.
00467 void StructuredTable::UpdateMargins(ColPartitionGrid* grid) {
00468   int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
00469   space_below_ = MIN(space_below_, below);
00470   int above = FindVerticalMargin(grid, bounding_box_.top(), false);
00471   space_above_ = MIN(space_above_, above);
00472   int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
00473   space_left_ = MIN(space_left_, left);
00474   int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
00475   space_right_ = MIN(space_right_, right);
00476 }
00477 int StructuredTable::FindVerticalMargin(ColPartitionGrid* grid, int border,
00478                                         bool decrease) const {
00479   ColPartitionGridSearch gsearch(grid);
00480   gsearch.SetUniqueMode(true);
00481   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
00482                               border);
00483   ColPartition* part = NULL;
00484   while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) {
00485     if (!part->IsTextType() && !part->IsHorizontalLine())
00486       continue;
00487     int distance = decrease ? border - part->bounding_box().top()
00488                             : part->bounding_box().bottom() - border;
00489     if (distance >= 0)
00490       return distance;
00491   }
00492   return MAX_INT32;
00493 }
00494 int StructuredTable::FindHorizontalMargin(ColPartitionGrid* grid, int border,
00495                                           bool decrease) const {
00496   ColPartitionGridSearch gsearch(grid);
00497   gsearch.SetUniqueMode(true);
00498   gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
00499   ColPartition* part = NULL;
00500   while ((part = gsearch.NextSideSearch(decrease)) != NULL) {
00501     if (!part->IsTextType() && !part->IsVerticalLine())
00502       continue;
00503     int distance = decrease ? border - part->bounding_box().right()
00504                             : part->bounding_box().left() - border;
00505     if (distance >= 0)
00506       return distance;
00507   }
00508   return MAX_INT32;
00509 }
00510 
00511 void StructuredTable::CalculateStats() {
00512   const int kMaxCellHeight = 1000;
00513   const int kMaxCellWidth = 1000;
00514   STATS height_stats(0, kMaxCellHeight + 1);
00515   STATS width_stats(0, kMaxCellWidth + 1);
00516 
00517   for (int i = 0; i < row_count(); ++i)
00518     height_stats.add(row_height(i), column_count());
00519   for (int i = 0; i < column_count(); ++i)
00520     width_stats.add(column_width(i), row_count());
00521 
00522   median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
00523   median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
00524 }
00525 
00526 // Looks for grid lines near the current bounding box and
00527 // grows the bounding box to include them if no intersections
00528 // will occur as a result. This is necessary because the margins
00529 // are calculated relative to the closest line/text. If the
00530 // line isn't absorbed, the margin will be the distance to the line.
00531 void StructuredTable::AbsorbNearbyLines() {
00532   ColPartitionGridSearch gsearch(line_grid_);
00533   gsearch.SetUniqueMode(true);
00534 
00535   // Is the closest line above good? Loop multiple times for tables with
00536   // multi-line (sometimes 2) borders. Limit the number of lines by
00537   // making sure they stay within a table cell or so.
00538   ColPartition* line = NULL;
00539   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
00540                               bounding_box_.top());
00541   while ((line = gsearch.NextVerticalSearch(false)) != NULL) {
00542     if (!line->IsHorizontalLine())
00543       break;
00544     TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
00545                      bounding_box_.right(), line->MidY());
00546     if (text_search.height() > median_cell_height_ * 2)
00547       break;
00548     if (CountPartitions(text_search) > 0)
00549       break;
00550     bounding_box_.set_top(line->MidY());
00551   }
00552   // As above, is the closest line below good?
00553   line = NULL;
00554   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
00555                               bounding_box_.bottom());
00556   while ((line = gsearch.NextVerticalSearch(true)) != NULL) {
00557     if (!line->IsHorizontalLine())
00558       break;
00559     TBOX text_search(bounding_box_.left(), line->MidY(),
00560                      bounding_box_.right(), bounding_box_.bottom() - 1);
00561     if (text_search.height() > median_cell_height_ * 2)
00562       break;
00563     if (CountPartitions(text_search) > 0)
00564       break;
00565     bounding_box_.set_bottom(line->MidY());
00566   }
00567   // TODO(nbeato): vertical lines
00568 }
00569 
00570 
00571 // This function will find all "0 valleys" (of any length) given two
00572 // arrays. The arrays are the mins and maxes of partitions (either
00573 // left and right or bottom and top). Since the min/max lists are generated
00574 // with pairs of increasing integers, we can make some assumptions in
00575 // the function about ordering of the overall list, which are shown in the
00576 // asserts.
00577 // The algorithm works as follows:
00578 //   While there are numbers to process, take the smallest number.
00579 //     If it is from the min_list, increment the "hill" counter.
00580 //     Otherwise, decrement the "hill" counter.
00581 //     In the process of doing this, keep track of "crossing" the
00582 //     desired height.
00583 // The first/last items are extremal values of the list and known.
00584 // NOTE: This function assumes the lists are sorted!
00585 void StructuredTable::FindCellSplitLocations(const GenericVector<int>& min_list,
00586                                              const GenericVector<int>& max_list,
00587                                              int max_merged,
00588                                              GenericVector<int>* locations) {
00589   locations->clear();
00590   ASSERT_HOST(min_list.length() == max_list.length());
00591   if (min_list.length() == 0)
00592     return;
00593   ASSERT_HOST(min_list.get(0) < max_list.get(0));
00594   ASSERT_HOST(min_list.get(min_list.length() - 1) <
00595               max_list.get(max_list.length() - 1));
00596 
00597   locations->push_back(min_list.get(0));
00598   int min_index = 0;
00599   int max_index = 0;
00600   int stacked_partitions = 0;
00601   int last_cross_position = MAX_INT32;
00602   // max_index will expire after min_index.
00603   // However, we can't "increase" the hill size if min_index expired.
00604   // So finish processing when min_index expires.
00605   while (min_index < min_list.length()) {
00606     // Increase the hill count.
00607     if (min_list[min_index] < max_list[max_index]) {
00608       ++stacked_partitions;
00609       if (last_cross_position != MAX_INT32 &&
00610           stacked_partitions > max_merged) {
00611         int mid = (last_cross_position + min_list[min_index]) / 2;
00612         locations->push_back(mid);
00613         last_cross_position = MAX_INT32;
00614       }
00615       ++min_index;
00616     } else {
00617       // Decrease the hill count.
00618       --stacked_partitions;
00619       if (last_cross_position == MAX_INT32 &&
00620           stacked_partitions <= max_merged) {
00621         last_cross_position = max_list[max_index];
00622       }
00623       ++max_index;
00624     }
00625   }
00626   locations->push_back(max_list.get(max_list.length() - 1));
00627 }
00628 
00629 // Counts the number of partitions in the table
00630 // box that intersection the given x value.
00631 int StructuredTable::CountVerticalIntersections(int x) {
00632   int count = 0;
00633   // Make a small box to keep the search time down.
00634   const int kGridSize = text_grid_->gridsize();
00635   TBOX vertical_box = bounding_box_;
00636   vertical_box.set_left(x - kGridSize);
00637   vertical_box.set_right(x + kGridSize);
00638 
00639   ColPartitionGridSearch gsearch(text_grid_);
00640   gsearch.SetUniqueMode(true);
00641   gsearch.StartRectSearch(vertical_box);
00642   ColPartition* text = NULL;
00643   while ((text = gsearch.NextRectSearch()) != NULL) {
00644     if (!text->IsTextType())
00645       continue;
00646     const TBOX& box = text->bounding_box();
00647     if (box.left() < x && x < box.right())
00648       ++count;
00649   }
00650   return count;
00651 }
00652 
00653 // Counts the number of partitions in the table
00654 // box that intersection the given y value.
00655 int StructuredTable::CountHorizontalIntersections(int y) {
00656   int count = 0;
00657   // Make a small box to keep the search time down.
00658   const int kGridSize = text_grid_->gridsize();
00659   TBOX horizontal_box = bounding_box_;
00660   horizontal_box.set_bottom(y - kGridSize);
00661   horizontal_box.set_top(y + kGridSize);
00662 
00663   ColPartitionGridSearch gsearch(text_grid_);
00664   gsearch.SetUniqueMode(true);
00665   gsearch.StartRectSearch(horizontal_box);
00666   ColPartition* text = NULL;
00667   while ((text = gsearch.NextRectSearch()) != NULL) {
00668     if (!text->IsTextType())
00669       continue;
00670 
00671     const TBOX& box = text->bounding_box();
00672     if (box.bottom() < y && y < box.top())
00673       ++count;
00674   }
00675   return count;
00676 }
00677 
00678 // Counts how many text partitions are in this box.
00679 // This is used to count partitons in cells, as that can indicate
00680 // how "strong" a potential table row/colum (or even full table) actually is.
00681 int StructuredTable::CountPartitions(const TBOX& box) {
00682   ColPartitionGridSearch gsearch(text_grid_);
00683   gsearch.SetUniqueMode(true);
00684   gsearch.StartRectSearch(box);
00685   int count = 0;
00686   ColPartition* text = NULL;
00687   while ((text = gsearch.NextRectSearch()) != NULL) {
00688     if (text->IsTextType())
00689       ++count;
00690   }
00691   return count;
00692 }
00693 
00697 
00698 TableRecognizer::TableRecognizer()
00699     : text_grid_(NULL),
00700       line_grid_(NULL),
00701       min_height_(0),
00702       min_width_(0),
00703       max_text_height_(MAX_INT32) {
00704 }
00705 
00706 TableRecognizer::~TableRecognizer() {
00707 }
00708 
00709 void TableRecognizer::Init() {
00710 }
00711 
00712 void TableRecognizer::set_text_grid(ColPartitionGrid* text_grid) {
00713   text_grid_ = text_grid;
00714 }
00715 void TableRecognizer::set_line_grid(ColPartitionGrid* line_grid) {
00716   line_grid_ = line_grid;
00717 }
00718 void TableRecognizer::set_min_height(int height) {
00719   min_height_ = height;
00720 }
00721 void TableRecognizer::set_min_width(int width) {
00722   min_width_ = width;
00723 }
00724 void TableRecognizer::set_max_text_height(int height) {
00725   max_text_height_ = height;
00726 }
00727 
00728 StructuredTable* TableRecognizer::RecognizeTable(const TBOX& guess) {
00729   StructuredTable* table = new StructuredTable();
00730   table->Init();
00731   table->set_text_grid(text_grid_);
00732   table->set_line_grid(line_grid_);
00733   table->set_max_text_height(max_text_height_);
00734 
00735   // Try to solve ths simple case, a table with *both*
00736   // vertical and horizontal lines.
00737   if (RecognizeLinedTable(guess, table))
00738     return table;
00739 
00740   // Fallback to whitespace if that failed.
00741   // TODO(nbeato): Break this apart to take advantage of horizontal
00742   // lines or vertical lines when present.
00743   if (RecognizeWhitespacedTable(guess, table))
00744     return table;
00745 
00746   // No table found...
00747   delete table;
00748   return NULL;
00749 }
00750 
00751 bool TableRecognizer::RecognizeLinedTable(const TBOX& guess_box,
00752                                           StructuredTable* table) {
00753   if (!HasSignificantLines(guess_box))
00754     return false;
00755   TBOX line_bound = guess_box;
00756   if (!FindLinesBoundingBox(&line_bound))
00757     return false;
00758   table->set_bounding_box(line_bound);
00759   return table->FindLinedStructure();
00760 }
00761 
00762 // Quick implementation. Just count the number of lines in the box.
00763 // A better implementation would counter intersections and look for connected
00764 // components. It could even go as far as finding similar length lines.
00765 // To account for these possible issues, the VerifyLinedTableCells function
00766 // will reject lined tables that cause intersections with text on the page.
00767 // TODO(nbeato): look for "better" lines
00768 bool TableRecognizer::HasSignificantLines(const TBOX& guess) {
00769   ColPartitionGridSearch box_search(line_grid_);
00770   box_search.SetUniqueMode(true);
00771   box_search.StartRectSearch(guess);
00772   ColPartition* line = NULL;
00773   int vertical_count = 0;
00774   int horizontal_count = 0;
00775 
00776   while ((line = box_search.NextRectSearch()) != NULL) {
00777     if (line->IsHorizontalLine())
00778       ++horizontal_count;
00779     if (line->IsVerticalLine())
00780       ++vertical_count;
00781   }
00782 
00783   return vertical_count >= kLinedTableMinVerticalLines &&
00784          horizontal_count >= kLinedTableMinHorizontalLines;
00785 }
00786 
00787 // Given a bounding box with a bunch of horizontal / vertical lines,
00788 // we just find the extents of all of these lines iteratively.
00789 // The box will be at least as large as guess. This
00790 // could possibly be a bad assumption.
00791 // It is guaranteed to halt in at least O(n * gridarea) where n
00792 // is the number of lines.
00793 // The assumption is that growing the box iteratively will add lines
00794 // several times, but eventually we'll find the extents.
00795 //
00796 // For tables, the approach is a bit aggressive, a single line (which could be
00797 // noise or a column ruling) can destroy the table inside.
00798 //
00799 // TODO(nbeato): This is a quick first implementation.
00800 // A better implementation would actually look for consistency
00801 // in extents of the lines and find the extents using lines
00802 // that clearly describe the table. This would allow the
00803 // lines to "vote" for height/width. An approach like
00804 // this would solve issues with page layout rulings.
00805 // I haven't looked for these issues yet, so I can't even
00806 // say they happen confidently.
00807 bool TableRecognizer::FindLinesBoundingBox(TBOX* bounding_box) {
00808   // The first iteration will tell us if there are lines
00809   // present and shrink the box to a minimal iterative size.
00810   if (!FindLinesBoundingBoxIteration(bounding_box))
00811     return false;
00812 
00813   // Keep growing until the area of the table stabilizes.
00814   // The box can only get bigger, increasing area.
00815   bool changed = true;
00816   while (changed) {
00817     changed = false;
00818     int old_area = bounding_box->area();
00819     bool check = FindLinesBoundingBoxIteration(bounding_box);
00820     // At this point, the function will return true.
00821     ASSERT_HOST(check);
00822     ASSERT_HOST(bounding_box->area() >= old_area);
00823     changed = (bounding_box->area() > old_area);
00824   }
00825 
00826   return true;
00827 }
00828 
00829 bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX* bounding_box) {
00830   // Search for all of the lines in the current box, keeping track of extents.
00831   ColPartitionGridSearch box_search(line_grid_);
00832   box_search.SetUniqueMode(true);
00833   box_search.StartRectSearch(*bounding_box);
00834   ColPartition* line = NULL;
00835   bool first_line = true;
00836 
00837   while ((line = box_search.NextRectSearch()) != NULL) {
00838     if (line->IsLineType()) {
00839       if (first_line) {
00840         // The first iteration can shrink the box.
00841         *bounding_box = line->bounding_box();
00842         first_line = false;
00843       } else {
00844         *bounding_box += line->bounding_box();
00845       }
00846     }
00847   }
00848   return !first_line;
00849 }
00850 
00851 // The goal of this function is to move the table boundaries around and find
00852 // a table that maximizes the whitespace around the table while maximizing
00853 // the cellular structure. As a result, it gets confused by headers, footers,
00854 // and merged columns (text that crosses columns). There is a tolerance
00855 // that allows a few partitions to count towards potential cell merges.
00856 // It's the max_merged parameter to FindPartitionLocations.
00857 // It can work, but it needs some false positive remove on boundaries.
00858 // For now, the grid structure must not intersect any partitions.
00859 // Also, small tolerance is added to the horizontal lines for tightly packed
00860 // tables. The tolerance is added by adjusting the bounding boxes of the
00861 // partitions (in FindHorizontalPartitions). The current implementation
00862 // only adjusts the vertical extents of the table.
00863 //
00864 // Also note. This was hacked at a lot. It could probably use some
00865 // more hacking at to find a good set of border conditions and then a
00866 // nice clean up.
00867 bool TableRecognizer::RecognizeWhitespacedTable(const TBOX& guess_box,
00868                                                 StructuredTable* table) {
00869   TBOX best_box = guess_box;  // Best borders known.
00870   int best_below = 0;         // Margin size above best table.
00871   int best_above = 0;         // Margin size below best table.
00872   TBOX adjusted = guess_box;  // The search box.
00873 
00874   // We assume that the guess box is somewhat accurate, so we don't allow
00875   // the adjusted border to pass half of the guessed area. This prevents
00876   // "negative" tables from forming.
00877   const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
00878   // Keeps track of the most columns in an accepted table. The resulting table
00879   // may be less than the max, but we don't want to stray too far.
00880   int best_cols = 0;
00881   // Make sure we find a good border.
00882   bool found_good_border = false;
00883 
00884   // Find the bottom of the table by trying a few different locations. For
00885   // each location, the top, left, and right are fixed. We start the search
00886   // in a smaller table to favor best_cols getting a good estimate sooner.
00887   int last_bottom = MAX_INT32;
00888   int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00889                                    kMidGuessY - min_height_ / 2, true);
00890   int top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00891                                 kMidGuessY + min_height_ / 2, false);
00892   adjusted.set_top(top);
00893 
00894   // Headers/footers can be spaced far from everything.
00895   // Make sure that the space below is greater than the space above
00896   // the lowest row.
00897   int previous_below = 0;
00898   const int kMaxChances = 10;
00899   int chances = kMaxChances;
00900   while (bottom != last_bottom) {
00901     adjusted.set_bottom(bottom);
00902 
00903     if (adjusted.height() >= min_height_) {
00904       // Try to fit the grid on the current box. We give it a chance
00905       // if the number of columns didn't significantly drop.
00906       table->set_bounding_box(adjusted);
00907       if (table->FindWhitespacedStructure() &&
00908           table->column_count() >= best_cols * kRequiredColumns) {
00909         if (false && IsWeakTableRow(table, 0)) {
00910           // Currently buggy, but was looking promising so disabled.
00911           --chances;
00912         } else {
00913           // We favor 2 things,
00914           //   1- Adding rows that have partitioned data.
00915           //   2- Better margins (to find header/footer).
00916           // For better tables, we just look for multiple cells in the
00917           // bottom row with data in them.
00918           // For margins, the space below the last row should
00919           // be better than a table with the last row removed.
00920           chances = kMaxChances;
00921           double max_row_height = kMaxRowSize * table->median_cell_height();
00922           if ((table->space_below() * kMarginFactor >= best_below &&
00923                table->space_below() >= previous_below) ||
00924               (table->CountFilledCellsInRow(0) > 1 &&
00925                table->row_height(0) < max_row_height)) {
00926             best_box.set_bottom(bottom);
00927             best_below = table->space_below();
00928             best_cols = MAX(table->column_count(), best_cols);
00929             found_good_border = true;
00930           }
00931         }
00932         previous_below = table->space_below();
00933       } else {
00934        --chances;
00935       }
00936     }
00937     if (chances <= 0)
00938       break;
00939 
00940     last_bottom = bottom;
00941     bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00942                                  last_bottom, true);
00943   }
00944   if (!found_good_border)
00945     return false;
00946 
00947   // TODO(nbeato) comments: follow modified code above... put it in a function!
00948   found_good_border = false;
00949   int last_top = MIN_INT32;
00950   top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00951                             kMidGuessY + min_height_ / 2, false);
00952   int previous_above = 0;
00953   chances = kMaxChances;
00954 
00955   adjusted.set_bottom(best_box.bottom());
00956   while (last_top != top) {
00957     adjusted.set_top(top);
00958     if (adjusted.height() >= min_height_) {
00959       table->set_bounding_box(adjusted);
00960       if (table->FindWhitespacedStructure() &&
00961           table->column_count() >= best_cols * kRequiredColumns) {
00962         int last_row = table->row_count() - 1;
00963         if (false && IsWeakTableRow(table, last_row)) {
00964           // Currently buggy, but was looking promising so disabled.
00965           --chances;
00966         } else {
00967           chances = kMaxChances;
00968           double max_row_height = kMaxRowSize * table->median_cell_height();
00969           if ((table->space_above() * kMarginFactor >= best_above &&
00970                table->space_above() >= previous_above) ||
00971               (table->CountFilledCellsInRow(last_row) > 1 &&
00972                table->row_height(last_row) < max_row_height)) {
00973             best_box.set_top(top);
00974             best_above = table->space_above();
00975             best_cols = MAX(table->column_count(), best_cols);
00976             found_good_border = true;
00977           }
00978         }
00979         previous_above = table->space_above();
00980       } else {
00981        --chances;
00982       }
00983     }
00984     if (chances <= 0)
00985       break;
00986 
00987     last_top = top;
00988     top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00989                               last_top, false);
00990   }
00991 
00992   if (!found_good_border)
00993     return false;
00994 
00995   // If we get here, this shouldn't happen. It can be an assert, but
00996   // I haven't tested it enough to make it crash things.
00997   if (best_box.null_box())
00998     return false;
00999 
01000   // Given the best locations, fit the box to those locations.
01001   table->set_bounding_box(best_box);
01002   return table->FindWhitespacedStructure();
01003 }
01004 
01005 // Finds the closest value to y that can safely cause a horizontal
01006 // split in the partitions.
01007 // This function has been buggy and not as reliable as I would've
01008 // liked. I suggest finding all of the splits using the
01009 // FindPartitionLocations once and then just keeping the results
01010 // of that function cached somewhere.
01011 int TableRecognizer::NextHorizontalSplit(int left, int right, int y,
01012                                          bool top_to_bottom) {
01013   ColPartitionGridSearch gsearch(text_grid_);
01014   gsearch.SetUniqueMode(true);
01015   gsearch.StartVerticalSearch(left, right, y);
01016   ColPartition* text = NULL;
01017   int last_y = y;
01018   while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != NULL) {
01019     if (!text->IsTextType() || !text->IsHorizontalType())
01020       continue;
01021     if (text->bounding_box().height() > max_text_height_)
01022       continue;
01023 
01024     const TBOX& text_box = text->bounding_box();
01025     if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
01026       last_y = MIN(last_y, text_box.bottom());
01027       continue;
01028     }
01029     if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
01030       last_y = MAX(last_y, text_box.top());
01031       continue;
01032     }
01033 
01034     return last_y;
01035   }
01036   // If none is found, we at least want to preserve the min/max,
01037   // which defines the overlap of y with the last partition in the grid.
01038   return last_y;
01039 }
01040 
01041 // Code is buggy right now. It is disabled in the calling function.
01042 // It seems like sometimes the row that is passed in is not correct
01043 // sometimes (like a phantom row is introduced). There's something going
01044 // on in the cell_y_ data member before this is called... not certain.
01045 bool TableRecognizer::IsWeakTableRow(StructuredTable* table, int row) {
01046   if (!table->VerifyRowFilled(row))
01047     return false;
01048 
01049   double threshold = 0.0;
01050   if (table->column_count() > kGoodRowNumberOfColumnsSmallSize)
01051     threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
01052   else
01053     threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
01054 
01055   return table->CountFilledCellsInRow(row) < threshold;
01056 }
01057 
01058 }  // namespace tesseract