Tesseract
3.02
|
00001 00002 // File: tablerecog.cpp 00003 // Description: Helper class to help structure table areas. Given an bounding 00004 // box from TableFinder, the TableRecognizer should give a 00005 // StructuredTable (maybe a list in the future) of "good" tables 00006 // in that area. 00007 // Author: Nicholas Beato 00008 // Created: Friday, Aug. 20, 2010 00009 // 00010 // (C) Copyright 2009, Google Inc. 00011 // Licensed under the Apache License, Version 2.0 (the "License"); 00012 // you may not use this file except in compliance with the License. 00013 // You may obtain a copy of the License at 00014 // http://www.apache.org/licenses/LICENSE-2.0 00015 // Unless required by applicable law or agreed to in writing, software 00016 // distributed under the License is distributed on an "AS IS" BASIS, 00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 // See the License for the specific language governing permissions and 00019 // limitations under the License. 00020 // 00022 00023 #include "tablerecog.h" 00024 00025 namespace tesseract { 00026 00027 // The amount of space required between the ColPartitions in 2 columns 00028 // of a non-lined table as a multiple of the median width. 00029 const double kHorizontalSpacing = 0.30; 00030 // The amount of space required between the ColPartitions in 2 rows 00031 // of a non-lined table as multiples of the median height. 00032 const double kVerticalSpacing = -0.2; 00033 // The number of cells that the grid lines may intersect. 00034 // See FindCellSplitLocations for explanation. 00035 const int kCellSplitRowThreshold = 0; 00036 const int kCellSplitColumnThreshold = 0; 00037 // For "lined tables", the number of required lines. Currently a guess. 00038 const int kLinedTableMinVerticalLines = 3; 00039 const int kLinedTableMinHorizontalLines = 3; 00040 // Number of columns required, as a fraction of the most columns found. 00041 // None of these are tweaked at all. 00042 const double kRequiredColumns = 0.7; 00043 // The tolerance for comparing margins of potential tables. 00044 const double kMarginFactor = 1.1; 00045 // The first and last row should be consistent cell height. 00046 // This factor is the first and last row cell height max. 00047 const double kMaxRowSize = 2.5; 00048 // Number of filled columns required to form a strong table row. 00049 // For small tables, this is an absolute number. 00050 const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 }; 00051 const int kGoodRowNumberOfColumnsSmallSize = 00052 sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1; 00053 // For large tables, it is a relative number 00054 const double kGoodRowNumberOfColumnsLarge = 0.7; 00055 // The amount of area that must be covered in a cell by ColPartitions to 00056 // be considered "filled" 00057 const double kMinFilledArea = 0.35; 00058 00062 00063 StructuredTable::StructuredTable() 00064 : text_grid_(NULL), 00065 line_grid_(NULL), 00066 is_lined_(false), 00067 space_above_(0), 00068 space_below_(0), 00069 space_left_(0), 00070 space_right_(0), 00071 median_cell_height_(0), 00072 median_cell_width_(0), 00073 max_text_height_(MAX_INT32) { 00074 } 00075 00076 StructuredTable::~StructuredTable() { 00077 } 00078 00079 void StructuredTable::Init() { 00080 } 00081 00082 void StructuredTable::set_text_grid(ColPartitionGrid* text_grid) { 00083 text_grid_ = text_grid; 00084 } 00085 void StructuredTable::set_line_grid(ColPartitionGrid* line_grid) { 00086 line_grid_ = line_grid; 00087 } 00088 void StructuredTable::set_max_text_height(int height) { 00089 max_text_height_ = height; 00090 } 00091 bool StructuredTable::is_lined() const { 00092 return is_lined_; 00093 } 00094 int StructuredTable::row_count() const { 00095 return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1; 00096 } 00097 int StructuredTable::column_count() const { 00098 return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1; 00099 } 00100 int StructuredTable::cell_count() const { 00101 return row_count() * column_count(); 00102 } 00103 void StructuredTable::set_bounding_box(const TBOX& box) { 00104 bounding_box_ = box; 00105 } 00106 const TBOX& StructuredTable::bounding_box() const { 00107 return bounding_box_; 00108 } 00109 int StructuredTable::median_cell_height() { 00110 return median_cell_height_; 00111 } 00112 int StructuredTable::median_cell_width() { 00113 return median_cell_width_; 00114 } 00115 int StructuredTable::row_height(int row) const { 00116 ASSERT_HOST(0 <= row && row < row_count()); 00117 return cell_y_[row + 1] - cell_y_[row]; 00118 } 00119 int StructuredTable::column_width(int column) const { 00120 ASSERT_HOST(0 <= column && column < column_count()); 00121 return cell_x_[column + 1] - cell_x_[column]; 00122 } 00123 int StructuredTable::space_above() const { 00124 return space_above_; 00125 } 00126 int StructuredTable::space_below() const { 00127 return space_below_; 00128 } 00129 00130 // At this point, we know that the lines are contained 00131 // by the box (by FindLinesBoundingBox). 00132 // So try to find the cell structure and make sure it works out. 00133 // The assumption is that all lines span the table. If this 00134 // assumption fails, the VerifyLinedTable method will 00135 // abort the lined table. The TableRecognizer will fall 00136 // back on FindWhitespacedStructure. 00137 bool StructuredTable::FindLinedStructure() { 00138 ClearStructure(); 00139 00140 // Search for all of the lines in the current box. 00141 // Update the cellular structure with the exact lines. 00142 ColPartitionGridSearch box_search(line_grid_); 00143 box_search.SetUniqueMode(true); 00144 box_search.StartRectSearch(bounding_box_); 00145 ColPartition* line = NULL; 00146 00147 while ((line = box_search.NextRectSearch()) != NULL) { 00148 if (line->IsHorizontalLine()) 00149 cell_y_.push_back(line->MidY()); 00150 if (line->IsVerticalLine()) 00151 cell_x_.push_back(line->MidX()); 00152 } 00153 00154 // HasSignificantLines should guarantee cells. 00155 // Because that code is a different class, just gracefully 00156 // return false. This could be an assert. 00157 if (cell_x_.length() < 3 || cell_y_.length() < 3) 00158 return false; 00159 00160 cell_x_.sort(); 00161 cell_y_.sort(); 00162 00163 // Remove duplicates that may have occurred due to split lines. 00164 cell_x_.compact_sorted(); 00165 cell_y_.compact_sorted(); 00166 00167 // The border should be the extents of line boxes, not middle. 00168 cell_x_[0] = bounding_box_.left(); 00169 cell_x_[cell_x_.length() - 1] = bounding_box_.right(); 00170 cell_y_[0] = bounding_box_.bottom(); 00171 cell_y_[cell_y_.length() - 1] = bounding_box_.top(); 00172 00173 // Remove duplicates that may have occurred due to moving the borders. 00174 cell_x_.compact_sorted(); 00175 cell_y_.compact_sorted(); 00176 00177 CalculateMargins(); 00178 CalculateStats(); 00179 is_lined_ = VerifyLinedTableCells(); 00180 return is_lined_; 00181 } 00182 00183 // Finds the cellular structure given a particular box. 00184 bool StructuredTable::FindWhitespacedStructure() { 00185 ClearStructure(); 00186 FindWhitespacedColumns(); 00187 FindWhitespacedRows(); 00188 00189 if (!VerifyWhitespacedTable()) { 00190 return false; 00191 } else { 00192 bounding_box_.set_left(cell_x_[0]); 00193 bounding_box_.set_right(cell_x_[cell_x_.length() - 1]); 00194 bounding_box_.set_bottom(cell_y_[0]); 00195 bounding_box_.set_top(cell_y_[cell_y_.length() - 1]); 00196 AbsorbNearbyLines(); 00197 CalculateMargins(); 00198 CalculateStats(); 00199 return true; 00200 } 00201 } 00202 00203 // Tests if a partition fits inside the table structure. 00204 // Partitions must fully span a grid line in order to intersect it. 00205 // This means that a partition does not intersect a line 00206 // that it "just" touches. This is mainly because the assumption 00207 // throughout the code is that "0" distance is a very very small space. 00208 bool StructuredTable::DoesPartitionFit(const ColPartition& part) const { 00209 const TBOX& box = part.bounding_box(); 00210 for (int i = 0; i < cell_x_.length(); ++i) 00211 if (box.left() < cell_x_[i] && cell_x_[i] < box.right()) 00212 return false; 00213 for (int i = 0; i < cell_y_.length(); ++i) 00214 if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top()) 00215 return false; 00216 return true; 00217 } 00218 00219 // Checks if a sub-table has multiple data cells filled. 00220 int StructuredTable::CountFilledCells() { 00221 return CountFilledCells(0, row_count() - 1, 0, column_count() - 1); 00222 } 00223 int StructuredTable::CountFilledCellsInRow(int row) { 00224 return CountFilledCells(row, row, 0, column_count() - 1); 00225 } 00226 int StructuredTable::CountFilledCellsInColumn(int column) { 00227 return CountFilledCells(0, row_count() - 1, column, column); 00228 } 00229 int StructuredTable::CountFilledCells(int row_start, int row_end, 00230 int column_start, int column_end) { 00231 ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count()); 00232 ASSERT_HOST(0 <= column_start && column_start <= column_end && 00233 column_end < column_count()); 00234 int cell_count = 0; 00235 TBOX cell_box; 00236 for (int row = row_start; row <= row_end; ++row) { 00237 cell_box.set_bottom(cell_y_[row]); 00238 cell_box.set_top(cell_y_[row + 1]); 00239 for (int col = column_start; col <= column_end; ++col) { 00240 cell_box.set_left(cell_x_[col]); 00241 cell_box.set_right(cell_x_[col + 1]); 00242 if (CountPartitions(cell_box) > 0) 00243 ++cell_count; 00244 } 00245 } 00246 return cell_count; 00247 } 00248 00249 // Makes sure that at least one cell in a row has substantial area filled. 00250 // This can filter out large whitespace caused by growing tables too far 00251 // and page numbers. 00252 bool StructuredTable::VerifyRowFilled(int row) { 00253 for (int i = 0; i < column_count(); ++i) { 00254 double area_filled = CalculateCellFilledPercentage(row, i); 00255 if (area_filled >= kMinFilledArea) 00256 return true; 00257 } 00258 return false; 00259 } 00260 00261 // Finds the filled area in a cell. 00262 // Assume ColPartitions do not overlap for simplicity (even though they do). 00263 double StructuredTable::CalculateCellFilledPercentage(int row, int column) { 00264 ASSERT_HOST(0 <= row && row <= row_count()); 00265 ASSERT_HOST(0 <= column && column <= column_count()); 00266 const TBOX kCellBox(cell_x_[column], cell_y_[row], 00267 cell_x_[column + 1], cell_y_[row + 1]); 00268 ASSERT_HOST(!kCellBox.null_box()); 00269 00270 ColPartitionGridSearch gsearch(text_grid_); 00271 gsearch.SetUniqueMode(true); 00272 gsearch.StartRectSearch(kCellBox); 00273 double area_covered = 0; 00274 ColPartition* text = NULL; 00275 while ((text = gsearch.NextRectSearch()) != NULL) { 00276 if (text->IsTextType()) 00277 area_covered += text->bounding_box().intersection(kCellBox).area(); 00278 } 00279 return MIN(1.0, area_covered / kCellBox.area()); 00280 } 00281 00282 void StructuredTable::Display(ScrollView* window, ScrollView::Color color) { 00283 #ifndef GRAPHICS_DISABLED 00284 window->Brush(ScrollView::NONE); 00285 window->Pen(color); 00286 window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), 00287 bounding_box_.right(), bounding_box_.top()); 00288 for (int i = 0; i < cell_x_.length(); i++) { 00289 window->Line(cell_x_[i], bounding_box_.bottom(), 00290 cell_x_[i], bounding_box_.top()); 00291 } 00292 for (int i = 0; i < cell_y_.length(); i++) { 00293 window->Line(bounding_box_.left(), cell_y_[i], 00294 bounding_box_.right(), cell_y_[i]); 00295 } 00296 window->UpdateWindow(); 00297 #endif 00298 } 00299 00300 // Clear structure information. 00301 void StructuredTable::ClearStructure() { 00302 cell_x_.clear(); 00303 cell_y_.clear(); 00304 is_lined_ = false; 00305 space_above_ = 0; 00306 space_below_ = 0; 00307 space_left_ = 0; 00308 space_right_ = 0; 00309 median_cell_height_ = 0; 00310 median_cell_width_ = 0; 00311 } 00312 00313 // When a table has lines, the lines should not intersect any partitions. 00314 // The following function makes sure the previous assumption is met. 00315 bool StructuredTable::VerifyLinedTableCells() { 00316 // Function only called when lines exist. 00317 ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2); 00318 for (int i = 0; i < cell_y_.length(); ++i) { 00319 if (CountHorizontalIntersections(cell_y_[i]) > 0) 00320 return false; 00321 } 00322 for (int i = 0; i < cell_x_.length(); ++i) { 00323 if (CountVerticalIntersections(cell_x_[i]) > 0) 00324 return false; 00325 } 00326 return true; 00327 } 00328 00329 // TODO(nbeato): Could be much better than this. 00330 // Examples: 00331 // - Caclulate the percentage of filled cells. 00332 // - Calculate the average number of ColPartitions per cell. 00333 // - Calculate the number of cells per row with partitions. 00334 // - Check if ColPartitions in adjacent cells are similar. 00335 // - Check that all columns are at least a certain width. 00336 // - etc. 00337 bool StructuredTable::VerifyWhitespacedTable() { 00338 // criteria for a table, must be at least 2x3 or 3x2 00339 return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6; 00340 } 00341 00342 // Finds vertical splits in the ColPartitions of text_grid_ by considering 00343 // all possible "good" guesses. A good guess is just the left/right sides of 00344 // the partitions, since these locations will uniquely define where the 00345 // extremal values where the splits can occur. The split happens 00346 // in the middle of the two nearest partitions. 00347 void StructuredTable::FindWhitespacedColumns() { 00348 // Set of the extents of all partitions on the page. 00349 GenericVectorEqEq<int> left_sides; 00350 GenericVectorEqEq<int> right_sides; 00351 00352 // Look at each text partition. We want to find the partitions 00353 // that have extremal left/right sides. These will give us a basis 00354 // for the table columns. 00355 ColPartitionGridSearch gsearch(text_grid_); 00356 gsearch.SetUniqueMode(true); 00357 gsearch.StartRectSearch(bounding_box_); 00358 ColPartition* text = NULL; 00359 while ((text = gsearch.NextRectSearch()) != NULL) { 00360 if (!text->IsTextType()) 00361 continue; 00362 00363 ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right()); 00364 int spacing = static_cast<int>(text->median_width() * 00365 kHorizontalSpacing / 2.0 + 0.5); 00366 left_sides.push_back(text->bounding_box().left() - spacing); 00367 right_sides.push_back(text->bounding_box().right() + spacing); 00368 } 00369 // It causes disaster below, so avoid it! 00370 if (left_sides.length() == 0 || right_sides.length() == 0) 00371 return; 00372 00373 // Since data may be inserted in grid order, we sort the left/right sides. 00374 left_sides.sort(); 00375 right_sides.sort(); 00376 00377 // At this point, in the "merged list", we expect to have a left side, 00378 // followed by either more left sides or a right side. The last number 00379 // should be a right side. We find places where the splits occur by looking 00380 // for "valleys". If we want to force gap sizes or allow overlap, change 00381 // the spacing above. If you want to let lines "slice" partitions as long 00382 // as it is infrequent, change the following function. 00383 FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, 00384 &cell_x_); 00385 } 00386 00387 // Finds horizontal splits in the ColPartitions of text_grid_ by considering 00388 // all possible "good" guesses. A good guess is just the bottom/top sides of 00389 // the partitions, since these locations will uniquely define where the 00390 // extremal values where the splits can occur. The split happens 00391 // in the middle of the two nearest partitions. 00392 void StructuredTable::FindWhitespacedRows() { 00393 // Set of the extents of all partitions on the page. 00394 GenericVectorEqEq<int> bottom_sides; 00395 GenericVectorEqEq<int> top_sides; 00396 // We will be "shrinking" partitions, so keep the min/max around to 00397 // make sure the bottom/top lines do not intersect text. 00398 int min_bottom = MAX_INT32; 00399 int max_top = MIN_INT32; 00400 00401 // Look at each text partition. We want to find the partitions 00402 // that have extremal bottom/top sides. These will give us a basis 00403 // for the table rows. Because the textlines can be skewed and close due 00404 // to warping, the height of the partitions is toned down a little bit. 00405 ColPartitionGridSearch gsearch(text_grid_); 00406 gsearch.SetUniqueMode(true); 00407 gsearch.StartRectSearch(bounding_box_); 00408 ColPartition* text = NULL; 00409 while ((text = gsearch.NextRectSearch()) != NULL) { 00410 if (!text->IsTextType()) 00411 continue; 00412 00413 ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top()); 00414 min_bottom = MIN(min_bottom, text->bounding_box().bottom()); 00415 max_top = MAX(max_top, text->bounding_box().top()); 00416 00417 // Ignore "tall" text partitions, as these are usually false positive 00418 // vertical text or multiple lines pulled together. 00419 if (text->bounding_box().height() > max_text_height_) 00420 continue; 00421 00422 int spacing = static_cast<int>(text->bounding_box().height() * 00423 kVerticalSpacing / 2.0 + 0.5); 00424 int bottom = text->bounding_box().bottom() - spacing; 00425 int top = text->bounding_box().top() + spacing; 00426 // For horizontal text, the factor can be negative. This should 00427 // probably cause a warning or failure. I haven't actually checked if 00428 // it happens. 00429 if (bottom >= top) 00430 continue; 00431 00432 bottom_sides.push_back(bottom); 00433 top_sides.push_back(top); 00434 } 00435 // It causes disaster below, so avoid it! 00436 if (bottom_sides.length() == 0 || top_sides.length() == 0) 00437 return; 00438 00439 // Since data may be inserted in grid order, we sort the bottom/top sides. 00440 bottom_sides.sort(); 00441 top_sides.sort(); 00442 00443 // At this point, in the "merged list", we expect to have a bottom side, 00444 // followed by either more bottom sides or a top side. The last number 00445 // should be a top side. We find places where the splits occur by looking 00446 // for "valleys". If we want to force gap sizes or allow overlap, change 00447 // the spacing above. If you want to let lines "slice" partitions as long 00448 // as it is infrequent, change the following function. 00449 FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, 00450 &cell_y_); 00451 00452 // Recover the min/max correctly since it was shifted. 00453 cell_y_[0] = min_bottom; 00454 cell_y_[cell_y_.length() - 1] = max_top; 00455 } 00456 00457 void StructuredTable::CalculateMargins() { 00458 space_above_ = MAX_INT32; 00459 space_below_ = MAX_INT32; 00460 space_right_ = MAX_INT32; 00461 space_left_ = MAX_INT32; 00462 UpdateMargins(text_grid_); 00463 UpdateMargins(line_grid_); 00464 } 00465 // Finds the nearest partition in grid to the table 00466 // boundaries and updates the margin. 00467 void StructuredTable::UpdateMargins(ColPartitionGrid* grid) { 00468 int below = FindVerticalMargin(grid, bounding_box_.bottom(), true); 00469 space_below_ = MIN(space_below_, below); 00470 int above = FindVerticalMargin(grid, bounding_box_.top(), false); 00471 space_above_ = MIN(space_above_, above); 00472 int left = FindHorizontalMargin(grid, bounding_box_.left(), true); 00473 space_left_ = MIN(space_left_, left); 00474 int right = FindHorizontalMargin(grid, bounding_box_.right(), false); 00475 space_right_ = MIN(space_right_, right); 00476 } 00477 int StructuredTable::FindVerticalMargin(ColPartitionGrid* grid, int border, 00478 bool decrease) const { 00479 ColPartitionGridSearch gsearch(grid); 00480 gsearch.SetUniqueMode(true); 00481 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), 00482 border); 00483 ColPartition* part = NULL; 00484 while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) { 00485 if (!part->IsTextType() && !part->IsHorizontalLine()) 00486 continue; 00487 int distance = decrease ? border - part->bounding_box().top() 00488 : part->bounding_box().bottom() - border; 00489 if (distance >= 0) 00490 return distance; 00491 } 00492 return MAX_INT32; 00493 } 00494 int StructuredTable::FindHorizontalMargin(ColPartitionGrid* grid, int border, 00495 bool decrease) const { 00496 ColPartitionGridSearch gsearch(grid); 00497 gsearch.SetUniqueMode(true); 00498 gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top()); 00499 ColPartition* part = NULL; 00500 while ((part = gsearch.NextSideSearch(decrease)) != NULL) { 00501 if (!part->IsTextType() && !part->IsVerticalLine()) 00502 continue; 00503 int distance = decrease ? border - part->bounding_box().right() 00504 : part->bounding_box().left() - border; 00505 if (distance >= 0) 00506 return distance; 00507 } 00508 return MAX_INT32; 00509 } 00510 00511 void StructuredTable::CalculateStats() { 00512 const int kMaxCellHeight = 1000; 00513 const int kMaxCellWidth = 1000; 00514 STATS height_stats(0, kMaxCellHeight + 1); 00515 STATS width_stats(0, kMaxCellWidth + 1); 00516 00517 for (int i = 0; i < row_count(); ++i) 00518 height_stats.add(row_height(i), column_count()); 00519 for (int i = 0; i < column_count(); ++i) 00520 width_stats.add(column_width(i), row_count()); 00521 00522 median_cell_height_ = static_cast<int>(height_stats.median() + 0.5); 00523 median_cell_width_ = static_cast<int>(width_stats.median() + 0.5); 00524 } 00525 00526 // Looks for grid lines near the current bounding box and 00527 // grows the bounding box to include them if no intersections 00528 // will occur as a result. This is necessary because the margins 00529 // are calculated relative to the closest line/text. If the 00530 // line isn't absorbed, the margin will be the distance to the line. 00531 void StructuredTable::AbsorbNearbyLines() { 00532 ColPartitionGridSearch gsearch(line_grid_); 00533 gsearch.SetUniqueMode(true); 00534 00535 // Is the closest line above good? Loop multiple times for tables with 00536 // multi-line (sometimes 2) borders. Limit the number of lines by 00537 // making sure they stay within a table cell or so. 00538 ColPartition* line = NULL; 00539 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), 00540 bounding_box_.top()); 00541 while ((line = gsearch.NextVerticalSearch(false)) != NULL) { 00542 if (!line->IsHorizontalLine()) 00543 break; 00544 TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, 00545 bounding_box_.right(), line->MidY()); 00546 if (text_search.height() > median_cell_height_ * 2) 00547 break; 00548 if (CountPartitions(text_search) > 0) 00549 break; 00550 bounding_box_.set_top(line->MidY()); 00551 } 00552 // As above, is the closest line below good? 00553 line = NULL; 00554 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), 00555 bounding_box_.bottom()); 00556 while ((line = gsearch.NextVerticalSearch(true)) != NULL) { 00557 if (!line->IsHorizontalLine()) 00558 break; 00559 TBOX text_search(bounding_box_.left(), line->MidY(), 00560 bounding_box_.right(), bounding_box_.bottom() - 1); 00561 if (text_search.height() > median_cell_height_ * 2) 00562 break; 00563 if (CountPartitions(text_search) > 0) 00564 break; 00565 bounding_box_.set_bottom(line->MidY()); 00566 } 00567 // TODO(nbeato): vertical lines 00568 } 00569 00570 00571 // This function will find all "0 valleys" (of any length) given two 00572 // arrays. The arrays are the mins and maxes of partitions (either 00573 // left and right or bottom and top). Since the min/max lists are generated 00574 // with pairs of increasing integers, we can make some assumptions in 00575 // the function about ordering of the overall list, which are shown in the 00576 // asserts. 00577 // The algorithm works as follows: 00578 // While there are numbers to process, take the smallest number. 00579 // If it is from the min_list, increment the "hill" counter. 00580 // Otherwise, decrement the "hill" counter. 00581 // In the process of doing this, keep track of "crossing" the 00582 // desired height. 00583 // The first/last items are extremal values of the list and known. 00584 // NOTE: This function assumes the lists are sorted! 00585 void StructuredTable::FindCellSplitLocations(const GenericVector<int>& min_list, 00586 const GenericVector<int>& max_list, 00587 int max_merged, 00588 GenericVector<int>* locations) { 00589 locations->clear(); 00590 ASSERT_HOST(min_list.length() == max_list.length()); 00591 if (min_list.length() == 0) 00592 return; 00593 ASSERT_HOST(min_list.get(0) < max_list.get(0)); 00594 ASSERT_HOST(min_list.get(min_list.length() - 1) < 00595 max_list.get(max_list.length() - 1)); 00596 00597 locations->push_back(min_list.get(0)); 00598 int min_index = 0; 00599 int max_index = 0; 00600 int stacked_partitions = 0; 00601 int last_cross_position = MAX_INT32; 00602 // max_index will expire after min_index. 00603 // However, we can't "increase" the hill size if min_index expired. 00604 // So finish processing when min_index expires. 00605 while (min_index < min_list.length()) { 00606 // Increase the hill count. 00607 if (min_list[min_index] < max_list[max_index]) { 00608 ++stacked_partitions; 00609 if (last_cross_position != MAX_INT32 && 00610 stacked_partitions > max_merged) { 00611 int mid = (last_cross_position + min_list[min_index]) / 2; 00612 locations->push_back(mid); 00613 last_cross_position = MAX_INT32; 00614 } 00615 ++min_index; 00616 } else { 00617 // Decrease the hill count. 00618 --stacked_partitions; 00619 if (last_cross_position == MAX_INT32 && 00620 stacked_partitions <= max_merged) { 00621 last_cross_position = max_list[max_index]; 00622 } 00623 ++max_index; 00624 } 00625 } 00626 locations->push_back(max_list.get(max_list.length() - 1)); 00627 } 00628 00629 // Counts the number of partitions in the table 00630 // box that intersection the given x value. 00631 int StructuredTable::CountVerticalIntersections(int x) { 00632 int count = 0; 00633 // Make a small box to keep the search time down. 00634 const int kGridSize = text_grid_->gridsize(); 00635 TBOX vertical_box = bounding_box_; 00636 vertical_box.set_left(x - kGridSize); 00637 vertical_box.set_right(x + kGridSize); 00638 00639 ColPartitionGridSearch gsearch(text_grid_); 00640 gsearch.SetUniqueMode(true); 00641 gsearch.StartRectSearch(vertical_box); 00642 ColPartition* text = NULL; 00643 while ((text = gsearch.NextRectSearch()) != NULL) { 00644 if (!text->IsTextType()) 00645 continue; 00646 const TBOX& box = text->bounding_box(); 00647 if (box.left() < x && x < box.right()) 00648 ++count; 00649 } 00650 return count; 00651 } 00652 00653 // Counts the number of partitions in the table 00654 // box that intersection the given y value. 00655 int StructuredTable::CountHorizontalIntersections(int y) { 00656 int count = 0; 00657 // Make a small box to keep the search time down. 00658 const int kGridSize = text_grid_->gridsize(); 00659 TBOX horizontal_box = bounding_box_; 00660 horizontal_box.set_bottom(y - kGridSize); 00661 horizontal_box.set_top(y + kGridSize); 00662 00663 ColPartitionGridSearch gsearch(text_grid_); 00664 gsearch.SetUniqueMode(true); 00665 gsearch.StartRectSearch(horizontal_box); 00666 ColPartition* text = NULL; 00667 while ((text = gsearch.NextRectSearch()) != NULL) { 00668 if (!text->IsTextType()) 00669 continue; 00670 00671 const TBOX& box = text->bounding_box(); 00672 if (box.bottom() < y && y < box.top()) 00673 ++count; 00674 } 00675 return count; 00676 } 00677 00678 // Counts how many text partitions are in this box. 00679 // This is used to count partitons in cells, as that can indicate 00680 // how "strong" a potential table row/colum (or even full table) actually is. 00681 int StructuredTable::CountPartitions(const TBOX& box) { 00682 ColPartitionGridSearch gsearch(text_grid_); 00683 gsearch.SetUniqueMode(true); 00684 gsearch.StartRectSearch(box); 00685 int count = 0; 00686 ColPartition* text = NULL; 00687 while ((text = gsearch.NextRectSearch()) != NULL) { 00688 if (text->IsTextType()) 00689 ++count; 00690 } 00691 return count; 00692 } 00693 00697 00698 TableRecognizer::TableRecognizer() 00699 : text_grid_(NULL), 00700 line_grid_(NULL), 00701 min_height_(0), 00702 min_width_(0), 00703 max_text_height_(MAX_INT32) { 00704 } 00705 00706 TableRecognizer::~TableRecognizer() { 00707 } 00708 00709 void TableRecognizer::Init() { 00710 } 00711 00712 void TableRecognizer::set_text_grid(ColPartitionGrid* text_grid) { 00713 text_grid_ = text_grid; 00714 } 00715 void TableRecognizer::set_line_grid(ColPartitionGrid* line_grid) { 00716 line_grid_ = line_grid; 00717 } 00718 void TableRecognizer::set_min_height(int height) { 00719 min_height_ = height; 00720 } 00721 void TableRecognizer::set_min_width(int width) { 00722 min_width_ = width; 00723 } 00724 void TableRecognizer::set_max_text_height(int height) { 00725 max_text_height_ = height; 00726 } 00727 00728 StructuredTable* TableRecognizer::RecognizeTable(const TBOX& guess) { 00729 StructuredTable* table = new StructuredTable(); 00730 table->Init(); 00731 table->set_text_grid(text_grid_); 00732 table->set_line_grid(line_grid_); 00733 table->set_max_text_height(max_text_height_); 00734 00735 // Try to solve ths simple case, a table with *both* 00736 // vertical and horizontal lines. 00737 if (RecognizeLinedTable(guess, table)) 00738 return table; 00739 00740 // Fallback to whitespace if that failed. 00741 // TODO(nbeato): Break this apart to take advantage of horizontal 00742 // lines or vertical lines when present. 00743 if (RecognizeWhitespacedTable(guess, table)) 00744 return table; 00745 00746 // No table found... 00747 delete table; 00748 return NULL; 00749 } 00750 00751 bool TableRecognizer::RecognizeLinedTable(const TBOX& guess_box, 00752 StructuredTable* table) { 00753 if (!HasSignificantLines(guess_box)) 00754 return false; 00755 TBOX line_bound = guess_box; 00756 if (!FindLinesBoundingBox(&line_bound)) 00757 return false; 00758 table->set_bounding_box(line_bound); 00759 return table->FindLinedStructure(); 00760 } 00761 00762 // Quick implementation. Just count the number of lines in the box. 00763 // A better implementation would counter intersections and look for connected 00764 // components. It could even go as far as finding similar length lines. 00765 // To account for these possible issues, the VerifyLinedTableCells function 00766 // will reject lined tables that cause intersections with text on the page. 00767 // TODO(nbeato): look for "better" lines 00768 bool TableRecognizer::HasSignificantLines(const TBOX& guess) { 00769 ColPartitionGridSearch box_search(line_grid_); 00770 box_search.SetUniqueMode(true); 00771 box_search.StartRectSearch(guess); 00772 ColPartition* line = NULL; 00773 int vertical_count = 0; 00774 int horizontal_count = 0; 00775 00776 while ((line = box_search.NextRectSearch()) != NULL) { 00777 if (line->IsHorizontalLine()) 00778 ++horizontal_count; 00779 if (line->IsVerticalLine()) 00780 ++vertical_count; 00781 } 00782 00783 return vertical_count >= kLinedTableMinVerticalLines && 00784 horizontal_count >= kLinedTableMinHorizontalLines; 00785 } 00786 00787 // Given a bounding box with a bunch of horizontal / vertical lines, 00788 // we just find the extents of all of these lines iteratively. 00789 // The box will be at least as large as guess. This 00790 // could possibly be a bad assumption. 00791 // It is guaranteed to halt in at least O(n * gridarea) where n 00792 // is the number of lines. 00793 // The assumption is that growing the box iteratively will add lines 00794 // several times, but eventually we'll find the extents. 00795 // 00796 // For tables, the approach is a bit aggressive, a single line (which could be 00797 // noise or a column ruling) can destroy the table inside. 00798 // 00799 // TODO(nbeato): This is a quick first implementation. 00800 // A better implementation would actually look for consistency 00801 // in extents of the lines and find the extents using lines 00802 // that clearly describe the table. This would allow the 00803 // lines to "vote" for height/width. An approach like 00804 // this would solve issues with page layout rulings. 00805 // I haven't looked for these issues yet, so I can't even 00806 // say they happen confidently. 00807 bool TableRecognizer::FindLinesBoundingBox(TBOX* bounding_box) { 00808 // The first iteration will tell us if there are lines 00809 // present and shrink the box to a minimal iterative size. 00810 if (!FindLinesBoundingBoxIteration(bounding_box)) 00811 return false; 00812 00813 // Keep growing until the area of the table stabilizes. 00814 // The box can only get bigger, increasing area. 00815 bool changed = true; 00816 while (changed) { 00817 changed = false; 00818 int old_area = bounding_box->area(); 00819 bool check = FindLinesBoundingBoxIteration(bounding_box); 00820 // At this point, the function will return true. 00821 ASSERT_HOST(check); 00822 ASSERT_HOST(bounding_box->area() >= old_area); 00823 changed = (bounding_box->area() > old_area); 00824 } 00825 00826 return true; 00827 } 00828 00829 bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX* bounding_box) { 00830 // Search for all of the lines in the current box, keeping track of extents. 00831 ColPartitionGridSearch box_search(line_grid_); 00832 box_search.SetUniqueMode(true); 00833 box_search.StartRectSearch(*bounding_box); 00834 ColPartition* line = NULL; 00835 bool first_line = true; 00836 00837 while ((line = box_search.NextRectSearch()) != NULL) { 00838 if (line->IsLineType()) { 00839 if (first_line) { 00840 // The first iteration can shrink the box. 00841 *bounding_box = line->bounding_box(); 00842 first_line = false; 00843 } else { 00844 *bounding_box += line->bounding_box(); 00845 } 00846 } 00847 } 00848 return !first_line; 00849 } 00850 00851 // The goal of this function is to move the table boundaries around and find 00852 // a table that maximizes the whitespace around the table while maximizing 00853 // the cellular structure. As a result, it gets confused by headers, footers, 00854 // and merged columns (text that crosses columns). There is a tolerance 00855 // that allows a few partitions to count towards potential cell merges. 00856 // It's the max_merged parameter to FindPartitionLocations. 00857 // It can work, but it needs some false positive remove on boundaries. 00858 // For now, the grid structure must not intersect any partitions. 00859 // Also, small tolerance is added to the horizontal lines for tightly packed 00860 // tables. The tolerance is added by adjusting the bounding boxes of the 00861 // partitions (in FindHorizontalPartitions). The current implementation 00862 // only adjusts the vertical extents of the table. 00863 // 00864 // Also note. This was hacked at a lot. It could probably use some 00865 // more hacking at to find a good set of border conditions and then a 00866 // nice clean up. 00867 bool TableRecognizer::RecognizeWhitespacedTable(const TBOX& guess_box, 00868 StructuredTable* table) { 00869 TBOX best_box = guess_box; // Best borders known. 00870 int best_below = 0; // Margin size above best table. 00871 int best_above = 0; // Margin size below best table. 00872 TBOX adjusted = guess_box; // The search box. 00873 00874 // We assume that the guess box is somewhat accurate, so we don't allow 00875 // the adjusted border to pass half of the guessed area. This prevents 00876 // "negative" tables from forming. 00877 const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2; 00878 // Keeps track of the most columns in an accepted table. The resulting table 00879 // may be less than the max, but we don't want to stray too far. 00880 int best_cols = 0; 00881 // Make sure we find a good border. 00882 bool found_good_border = false; 00883 00884 // Find the bottom of the table by trying a few different locations. For 00885 // each location, the top, left, and right are fixed. We start the search 00886 // in a smaller table to favor best_cols getting a good estimate sooner. 00887 int last_bottom = MAX_INT32; 00888 int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00889 kMidGuessY - min_height_ / 2, true); 00890 int top = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00891 kMidGuessY + min_height_ / 2, false); 00892 adjusted.set_top(top); 00893 00894 // Headers/footers can be spaced far from everything. 00895 // Make sure that the space below is greater than the space above 00896 // the lowest row. 00897 int previous_below = 0; 00898 const int kMaxChances = 10; 00899 int chances = kMaxChances; 00900 while (bottom != last_bottom) { 00901 adjusted.set_bottom(bottom); 00902 00903 if (adjusted.height() >= min_height_) { 00904 // Try to fit the grid on the current box. We give it a chance 00905 // if the number of columns didn't significantly drop. 00906 table->set_bounding_box(adjusted); 00907 if (table->FindWhitespacedStructure() && 00908 table->column_count() >= best_cols * kRequiredColumns) { 00909 if (false && IsWeakTableRow(table, 0)) { 00910 // Currently buggy, but was looking promising so disabled. 00911 --chances; 00912 } else { 00913 // We favor 2 things, 00914 // 1- Adding rows that have partitioned data. 00915 // 2- Better margins (to find header/footer). 00916 // For better tables, we just look for multiple cells in the 00917 // bottom row with data in them. 00918 // For margins, the space below the last row should 00919 // be better than a table with the last row removed. 00920 chances = kMaxChances; 00921 double max_row_height = kMaxRowSize * table->median_cell_height(); 00922 if ((table->space_below() * kMarginFactor >= best_below && 00923 table->space_below() >= previous_below) || 00924 (table->CountFilledCellsInRow(0) > 1 && 00925 table->row_height(0) < max_row_height)) { 00926 best_box.set_bottom(bottom); 00927 best_below = table->space_below(); 00928 best_cols = MAX(table->column_count(), best_cols); 00929 found_good_border = true; 00930 } 00931 } 00932 previous_below = table->space_below(); 00933 } else { 00934 --chances; 00935 } 00936 } 00937 if (chances <= 0) 00938 break; 00939 00940 last_bottom = bottom; 00941 bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00942 last_bottom, true); 00943 } 00944 if (!found_good_border) 00945 return false; 00946 00947 // TODO(nbeato) comments: follow modified code above... put it in a function! 00948 found_good_border = false; 00949 int last_top = MIN_INT32; 00950 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00951 kMidGuessY + min_height_ / 2, false); 00952 int previous_above = 0; 00953 chances = kMaxChances; 00954 00955 adjusted.set_bottom(best_box.bottom()); 00956 while (last_top != top) { 00957 adjusted.set_top(top); 00958 if (adjusted.height() >= min_height_) { 00959 table->set_bounding_box(adjusted); 00960 if (table->FindWhitespacedStructure() && 00961 table->column_count() >= best_cols * kRequiredColumns) { 00962 int last_row = table->row_count() - 1; 00963 if (false && IsWeakTableRow(table, last_row)) { 00964 // Currently buggy, but was looking promising so disabled. 00965 --chances; 00966 } else { 00967 chances = kMaxChances; 00968 double max_row_height = kMaxRowSize * table->median_cell_height(); 00969 if ((table->space_above() * kMarginFactor >= best_above && 00970 table->space_above() >= previous_above) || 00971 (table->CountFilledCellsInRow(last_row) > 1 && 00972 table->row_height(last_row) < max_row_height)) { 00973 best_box.set_top(top); 00974 best_above = table->space_above(); 00975 best_cols = MAX(table->column_count(), best_cols); 00976 found_good_border = true; 00977 } 00978 } 00979 previous_above = table->space_above(); 00980 } else { 00981 --chances; 00982 } 00983 } 00984 if (chances <= 0) 00985 break; 00986 00987 last_top = top; 00988 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00989 last_top, false); 00990 } 00991 00992 if (!found_good_border) 00993 return false; 00994 00995 // If we get here, this shouldn't happen. It can be an assert, but 00996 // I haven't tested it enough to make it crash things. 00997 if (best_box.null_box()) 00998 return false; 00999 01000 // Given the best locations, fit the box to those locations. 01001 table->set_bounding_box(best_box); 01002 return table->FindWhitespacedStructure(); 01003 } 01004 01005 // Finds the closest value to y that can safely cause a horizontal 01006 // split in the partitions. 01007 // This function has been buggy and not as reliable as I would've 01008 // liked. I suggest finding all of the splits using the 01009 // FindPartitionLocations once and then just keeping the results 01010 // of that function cached somewhere. 01011 int TableRecognizer::NextHorizontalSplit(int left, int right, int y, 01012 bool top_to_bottom) { 01013 ColPartitionGridSearch gsearch(text_grid_); 01014 gsearch.SetUniqueMode(true); 01015 gsearch.StartVerticalSearch(left, right, y); 01016 ColPartition* text = NULL; 01017 int last_y = y; 01018 while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != NULL) { 01019 if (!text->IsTextType() || !text->IsHorizontalType()) 01020 continue; 01021 if (text->bounding_box().height() > max_text_height_) 01022 continue; 01023 01024 const TBOX& text_box = text->bounding_box(); 01025 if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) { 01026 last_y = MIN(last_y, text_box.bottom()); 01027 continue; 01028 } 01029 if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) { 01030 last_y = MAX(last_y, text_box.top()); 01031 continue; 01032 } 01033 01034 return last_y; 01035 } 01036 // If none is found, we at least want to preserve the min/max, 01037 // which defines the overlap of y with the last partition in the grid. 01038 return last_y; 01039 } 01040 01041 // Code is buggy right now. It is disabled in the calling function. 01042 // It seems like sometimes the row that is passed in is not correct 01043 // sometimes (like a phantom row is introduced). There's something going 01044 // on in the cell_y_ data member before this is called... not certain. 01045 bool TableRecognizer::IsWeakTableRow(StructuredTable* table, int row) { 01046 if (!table->VerifyRowFilled(row)) 01047 return false; 01048 01049 double threshold = 0.0; 01050 if (table->column_count() > kGoodRowNumberOfColumnsSmallSize) 01051 threshold = table->column_count() * kGoodRowNumberOfColumnsLarge; 01052 else 01053 threshold = kGoodRowNumberOfColumnsSmall[table->column_count()]; 01054 01055 return table->CountFilledCellsInRow(row) < threshold; 01056 } 01057 01058 } // namespace tesseract