Tesseract
3.02
|
00001 00002 // File: tablefind.cpp 00003 // Description: Helper classes to find tables from ColPartitions. 00004 // Author: Faisal Shafait (faisal.shafait@dfki.de) 00005 // Created: Tue Jan 06 11:13:01 PST 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include "tablefind.h" 00025 #include <math.h> 00026 #ifdef HAVE_CONFIG_H 00027 #include "config_auto.h" 00028 #endif 00029 00030 #include "allheaders.h" 00031 00032 #include "colpartitionset.h" 00033 #include "tablerecog.h" 00034 00035 namespace tesseract { 00036 00037 // These numbers are used to calculate the global median stats. 00038 // They just set an upper bound on the stats objects. 00039 // Maximum vertical spacing between neighbor partitions. 00040 const int kMaxVerticalSpacing = 500; 00041 // Maximum width of a blob in a partition. 00042 const int kMaxBlobWidth = 500; 00043 00044 // Minimum whitespace size to split a partition (measured as a multiple 00045 // of a partition's median width). 00046 const double kSplitPartitionSize = 2.0; 00047 // To insert text, the partition must satisfy these size constraints 00048 // in AllowTextPartition(). The idea is to filter noise partitions 00049 // determined by the size compared to the global medians. 00050 // TODO(nbeato): Need to find good numbers again. 00051 const double kAllowTextHeight = 0.5; 00052 const double kAllowTextWidth = 0.6; 00053 const double kAllowTextArea = 0.8; 00054 // The same thing applies to blobs (to filter noise). 00055 // TODO(nbeato): These numbers are a shot in the dark... 00056 // height and width are 0.5 * gridsize() in colfind.cpp 00057 // area is a rough guess for the size of a period. 00058 const double kAllowBlobHeight = 0.3; 00059 const double kAllowBlobWidth = 0.4; 00060 const double kAllowBlobArea = 0.05; 00061 00062 // Minimum number of components in a text partition. A partition having fewer 00063 // components than that is more likely a data partition and is a candidate 00064 // table cell. 00065 const int kMinBoxesInTextPartition = 10; 00066 00067 // Maximum number of components that a data partition can have 00068 const int kMaxBoxesInDataPartition = 20; 00069 00070 // Maximum allowed gap in a text partitions as a multiple of its median size. 00071 const double kMaxGapInTextPartition = 4.0; 00072 00073 // Minimum value that the maximum gap in a text partition should have as a 00074 // factor of its median size. 00075 const double kMinMaxGapInTextPartition = 0.5; 00076 00077 // The amount of overlap that is "normal" for adjacent blobs in a text 00078 // partition. This is used to calculate gap between overlapping blobs. 00079 const double kMaxBlobOverlapFactor = 4.0; 00080 00081 // Maximum x-height a table partition can have as a multiple of global 00082 // median x-height 00083 const double kMaxTableCellXheight = 2.0; 00084 00085 // Maximum line spacing between a table column header and column contents 00086 // for merging the two (as a multiple of the partition's median_size). 00087 const int kMaxColumnHeaderDistance = 4; 00088 00089 // Minimum ratio of num_table_partitions to num_text_partitions in a column 00090 // block to be called it a table column 00091 const double kTableColumnThreshold = 3.0; 00092 00093 // Search for horizontal ruling lines within the vertical margin as a 00094 // multiple of grid size 00095 const int kRulingVerticalMargin = 3; 00096 00097 // Minimum overlap that a colpartition must have with a table region 00098 // to become part of that table 00099 const double kMinOverlapWithTable = 0.6; 00100 00101 // Maximum side space (distance from column boundary) that a typical 00102 // text-line in flowing text should have as a multiple of its x-height 00103 // (Median size). 00104 const int kSideSpaceMargin = 10; 00105 00106 // Fraction of the peak of x-projection of a table region to set the 00107 // threshold for the x-projection histogram 00108 const double kSmallTableProjectionThreshold = 0.35; 00109 const double kLargeTableProjectionThreshold = 0.45; 00110 // Minimum number of rows required to look for more rows in the projection. 00111 const int kLargeTableRowCount = 6; 00112 00113 // Minimum number of rows in a table 00114 const int kMinRowsInTable = 3; 00115 00116 // The number of "whitespace blobs" that should appear between the 00117 // ColPartition's bounding box and the column tab stops to the left/right 00118 // when looking for center justified tab stops. 00119 const double kRequiredFullJustifiedSpacing = 4.0; 00120 00121 // The amount of padding (multiplied by global_median_xheight_ during use) 00122 // that is vertically added to the search adjacent leader search during 00123 // ColPartition marking. 00124 const int kAdjacentLeaderSearchPadding = 2; 00125 00126 // Used when filtering false positives. When finding the last line 00127 // of a paragraph (typically left-aligned), the previous line should have 00128 // its center to the right of the last line by this scaled amount. 00129 const double kParagraphEndingPreviousLineRatio = 1.3; 00130 00131 // The maximum amount of whitespace allowed left of a paragraph ending. 00132 // Do not filter a ColPartition with more than this space left of it. 00133 const double kMaxParagraphEndingLeftSpaceMultiple = 3.0; 00134 00135 // Used when filtering false positives. The last line of a paragraph 00136 // should be preceded by a line that is predominantly text. This is the 00137 // ratio of text to whitespace (to the right of the text) that is required 00138 // for the previous line to be a text. 00139 const double kMinParagraphEndingTextToWhitespaceRatio = 3.0; 00140 00141 // When counting table columns, this is the required gap between two columns 00142 // (it is multiplied by global_median_xheight_). 00143 const double kMaxXProjectionGapFactor = 2.0; 00144 00145 // Used for similarity in partitions using stroke width. Values copied 00146 // from ColFind.cpp in Ray's CL. 00147 const double kStrokeWidthFractionalTolerance = 0.25; 00148 const double kStrokeWidthConstantTolerance = 2.0; 00149 00150 BOOL_VAR(textord_dump_table_images, false, "Paint table detection output"); 00151 BOOL_VAR(textord_show_tables, false, "Show table regions"); 00152 BOOL_VAR(textord_tablefind_show_mark, false, 00153 "Debug table marking steps in detail"); 00154 BOOL_VAR(textord_tablefind_show_stats, false, 00155 "Show page stats used in table finding"); 00156 BOOL_VAR(textord_tablefind_recognize_tables, false, 00157 "Enables the table recognizer for table layout and filtering."); 00158 00159 ELISTIZE(ColSegment) 00160 CLISTIZE(ColSegment) 00161 00162 // Templated helper function used to create destructor callbacks for the 00163 // BBGrid::ClearGridData() method. 00164 template <typename T> void DeleteObject(T *object) { 00165 delete object; 00166 } 00167 00168 TableFinder::TableFinder() 00169 : resolution_(0), 00170 global_median_xheight_(0), 00171 global_median_blob_width_(0), 00172 global_median_ledding_(0), 00173 left_to_right_language_(true) { 00174 } 00175 00176 TableFinder::~TableFinder() { 00177 // ColPartitions and ColSegments created by this class for storage in grids 00178 // need to be deleted explicitly. 00179 clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>); 00180 leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>); 00181 fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>); 00182 col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>); 00183 table_grid_.ClearGridData(&DeleteObject<ColSegment>); 00184 } 00185 00186 void TableFinder::set_left_to_right_language(bool order) { 00187 left_to_right_language_ = order; 00188 } 00189 00190 void TableFinder::Init(int grid_size, const ICOORD& bottom_left, 00191 const ICOORD& top_right) { 00192 // Initialize clean partitions list and grid 00193 clean_part_grid_.Init(grid_size, bottom_left, top_right); 00194 leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right); 00195 fragmented_text_grid_.Init(grid_size, bottom_left, top_right); 00196 col_seg_grid_.Init(grid_size, bottom_left, top_right); 00197 table_grid_.Init(grid_size, bottom_left, top_right); 00198 } 00199 00200 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and 00201 // insert leaders and rulers into the leader_and_ruling_grid_ 00202 void TableFinder::InsertCleanPartitions(ColPartitionGrid* grid, 00203 TO_BLOCK* block) { 00204 // Calculate stats. This lets us filter partitions in AllowTextPartition() 00205 // and filter blobs in AllowBlob(). 00206 SetGlobalSpacings(grid); 00207 00208 // Iterate the ColPartitions in the grid. 00209 ColPartitionGridSearch gsearch(grid); 00210 gsearch.SetUniqueMode(true); 00211 gsearch.StartFullSearch(); 00212 ColPartition* part = NULL; 00213 while ((part = gsearch.NextFullSearch()) != NULL) { 00214 // Reject partitions with nothing useful inside of them. 00215 if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) 00216 continue; 00217 ColPartition* clean_part = part->ShallowCopy(); 00218 ColPartition* leader_part = NULL; 00219 if (part->IsLineType()) { 00220 InsertRulingPartition(clean_part); 00221 continue; 00222 } 00223 // Insert all non-text partitions to clean_parts 00224 if (!part->IsTextType()) { 00225 InsertImagePartition(clean_part); 00226 continue; 00227 } 00228 // Insert text colpartitions after removing noisy components from them 00229 // The leaders are split into a separate grid. 00230 BLOBNBOX_CLIST* part_boxes = part->boxes(); 00231 BLOBNBOX_C_IT pit(part_boxes); 00232 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { 00233 BLOBNBOX *pblob = pit.data(); 00234 // Bad blobs... happens in UNLV set. 00235 // news.3G1, page 17 (around x=6) 00236 if (!AllowBlob(*pblob)) 00237 continue; 00238 if (pblob->flow() == BTFT_LEADER) { 00239 if (leader_part == NULL) { 00240 leader_part = part->ShallowCopy(); 00241 leader_part->set_flow(BTFT_LEADER); 00242 } 00243 leader_part->AddBox(pblob); 00244 } else if (pblob->region_type() != BRT_NOISE) { 00245 clean_part->AddBox(pblob); 00246 } 00247 } 00248 clean_part->ComputeLimits(); 00249 ColPartition* fragmented = clean_part->CopyButDontOwnBlobs(); 00250 InsertTextPartition(clean_part); 00251 SplitAndInsertFragmentedTextPartition(fragmented); 00252 if (leader_part != NULL) { 00253 // TODO(nbeato): Note that ComputeLimits does not update the column 00254 // information. So the leader may appear to span more columns than it 00255 // really does later on when IsInSameColumnAs gets called to test 00256 // for adjacent leaders. 00257 leader_part->ComputeLimits(); 00258 InsertLeaderPartition(leader_part); 00259 } 00260 } 00261 00262 // Make the partition partners better for upper and lower neighbors. 00263 clean_part_grid_.FindPartitionPartners(); 00264 clean_part_grid_.RefinePartitionPartners(false); 00265 } 00266 00267 // High level function to perform table detection 00268 void TableFinder::LocateTables(ColPartitionGrid* grid, 00269 ColPartitionSet** all_columns, 00270 WidthCallback* width_cb, 00271 const FCOORD& reskew) { 00272 // initialize spacing, neighbors, and columns 00273 InitializePartitions(all_columns); 00274 if (textord_show_tables) { 00275 ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors"); 00276 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00277 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00278 ScrollView::AQUAMARINE); 00279 DisplayColPartitionConnections(table_win, &clean_part_grid_, 00280 ScrollView::ORANGE); 00281 00282 table_win = MakeWindow(100, 300, "Fragmented Text"); 00283 DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE); 00284 } 00285 // mark, filter, and smooth candidate table partitions 00286 MarkTablePartitions(); 00287 00288 // Make single-column blocks from good_columns_ partitions. col_segments are 00289 // moved to a grid later which takes the ownership 00290 ColSegment_LIST column_blocks; 00291 GetColumnBlocks(all_columns, &column_blocks); 00292 // Set the ratio of candidate table partitions in each column 00293 SetColumnsType(&column_blocks); 00294 00295 // Move column segments to col_seg_grid_ 00296 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_); 00297 00298 // Detect split in column layout that might have occurred due to the 00299 // presence of a table. In such a case, merge the corresponding columns. 00300 GridMergeColumnBlocks(); 00301 00302 // Group horizontally overlapping table partitions into table columns. 00303 // table_columns created here get deleted at the end of this method. 00304 ColSegment_LIST table_columns; 00305 GetTableColumns(&table_columns); 00306 00307 // Within each column, mark the range table regions occupy based on the 00308 // table columns detected. table_regions are moved to a grid later which 00309 // takes the ownership 00310 ColSegment_LIST table_regions; 00311 GetTableRegions(&table_columns, &table_regions); 00312 00313 if (textord_tablefind_show_mark) { 00314 ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions"); 00315 DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE); 00316 DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW); 00317 } 00318 00319 // Merge table regions across columns for tables spanning multiple 00320 // columns 00321 MoveColSegmentsToGrid(&table_regions, &table_grid_); 00322 GridMergeTableRegions(); 00323 00324 // Adjust table boundaries by including nearby horizontal lines and left 00325 // out column headers 00326 AdjustTableBoundaries(); 00327 GridMergeTableRegions(); 00328 00329 if (textord_tablefind_recognize_tables) { 00330 // Remove false alarms consiting of a single column 00331 DeleteSingleColumnTables(); 00332 00333 if (textord_show_tables) { 00334 ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations"); 00335 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00336 DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI); 00337 table_grid_.DisplayBoxes(table_win); 00338 } 00339 00340 // Find table grid structure and reject tables that are malformed. 00341 RecognizeTables(); 00342 GridMergeTableRegions(); 00343 RecognizeTables(); 00344 00345 if (textord_show_tables) { 00346 ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables"); 00347 DisplayColPartitions(table_win, &clean_part_grid_, 00348 ScrollView::BLUE, ScrollView::BLUE); 00349 table_grid_.DisplayBoxes(table_win); 00350 } 00351 } else { 00352 // Remove false alarms consiting of a single column 00353 // TODO(nbeato): verify this is a NOP after structured table rejection. 00354 // Right now it isn't. If the recognize function is doing what it is 00355 // supposed to do, this function is obsolete. 00356 DeleteSingleColumnTables(); 00357 00358 if (textord_show_tables) { 00359 ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables"); 00360 DisplayColPartitions(table_win, &clean_part_grid_, 00361 ScrollView::BLUE, ScrollView::BLUE); 00362 table_grid_.DisplayBoxes(table_win); 00363 } 00364 } 00365 00366 if (textord_dump_table_images) 00367 WriteToPix(reskew); 00368 00369 // Merge all colpartitions in table regions to make them a single 00370 // colpartition and revert types of isolated table cells not 00371 // assigned to any table to their original types. 00372 MakeTableBlocks(grid, all_columns, width_cb); 00373 } 00374 // All grids have the same dimensions. The clean_part_grid_ sizes are set from 00375 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as 00376 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_ 00377 // dimensions instead of duplicated memory. 00378 int TableFinder::gridsize() const { 00379 return clean_part_grid_.gridsize(); 00380 } 00381 int TableFinder::gridwidth() const { 00382 return clean_part_grid_.gridwidth(); 00383 } 00384 int TableFinder::gridheight() const { 00385 return clean_part_grid_.gridheight(); 00386 } 00387 const ICOORD& TableFinder::bleft() const { 00388 return clean_part_grid_.bleft(); 00389 } 00390 const ICOORD& TableFinder::tright() const { 00391 return clean_part_grid_.tright(); 00392 } 00393 00394 void TableFinder::InsertTextPartition(ColPartition* part) { 00395 ASSERT_HOST(part != NULL); 00396 if (AllowTextPartition(*part)) { 00397 clean_part_grid_.InsertBBox(true, true, part); 00398 } else { 00399 delete part; 00400 } 00401 } 00402 void TableFinder::InsertFragmentedTextPartition(ColPartition* part) { 00403 ASSERT_HOST(part != NULL); 00404 if (AllowTextPartition(*part)) { 00405 fragmented_text_grid_.InsertBBox(true, true, part); 00406 } else { 00407 delete part; 00408 } 00409 } 00410 void TableFinder::InsertLeaderPartition(ColPartition* part) { 00411 ASSERT_HOST(part != NULL); 00412 if (!part->IsEmpty() && part->bounding_box().area() > 0) { 00413 leader_and_ruling_grid_.InsertBBox(true, true, part); 00414 } else { 00415 delete part; 00416 } 00417 } 00418 void TableFinder::InsertRulingPartition(ColPartition* part) { 00419 leader_and_ruling_grid_.InsertBBox(true, true, part); 00420 } 00421 void TableFinder::InsertImagePartition(ColPartition* part) { 00422 // NOTE: If images are placed into a different grid in the future, 00423 // the function SetPartitionSpacings needs to be updated. It should 00424 // be the only thing that cares about image partitions. 00425 clean_part_grid_.InsertBBox(true, true, part); 00426 } 00427 00428 // Splits a partition into its "words". The splits happen 00429 // at locations with wide inter-blob spacing. This is useful 00430 // because it allows the table recognize to "cut through" the 00431 // text lines on the page. The assumption is that a table 00432 // will have several lines with similar overlapping whitespace 00433 // whereas text will not have this type of property. 00434 // Note: The code Assumes that blobs are sorted by the left side x! 00435 // This will not work (as well) if the blobs are sorted by center/right. 00436 void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition* part) { 00437 ASSERT_HOST(part != NULL); 00438 // Bye bye empty partitions! 00439 if (part->boxes()->empty()) { 00440 delete part; 00441 return; 00442 } 00443 00444 // The AllowBlob function prevents this. 00445 ASSERT_HOST(part->median_width() > 0); 00446 const double kThreshold = part->median_width() * kSplitPartitionSize; 00447 00448 ColPartition* right_part = part; 00449 bool found_split = true; 00450 while (found_split) { 00451 found_split = false; 00452 BLOBNBOX_C_IT box_it(right_part->boxes()); 00453 // Blobs are sorted left side first. If blobs overlap, 00454 // the previous blob may have a "more right" right side. 00455 // Account for this by always keeping the largest "right" 00456 // so far. 00457 int previous_right = MIN_INT32; 00458 00459 // Look for the next split in the partition. 00460 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { 00461 const TBOX& box = box_it.data()->bounding_box(); 00462 if (previous_right != MIN_INT32 && 00463 box.left() - previous_right > kThreshold) { 00464 // We have a split position. Split the partition in two pieces. 00465 // Insert the left piece in the grid and keep processing the right. 00466 int mid_x = (box.left() + previous_right) / 2; 00467 ColPartition* left_part = right_part; 00468 right_part = left_part->SplitAt(mid_x); 00469 00470 InsertFragmentedTextPartition(left_part); 00471 found_split = true; 00472 break; 00473 } 00474 00475 // The right side of the previous blobs. 00476 previous_right = MAX(previous_right, box.right()); 00477 } 00478 } 00479 // When a split is not found, the right part is minimized 00480 // as much as possible, so process it. 00481 InsertFragmentedTextPartition(right_part); 00482 } 00483 00484 // Some simple criteria to filter out now. We want to make sure the 00485 // average blob size in the partition is consistent with the 00486 // global page stats. 00487 // The area metric will almost always pass for multi-blob partitions. 00488 // It is useful when filtering out noise caused by an isolated blob. 00489 bool TableFinder::AllowTextPartition(const ColPartition& part) const { 00490 const double kHeightRequired = global_median_xheight_ * kAllowTextHeight; 00491 const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth; 00492 const int median_area = global_median_xheight_ * global_median_blob_width_; 00493 const double kAreaPerBlobRequired = median_area * kAllowTextArea; 00494 // Keep comparisons strictly greater to disallow 0! 00495 return part.median_size() > kHeightRequired && 00496 part.median_width() > kWidthRequired && 00497 part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count(); 00498 } 00499 00500 // Same as above, applied to blobs. Keep in mind that 00501 // leaders, commas, and periods are important in tables. 00502 bool TableFinder::AllowBlob(const BLOBNBOX& blob) const { 00503 const TBOX& box = blob.bounding_box(); 00504 const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight; 00505 const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth; 00506 const int median_area = global_median_xheight_ * global_median_blob_width_; 00507 const double kAreaRequired = median_area * kAllowBlobArea; 00508 // Keep comparisons strictly greater to disallow 0! 00509 return box.height() > kHeightRequired && 00510 box.width() > kWidthRequired && 00511 box.area() > kAreaRequired; 00512 } 00513 00514 // TODO(nbeato): The grid that makes the window doesn't seem to matter. 00515 // The only downside is that window messages will be caught by 00516 // clean_part_grid_ instead of a useful object. This is a temporary solution 00517 // for the debug windows created by the TableFinder. 00518 ScrollView* TableFinder::MakeWindow(int x, int y, const char* window_name) { 00519 return clean_part_grid_.MakeWindow(x, y, window_name); 00520 } 00521 00522 // Make single-column blocks from good_columns_ partitions. 00523 void TableFinder::GetColumnBlocks(ColPartitionSet** all_columns, 00524 ColSegment_LIST* column_blocks) { 00525 for (int i = 0; i < gridheight(); ++i) { 00526 ColPartitionSet* columns = all_columns[i]; 00527 if (columns != NULL) { 00528 ColSegment_LIST new_blocks; 00529 // Get boxes from the current vertical position on the grid 00530 columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks); 00531 // Merge the new_blocks boxes into column_blocks if they are well-aligned 00532 GroupColumnBlocks(&new_blocks, column_blocks); 00533 } 00534 } 00535 } 00536 00537 // Merge column segments into the current list if they are well aligned. 00538 void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks, 00539 ColSegment_LIST* column_blocks) { 00540 ColSegment_IT src_it(new_blocks); 00541 ColSegment_IT dest_it(column_blocks); 00542 // iterate through the source list 00543 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { 00544 ColSegment* src_seg = src_it.data(); 00545 TBOX src_box = src_seg->bounding_box(); 00546 bool match_found = false; 00547 // iterate through the destination list to find a matching column block 00548 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) { 00549 ColSegment* dest_seg = dest_it.data(); 00550 TBOX dest_box = dest_seg->bounding_box(); 00551 if (ConsecutiveBoxes(src_box, dest_box)) { 00552 // If matching block is found, insert the current block into it 00553 // and delete the soure block 00554 dest_seg->InsertBox(src_box); 00555 match_found = true; 00556 delete src_it.extract(); 00557 break; 00558 } 00559 } 00560 // If no match is found, just append the source block to column_blocks 00561 if (!match_found) { 00562 dest_it.add_after_then_move(src_it.extract()); 00563 } 00564 } 00565 } 00566 00567 // are the two boxes immediate neighbors along the vertical direction 00568 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) { 00569 int x_margin = 20; 00570 int y_margin = 5; 00571 return (abs(b1.left() - b2.left()) < x_margin) && 00572 (abs(b1.right() - b2.right()) < x_margin) && 00573 (abs(b1.top()-b2.bottom()) < y_margin || 00574 abs(b2.top()-b1.bottom()) < y_margin); 00575 } 00576 00577 // Set up info for clean_part_grid_ partitions to be valid during detection 00578 // code. 00579 void TableFinder::InitializePartitions(ColPartitionSet** all_columns) { 00580 FindNeighbors(); 00581 SetPartitionSpacings(&clean_part_grid_, all_columns); 00582 SetGlobalSpacings(&clean_part_grid_); 00583 } 00584 00585 // Set left, right and top, bottom spacings of each colpartition. 00586 void TableFinder::SetPartitionSpacings(ColPartitionGrid* grid, 00587 ColPartitionSet** all_columns) { 00588 // Iterate the ColPartitions in the grid. 00589 ColPartitionGridSearch gsearch(grid); 00590 gsearch.StartFullSearch(); 00591 ColPartition* part = NULL; 00592 while ((part = gsearch.NextFullSearch()) != NULL) { 00593 ColPartitionSet* columns = all_columns[gsearch.GridY()]; 00594 TBOX box = part->bounding_box(); 00595 int y = part->MidY(); 00596 ColPartition* left_column = columns->ColumnContaining(box.left(), y); 00597 ColPartition* right_column = columns->ColumnContaining(box.right(), y); 00598 // set distance from left column as space to the left 00599 if (left_column) { 00600 int left_space = MAX(0, box.left() - left_column->LeftAtY(y)); 00601 part->set_space_to_left(left_space); 00602 } 00603 // set distance from right column as space to the right 00604 if (right_column) { 00605 int right_space = MAX(0, right_column->RightAtY(y) - box.right()); 00606 part->set_space_to_right(right_space); 00607 } 00608 00609 // Look for images that may be closer. 00610 // NOTE: used to be part_grid_, might cause issues now 00611 ColPartitionGridSearch hsearch(grid); 00612 hsearch.StartSideSearch(box.left(), box.bottom(), box.top()); 00613 ColPartition* neighbor = NULL; 00614 while ((neighbor = hsearch.NextSideSearch(true)) != NULL) { 00615 if (neighbor->type() == PT_PULLOUT_IMAGE || 00616 neighbor->type() == PT_FLOWING_IMAGE || 00617 neighbor->type() == PT_HEADING_IMAGE) { 00618 int right = neighbor->bounding_box().right(); 00619 if (right < box.left()) { 00620 int space = MIN(box.left() - right, part->space_to_left()); 00621 part->set_space_to_left(space); 00622 } 00623 } 00624 } 00625 hsearch.StartSideSearch(box.left(), box.bottom(), box.top()); 00626 neighbor = NULL; 00627 while ((neighbor = hsearch.NextSideSearch(false)) != NULL) { 00628 if (neighbor->type() == PT_PULLOUT_IMAGE || 00629 neighbor->type() == PT_FLOWING_IMAGE || 00630 neighbor->type() == PT_HEADING_IMAGE) { 00631 int left = neighbor->bounding_box().left(); 00632 if (left > box.right()) { 00633 int space = MIN(left - box.right(), part->space_to_right()); 00634 part->set_space_to_right(space); 00635 } 00636 } 00637 } 00638 00639 ColPartition* upper_part = part->SingletonPartner(true); 00640 if (upper_part) { 00641 int space = MAX(0, upper_part->bounding_box().bottom() - 00642 part->bounding_box().bottom()); 00643 part->set_space_above(space); 00644 } else { 00645 // TODO(nbeato): What constitutes a good value? 00646 // 0 is the default value when not set, explicitly noting it needs to 00647 // be something else. 00648 part->set_space_above(MAX_INT32); 00649 } 00650 00651 ColPartition* lower_part = part->SingletonPartner(false); 00652 if (lower_part) { 00653 int space = MAX(0, part->bounding_box().bottom() - 00654 lower_part->bounding_box().bottom()); 00655 part->set_space_below(space); 00656 } else { 00657 // TODO(nbeato): What constitutes a good value? 00658 // 0 is the default value when not set, explicitly noting it needs to 00659 // be something else. 00660 part->set_space_below(MAX_INT32); 00661 } 00662 } 00663 } 00664 00665 // Set spacing and closest neighbors above and below a given colpartition. 00666 void TableFinder::SetVerticalSpacing(ColPartition* part) { 00667 TBOX box = part->bounding_box(); 00668 int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y()); 00669 int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y()); 00670 box.set_top(top_range); 00671 box.set_bottom(bottom_range); 00672 00673 TBOX part_box = part->bounding_box(); 00674 // Start a rect search 00675 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 00676 rectsearch(&clean_part_grid_); 00677 rectsearch.StartRectSearch(box); 00678 ColPartition* neighbor; 00679 int min_space_above = kMaxVerticalSpacing; 00680 int min_space_below = kMaxVerticalSpacing; 00681 ColPartition* above_neighbor = NULL; 00682 ColPartition* below_neighbor = NULL; 00683 while ((neighbor = rectsearch.NextRectSearch()) != NULL) { 00684 if (neighbor == part) 00685 continue; 00686 TBOX neighbor_box = neighbor->bounding_box(); 00687 if (neighbor_box.major_x_overlap(part_box)) { 00688 int gap = abs(part->median_bottom() - neighbor->median_bottom()); 00689 // If neighbor is below current partition 00690 if (neighbor_box.top() < part_box.bottom() && 00691 gap < min_space_below) { 00692 min_space_below = gap; 00693 below_neighbor = neighbor; 00694 } // If neighbor is above current partition 00695 else if (part_box.top() < neighbor_box.bottom() && 00696 gap < min_space_above) { 00697 min_space_above = gap; 00698 above_neighbor = neighbor; 00699 } 00700 } 00701 } 00702 part->set_space_above(min_space_above); 00703 part->set_space_below(min_space_below); 00704 part->set_nearest_neighbor_above(above_neighbor); 00705 part->set_nearest_neighbor_below(below_neighbor); 00706 } 00707 00708 // Set global spacing and x-height estimates 00709 void TableFinder::SetGlobalSpacings(ColPartitionGrid* grid) { 00710 STATS xheight_stats(0, kMaxVerticalSpacing + 1); 00711 STATS width_stats(0, kMaxBlobWidth + 1); 00712 STATS ledding_stats(0, kMaxVerticalSpacing + 1); 00713 // Iterate the ColPartitions in the grid. 00714 ColPartitionGridSearch gsearch(grid); 00715 gsearch.SetUniqueMode(true); 00716 gsearch.StartFullSearch(); 00717 ColPartition* part = NULL; 00718 while ((part = gsearch.NextFullSearch()) != NULL) { 00719 // TODO(nbeato): HACK HACK HACK! medians are equal to partition length. 00720 // ComputeLimits needs to get called somewhere outside of TableFinder 00721 // to make sure the partitions are properly initialized. 00722 // When this is called, SmoothPartitionPartners dies in an assert after 00723 // table find runs. Alternative solution. 00724 // part->ComputeLimits(); 00725 if (part->IsTextType()) { 00726 // xheight_stats.add(part->median_size(), part->boxes_count()); 00727 // width_stats.add(part->median_width(), part->boxes_count()); 00728 00729 // This loop can be removed when above issues are fixed. 00730 // Replace it with the 2 lines commented out above. 00731 BLOBNBOX_C_IT it(part->boxes()); 00732 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00733 xheight_stats.add(it.data()->bounding_box().height(), 1); 00734 width_stats.add(it.data()->bounding_box().width(), 1); 00735 } 00736 00737 ledding_stats.add(part->space_above(), 1); 00738 ledding_stats.add(part->space_below(), 1); 00739 } 00740 } 00741 // Set estimates based on median of statistics obtained 00742 set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5)); 00743 set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5)); 00744 set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5)); 00745 #ifndef GRAPHICS_DISABLED 00746 if (textord_tablefind_show_stats) { 00747 const char* kWindowName = "X-height (R), X-width (G), and ledding (B)"; 00748 ScrollView* stats_win = MakeWindow(500, 10, kWindowName); 00749 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED); 00750 width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN); 00751 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE); 00752 } 00753 #endif // GRAPHICS_DISABLED 00754 } 00755 00756 void TableFinder::set_global_median_xheight(int xheight) { 00757 global_median_xheight_ = xheight; 00758 } 00759 void TableFinder::set_global_median_blob_width(int width) { 00760 global_median_blob_width_ = width; 00761 } 00762 void TableFinder::set_global_median_ledding(int ledding) { 00763 global_median_ledding_ = ledding; 00764 } 00765 00766 void TableFinder::FindNeighbors() { 00767 ColPartitionGridSearch gsearch(&clean_part_grid_); 00768 gsearch.StartFullSearch(); 00769 ColPartition* part = NULL; 00770 while ((part = gsearch.NextFullSearch()) != NULL) { 00771 // TODO(nbeato): Rename this function, meaning is different now. 00772 // IT is finding nearest neighbors its own way 00773 //SetVerticalSpacing(part); 00774 00775 ColPartition* upper = part->SingletonPartner(true); 00776 if (upper) 00777 part->set_nearest_neighbor_above(upper); 00778 00779 ColPartition* lower = part->SingletonPartner(false); 00780 if (lower) 00781 part->set_nearest_neighbor_below(lower); 00782 } 00783 } 00784 00785 // High level interface. Input is an unmarked ColPartitionGrid 00786 // (namely, clean_part_grid_). Partitions are identified using local 00787 // information and filter/smoothed. The function exit should contain 00788 // a good sampling of the table partitions. 00789 void TableFinder::MarkTablePartitions() { 00790 MarkPartitionsUsingLocalInformation(); 00791 if (textord_tablefind_show_mark) { 00792 ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions"); 00793 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00794 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00795 ScrollView::AQUAMARINE); 00796 } 00797 FilterFalseAlarms(); 00798 if (textord_tablefind_show_mark) { 00799 ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions"); 00800 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00801 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00802 ScrollView::AQUAMARINE); 00803 } 00804 SmoothTablePartitionRuns(); 00805 if (textord_tablefind_show_mark) { 00806 ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions"); 00807 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00808 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00809 ScrollView::AQUAMARINE); 00810 } 00811 FilterFalseAlarms(); 00812 if (textord_tablefind_show_mark || textord_show_tables) { 00813 ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions"); 00814 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00815 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00816 ScrollView::AQUAMARINE); 00817 } 00818 } 00819 00820 // These types of partitions are marked as table partitions: 00821 // 1- Partitions that have at lease one large gap between words 00822 // 2- Partitions that consist of only one word (no significant gap 00823 // between components) 00824 // 3- Partitions that vertically overlap with other partitions within the 00825 // same column. 00826 // 4- Partitions with leaders before/after them. 00827 void TableFinder::MarkPartitionsUsingLocalInformation() { 00828 // Iterate the ColPartitions in the grid. 00829 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 00830 gsearch(&clean_part_grid_); 00831 gsearch.StartFullSearch(); 00832 ColPartition* part = NULL; 00833 while ((part = gsearch.NextFullSearch()) != NULL) { 00834 if (!part->IsTextType()) // Only consider text partitions 00835 continue; 00836 // Only consider partitions in dominant font size or smaller 00837 if (part->median_size() > kMaxTableCellXheight * global_median_xheight_) 00838 continue; 00839 // Mark partitions with a large gap, or no significant gap as 00840 // table partitions. 00841 // Comments: It produces several false alarms at: 00842 // - last line of a paragraph (fixed) 00843 // - single word section headings 00844 // - page headers and footers 00845 // - numbered equations 00846 // - line drawing regions 00847 // TODO(faisal): detect and fix above-mentioned cases 00848 if (HasWideOrNoInterWordGap(part) || 00849 HasLeaderAdjacent(*part)) { 00850 part->set_table_type(); 00851 } 00852 } 00853 } 00854 00855 // Check if the partition has at least one large gap between words or no 00856 // significant gap at all 00857 bool TableFinder::HasWideOrNoInterWordGap(ColPartition* part) const { 00858 // Should only get text partitions. 00859 ASSERT_HOST(part->IsTextType()); 00860 // Blob access 00861 BLOBNBOX_CLIST* part_boxes = part->boxes(); 00862 BLOBNBOX_C_IT it(part_boxes); 00863 // Check if this is a relatively small partition (such as a single word) 00864 if (part->bounding_box().width() < 00865 kMinBoxesInTextPartition * part->median_size() && 00866 part_boxes->length() < kMinBoxesInTextPartition) 00867 return true; 00868 00869 // Variables used to compute inter-blob spacing. 00870 int current_x0 = -1; 00871 int current_x1 = -1; 00872 int previous_x1 = -1; 00873 // Stores the maximum gap detected. 00874 int largest_partition_gap_found = -1; 00875 // Text partition gap limits. If this is text (and not a table), 00876 // there should be at least one gap larger than min_gap and no gap 00877 // larger than max_gap. 00878 const double max_gap = kMaxGapInTextPartition * part->median_size(); 00879 const double min_gap = kMinMaxGapInTextPartition * part->median_size(); 00880 00881 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00882 BLOBNBOX* blob = it.data(); 00883 current_x0 = blob->bounding_box().left(); 00884 current_x1 = blob->bounding_box().right(); 00885 if (previous_x1 != -1) { 00886 int gap = current_x0 - previous_x1; 00887 00888 // TODO(nbeato): Boxes may overlap? Huh? 00889 // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors 00890 // on the top right of the page are filtered out with this line. 00891 // Note 2: Iterating over blobs in a partition, so we are looking for 00892 // spacing between the words. 00893 if (gap < 0) { 00894 // More likely case, the blobs slightly overlap. This can happen 00895 // with diacritics (accents) or broken alphabet symbols (characters). 00896 // Merge boxes together by taking max of right sides. 00897 if (-gap < part->median_size() * kMaxBlobOverlapFactor) { 00898 previous_x1 = MAX(previous_x1, current_x1); 00899 continue; 00900 } 00901 // Extreme case, blobs overlap significantly in the same partition... 00902 // This should not happen often (if at all), but it does. 00903 // TODO(nbeato): investigate cases when this happens. 00904 else { 00905 // The behavior before was to completely ignore this case. 00906 } 00907 } 00908 00909 // If a large enough gap is found, mark it as a table cell (return true) 00910 if (gap > max_gap) 00911 return true; 00912 if (gap > largest_partition_gap_found) 00913 largest_partition_gap_found = gap; 00914 } 00915 previous_x1 = current_x1; 00916 } 00917 // Since no large gap was found, return false if the partition is too 00918 // long to be a data cell 00919 if (part->bounding_box().width() > 00920 kMaxBoxesInDataPartition * part->median_size() || 00921 part_boxes->length() > kMaxBoxesInDataPartition) 00922 return false; 00923 00924 // A partition may be a single blob. In this case, it's an isolated symbol 00925 // or non-text (such as a ruling or image). 00926 // Detect these as table partitions? Shouldn't this be case by case? 00927 // The behavior before was to ignore this, making max_partition_gap < 0 00928 // and implicitly return true. Just making it explicit. 00929 if (largest_partition_gap_found == -1) 00930 return true; 00931 00932 // return true if the maximum gap found is smaller than the minimum allowed 00933 // max_gap in a text partition. This indicates that there is no signficant 00934 // space in the partition, hence it is likely a single word. 00935 return largest_partition_gap_found < min_gap; 00936 } 00937 00938 // A criteria for possible tables is that a table may have leaders 00939 // between data cells. An aggressive solution to find such tables is to 00940 // explicitly mark partitions that have adjacent leaders. 00941 // Note that this includes overlapping leaders. However, it does not 00942 // include leaders in different columns on the page. 00943 // Possible false-positive will include lists, such as a table of contents. 00944 // As these arise, the agressive nature of this search may need to be 00945 // trimmed down. 00946 bool TableFinder::HasLeaderAdjacent(const ColPartition& part) { 00947 if (part.flow() == BTFT_LEADER) 00948 return true; 00949 // Search range is left and right bounded by an offset of the 00950 // median xheight. This offset is to allow some tolerance to the 00951 // the leaders on the page in the event that the alignment is still 00952 // a bit off. 00953 const TBOX& box = part.bounding_box(); 00954 const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_; 00955 const int top = box.top() + search_size; 00956 const int bottom = box.bottom() - search_size; 00957 ColPartitionGridSearch hsearch(&leader_and_ruling_grid_); 00958 for (int direction = 0; direction < 2; ++direction) { 00959 bool right_to_left = (direction == 0); 00960 int x = right_to_left ? box.right() : box.left(); 00961 hsearch.StartSideSearch(x, bottom, top); 00962 ColPartition* leader = NULL; 00963 while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) { 00964 // This should not happen, they are in different grids. 00965 ASSERT_HOST(&part != leader); 00966 // The leader could be a horizontal ruling in the grid. 00967 // Make sure it is actually a leader. 00968 if (leader->flow() != BTFT_LEADER) 00969 continue; 00970 // Make sure the leader shares a page column with the partition, 00971 // otherwise we are spreading across columns. 00972 if (!part.IsInSameColumnAs(*leader)) 00973 break; 00974 // There should be a significant vertical overlap 00975 if (!leader->VSignificantCoreOverlap(part)) 00976 continue; 00977 // Leader passed all tests, so it is adjacent. 00978 return true; 00979 } 00980 } 00981 // No leaders are adjacent to the given partition. 00982 return false; 00983 } 00984 00985 // Filter individual text partitions marked as table partitions 00986 // consisting of paragraph endings, small section headings, and 00987 // headers and footers. 00988 void TableFinder::FilterFalseAlarms() { 00989 FilterParagraphEndings(); 00990 FilterHeaderAndFooter(); 00991 // TODO(nbeato): Fully justified text as non-table? 00992 } 00993 00994 void TableFinder::FilterParagraphEndings() { 00995 // Detect last line of paragraph 00996 // Iterate the ColPartitions in the grid. 00997 ColPartitionGridSearch gsearch(&clean_part_grid_); 00998 gsearch.StartFullSearch(); 00999 ColPartition* part = NULL; 01000 while ((part = gsearch.NextFullSearch()) != NULL) { 01001 if (part->type() != PT_TABLE) 01002 continue; // Consider only table partitions 01003 01004 // Paragraph ending should have flowing text above it. 01005 ColPartition* upper_part = part->nearest_neighbor_above(); 01006 if (!upper_part) 01007 continue; 01008 if (upper_part->type() != PT_FLOWING_TEXT) 01009 continue; 01010 if (upper_part->bounding_box().width() < 01011 2 * part->bounding_box().width()) 01012 continue; 01013 // Check if its the last line of a paragraph. 01014 // In most cases, a paragraph ending should be left-aligned to text line 01015 // above it. Sometimes, it could be a 2 line paragraph, in which case 01016 // the line above it is indented. 01017 // To account for that, check if the partition center is to 01018 // the left of the one above it. 01019 int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2; 01020 int upper_mid = (upper_part->bounding_box().left() + 01021 upper_part->bounding_box().right()) / 2; 01022 int current_spacing = 0; // spacing of the current line to margin 01023 int upper_spacing = 0; // spacing of the previous line to the margin 01024 if (left_to_right_language_) { 01025 // Left to right languages, use mid - left to figure out the distance 01026 // the middle is from the left margin. 01027 int left = MIN(part->bounding_box().left(), 01028 upper_part->bounding_box().left()); 01029 current_spacing = mid - left; 01030 upper_spacing = upper_mid - left; 01031 } else { 01032 // Right to left languages, use right - mid to figure out the distance 01033 // the middle is from the right margin. 01034 int right = MAX(part->bounding_box().right(), 01035 upper_part->bounding_box().right()); 01036 current_spacing = right - mid; 01037 upper_spacing = right - upper_mid; 01038 } 01039 if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) 01040 continue; 01041 01042 // Paragraphs should have similar fonts. 01043 if (!part->MatchingSizes(*upper_part) || 01044 !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance, 01045 kStrokeWidthConstantTolerance)) { 01046 continue; 01047 } 01048 01049 // The last line of a paragraph should be left aligned. 01050 // TODO(nbeato): This would be untrue if the text was right aligned. 01051 // How often is that? 01052 if (part->space_to_left() > 01053 kMaxParagraphEndingLeftSpaceMultiple * part->median_size()) 01054 continue; 01055 // The line above it should be right aligned (assuming justified format). 01056 // Since we can't assume justified text, we compare whitespace to text. 01057 // The above line should have majority spanning text (or the current 01058 // line could have fit on the previous line). So compare 01059 // whitespace to text. 01060 if (upper_part->bounding_box().width() < 01061 kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right()) 01062 continue; 01063 01064 // Ledding above the line should be less than ledding below 01065 if (part->space_above() >= part->space_below() || 01066 part->space_above() > 2 * global_median_ledding_) 01067 continue; 01068 01069 // If all checks failed, it is probably text. 01070 part->clear_table_type(); 01071 } 01072 } 01073 01074 void TableFinder::FilterHeaderAndFooter() { 01075 // Consider top-most text colpartition as header and bottom most as footer 01076 ColPartition* header = NULL; 01077 ColPartition* footer = NULL; 01078 int max_top = MIN_INT32; 01079 int min_bottom = MAX_INT32; 01080 ColPartitionGridSearch gsearch(&clean_part_grid_); 01081 gsearch.StartFullSearch(); 01082 ColPartition* part = NULL; 01083 while ((part = gsearch.NextFullSearch()) != NULL) { 01084 if (!part->IsTextType()) 01085 continue; // Consider only text partitions 01086 int top = part->bounding_box().top(); 01087 int bottom = part->bounding_box().bottom(); 01088 if (top > max_top) { 01089 max_top = top; 01090 header = part; 01091 } 01092 if (bottom < min_bottom) { 01093 min_bottom = bottom; 01094 footer = part; 01095 } 01096 } 01097 if (header) 01098 header->clear_table_type(); 01099 if (footer) 01100 footer->clear_table_type(); 01101 } 01102 01103 // Mark all ColPartitions as table cells that have a table cell above 01104 // and below them 01105 // TODO(faisal): This is too aggressive at the moment. The method needs to 01106 // consider spacing and alignment as well. Detection of false alarm table cells 01107 // should also be done as part of it. 01108 void TableFinder::SmoothTablePartitionRuns() { 01109 // Iterate the ColPartitions in the grid. 01110 ColPartitionGridSearch gsearch(&clean_part_grid_); 01111 gsearch.StartFullSearch(); 01112 ColPartition* part = NULL; 01113 while ((part = gsearch.NextFullSearch()) != NULL) { 01114 if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) 01115 continue; // Consider only text partitions 01116 ColPartition* upper_part = part->nearest_neighbor_above(); 01117 ColPartition* lower_part = part->nearest_neighbor_below(); 01118 if (!upper_part || !lower_part) 01119 continue; 01120 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) 01121 part->set_table_type(); 01122 } 01123 01124 // Pass 2, do the opposite. If both the upper and lower neighbors 01125 // exist and are not tables, this probably shouldn't be a table. 01126 gsearch.StartFullSearch(); 01127 part = NULL; 01128 while ((part = gsearch.NextFullSearch()) != NULL) { 01129 if (part->type() != PT_TABLE) 01130 continue; // Consider only text partitions 01131 ColPartition* upper_part = part->nearest_neighbor_above(); 01132 ColPartition* lower_part = part->nearest_neighbor_below(); 01133 01134 // table can't be by itself 01135 if ((upper_part && upper_part->type() != PT_TABLE) && 01136 (lower_part && lower_part->type() != PT_TABLE)) { 01137 part->clear_table_type(); 01138 } 01139 } 01140 } 01141 01142 // Set the type of a column segment based on the ratio of table to text cells 01143 void TableFinder::SetColumnsType(ColSegment_LIST* column_blocks) { 01144 ColSegment_IT it(column_blocks); 01145 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 01146 ColSegment* seg = it.data(); 01147 TBOX box = seg->bounding_box(); 01148 int num_table_cells = 0; 01149 int num_text_cells = 0; 01150 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01151 rsearch(&clean_part_grid_); 01152 rsearch.SetUniqueMode(true); 01153 rsearch.StartRectSearch(box); 01154 ColPartition* part = NULL; 01155 while ((part = rsearch.NextRectSearch()) != NULL) { 01156 if (part->type() == PT_TABLE) { 01157 num_table_cells++; 01158 } else if (part->type() == PT_FLOWING_TEXT) { 01159 num_text_cells++; 01160 } 01161 } 01162 // If a column block has no text or table partition in it, it is not needed 01163 // for table detection. 01164 if (!num_table_cells && !num_text_cells) { 01165 delete it.extract(); 01166 } else { 01167 seg->set_num_table_cells(num_table_cells); 01168 seg->set_num_text_cells(num_text_cells); 01169 // set column type based on the ratio of table to text cells 01170 seg->set_type(); 01171 } 01172 } 01173 } 01174 01175 // Move column blocks to grid 01176 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments, 01177 ColSegmentGrid *col_seg_grid) { 01178 ColSegment_IT it(segments); 01179 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 01180 ColSegment* seg = it.extract(); 01181 col_seg_grid->InsertBBox(true, true, seg); 01182 } 01183 } 01184 01185 // Merge column blocks if a split is detected due to the presence of a 01186 // table. A text block is considered split if it has multiple 01187 // neighboring blocks above/below it, and at least one of the 01188 // neighboring blocks is of table type (has a high density of table 01189 // partitions). In this case neighboring blocks in the direction 01190 // (above/below) of the table block are merged with the text block. 01191 01192 // Comment: This method does not handle split due to a full page table 01193 // since table columns in this case do not have a text column on which 01194 // split decision can be based. 01195 void TableFinder::GridMergeColumnBlocks() { 01196 int margin = gridsize(); 01197 01198 // Iterate the Column Blocks in the grid. 01199 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01200 gsearch(&col_seg_grid_); 01201 gsearch.StartFullSearch(); 01202 ColSegment* seg; 01203 while ((seg = gsearch.NextFullSearch()) != NULL) { 01204 if (seg->type() != COL_TEXT) 01205 continue; // only consider text blocks for split detection 01206 bool neighbor_found = false; 01207 bool modified = false; // Modified at least once 01208 // keep expanding current box as long as neighboring table columns 01209 // are found above or below it. 01210 do { 01211 TBOX box = seg->bounding_box(); 01212 // slightly expand the search region vertically 01213 int top_range = MIN(box.top() + margin, tright().y()); 01214 int bottom_range = MAX(box.bottom() - margin, bleft().y()); 01215 box.set_top(top_range); 01216 box.set_bottom(bottom_range); 01217 neighbor_found = false; 01218 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01219 rectsearch(&col_seg_grid_); 01220 rectsearch.StartRectSearch(box); 01221 ColSegment* neighbor = NULL; 01222 while ((neighbor = rectsearch.NextRectSearch()) != NULL) { 01223 if (neighbor == seg) 01224 continue; 01225 const TBOX& neighbor_box = neighbor->bounding_box(); 01226 // If the neighbor box significantly overlaps with the current 01227 // box (due to the expansion of the current box in the 01228 // previous iteration of this loop), remove the neighbor box 01229 // and expand the current box to include it. 01230 if (neighbor_box.overlap_fraction(box) >= 0.9) { 01231 seg->InsertBox(neighbor_box); 01232 modified = true; 01233 rectsearch.RemoveBBox(); 01234 gsearch.RepositionIterator(); 01235 delete neighbor; 01236 continue; 01237 } 01238 // Only expand if the neighbor box is of table type 01239 if (neighbor->type() != COL_TABLE) 01240 continue; 01241 // Insert the neighbor box into the current column block 01242 if (neighbor_box.major_x_overlap(box) && 01243 !box.contains(neighbor_box)) { 01244 seg->InsertBox(neighbor_box); 01245 neighbor_found = true; 01246 modified = true; 01247 rectsearch.RemoveBBox(); 01248 gsearch.RepositionIterator(); 01249 delete neighbor; 01250 } 01251 } 01252 } while (neighbor_found); 01253 if (modified) { 01254 // Because the box has changed, it has to be removed first. 01255 gsearch.RemoveBBox(); 01256 col_seg_grid_.InsertBBox(true, true, seg); 01257 gsearch.RepositionIterator(); 01258 } 01259 } 01260 } 01261 01262 // Group horizontally overlapping table partitions into table columns. 01263 // TODO(faisal): This is too aggressive at the moment. The method should 01264 // consider more attributes to group table partitions together. Some common 01265 // errors are: 01266 // 1- page number is merged with a table column above it even 01267 // if there is a large vertical gap between them. 01268 // 2- column headers go on to catch one of the columns arbitrarily 01269 // 3- an isolated noise blob near page top or bottom merges with the table 01270 // column below/above it 01271 // 4- cells from two vertically adjacent tables merge together to make a 01272 // single column resulting in merging of the two tables 01273 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) { 01274 ColSegment_IT it(table_columns); 01275 // Iterate the ColPartitions in the grid. 01276 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01277 gsearch(&clean_part_grid_); 01278 gsearch.StartFullSearch(); 01279 ColPartition* part; 01280 while ((part = gsearch.NextFullSearch()) != NULL) { 01281 if (part->inside_table_column() || part->type() != PT_TABLE) 01282 continue; // prevent a partition to be assigned to multiple columns 01283 const TBOX& box = part->bounding_box(); 01284 ColSegment* col = new ColSegment(); 01285 col->InsertBox(box); 01286 part->set_inside_table_column(true); 01287 // Start a search below the current cell to find bottom neighbours 01288 // Note: a full search will always process things above it first, so 01289 // this should be starting at the highest cell and working its way down. 01290 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01291 vsearch(&clean_part_grid_); 01292 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom()); 01293 ColPartition* neighbor = NULL; 01294 bool found_neighbours = false; 01295 while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) { 01296 // only consider neighbors not assigned to any column yet 01297 if (neighbor->inside_table_column()) 01298 continue; 01299 // Horizontal lines should not break the flow 01300 if (neighbor->IsHorizontalLine()) 01301 continue; 01302 // presence of a non-table neighbor marks the end of current 01303 // table column 01304 if (neighbor->type() != PT_TABLE) 01305 break; 01306 // add the neighbor partition to the table column 01307 const TBOX& neighbor_box = neighbor->bounding_box(); 01308 col->InsertBox(neighbor_box); 01309 neighbor->set_inside_table_column(true); 01310 found_neighbours = true; 01311 } 01312 if (found_neighbours) { 01313 it.add_after_then_move(col); 01314 } else { 01315 part->set_inside_table_column(false); 01316 delete col; 01317 } 01318 } 01319 } 01320 01321 // Mark regions in a column that are x-bounded by the column boundaries and 01322 // y-bounded by the table columns' projection on the y-axis as table regions 01323 void TableFinder::GetTableRegions(ColSegment_LIST* table_columns, 01324 ColSegment_LIST* table_regions) { 01325 ColSegment_IT cit(table_columns); 01326 ColSegment_IT rit(table_regions); 01327 // Iterate through column blocks 01328 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01329 gsearch(&col_seg_grid_); 01330 gsearch.StartFullSearch(); 01331 ColSegment* part; 01332 int page_height = tright().y() - bleft().y(); 01333 ASSERT_HOST(page_height > 0); 01334 // create a bool array to hold projection on y-axis 01335 bool* table_region = new bool[page_height]; 01336 while ((part = gsearch.NextFullSearch()) != NULL) { 01337 TBOX part_box = part->bounding_box(); 01338 // reset the projection array 01339 for (int i = 0; i < page_height; i++) { 01340 table_region[i] = false; 01341 } 01342 // iterate through all table columns to find regions in the current 01343 // page column block 01344 cit.move_to_first(); 01345 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) { 01346 TBOX col_box = cit.data()->bounding_box(); 01347 // find intersection region of table column and page column 01348 TBOX intersection_box = col_box.intersection(part_box); 01349 // project table column on the y-axis 01350 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) { 01351 table_region[i - bleft().y()] = true; 01352 } 01353 } 01354 // set x-limits of table regions to page column width 01355 TBOX current_table_box; 01356 current_table_box.set_left(part_box.left()); 01357 current_table_box.set_right(part_box.right()); 01358 // go through the y-axis projection to find runs of table 01359 // regions. Each run makes one table region. 01360 for (int i = 1; i < page_height; i++) { 01361 // detect start of a table region 01362 if (!table_region[i - 1] && table_region[i]) { 01363 current_table_box.set_bottom(i + bleft().y()); 01364 } 01365 // TODO(nbeato): Is it guaranteed that the last row is not a table region? 01366 // detect end of a table region 01367 if (table_region[i - 1] && !table_region[i]) { 01368 current_table_box.set_top(i + bleft().y()); 01369 if (!current_table_box.null_box()) { 01370 ColSegment* seg = new ColSegment(); 01371 seg->InsertBox(current_table_box); 01372 rit.add_after_then_move(seg); 01373 } 01374 } 01375 } 01376 } 01377 delete[] table_region; 01378 } 01379 01380 // Merge table regions corresponding to tables spanning multiple columns if 01381 // there is a colpartition (horizontal ruling line or normal text) that 01382 // touches both regions. 01383 // TODO(faisal): A rare error occurs if there are two horizontally adjacent 01384 // tables with aligned ruling lines. In this case, line finder returns a 01385 // single line and hence the tables get merged together 01386 void TableFinder::GridMergeTableRegions() { 01387 // Iterate the table regions in the grid. 01388 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01389 gsearch(&table_grid_); 01390 gsearch.StartFullSearch(); 01391 ColSegment* seg = NULL; 01392 while ((seg = gsearch.NextFullSearch()) != NULL) { 01393 bool neighbor_found = false; 01394 bool modified = false; // Modified at least once 01395 do { 01396 // Start a rectangle search x-bounded by the image and y by the table 01397 const TBOX& box = seg->bounding_box(); 01398 TBOX search_region(box); 01399 search_region.set_left(bleft().x()); 01400 search_region.set_right(tright().x()); 01401 neighbor_found = false; 01402 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01403 rectsearch(&table_grid_); 01404 rectsearch.StartRectSearch(search_region); 01405 ColSegment* neighbor = NULL; 01406 while ((neighbor = rectsearch.NextRectSearch()) != NULL) { 01407 if (neighbor == seg) 01408 continue; 01409 const TBOX& neighbor_box = neighbor->bounding_box(); 01410 // Check if a neighbor box has a large overlap with the table 01411 // region. This may happen as a result of merging two table 01412 // regions in the previous iteration. 01413 if (neighbor_box.overlap_fraction(box) >= 0.9) { 01414 seg->InsertBox(neighbor_box); 01415 rectsearch.RemoveBBox(); 01416 gsearch.RepositionIterator(); 01417 delete neighbor; 01418 modified = true; 01419 continue; 01420 } 01421 // Check if two table regions belong together based on a common 01422 // horizontal ruling line 01423 if (BelongToOneTable(box, neighbor_box)) { 01424 seg->InsertBox(neighbor_box); 01425 neighbor_found = true; 01426 modified = true; 01427 rectsearch.RemoveBBox(); 01428 gsearch.RepositionIterator(); 01429 delete neighbor; 01430 } 01431 } 01432 } while (neighbor_found); 01433 if (modified) { 01434 // Because the box has changed, it has to be removed first. 01435 gsearch.RemoveBBox(); 01436 table_grid_.InsertBBox(true, true, seg); 01437 gsearch.RepositionIterator(); 01438 } 01439 } 01440 } 01441 01442 // Decide if two table regions belong to one table based on a common 01443 // horizontal ruling line or another colpartition 01444 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) { 01445 // Check the obvious case. Most likely not true because overlapping boxes 01446 // should already be merged, but seems like a good thing to do in case things 01447 // change. 01448 if (box1.overlap(box2)) 01449 return true; 01450 // Check for ColPartitions spanning both table regions 01451 TBOX bbox = box1.bounding_union(box2); 01452 // Start a rect search on bbox 01453 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01454 rectsearch(&clean_part_grid_); 01455 rectsearch.StartRectSearch(bbox); 01456 ColPartition* part = NULL; 01457 while ((part = rectsearch.NextRectSearch()) != NULL) { 01458 const TBOX& part_box = part->bounding_box(); 01459 // return true if a colpartition spanning both table regions is found 01460 if (part_box.overlap(box1) && part_box.overlap(box2)) 01461 return true; 01462 } 01463 return false; 01464 } 01465 01466 // Adjust table boundaries by: 01467 // - building a tight bounding box around all ColPartitions contained in it. 01468 // - expanding table boundaries to include all colpartitions that overlap the 01469 // table by more than half of their area 01470 // - expanding table boundaries to include nearby horizontal rule lines 01471 // - expanding table vertically to include left out column headers 01472 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually 01473 // makes following errors: 01474 // 1- horizontal lines consisting of underlines are included in the table if 01475 // they are close enough 01476 // 2- horizontal lines originating from noise tend to get merged with a table 01477 // near the top of the page 01478 // 3- the criteria for including horizontal lines is very generous. Many times 01479 // horizontal lines separating headers and footers get merged with a 01480 // single-column table in a multi-column page thereby including text 01481 // from the neighboring column inside the table 01482 // 4- the criteria for including left out column headers also tends to 01483 // occasionally include text-lines above the tables, typically from 01484 // table caption 01485 void TableFinder::AdjustTableBoundaries() { 01486 // Iterate the table regions in the grid 01487 ColSegment_CLIST adjusted_tables; 01488 ColSegment_C_IT it(&adjusted_tables); 01489 ColSegmentGridSearch gsearch(&table_grid_); 01490 gsearch.StartFullSearch(); 01491 ColSegment* table = NULL; 01492 while ((table = gsearch.NextFullSearch()) != NULL) { 01493 const TBOX& table_box = table->bounding_box(); 01494 TBOX grown_box = table_box; 01495 GrowTableBox(table_box, &grown_box); 01496 // To prevent a table from expanding again, do not insert the 01497 // modified box back to the grid. Instead move it to a list and 01498 // and remove it from the grid. The list is moved later back to the grid. 01499 if (!grown_box.null_box()) { 01500 ColSegment* col = new ColSegment(); 01501 col->InsertBox(grown_box); 01502 it.add_after_then_move(col); 01503 } 01504 gsearch.RemoveBBox(); 01505 delete table; 01506 } 01507 // clear table grid to move final tables in it 01508 // TODO(nbeato): table_grid_ should already be empty. The above loop 01509 // removed everything. Maybe just assert it is empty? 01510 table_grid_.Clear(); 01511 it.move_to_first(); 01512 // move back final tables to table_grid_ 01513 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 01514 ColSegment* seg = it.extract(); 01515 table_grid_.InsertBBox(true, true, seg); 01516 } 01517 } 01518 01519 void TableFinder::GrowTableBox(const TBOX& table_box, TBOX* result_box) { 01520 // TODO(nbeato): The growing code is a bit excessive right now. 01521 // By removing these lines, the partitions considered need 01522 // to have some overlap or be special cases. These lines could 01523 // be added again once a check is put in place to make sure that 01524 // growing tables don't stomp on a lot of non-table partitions. 01525 01526 // search for horizontal ruling lines within the vertical margin 01527 // int vertical_margin = kRulingVerticalMargin * gridsize(); 01528 TBOX search_box = table_box; 01529 // int top = MIN(search_box.top() + vertical_margin, tright().y()); 01530 // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y()); 01531 // search_box.set_top(top); 01532 // search_box.set_bottom(bottom); 01533 01534 GrowTableToIncludePartials(table_box, search_box, result_box); 01535 GrowTableToIncludeLines(table_box, search_box, result_box); 01536 IncludeLeftOutColumnHeaders(result_box); 01537 } 01538 01539 // Grow a table by increasing the size of the box to include 01540 // partitions with significant overlap with the table. 01541 void TableFinder::GrowTableToIncludePartials(const TBOX& table_box, 01542 const TBOX& search_range, 01543 TBOX* result_box) { 01544 // Rulings are in a different grid, so search 2 grids for rulings, text, 01545 // and table partitions that are not entirely within the new box. 01546 for (int i = 0; i < 2; ++i) { 01547 ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ : 01548 &leader_and_ruling_grid_; 01549 ColPartitionGridSearch rectsearch(grid); 01550 rectsearch.StartRectSearch(search_range); 01551 ColPartition* part = NULL; 01552 while ((part = rectsearch.NextRectSearch()) != NULL) { 01553 // Only include text and table types. 01554 if (part->IsImageType()) 01555 continue; 01556 const TBOX& part_box = part->bounding_box(); 01557 // Include partition in the table if more than half of it 01558 // is covered by the table 01559 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { 01560 *result_box = result_box->bounding_union(part_box); 01561 continue; 01562 } 01563 } 01564 } 01565 } 01566 01567 // Grow a table by expanding to the extents of significantly 01568 // overlapping lines. 01569 void TableFinder::GrowTableToIncludeLines(const TBOX& table_box, 01570 const TBOX& search_range, 01571 TBOX* result_box) { 01572 ColPartitionGridSearch rsearch(&leader_and_ruling_grid_); 01573 rsearch.SetUniqueMode(true); 01574 rsearch.StartRectSearch(search_range); 01575 ColPartition* part = NULL; 01576 while ((part = rsearch.NextRectSearch()) != NULL) { 01577 // TODO(nbeato) This should also do vertical, but column 01578 // boundaries are breaking things. This function needs to be 01579 // updated to allow vertical lines as well. 01580 if (!part->IsLineType()) 01581 continue; 01582 // Avoid the following function call if the result of the 01583 // function is irrelevant. 01584 const TBOX& part_box = part->bounding_box(); 01585 if (result_box->contains(part_box)) 01586 continue; 01587 // Include a partially overlapping horizontal line only if the 01588 // extra ColPartitions that will be included due to expansion 01589 // have large side spacing w.r.t. columns containing them. 01590 if (HLineBelongsToTable(*part, table_box)) 01591 *result_box = result_box->bounding_union(part_box); 01592 // TODO(nbeato): Vertical 01593 } 01594 } 01595 01596 // Checks whether the horizontal line belong to the table by looking at the 01597 // side spacing of extra ColParitions that will be included in the table 01598 // due to expansion 01599 bool TableFinder::HLineBelongsToTable(const ColPartition& part, 01600 const TBOX& table_box) { 01601 if (!part.IsHorizontalLine()) 01602 return false; 01603 const TBOX& part_box = part.bounding_box(); 01604 if (!part_box.major_x_overlap(table_box)) 01605 return false; 01606 // Do not consider top-most horizontal line since it usually 01607 // originates from noise. 01608 // TODO(nbeato): I had to comment this out because the ruling grid doesn't 01609 // have neighbors solved. 01610 // if (!part.nearest_neighbor_above()) 01611 // return false; 01612 const TBOX bbox = part_box.bounding_union(table_box); 01613 // In the "unioned table" box (the table extents expanded by the line), 01614 // keep track of how many partitions have significant padding to the left 01615 // and right. If more than half of the partitions covered by the new table 01616 // have significant spacing, the line belongs to the table and the table 01617 // grows to include all of the partitions. 01618 int num_extra_partitions = 0; 01619 int extra_space_to_right = 0; 01620 int extra_space_to_left = 0; 01621 // Rulings are in a different grid, so search 2 grids for rulings, text, 01622 // and table partitions that are introduced by the new box. 01623 for (int i = 0; i < 2; ++i) { 01624 ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ : 01625 &leader_and_ruling_grid_; 01626 // Start a rect search on bbox 01627 ColPartitionGridSearch rectsearch(grid); 01628 rectsearch.SetUniqueMode(true); 01629 rectsearch.StartRectSearch(bbox); 01630 ColPartition* extra_part = NULL; 01631 while ((extra_part = rectsearch.NextRectSearch()) != NULL) { 01632 // ColPartition already in table 01633 const TBOX& extra_part_box = extra_part->bounding_box(); 01634 if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) 01635 continue; 01636 // Non-text ColPartitions do not contribute 01637 if (extra_part->IsImageType()) 01638 continue; 01639 // Consider this partition. 01640 num_extra_partitions++; 01641 // presence of a table cell is a strong hint, so just increment the scores 01642 // without looking at the spacing. 01643 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) { 01644 extra_space_to_right++; 01645 extra_space_to_left++; 01646 continue; 01647 } 01648 int space_threshold = kSideSpaceMargin * part.median_size(); 01649 if (extra_part->space_to_right() > space_threshold) 01650 extra_space_to_right++; 01651 if (extra_part->space_to_left() > space_threshold) 01652 extra_space_to_left++; 01653 } 01654 } 01655 // tprintf("%d %d %d\n", 01656 // num_extra_partitions,extra_space_to_right,extra_space_to_left); 01657 return (extra_space_to_right > num_extra_partitions / 2) || 01658 (extra_space_to_left > num_extra_partitions / 2); 01659 } 01660 01661 // Look for isolated column headers above the given table box and 01662 // include them in the table 01663 void TableFinder::IncludeLeftOutColumnHeaders(TBOX* table_box) { 01664 // Start a search above the current table to look for column headers 01665 ColPartitionGridSearch vsearch(&clean_part_grid_); 01666 vsearch.StartVerticalSearch(table_box->left(), table_box->right(), 01667 table_box->top()); 01668 ColPartition* neighbor = NULL; 01669 ColPartition* previous_neighbor = NULL; 01670 while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) { 01671 // Max distance to find a table heading. 01672 const int max_distance = kMaxColumnHeaderDistance * 01673 neighbor->median_size(); 01674 int table_top = table_box->top(); 01675 const TBOX& box = neighbor->bounding_box(); 01676 // Do not continue if the next box is way above 01677 if (box.bottom() - table_top > max_distance) 01678 break; 01679 // Unconditionally include partitions of type TABLE or LINE 01680 // TODO(faisal): add some reasonable conditions here 01681 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) { 01682 table_box->set_top(box.top()); 01683 previous_neighbor = NULL; 01684 continue; 01685 } 01686 // If there are two text partitions, one above the other, without a table 01687 // cell on their left or right side, consider them a barrier and quit 01688 if (previous_neighbor == NULL) { 01689 previous_neighbor = neighbor; 01690 } else { 01691 const TBOX& previous_box = previous_neighbor->bounding_box(); 01692 if (!box.major_y_overlap(previous_box)) 01693 break; 01694 } 01695 } 01696 } 01697 01698 // Remove false alarms consiting of a single column based on their 01699 // projection on the x-axis. Projection of a real table on the x-axis 01700 // should have at least one zero-valley larger than the global median 01701 // x-height of the page. 01702 void TableFinder::DeleteSingleColumnTables() { 01703 int page_width = tright().x() - bleft().x(); 01704 ASSERT_HOST(page_width > 0); 01705 // create an integer array to hold projection on x-axis 01706 int* table_xprojection = new int[page_width]; 01707 // Iterate through all tables in the table grid 01708 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01709 table_search(&table_grid_); 01710 table_search.StartFullSearch(); 01711 ColSegment* table; 01712 while ((table = table_search.NextFullSearch()) != NULL) { 01713 TBOX table_box = table->bounding_box(); 01714 // reset the projection array 01715 for (int i = 0; i < page_width; i++) { 01716 table_xprojection[i] = 0; 01717 } 01718 // Start a rect search on table_box 01719 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01720 rectsearch(&clean_part_grid_); 01721 rectsearch.SetUniqueMode(true); 01722 rectsearch.StartRectSearch(table_box); 01723 ColPartition* part; 01724 while ((part = rectsearch.NextRectSearch()) != NULL) { 01725 if (!part->IsTextType()) 01726 continue; // Do not consider non-text partitions 01727 if (part->flow() == BTFT_LEADER) 01728 continue; // Assume leaders are in tables 01729 TBOX part_box = part->bounding_box(); 01730 // Do not consider partitions partially covered by the table 01731 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) 01732 continue; 01733 BLOBNBOX_CLIST* part_boxes = part->boxes(); 01734 BLOBNBOX_C_IT pit(part_boxes); 01735 01736 // Make sure overlapping blobs don't artificially inflate the number 01737 // of rows in the table. This happens frequently with things such as 01738 // decimals and split characters. Do this by assuming the column 01739 // partition is sorted mostly left to right and just clip 01740 // bounding boxes by the previous box's extent. 01741 int next_position_to_write = 0; 01742 01743 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { 01744 BLOBNBOX *pblob = pit.data(); 01745 // ignore blob height for the purpose of projection since we 01746 // are only interested in finding valleys 01747 int xstart = pblob->bounding_box().left(); 01748 int xend = pblob->bounding_box().right(); 01749 01750 xstart = MAX(xstart, next_position_to_write); 01751 for (int i = xstart; i < xend; i++) 01752 table_xprojection[i - bleft().x()]++; 01753 next_position_to_write = xend; 01754 } 01755 } 01756 // Find largest valley between two reasonable peaks in the table 01757 if (!GapInXProjection(table_xprojection, page_width)) { 01758 table_search.RemoveBBox(); 01759 delete table; 01760 } 01761 } 01762 delete[] table_xprojection; 01763 } 01764 01765 // Return true if at least one gap larger than the global x-height 01766 // exists in the horizontal projection 01767 bool TableFinder::GapInXProjection(int* xprojection, int length) { 01768 // Find peak value of the histogram 01769 int peak_value = 0; 01770 for (int i = 0; i < length; i++) { 01771 if (xprojection[i] > peak_value) { 01772 peak_value = xprojection[i]; 01773 } 01774 } 01775 // Peak value represents the maximum number of horizontally 01776 // overlapping colpartitions, so this can be considered as the 01777 // number of rows in the table 01778 if (peak_value < kMinRowsInTable) 01779 return false; 01780 double projection_threshold = kSmallTableProjectionThreshold * peak_value; 01781 if (peak_value >= kLargeTableRowCount) 01782 projection_threshold = kLargeTableProjectionThreshold * peak_value; 01783 // Threshold the histogram 01784 for (int i = 0; i < length; i++) { 01785 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0; 01786 } 01787 // Find the largest run of zeros between two ones 01788 int largest_gap = 0; 01789 int run_start = -1; 01790 for (int i = 1; i < length; i++) { 01791 // detect start of a run of zeros 01792 if (xprojection[i - 1] && !xprojection[i]) { 01793 run_start = i; 01794 } 01795 // detect end of a run of zeros and update the value of largest gap 01796 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) { 01797 int gap = i - run_start; 01798 if (gap > largest_gap) 01799 largest_gap = gap; 01800 run_start = -1; 01801 } 01802 } 01803 return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_; 01804 } 01805 01806 // Given the location of a table "guess", try to overlay a cellular 01807 // grid in the location, adjusting the boundaries. 01808 // TODO(nbeato): Falsely introduces: 01809 // -headers/footers (not any worse, too much overlap destroys cells) 01810 // -page numbers (not worse, included because maximize margins) 01811 // -equations (nicely fit into a celluar grid, but more sparsely) 01812 // -figures (random text box, also sparse) 01813 // -small left-aligned text areas with overlapping positioned whitespace 01814 // (rejected before) 01815 // Overall, this just needs some more work. 01816 void TableFinder::RecognizeTables() { 01817 ScrollView* table_win = NULL; 01818 if (textord_show_tables) { 01819 table_win = MakeWindow(0, 0, "Table Structure"); 01820 DisplayColPartitions(table_win, &fragmented_text_grid_, 01821 ScrollView::BLUE, ScrollView::LIGHT_BLUE); 01822 // table_grid_.DisplayBoxes(table_win); 01823 } 01824 01825 01826 TableRecognizer recognizer; 01827 recognizer.Init(); 01828 recognizer.set_line_grid(&leader_and_ruling_grid_); 01829 recognizer.set_text_grid(&fragmented_text_grid_); 01830 recognizer.set_max_text_height(global_median_xheight_ * 2.0); 01831 recognizer.set_min_height(1.5 * gridheight()); 01832 // Loop over all of the tables and try to fit them. 01833 // Store the good tables here. 01834 ColSegment_CLIST good_tables; 01835 ColSegment_C_IT good_it(&good_tables); 01836 01837 ColSegmentGridSearch gsearch(&table_grid_); 01838 gsearch.StartFullSearch(); 01839 ColSegment* found_table = NULL; 01840 while ((found_table = gsearch.NextFullSearch()) != NULL) { 01841 gsearch.RemoveBBox(); 01842 01843 // The goal is to make the tables persistent in a list. 01844 // When that happens, this will move into the search loop. 01845 const TBOX& found_box = found_table->bounding_box(); 01846 StructuredTable* table_structure = recognizer.RecognizeTable(found_box); 01847 01848 // Process a table. Good tables are inserted into the grid again later on 01849 // We can't change boxes in the grid while it is running a search. 01850 if (table_structure != NULL) { 01851 if (textord_show_tables) { 01852 table_structure->Display(table_win, ScrollView::LIME_GREEN); 01853 } 01854 found_table->set_bounding_box(table_structure->bounding_box()); 01855 delete table_structure; 01856 good_it.add_after_then_move(found_table); 01857 } else { 01858 delete found_table; 01859 } 01860 } 01861 // TODO(nbeato): MERGE!! There is awesome info now available for merging. 01862 01863 // At this point, the grid is empty. We can safely insert the good tables 01864 // back into grid. 01865 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) 01866 table_grid_.InsertBBox(true, true, good_it.extract()); 01867 } 01868 01869 // Displays the column segments in some window. 01870 void TableFinder::DisplayColSegments(ScrollView* win, 01871 ColSegment_LIST *segments, 01872 ScrollView::Color color) { 01873 #ifndef GRAPHICS_DISABLED 01874 win->Pen(color); 01875 win->Brush(ScrollView::NONE); 01876 ColSegment_IT it(segments); 01877 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 01878 ColSegment* col = it.data(); 01879 const TBOX& box = col->bounding_box(); 01880 int left_x = box.left(); 01881 int right_x = box.right(); 01882 int top_y = box.top(); 01883 int bottom_y = box.bottom(); 01884 win->Rectangle(left_x, bottom_y, right_x, top_y); 01885 } 01886 win->UpdateWindow(); 01887 #endif 01888 } 01889 01890 void TableFinder::DisplayColSegmentGrid(ScrollView* win, ColSegmentGrid* grid, 01891 ScrollView::Color color) { 01892 #ifndef GRAPHICS_DISABLED 01893 // Iterate the ColPartitions in the grid. 01894 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01895 gsearch(grid); 01896 gsearch.StartFullSearch(); 01897 ColSegment* seg = NULL; 01898 while ((seg = gsearch.NextFullSearch()) != NULL) { 01899 const TBOX& box = seg->bounding_box(); 01900 int left_x = box.left(); 01901 int right_x = box.right(); 01902 int top_y = box.top(); 01903 int bottom_y = box.bottom(); 01904 win->Brush(ScrollView::NONE); 01905 win->Pen(color); 01906 win->Rectangle(left_x, bottom_y, right_x, top_y); 01907 } 01908 win->UpdateWindow(); 01909 #endif 01910 } 01911 01912 // Displays the colpartitions using a new coloring on an existing window. 01913 // Note: This method is only for debug purpose during development and 01914 // would not be part of checked in code 01915 void TableFinder::DisplayColPartitions(ScrollView* win, 01916 ColPartitionGrid* grid, 01917 ScrollView::Color default_color, 01918 ScrollView::Color table_color) { 01919 #ifndef GRAPHICS_DISABLED 01920 ScrollView::Color color = default_color; 01921 // Iterate the ColPartitions in the grid. 01922 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01923 gsearch(grid); 01924 gsearch.StartFullSearch(); 01925 ColPartition* part = NULL; 01926 while ((part = gsearch.NextFullSearch()) != NULL) { 01927 color = default_color; 01928 if (part->type() == PT_TABLE) 01929 color = table_color; 01930 01931 const TBOX& box = part->bounding_box(); 01932 int left_x = box.left(); 01933 int right_x = box.right(); 01934 int top_y = box.top(); 01935 int bottom_y = box.bottom(); 01936 win->Brush(ScrollView::NONE); 01937 win->Pen(color); 01938 win->Rectangle(left_x, bottom_y, right_x, top_y); 01939 } 01940 win->UpdateWindow(); 01941 #endif 01942 } 01943 void TableFinder::DisplayColPartitions(ScrollView* win, 01944 ColPartitionGrid* grid, 01945 ScrollView::Color default_color) { 01946 DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW); 01947 } 01948 01949 void TableFinder::DisplayColPartitionConnections( 01950 ScrollView* win, 01951 ColPartitionGrid* grid, 01952 ScrollView::Color color) { 01953 #ifndef GRAPHICS_DISABLED 01954 // Iterate the ColPartitions in the grid. 01955 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01956 gsearch(grid); 01957 gsearch.StartFullSearch(); 01958 ColPartition* part = NULL; 01959 while ((part = gsearch.NextFullSearch()) != NULL) { 01960 const TBOX& box = part->bounding_box(); 01961 int left_x = box.left(); 01962 int right_x = box.right(); 01963 int top_y = box.top(); 01964 int bottom_y = box.bottom(); 01965 01966 ColPartition* upper_part = part->nearest_neighbor_above(); 01967 if (upper_part) { 01968 TBOX upper_box = upper_part->bounding_box(); 01969 int mid_x = (left_x + right_x) / 2; 01970 int mid_y = (top_y + bottom_y) / 2; 01971 int other_x = (upper_box.left() + upper_box.right()) / 2; 01972 int other_y = (upper_box.top() + upper_box.bottom()) / 2; 01973 win->Brush(ScrollView::NONE); 01974 win->Pen(color); 01975 win->Line(mid_x, mid_y, other_x, other_y); 01976 } 01977 ColPartition* lower_part = part->nearest_neighbor_below(); 01978 if (lower_part) { 01979 TBOX lower_box = lower_part->bounding_box(); 01980 int mid_x = (left_x + right_x) / 2; 01981 int mid_y = (top_y + bottom_y) / 2; 01982 int other_x = (lower_box.left() + lower_box.right()) / 2; 01983 int other_y = (lower_box.top() + lower_box.bottom()) / 2; 01984 win->Brush(ScrollView::NONE); 01985 win->Pen(color); 01986 win->Line(mid_x, mid_y, other_x, other_y); 01987 } 01988 } 01989 win->UpdateWindow(); 01990 #endif 01991 } 01992 01993 01994 // Write debug image and text file. 01995 // Note: This method is only for debug purpose during development and 01996 // would not be part of checked in code 01997 void TableFinder::WriteToPix(const FCOORD& reskew) { 01998 // Input file must be named test1.tif 01999 PIX* pix = pixRead("test1.tif"); 02000 if (!pix) { 02001 tprintf("Input file test1.tif not found.\n"); 02002 return; 02003 } 02004 int img_height = pixGetHeight(pix); 02005 int img_width = pixGetWidth(pix); 02006 // Maximum number of text or table partitions 02007 int num_boxes = 10; 02008 BOXA* text_box_array = boxaCreate(num_boxes); 02009 BOXA* table_box_array = boxaCreate(num_boxes); 02010 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 02011 gsearch(&clean_part_grid_); 02012 gsearch.StartFullSearch(); 02013 ColPartition* part; 02014 // load colpartitions into text_box_array and table_box_array 02015 while ((part = gsearch.NextFullSearch()) != NULL) { 02016 TBOX box = part->bounding_box(); 02017 box.rotate_large(reskew); 02018 BOX* lept_box = boxCreate(box.left(), img_height - box.top(), 02019 box.right() - box.left(), 02020 box.top() - box.bottom()); 02021 if (part->type() == PT_TABLE) 02022 boxaAddBox(table_box_array, lept_box, L_INSERT); 02023 else 02024 boxaAddBox(text_box_array, lept_box, L_INSERT); 02025 } 02026 // draw colpartitions on the output image 02027 PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000); 02028 out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00); 02029 02030 BOXA* table_array = boxaCreate(num_boxes); 02031 // text file containing detected table bounding boxes 02032 FILE* fptr = fopen("tess-table.txt", "wb"); 02033 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 02034 table_search(&table_grid_); 02035 table_search.StartFullSearch(); 02036 ColSegment* table; 02037 // load table boxes to table_array and write them to text file as well 02038 while ((table = table_search.NextFullSearch()) != NULL) { 02039 TBOX box = table->bounding_box(); 02040 box.rotate_large(reskew); 02041 // Since deskewing introduces negative coordinates, reskewing 02042 // might not completely recover from that since both steps enlarge 02043 // the actual box. Hence a box that undergoes deskewing/reskewing 02044 // may go out of image boundaries. Crop a table box if needed to 02045 // contain it inside the image dimensions. 02046 box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1)); 02047 BOX* lept_box = boxCreate(box.left(), img_height - box.top(), 02048 box.right() - box.left(), 02049 box.top() - box.bottom()); 02050 boxaAddBox(table_array, lept_box, L_INSERT); 02051 fprintf(fptr, "%d %d %d %d TABLE\n", box.left(), 02052 img_height - box.top(), box.right(), img_height - box.bottom()); 02053 } 02054 fclose(fptr); 02055 // paint table boxes on the debug image 02056 out = pixDrawBoxa(out, table_array, 5, 0x7fff0000); 02057 02058 pixWrite("out.png", out, IFF_PNG); 02059 // memory cleanup 02060 boxaDestroy(&text_box_array); 02061 boxaDestroy(&table_box_array); 02062 boxaDestroy(&table_array); 02063 pixDestroy(&pix); 02064 pixDestroy(&out); 02065 } 02066 02067 // Merge all colpartitions in table regions to make them a single 02068 // colpartition and revert types of isolated table cells not 02069 // assigned to any table to their original types. 02070 void TableFinder::MakeTableBlocks(ColPartitionGrid* grid, 02071 ColPartitionSet** all_columns, 02072 WidthCallback* width_cb) { 02073 // Since we have table blocks already, remove table tags from all 02074 // colpartitions 02075 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 02076 gsearch(grid); 02077 gsearch.StartFullSearch(); 02078 ColPartition* part = NULL; 02079 02080 while ((part = gsearch.NextFullSearch()) != NULL) { 02081 if (part->type() == PT_TABLE) { 02082 part->clear_table_type(); 02083 } 02084 } 02085 // Now make a single colpartition out of each table block and remove 02086 // all colpartitions contained within a table 02087 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 02088 table_search(&table_grid_); 02089 table_search.StartFullSearch(); 02090 ColSegment* table; 02091 while ((table = table_search.NextFullSearch()) != NULL) { 02092 TBOX table_box = table->bounding_box(); 02093 // Start a rect search on table_box 02094 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 02095 rectsearch(grid); 02096 rectsearch.StartRectSearch(table_box); 02097 ColPartition* part; 02098 ColPartition* table_partition = NULL; 02099 while ((part = rectsearch.NextRectSearch()) != NULL) { 02100 // Do not consider image partitions 02101 if (!part->IsTextType()) 02102 continue; 02103 TBOX part_box = part->bounding_box(); 02104 // Include partition in the table if more than half of it 02105 // is covered by the table 02106 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { 02107 rectsearch.RemoveBBox(); 02108 if (table_partition) { 02109 table_partition->Absorb(part, width_cb); 02110 } else { 02111 table_partition = part; 02112 } 02113 } 02114 } 02115 // Insert table colpartition back to part_grid_ 02116 if (table_partition) { 02117 // To match the columns used when transforming to blocks, the new table 02118 // partition must have its first and last column set at the grid y that 02119 // corresponds to its bottom. 02120 const TBOX& table_box = table_partition->bounding_box(); 02121 int grid_x, grid_y; 02122 grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y); 02123 table_partition->SetPartitionType(resolution_, all_columns[grid_y]); 02124 table_partition->set_table_type(); 02125 table_partition->set_blob_type(BRT_TEXT); 02126 table_partition->set_flow(BTFT_CHAIN); 02127 table_partition->SetBlobTypes(); 02128 grid->InsertBBox(true, true, table_partition); 02129 } 02130 } 02131 } 02132 02135 ColSegment::ColSegment() 02136 : ELIST_LINK(), 02137 num_table_cells_(0), 02138 num_text_cells_(0), 02139 type_(COL_UNKNOWN) { 02140 } 02141 ColSegment::~ColSegment() { 02142 } 02143 02144 // Provides a color for BBGrid to draw the rectangle. 02145 ScrollView::Color ColSegment::BoxColor() const { 02146 const ScrollView::Color kBoxColors[PT_COUNT] = { 02147 ScrollView::YELLOW, 02148 ScrollView::BLUE, 02149 ScrollView::YELLOW, 02150 ScrollView::MAGENTA, 02151 }; 02152 return kBoxColors[type_]; 02153 } 02154 02155 // Insert a box into this column segment 02156 void ColSegment::InsertBox(const TBOX& other) { 02157 bounding_box_ = bounding_box_.bounding_union(other); 02158 } 02159 02160 // Set column segment type based on the ratio of text and table partitions 02161 // in it. 02162 void ColSegment::set_type() { 02163 if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) 02164 type_ = COL_TABLE; 02165 else if (num_text_cells_ > num_table_cells_) 02166 type_ = COL_TEXT; 02167 else 02168 type_ = COL_MIXED; 02169 } 02170 02171 } // namespace tesseract.