Tesseract
3.02
|
00001 00002 // File: tablerecog.h 00003 // Description: Functions to detect structure of tables. 00004 // Author: Nicholas Beato 00005 // Created: Aug 17, 2010 00006 // 00007 // (C) Copyright 2010, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TABLERECOG_H_ 00021 #define TABLERECOG_H_ 00022 00023 #include "colpartitiongrid.h" 00024 #include "genericvector.h" 00025 00026 namespace tesseract { 00027 00028 // There are 2 classes in this file. They have 2 different purposes. 00029 // - StructuredTable contains the methods to find the structure given 00030 // a specific bounding box and grow that structure. 00031 // - TableRecognizer contains the methods to adjust the possible positions 00032 // of a table without worrying about structure. 00033 // 00034 // To use these classes, the assumption is that the TableFinder will 00035 // have a guess of the location of a table (or possibly over/undersegmented 00036 // tables). The TableRecognizer is responsible for finding the table boundaries 00037 // at a high level. The StructuredTable class is responsible for determining 00038 // the structure of the table and trying to maximize its bounds while retaining 00039 // the structure. 00040 // (The latter part is not implemented yet, but that was the goal). 00041 // 00042 // While on the boundary discussion, keep in mind that this is a first pass. 00043 // There should eventually be some things like internal structure checks, 00044 // and, more importantly, surrounding text flow checks. 00045 // 00046 00047 // Usage: 00048 // The StructuredTable class contains methods to query a potential table. 00049 // It has functions to find structure, count rows, find ColPartitions that 00050 // intersect gridlines, etc. It is not meant to blindly find a table. It 00051 // is meant to start with a known table location and enhance it. 00052 // Usage: 00053 // ColPartitionGrid text_grid, line_grid; // init 00054 // TBOX table_box; // known location of table location 00055 // 00056 // StructuredTable table; 00057 // table.Init(); // construction code 00058 // table.set_text_grid(/* text */); // These 2 grids can be the same! 00059 // table.set_line_grid(/* lines */); 00060 // table.set_min_text_height(10); // Filter vertical and tall text. 00061 // // IMPORTANT! The table needs to be told where it is! 00062 // table.set_bounding_box(table_box); // Set initial table location. 00063 // if (table.FindWhitespacedStructure()) { 00064 // // process table 00065 // table.column_count(); // number of columns 00066 // table.row_count(); // number of rows 00067 // table.cells_count(); // number of cells 00068 // table.bounding_box(); // updated bounding box 00069 // // etc. 00070 // } 00071 // 00072 class StructuredTable { 00073 public: 00074 StructuredTable(); 00075 ~StructuredTable(); 00076 00077 // Initialization code. Must be called after the constructor. 00078 void Init(); 00079 00080 // Sets the grids used by the table. These can be changed between 00081 // calls to Recognize. They are treated as read-only data. 00082 void set_text_grid(ColPartitionGrid* text); 00083 void set_line_grid(ColPartitionGrid* lines); 00084 // Filters text partitions that are ridiculously tall to prevent 00085 // merging rows. 00086 void set_max_text_height(int height); 00087 00088 // Basic accessors. Some are treated as attributes despite having indirect 00089 // representation. 00090 bool is_lined() const; 00091 int row_count() const; 00092 int column_count() const; 00093 int cell_count() const; 00094 void set_bounding_box(const TBOX& box); 00095 const TBOX& bounding_box() const; 00096 int median_cell_height(); 00097 int median_cell_width(); 00098 int row_height(int row) const; 00099 int column_width(int column) const; 00100 int space_above() const; 00101 int space_below() const; 00102 00103 // Given enough horizontal and vertical lines in a region, create this table 00104 // based on the structure given by the lines. Return true if it worked out. 00105 // Code assumes the lines exist. It is the caller's responsibility to check 00106 // for lines and find an appropriate bounding box. 00107 bool FindLinedStructure(); 00108 00109 // The main subroutine for finding generic table structure. The function 00110 // finds the grid structure in the given box. Returns true if a good grid 00111 // exists, implying that "this" table is valid. 00112 bool FindWhitespacedStructure(); 00113 00117 00118 // Returns true if inserting part into the table does not cause any 00119 // cell merges. 00120 bool DoesPartitionFit(const ColPartition& part) const; 00121 // Checks if a sub-table has multiple data cells filled. 00122 int CountFilledCells(); 00123 int CountFilledCellsInRow(int row); 00124 int CountFilledCellsInColumn(int column); 00125 int CountFilledCells(int row_start, int row_end, 00126 int column_start, int column_end); 00127 00128 // Makes sure that at least one cell in a row has substantial area filled. 00129 // This can filter out large whitespace caused by growing tables too far 00130 // and page numbers. 00131 // (currently bugged for some reason). 00132 bool VerifyRowFilled(int row); 00133 // Finds the filled area in a cell. 00134 double CalculateCellFilledPercentage(int row, int column); 00135 00136 // Debug display, draws the table in the given color. If the table is not 00137 // valid, the table and "best" grid lines are still drawn in the given color. 00138 void Display(ScrollView* window, ScrollView::Color color); 00139 00140 protected: 00141 // Clear the structure information. 00142 void ClearStructure(); 00143 00147 00148 // Verifies the lines do not intersect partitions. This happens when 00149 // the lines are in column boundaries and extend the full page. As a result, 00150 // the grid lines go through column text. The condition is detectable. 00151 bool VerifyLinedTableCells(); 00152 00156 00157 // This is the function to change if you want to filter resulting tables 00158 // better. Right now it just checks for a minimum cell count and such. 00159 // You could add things like maximum number of ColPartitions per cell or 00160 // similar. 00161 bool VerifyWhitespacedTable(); 00162 // Find the columns of a table using whitespace. 00163 void FindWhitespacedColumns(); 00164 // Find the rows of a table using whitespace. 00165 void FindWhitespacedRows(); 00166 00170 00171 // Calculates the whitespace around the table using the table boundary and 00172 // the supplied grids (set_text_grid and set_line_grid). 00173 void CalculateMargins(); 00174 // Update the table margins with the supplied grid. This is 00175 // only called by calculate margins to use multiple grid sources. 00176 void UpdateMargins(ColPartitionGrid* grid); 00177 int FindVerticalMargin(ColPartitionGrid* grid, int start_x, 00178 bool decrease) const; 00179 int FindHorizontalMargin(ColPartitionGrid* grid, int start_y, 00180 bool decrease) const; 00181 // Calculates stats on the table, namely the median cell height and width. 00182 void CalculateStats(); 00183 00187 00188 // Given a whitespaced table, this looks for bordering lines that might 00189 // be page layout boxes around the table. It is necessary to get the margins 00190 // correct on the table. If the lines are not joined, the margins will be 00191 // the distance to the line, which is not right. 00192 void AbsorbNearbyLines(); 00193 00194 // Nice utility function for finding partition gaps. You feed it a sorted 00195 // list of all of the mins/maxes of the partitions in the table, and it gives 00196 // you the gaps (middle). This works for both vertical and horizontal 00197 // gaps. 00198 // 00199 // If you want to allow slight overlap in the division and the partitions, 00200 // just scale down the partitions before inserting them in the list. 00201 // Likewise, you can force at least some space between partitions. 00202 // This trick is how the horizontal partitions are done (since the page 00203 // skew could make it hard to find splits in the text). 00204 // 00205 // As a result, "0 distance" between closest partitions causes a gap. 00206 // This is not a programmatic assumption. It is intentional and simplifies 00207 // things. 00208 // 00209 // "max_merged" indicates both the minimum number of stacked partitions 00210 // to cause a cell (add 1 to it), and the maximum number of partitions that 00211 // a grid line can intersect. For example, if max_merged is 0, then lines 00212 // are inserted wherever space exists between partitions. If it is 2, 00213 // lines may intersect 2 partitions at most, but you also need at least 00214 // 2 partitions to generate a line. 00215 static void FindCellSplitLocations(const GenericVector<int>& min_list, 00216 const GenericVector<int>& max_list, 00217 int max_merged, 00218 GenericVector<int>* locations); 00219 00223 00224 // Counts the number of ColPartitions that intersect vertical cell 00225 // division at this x value. Used by VerifyLinedTable. 00226 int CountVerticalIntersections(int x); 00227 int CountHorizontalIntersections(int y); 00228 00229 // Counts how many text partitions are in this box. 00230 int CountPartitions(const TBOX& box); 00231 00235 00236 // Input data, used as read only data to make decisions. 00237 ColPartitionGrid* text_grid_; // Text ColPartitions 00238 ColPartitionGrid* line_grid_; // Line ColPartitions 00239 // Table structure. 00240 // bounding box is a convenient external representation. 00241 // cell_x_ and cell_y_ indicate the grid lines. 00242 TBOX bounding_box_; // Bounding box 00243 GenericVectorEqEq<int> cell_x_; // Locations of vertical divisions (sorted) 00244 GenericVectorEqEq<int> cell_y_; // Locations of horizontal divisions (sorted) 00245 bool is_lined_; // Is the table backed up by a line structure 00246 // Table margins, set via CalculateMargins 00247 int space_above_; 00248 int space_below_; 00249 int space_left_; 00250 int space_right_; 00251 int median_cell_height_; 00252 int median_cell_width_; 00253 // Filters, used to prevent awkward partitions from destroying structure. 00254 int max_text_height_; 00255 }; 00256 00257 class TableRecognizer { 00258 public: 00259 TableRecognizer(); 00260 ~TableRecognizer(); 00261 00262 // Initialization code. Must be called after the constructor. 00263 void Init(); 00264 00268 00269 // Sets the grids used by the table. These can be changed between 00270 // calls to Recognize. They are treated as read-only data. 00271 void set_text_grid(ColPartitionGrid* text); 00272 void set_line_grid(ColPartitionGrid* lines); 00273 // Sets some additional constraints on the table. 00274 void set_min_height(int height); 00275 void set_min_width(int width); 00276 // Filters text partitions that are ridiculously tall to prevent 00277 // merging rows. Note that "filters" refers to allowing horizontal 00278 // cells to slice through them on the premise that they were 00279 // merged text rows during previous layout. 00280 void set_max_text_height(int height); 00281 00282 // Given a guess location, the RecognizeTable function will try to find a 00283 // structured grid in the area. On success, it will return a new 00284 // StructuredTable (and assumes you will delete it). Otherwise, 00285 // NULL is returned. 00286 // 00287 // Keep in mind, this may "overgrow" or "undergrow" the size of guess. 00288 // Ideally, there is a either a one-to-one correspondence between 00289 // the guess and table or no table at all. This is not the best of 00290 // assumptions right now, but was made to try to keep things simple in 00291 // the first pass. 00292 // 00293 // If a line structure is available on the page in the given region, 00294 // the table will use the linear structure as it is. 00295 // Otherwise, it will try to maximize the whitespace around it while keeping 00296 // a grid structure. This is somewhat working. 00297 // 00298 // Since the combination of adjustments can get high, effort was 00299 // originally made to keep the number of adjustments linear in the number 00300 // of partitions. The underlying structure finding code used to be 00301 // much more complex. I don't know how necessary this constraint is anymore. 00302 // The evaluation of a possible table is kept within O(nlogn) in the size of 00303 // the table (where size is the number of partitions in the table). 00304 // As a result, the algorithm is capable of O(n^2 log n). Depending 00305 // on the grid search size, it may be higher. 00306 // 00307 // Last note: it is possible to just try all partition boundaries at a high 00308 // level O(n^4) and do a verification scheme (at least O(nlogn)). If there 00309 // area 200 partitions on a page, this could be too costly. Effort could go 00310 // into pruning the search, but I opted for something quicker. I'm confident 00311 // that the independent adjustments can get similar results and keep the 00312 // complextiy down. However, the other approach could work without using 00313 // TableFinder at all if it is fast enough. It comes down to properly 00314 // deciding what is a table. The code currently relies on TableFinder's 00315 // guess to the location of a table for that. 00316 StructuredTable* RecognizeTable(const TBOX& guess_box); 00317 00318 protected: 00322 00323 // Returns true if the given box has a lined table within it. The 00324 // table argument will be updated with the table if the table exists. 00325 bool RecognizeLinedTable(const TBOX& guess_box, StructuredTable* table); 00326 // Returns true if the given box has a large number of horizontal and 00327 // vertical lines present. If so, we assume the extent of these lines 00328 // uniquely defines a table and find that table via SolveLinedTable. 00329 bool HasSignificantLines(const TBOX& guess); 00330 00331 // Given enough horizontal and vertical lines in a region, find a bounding 00332 // box that encloses all of them (as well as newly introduced lines). 00333 // The bounding box is the smallest box that encloses the lines in guess 00334 // without having any lines sticking out of it. 00335 // bounding_box is an in/out parameter. 00336 // On input, it in the extents of the box to search. 00337 // On output, it is the resulting bounding box. 00338 bool FindLinesBoundingBox(TBOX* bounding_box); 00339 // Iteration in above search. 00340 // bounding_box is an in/out parameter. 00341 // On input, it in the extents of the box to search. 00342 // On output, it is the resulting bounding box. 00343 bool FindLinesBoundingBoxIteration(TBOX* bounding_box); 00344 00348 00349 // Returns true if the given box has a whitespaced table within it. The 00350 // table argument will be updated if the table exists. Also note 00351 // that this method will fail if the guess_box center is not 00352 // mostly within the table. 00353 bool RecognizeWhitespacedTable(const TBOX& guess_box, StructuredTable* table); 00354 00355 // Finds the location of a horizontal split relative to y. 00356 // This function is mostly unused now. If the SolveWhitespacedTable 00357 // changes much, it can be removed. Note, it isn't really as reliable 00358 // as I thought. I went with alternatives for most of the other uses. 00359 int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom); 00360 00361 // Indicates that a table row is weak. This means that it has 00362 // many missing data cells or very large cell heights compared. 00363 // to the rest of the table. 00364 static bool IsWeakTableRow(StructuredTable* table, int row); 00365 00366 // Input data, used as read only data to make decisions. 00367 ColPartitionGrid* text_grid_; // Text ColPartitions 00368 ColPartitionGrid* line_grid_; // Line ColPartitions 00369 // Table constraints, a "good" table must satisfy these. 00370 int min_height_; 00371 int min_width_; 00372 // Filters, used to prevent awkward partitions from destroying structure. 00373 int max_text_height_; // Horizontal lines may intersect taller text. 00374 }; 00375 00376 } // namespace tesseract 00377 00378 #endif /* TABLERECOG_H_ */