Tesseract  3.02
tesseract-ocr/textord/tablerecog.h
Go to the documentation of this file.
00001 
00002 // File:        tablerecog.h
00003 // Description: Functions to detect structure of tables.
00004 // Author:    Nicholas Beato
00005 // Created:   Aug 17, 2010
00006 //
00007 // (C) Copyright 2010, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TABLERECOG_H_
00021 #define TABLERECOG_H_
00022 
00023 #include "colpartitiongrid.h"
00024 #include "genericvector.h"
00025 
00026 namespace tesseract {
00027 
00028 // There are 2 classes in this file. They have 2 different purposes.
00029 //  - StructuredTable contains the methods to find the structure given
00030 //    a specific bounding box and grow that structure.
00031 //  - TableRecognizer contains the methods to adjust the possible positions
00032 //    of a table without worrying about structure.
00033 //
00034 // To use these classes, the assumption is that the TableFinder will
00035 // have a guess of the location of a table (or possibly over/undersegmented
00036 // tables). The TableRecognizer is responsible for finding the table boundaries
00037 // at a high level. The StructuredTable class is responsible for determining
00038 // the structure of the table and trying to maximize its bounds while retaining
00039 // the structure.
00040 // (The latter part is not implemented yet, but that was the goal).
00041 //
00042 // While on the boundary discussion, keep in mind that this is a first pass.
00043 // There should eventually be some things like internal structure checks,
00044 // and, more importantly, surrounding text flow checks.
00045 //
00046 
00047 // Usage:
00048 // The StructuredTable class contains methods to query a potential table.
00049 // It has functions to find structure, count rows, find ColPartitions that
00050 // intersect gridlines, etc. It is not meant to blindly find a table. It
00051 // is meant to start with a known table location and enhance it.
00052 // Usage:
00053 //    ColPartitionGrid text_grid, line_grid;  // init
00054 //    TBOX table_box;  // known location of table location
00055 //
00056 //    StructuredTable table;
00057 //    table.Init();  // construction code
00058 //    table.set_text_grid(/* text */);  // These 2 grids can be the same!
00059 //    table.set_line_grid(/* lines */);
00060 //    table.set_min_text_height(10);    // Filter vertical and tall text.
00061 //    // IMPORTANT! The table needs to be told where it is!
00062 //    table.set_bounding_box(table_box);  // Set initial table location.
00063 //    if (table.FindWhitespacedStructure()) {
00064 //      // process table
00065 //      table.column_count();  // number of columns
00066 //      table.row_count();     // number of rows
00067 //      table.cells_count();   // number of cells
00068 //      table.bounding_box();  // updated bounding box
00069 //      // etc.
00070 //    }
00071 //
00072 class StructuredTable {
00073  public:
00074   StructuredTable();
00075   ~StructuredTable();
00076 
00077   // Initialization code. Must be called after the constructor.
00078   void Init();
00079 
00080   // Sets the grids used by the table. These can be changed between
00081   // calls to Recognize. They are treated as read-only data.
00082   void set_text_grid(ColPartitionGrid* text);
00083   void set_line_grid(ColPartitionGrid* lines);
00084   // Filters text partitions that are ridiculously tall to prevent
00085   // merging rows.
00086   void set_max_text_height(int height);
00087 
00088   // Basic accessors. Some are treated as attributes despite having indirect
00089   // representation.
00090   bool is_lined() const;
00091   int row_count() const;
00092   int column_count() const;
00093   int cell_count() const;
00094   void set_bounding_box(const TBOX& box);
00095   const TBOX& bounding_box() const;
00096   int median_cell_height();
00097   int median_cell_width();
00098   int row_height(int row) const;
00099   int column_width(int column) const;
00100   int space_above() const;
00101   int space_below() const;
00102 
00103   // Given enough horizontal and vertical lines in a region, create this table
00104   // based on the structure given by the lines. Return true if it worked out.
00105   // Code assumes the lines exist. It is the caller's responsibility to check
00106   // for lines and find an appropriate bounding box.
00107   bool FindLinedStructure();
00108 
00109   // The main subroutine for finding generic table structure. The function
00110   // finds the grid structure in the given box. Returns true if a good grid
00111   // exists, implying that "this" table is valid.
00112   bool FindWhitespacedStructure();
00113 
00117 
00118   // Returns true if inserting part into the table does not cause any
00119   // cell merges.
00120   bool DoesPartitionFit(const ColPartition& part) const;
00121   // Checks if a sub-table has multiple data cells filled.
00122   int CountFilledCells();
00123   int CountFilledCellsInRow(int row);
00124   int CountFilledCellsInColumn(int column);
00125   int CountFilledCells(int row_start, int row_end,
00126                        int column_start, int column_end);
00127 
00128   // Makes sure that at least one cell in a row has substantial area filled.
00129   // This can filter out large whitespace caused by growing tables too far
00130   // and page numbers.
00131   // (currently bugged for some reason).
00132   bool VerifyRowFilled(int row);
00133   // Finds the filled area in a cell.
00134   double CalculateCellFilledPercentage(int row, int column);
00135 
00136   // Debug display, draws the table in the given color. If the table is not
00137   // valid, the table and "best" grid lines are still drawn in the given color.
00138   void Display(ScrollView* window, ScrollView::Color color);
00139 
00140  protected:
00141   // Clear the structure information.
00142   void ClearStructure();
00143 
00147 
00148   // Verifies the lines do not intersect partitions. This happens when
00149   // the lines are in column boundaries and extend the full page. As a result,
00150   // the grid lines go through column text. The condition is detectable.
00151   bool VerifyLinedTableCells();
00152 
00156 
00157   // This is the function to change if you want to filter resulting tables
00158   // better. Right now it just checks for a minimum cell count and such.
00159   // You could add things like maximum number of ColPartitions per cell or
00160   // similar.
00161   bool VerifyWhitespacedTable();
00162   // Find the columns of a table using whitespace.
00163   void FindWhitespacedColumns();
00164   // Find the rows of a table using whitespace.
00165   void FindWhitespacedRows();
00166 
00170 
00171   // Calculates the whitespace around the table using the table boundary and
00172   // the supplied grids (set_text_grid and set_line_grid).
00173   void CalculateMargins();
00174   // Update the table margins with the supplied grid. This is
00175   // only called by calculate margins to use multiple grid sources.
00176   void UpdateMargins(ColPartitionGrid* grid);
00177   int FindVerticalMargin(ColPartitionGrid* grid, int start_x,
00178                          bool decrease) const;
00179   int FindHorizontalMargin(ColPartitionGrid* grid, int start_y,
00180                            bool decrease) const;
00181   // Calculates stats on the table, namely the median cell height and width.
00182   void CalculateStats();
00183 
00187 
00188   // Given a whitespaced table, this looks for bordering lines that might
00189   // be page layout boxes around the table. It is necessary to get the margins
00190   // correct on the table. If the lines are not joined, the margins will be
00191   // the distance to the line, which is not right.
00192   void AbsorbNearbyLines();
00193 
00194   // Nice utility function for finding partition gaps. You feed it a sorted
00195   // list of all of the mins/maxes of the partitions in the table, and it gives
00196   // you the gaps (middle). This works for both vertical and horizontal
00197   // gaps.
00198   //
00199   // If you want to allow slight overlap in the division and the partitions,
00200   // just scale down the partitions before inserting them in the list.
00201   // Likewise, you can force at least some space between partitions.
00202   // This trick is how the horizontal partitions are done (since the page
00203   // skew could make it hard to find splits in the text).
00204   //
00205   // As a result, "0 distance" between closest partitions causes a gap.
00206   // This is not a programmatic assumption. It is intentional and simplifies
00207   // things.
00208   //
00209   // "max_merged" indicates both the minimum number of stacked partitions
00210   // to cause a cell (add 1 to it), and the maximum number of partitions that
00211   // a grid line can intersect. For example, if max_merged is 0, then lines
00212   // are inserted wherever space exists between partitions. If it is 2,
00213   // lines may intersect 2 partitions at most, but you also need at least
00214   // 2 partitions to generate a line.
00215   static void FindCellSplitLocations(const GenericVector<int>& min_list,
00216                                      const GenericVector<int>& max_list,
00217                                      int max_merged,
00218                                      GenericVector<int>* locations);
00219 
00223 
00224   // Counts the number of ColPartitions that intersect vertical cell
00225   // division at this x value. Used by VerifyLinedTable.
00226   int CountVerticalIntersections(int x);
00227   int CountHorizontalIntersections(int y);
00228 
00229   // Counts how many text partitions are in this box.
00230   int CountPartitions(const TBOX& box);
00231 
00235 
00236   // Input data, used as read only data to make decisions.
00237   ColPartitionGrid* text_grid_;    // Text ColPartitions
00238   ColPartitionGrid* line_grid_;    // Line ColPartitions
00239   // Table structure.
00240   // bounding box is a convenient external representation.
00241   // cell_x_ and cell_y_ indicate the grid lines.
00242   TBOX bounding_box_;              // Bounding box
00243   GenericVectorEqEq<int> cell_x_;  // Locations of vertical divisions (sorted)
00244   GenericVectorEqEq<int> cell_y_;  // Locations of horizontal divisions (sorted)
00245   bool is_lined_;                  // Is the table backed up by a line structure
00246   // Table margins, set via CalculateMargins
00247   int space_above_;
00248   int space_below_;
00249   int space_left_;
00250   int space_right_;
00251   int median_cell_height_;
00252   int median_cell_width_;
00253   // Filters, used to prevent awkward partitions from destroying structure.
00254   int max_text_height_;
00255 };
00256 
00257 class TableRecognizer {
00258  public:
00259   TableRecognizer();
00260   ~TableRecognizer();
00261 
00262   // Initialization code. Must be called after the constructor.
00263   void Init();
00264 
00268 
00269   // Sets the grids used by the table. These can be changed between
00270   // calls to Recognize. They are treated as read-only data.
00271   void set_text_grid(ColPartitionGrid* text);
00272   void set_line_grid(ColPartitionGrid* lines);
00273   // Sets some additional constraints on the table.
00274   void set_min_height(int height);
00275   void set_min_width(int width);
00276   // Filters text partitions that are ridiculously tall to prevent
00277   // merging rows. Note that "filters" refers to allowing horizontal
00278   // cells to slice through them on the premise that they were
00279   // merged text rows during previous layout.
00280   void set_max_text_height(int height);
00281 
00282   // Given a guess location, the RecognizeTable function will try to find a
00283   // structured grid in the area. On success, it will return a new
00284   // StructuredTable (and assumes you will delete it). Otherwise,
00285   // NULL is returned.
00286   //
00287   // Keep in mind, this may "overgrow" or "undergrow" the size of guess.
00288   // Ideally, there is a either a one-to-one correspondence between
00289   // the guess and table or no table at all. This is not the best of
00290   // assumptions right now, but was made to try to keep things simple in
00291   // the first pass.
00292   //
00293   // If a line structure is available on the page in the given region,
00294   // the table will use the linear structure as it is.
00295   // Otherwise, it will try to maximize the whitespace around it while keeping
00296   // a grid structure. This is somewhat working.
00297   //
00298   // Since the combination of adjustments can get high, effort was
00299   // originally made to keep the number of adjustments linear in the number
00300   // of partitions. The underlying structure finding code used to be
00301   // much more complex. I don't know how necessary this constraint is anymore.
00302   // The evaluation of a possible table is kept within O(nlogn) in the size of
00303   // the table (where size is the number of partitions in the table).
00304   // As a result, the algorithm is capable of O(n^2 log n). Depending
00305   // on the grid search size, it may be higher.
00306   //
00307   // Last note: it is possible to just try all partition boundaries at a high
00308   // level O(n^4) and do a verification scheme (at least O(nlogn)). If there
00309   // area 200 partitions on a page, this could be too costly. Effort could go
00310   // into pruning the search, but I opted for something quicker. I'm confident
00311   // that the independent adjustments can get similar results and keep the
00312   // complextiy down. However, the other approach could work without using
00313   // TableFinder at all if it is fast enough.  It comes down to properly
00314   // deciding what is a table. The code currently relies on TableFinder's
00315   // guess to the location of a table for that.
00316   StructuredTable* RecognizeTable(const TBOX& guess_box);
00317 
00318  protected:
00322 
00323   // Returns true if the given box has a lined table within it. The
00324   // table argument will be updated with the table if the table exists.
00325   bool RecognizeLinedTable(const TBOX& guess_box, StructuredTable* table);
00326   // Returns true if the given box has a large number of horizontal and
00327   // vertical lines present. If so, we assume the extent of these lines
00328   // uniquely defines a table and find that table via SolveLinedTable.
00329   bool HasSignificantLines(const TBOX& guess);
00330 
00331   // Given enough horizontal and vertical lines in a region, find a bounding
00332   // box that encloses all of them (as well as newly introduced lines).
00333   // The bounding box is the smallest box that encloses the lines in guess
00334   // without having any lines sticking out of it.
00335   // bounding_box is an in/out parameter.
00336   // On input, it in the extents of the box to search.
00337   // On output, it is the resulting bounding box.
00338   bool FindLinesBoundingBox(TBOX* bounding_box);
00339   // Iteration in above search.
00340   // bounding_box is an in/out parameter.
00341   // On input, it in the extents of the box to search.
00342   // On output, it is the resulting bounding box.
00343   bool FindLinesBoundingBoxIteration(TBOX* bounding_box);
00344 
00348 
00349   // Returns true if the given box has a whitespaced table within it. The
00350   // table argument will be updated if the table exists. Also note
00351   // that this method will fail if the guess_box center is not
00352   // mostly within the table.
00353   bool RecognizeWhitespacedTable(const TBOX& guess_box, StructuredTable* table);
00354 
00355   // Finds the location of a horizontal split relative to y.
00356   // This function is mostly unused now. If the SolveWhitespacedTable
00357   // changes much, it can be removed. Note, it isn't really as reliable
00358   // as I thought. I went with alternatives for most of the other uses.
00359   int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom);
00360 
00361   // Indicates that a table row is weak. This means that it has
00362   // many missing data cells or very large cell heights compared.
00363   // to the rest of the table.
00364   static bool IsWeakTableRow(StructuredTable* table, int row);
00365 
00366   // Input data, used as read only data to make decisions.
00367   ColPartitionGrid* text_grid_;    // Text ColPartitions
00368   ColPartitionGrid* line_grid_;    // Line ColPartitions
00369   // Table constraints, a "good" table must satisfy these.
00370   int min_height_;
00371   int min_width_;
00372   // Filters, used to prevent awkward partitions from destroying structure.
00373   int max_text_height_;  // Horizontal lines may intersect taller text.
00374 };
00375 
00376 }  // namespace tesseract
00377 
00378 #endif  /* TABLERECOG_H_ */