Tesseract
3.02
|
00001 00002 // File: colfind.h 00003 // Description: Class to find columns in the grid of BLOBNBOXes. 00004 // Author: Ray Smith 00005 // Created: Thu Feb 21 14:04:01 PST 2008 00006 // 00007 // (C) Copyright 2008, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_TEXTORD_COLFIND_H__ 00021 #define TESSERACT_TEXTORD_COLFIND_H__ 00022 00023 #include "tabfind.h" 00024 #include "imagefind.h" 00025 #include "colpartitiongrid.h" 00026 #include "colpartitionset.h" 00027 #include "ocrblock.h" 00028 #include "textlineprojection.h" 00029 00030 class BLOCK_LIST; 00031 struct Boxa; 00032 struct Pixa; 00033 class DENORM; 00034 class ScrollView; 00035 class STATS; 00036 class TO_BLOCK; 00037 00038 namespace tesseract { 00039 00040 extern BOOL_VAR_H(textord_tabfind_find_tables, false, "run table detection"); 00041 00042 class ColPartitionSet; 00043 class ColPartitionSet_LIST; 00044 class ColSegment_LIST; 00045 class ColumnGroup_LIST; 00046 class LineSpacing; 00047 class StrokeWidth; 00048 class TempColumn_LIST; 00049 class EquationDetectBase; 00050 00051 // The ColumnFinder class finds columns in the grid. 00052 class ColumnFinder : public TabFind { 00053 public: 00054 // Gridsize is an estimate of the text size in the image. A suitable value 00055 // is in TO_BLOCK::line_size after find_components has been used to make 00056 // the blobs. 00057 // bleft and tright are the bounds of the image (rectangle) being processed. 00058 // vlines is a (possibly empty) list of TabVector and vertical_x and y are 00059 // the sum logical vertical vector produced by LineFinder::FindVerticalLines. 00060 ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright, 00061 int resolution, TabVector_LIST* vlines, TabVector_LIST* hlines, 00062 int vertical_x, int vertical_y); 00063 virtual ~ColumnFinder(); 00064 00065 // Accessors for testing 00066 const DENORM* denorm() const { 00067 return denorm_; 00068 } 00069 const TextlineProjection* projection() const { 00070 return &projection_; 00071 } 00072 00073 // ====================================================================== 00074 // The main function of ColumnFinder is broken into pieces to facilitate 00075 // optional insertion of orientation and script detection in an efficient 00076 // way. The calling sequence IS MANDATORY however, whether or not 00077 // OSD is being used: 00078 // 1. Construction. 00079 // 2. SetupAndFilterNoise. 00080 // 3. IsVerticallyAlignedText. 00081 // 4. CorrectOrientation. 00082 // 5. FindBlocks. 00083 // 6. Destruction. Use of a single column finder for multiple images does not 00084 // make sense. 00085 // Throughout these steps, the ColPartitions are owned by part_grid_, which 00086 // means that that it must be kept correct. Exception: big_parts_ owns its 00087 // own ColPartitions. 00088 // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except 00089 // for a phase in FindBlocks before TransformToBlocks, when they become 00090 // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX 00091 // indicates more of a betrothal for the majority of layout analysis, ie 00092 // which ColPartition will take ownership when the blobs are release from 00093 // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that 00094 // are part of the image regions, as they are not on any TO_BLOCK list. 00095 // TODO(rays) break up column finder further into smaller classes, as 00096 // there is a lot more to it than column finding now. 00097 // ====================================================================== 00098 00099 // Performs initial processing on the blobs in the input_block: 00100 // Setup the part_grid, stroke_width_, nontext_map_. 00101 // Obvious noise blobs are filtered out and used to mark the nontext_map_. 00102 // Initial stroke-width analysis is used to get local text alignment 00103 // direction, so the textline projection_ map can be setup. 00104 // On return, IsVerticallyAlignedText may be called (now optionally) to 00105 // determine the gross textline alignment of the page. 00106 void SetupAndFilterNoise(Pix* photo_mask_pix, TO_BLOCK* input_block); 00107 00108 // Tests for vertical alignment of text (returning true if so), and generates 00109 // a list of blobs (in osd_blobs) for orientation and script detection. 00110 // block is the single block for the whole page or rectangle to be OCRed. 00111 // Note that the vertical alignment may be due to text whose writing direction 00112 // is vertical, like say Japanese, or due to text whose writing direction is 00113 // horizontal but whose text appears vertically aligned because the image is 00114 // not the right way up. 00115 bool IsVerticallyAlignedText(TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs); 00116 00117 // Rotates the blobs and the TabVectors so that the gross writing direction 00118 // (text lines) are horizontal and lines are read down the page. 00119 // Applied rotation stored in rotation_. 00120 // A second rotation is calculated for application during recognition to 00121 // make the rotated blobs upright for recognition. 00122 // Subsequent rotation stored in text_rotation_. 00123 // 00124 // Arguments: 00125 // vertical_text_lines is true if the text lines are vertical. 00126 // recognition_rotation [0..3] is the number of anti-clockwise 90 degree 00127 // rotations from osd required for the text to be upright and readable. 00128 void CorrectOrientation(TO_BLOCK* block, bool vertical_text_lines, 00129 int recognition_rotation); 00130 00131 // Finds blocks of text, image, rule line, table etc, returning them in the 00132 // blocks and to_blocks 00133 // (Each TO_BLOCK points to the basic BLOCK and adds more information.) 00134 // Image blocks are generated by a combination of photo_mask_pix (which may 00135 // NOT be NULL) and the rejected text found during preliminary textline 00136 // finding. 00137 // The input_block is the result of a call to find_components, and contains 00138 // the blobs found in the image or rectangle to be OCRed. These blobs will be 00139 // removed and placed in the output blocks, while unused ones will be deleted. 00140 // If single_column is true, the input is treated as single column, but 00141 // it is still divided into blocks of equal line spacing/text size. 00142 // scaled_color is scaled down by scaled_factor from the input color image, 00143 // and may be NULL if the input was not color. 00144 // Returns -1 if the user hits the 'd' key in the blocks window while running 00145 // in debug mode, which requests a retry with more debug info. 00146 int FindBlocks(bool single_column, 00147 Pix* scaled_color, int scaled_factor, 00148 TO_BLOCK* block, Pix* photo_mask_pix, 00149 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); 00150 00151 // Get the rotation required to deskew, and its inverse rotation. 00152 void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew); 00153 00154 // Set the equation detection pointer. 00155 void SetEquationDetect(EquationDetectBase* detect); 00156 00157 private: 00158 // Displays the blob and block bounding boxes in a window called Blocks. 00159 void DisplayBlocks(BLOCK_LIST* blocks); 00160 // Displays the column edges at each grid y coordinate defined by 00161 // best_columns_. 00162 void DisplayColumnBounds(PartSetVector* sets); 00163 00165 00166 // Sets up column_sets_ (the determined column layout at each horizontal 00167 // slice). Returns false if the page is empty. 00168 bool MakeColumns(bool single_column); 00169 // Attempt to improve the column_candidates by expanding the columns 00170 // and adding new partitions from the partition sets in src_sets. 00171 // Src_sets may be equal to column_candidates, in which case it will 00172 // use them as a source to improve themselves. 00173 void ImproveColumnCandidates(PartSetVector* src_sets, 00174 PartSetVector* column_sets); 00175 // Prints debug information on the column candidates. 00176 void PrintColumnCandidates(const char* title); 00177 // Finds the optimal set of columns that cover the entire image with as 00178 // few changes in column partition as possible. 00179 void AssignColumns(const PartSetVector& part_sets); 00180 // Finds the biggest range in part_sets_ that has no assigned column, but 00181 // column assignment is possible. 00182 bool BiggestUnassignedRange(int set_count, const bool* any_columns_possible, 00183 int* start, int* end); 00184 // Finds the modal compatible column_set_ index within the given range. 00185 int RangeModalColumnSet(int** column_set_costs, const int* assigned_costs, 00186 int start, int end); 00187 // Given that there are many column_set_id compatible columns in the range, 00188 // shrinks the range to the longest contiguous run of compatibility, allowing 00189 // gaps where no columns are possible, but not where competing columns are 00190 // possible. 00191 void ShrinkRangeToLongestRun(int** column_set_costs, 00192 const int* assigned_costs, 00193 const bool* any_columns_possible, 00194 int column_set_id, 00195 int* best_start, int* best_end); 00196 // Moves start in the direction of step, upto, but not including end while 00197 // the only incompatible regions are no more than kMaxIncompatibleColumnCount 00198 // in size, and the compatible regions beyond are bigger. 00199 void ExtendRangePastSmallGaps(int** column_set_costs, 00200 const int* assigned_costs, 00201 const bool* any_columns_possible, 00202 int column_set_id, 00203 int step, int end, int* start); 00204 // Assigns the given column_set_id to the part_sets_ in the given range. 00205 void AssignColumnToRange(int column_set_id, int start, int end, 00206 int** column_set_costs, int* assigned_costs); 00207 00208 // Computes the mean_column_gap_. 00209 void ComputeMeanColumnGap(); 00210 00213 00214 // Hoovers up all un-owned blobs and deletes them. 00215 // The rest get released from the block so the ColPartitions can pass 00216 // ownership to the output blocks. 00217 void ReleaseBlobsAndCleanupUnused(TO_BLOCK* block); 00218 // Splits partitions that cross columns where they have nothing in the gap. 00219 void GridSplitPartitions(); 00220 // Merges partitions where there is vertical overlap, within a single column, 00221 // and the horizontal gap is small enough. 00222 void GridMergePartitions(); 00223 // Inserts remaining noise blobs into the most applicable partition if any. 00224 // If there is no applicable partition, then the blobs are deleted. 00225 void InsertRemainingNoise(TO_BLOCK* block); 00226 // Remove partitions that come from horizontal lines that look like 00227 // underlines, but are not part of a table. 00228 void GridRemoveUnderlinePartitions(); 00229 // Add horizontal line separators as partitions. 00230 void GridInsertHLinePartitions(); 00231 // Add vertical line separators as partitions. 00232 void GridInsertVLinePartitions(); 00233 // For every ColPartition in the grid, sets its type based on position 00234 // in the columns. 00235 void SetPartitionTypes(); 00236 // Only images remain with multiple types in a run of partners. 00237 // Sets the type of all in the group to the maximum of the group. 00238 void SmoothPartnerRuns(); 00239 00241 00242 // Helper functions for TransformToBlocks. 00243 // Add the part to the temp list in the correct order. 00244 void AddToTempPartList(ColPartition* part, ColPartition_CLIST* temp_list); 00245 // Add everything from the temp list to the work_set assuming correct order. 00246 void EmptyTempPartList(ColPartition_CLIST* temp_list, 00247 WorkingPartSet_LIST* work_set); 00248 00249 // Transform the grid of partitions to the output blocks. 00250 void TransformToBlocks(BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); 00251 00252 // Reflect the blob boxes (but not the outlines) in the y-axis so that 00253 // the blocks get created in the correct RTL order. Rotates the blobs 00254 // in the input_block and the bblobs list. 00255 // The reflection is undone in RotateAndReskewBlocks by 00256 // reflecting the blocks themselves, and then recomputing the blob bounding 00257 // boxes. 00258 void ReflectForRtl(TO_BLOCK* input_block, BLOBNBOX_LIST* bblobs); 00259 00260 // Undo the deskew that was done in FindTabVectors, as recognition is done 00261 // without correcting blobs or blob outlines for skew. 00262 // Reskew the completed blocks to put them back to the original rotated coords 00263 // that were created by CorrectOrientation. 00264 // If the input_is_rtl, then reflect the blocks in the y-axis to undo the 00265 // reflection that was done before FindTabVectors. 00266 // Blocks that were identified as vertical text (relative to the rotated 00267 // coordinates) are further rotated so the text lines are horizontal. 00268 // blob polygonal outlines are rotated to match the position of the blocks 00269 // that they are in, and their bounding boxes are recalculated to be accurate. 00270 // Record appropriate inverse transformations and required 00271 // classifier transformation in the blocks. 00272 void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST* to_blocks); 00273 00274 // Computes the rotations for the block (to make textlines horizontal) and 00275 // for the blobs (for classification) and sets the appropriate members 00276 // of the given block. 00277 // Returns the rotation that needs to be applied to the blobs to make 00278 // them sit in the rotated block. 00279 FCOORD ComputeBlockAndClassifyRotation(BLOCK* block); 00280 00281 // The minimum gutter width to apply for finding columns. 00282 // Modified when vertical text is detected to prevent detection of 00283 // vertical text lines as columns. 00284 int min_gutter_width_; 00285 // The mean gap between columns over the page. 00286 int mean_column_gap_; 00287 // The rotation vector needed to convert original coords to deskewed. 00288 FCOORD deskew_; 00289 // The rotation vector needed to convert deskewed back to original coords. 00290 FCOORD reskew_; 00291 // The rotation vector used to rotate vertically oriented pages. 00292 FCOORD rotation_; 00293 // The rotation vector needed to convert the rotated back to original coords. 00294 FCOORD rerotate_; 00295 // The additional rotation vector needed to rotate text for recognition. 00296 FCOORD text_rotation_; 00297 // The column_sets_ contain the ordered candidate ColPartitionSets that 00298 // define the possible divisions of the page into columns. 00299 PartSetVector column_sets_; 00300 // A simple array of pointers to the best assigned column division at 00301 // each grid y coordinate. 00302 ColPartitionSet** best_columns_; 00303 // The grid used for creating initial partitions with strokewidth. 00304 StrokeWidth* stroke_width_; 00305 // The grid used to hold ColPartitions after the columns have been determined. 00306 ColPartitionGrid part_grid_; 00307 // List of ColPartitions that are no longer needed after they have been 00308 // turned into regions, but are kept around because they are referenced 00309 // by the part_grid_. 00310 ColPartition_LIST good_parts_; 00311 // List of ColPartitions that are big and might be dropcap or vertically 00312 // joined. 00313 ColPartition_LIST big_parts_; 00314 // List of ColPartitions that have been declared noise. 00315 ColPartition_LIST noise_parts_; 00316 // The fake blobs that are made from the images. 00317 BLOBNBOX_LIST image_bblobs_; 00318 // Horizontal line separators. 00319 TabVector_LIST horizontal_lines_; 00320 // Image map of photo/noise areas on the page. 00321 Pix* nontext_map_; 00322 // Textline projection map. 00323 TextlineProjection projection_; 00324 // Sequence of DENORMS that indicate how to get back to the original image 00325 // coordinate space. The destructor must delete all the DENORMs in the chain. 00326 DENORM* denorm_; 00327 00328 // Various debug windows that automatically go away on completion. 00329 ScrollView* input_blobs_win_; 00330 00331 // The equation region detector pointer. Note: This pointer is passed in by 00332 // member function SetEquationDetect, and releasing it is NOT owned by this 00333 // class. 00334 EquationDetectBase* equation_detect_; 00335 00336 // Allow a subsequent instance to reuse the blocks window. 00337 // Not thread-safe, but multiple threads shouldn't be using windows anyway. 00338 static ScrollView* blocks_win_; 00339 }; 00340 00341 } // namespace tesseract. 00342 00343 #endif // TESSERACT_TEXTORD_COLFIND_H__