Tesseract  3.02
tesseract-ocr/textord/textlineprojection.h
Go to the documentation of this file.
00001 // Copyright 2011 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00003 //
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 
00014 #ifndef TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_
00015 #define TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_
00016 
00017 #include "blobgrid.h"      // For BlobGrid
00018 
00019 class DENORM;
00020 struct Pix;
00021 struct TPOINT;
00022 
00023 namespace tesseract {
00024 
00025 class ColPartition;
00026 
00027 // Simple class to encapsulate the computation of an image representing
00028 // local textline density, and function(s) to make use of it.
00029 // The underlying principle is that if you smear connected components
00030 // horizontally (vertically for components on a vertically written textline)
00031 // and count the number of smeared components in an image, then the resulting
00032 // image shows the density of the textlines at each image position.
00033 class TextlineProjection {
00034  public:
00035   // The down-scaling factor is computed to obtain a projection resolution
00036   // of about 100 dpi, whatever the input.
00037   explicit TextlineProjection(int resolution);
00038   ~TextlineProjection();
00039 
00040   // Build the projection profile given the input_block containing lists of
00041   // blobs, a rotation to convert to image coords,
00042   // and a full-resolution nontext_map, marking out areas to avoid.
00043   // During construction, we have the following assumptions:
00044   // The rotation is a multiple of 90 degrees, ie no deskew yet.
00045   // The blobs have had their left and right rules set to also limit
00046   // the range of projection.
00047   void ConstructProjection(TO_BLOCK* input_block,
00048                            const FCOORD& rotation, Pix* nontext_map);
00049 
00050   // Display the blobs in the window colored according to textline quality.
00051   void PlotGradedBlobs(BLOBNBOX_LIST* blobs, ScrollView* win);
00052 
00053   // Moves blobs that look like they don't sit well on a textline from the
00054   // input blobs list to the output small_blobs list.
00055   // This gets them away from initial textline finding to stop diacritics
00056   // from forming incorrect textlines. (Introduced mainly to fix Thai.)
00057   void MoveNonTextlineBlobs(BLOBNBOX_LIST* blobs,
00058                             BLOBNBOX_LIST* small_blobs) const;
00059 
00060   // Create a window and display the projection in it.
00061   void DisplayProjection() const;
00062 
00063   // Compute the distance of the box from the partition using curved projection
00064   // space. As DistanceOfBoxFromBox, except that the direction is taken from
00065   // the ColPartition and the median bounds of the ColPartition are used as
00066   // the to_box.
00067   int DistanceOfBoxFromPartition(const TBOX& box, const ColPartition& part,
00068                                  const DENORM* denorm, bool debug) const;
00069 
00070   // Compute the distance from the from_box to the to_box using curved
00071   // projection space. Separation that involves a decrease in projection
00072   // density (moving from the from_box to the to_box) is weighted more heavily
00073   // than constant density, and an increase is weighted less.
00074   // If horizontal_textline is true, then curved space is used vertically,
00075   // as for a diacritic on the edge of a textline.
00076   // The projection uses original image coords, so denorm is used to get
00077   // back to the image coords from box/part space.
00078   int DistanceOfBoxFromBox(const TBOX& from_box, const TBOX& to_box,
00079                            bool horizontal_textline,
00080                            const DENORM* denorm, bool debug) const;
00081 
00082   // Compute the distance between (x, y1) and (x, y2) using the rule that
00083   // a decrease in textline density is weighted more heavily than an increase.
00084   // The coordinates are in source image space, ie processed by any denorm
00085   // already, but not yet scaled by scale_factor_.
00086   // Going from the outside of a textline to the inside should measure much
00087   // less distance than going from the inside of a textline to the outside.
00088   int VerticalDistance(bool debug, int x, int y1, int y2) const;
00089 
00090   // Compute the distance between (x1, y) and (x2, y) using the rule that
00091   // a decrease in textline density is weighted more heavily than an increase.
00092   int HorizontalDistance(bool debug, int x1, int x2, int y) const;
00093 
00094   // Returns true if the blob appears to be outside of a horizontal textline.
00095   // Such blobs are potentially diacritics (even if large in Thai) and should
00096   // be kept away from initial textline finding.
00097   bool BoxOutOfHTextline(const TBOX& box, const DENORM* denorm,
00098                         bool debug) const;
00099 
00100   // Evaluates the textlineiness of a ColPartition. Uses EvaluateBox below,
00101   // but uses the median top/bottom for horizontal and median left/right for
00102   // vertical instead of the bounding box edges.
00103   // Evaluates for both horizontal and vertical and returns the best result,
00104   // with a positive value for horizontal and a negative value for vertical.
00105   int EvaluateColPartition(const ColPartition& part, const DENORM* denorm,
00106                            bool debug) const;
00107 
00108   // Computes the mean projection gradients over the horizontal and vertical
00109   // edges of the box:
00110   //   -h-h-h-h-h-h
00111   //  |------------| mean=htop   -v|+v--------+v|-v
00112   //  |+h+h+h+h+h+h|             -v|+v        +v|-v
00113   //  |            |             -v|+v        +v|-v
00114   //  |    box     |             -v|+v  box   +v|-v
00115   //  |            |             -v|+v        +v|-v
00116   //  |+h+h+h+h+h+h|             -v|+v        +v|-v
00117   //  |------------| mean=hbot   -v|+v--------+v|-v
00118   //   -h-h-h-h-h-h
00119   //                           mean=vleft  mean=vright
00120   //
00121   // Returns MAX(htop,hbot) - MAX(vleft,vright), which is a positive number
00122   // for a horizontal textline, a negative number for a vertical textline,
00123   // and near zero for undecided. Undecided is most likely non-text.
00124   int EvaluateBox(const TBOX& box, const DENORM* denorm, bool debug) const;
00125 
00126  private:
00127   // Internal version of EvaluateBox returns the unclipped gradients as well
00128   // as the result of EvaluateBox.
00129   // hgrad1 and hgrad2 are the gradients for the horizontal textline.
00130   int EvaluateBoxInternal(const TBOX& box, const DENORM* denorm, bool debug,
00131                           int* hgrad1, int* hgrad2,
00132                           int* vgrad1, int* vgrad2) const;
00133 
00134   // Helper returns the mean gradient value for the horizontal row at the given
00135   // y, (in the external coordinates) by subtracting the mean of the transformed
00136   // row 2 pixels above from the mean of the transformed row 2 pixels below.
00137   // This gives a positive value for a good top edge and negative for bottom.
00138   // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.
00139   int BestMeanGradientInRow(const DENORM* denorm, inT16 min_x, inT16 max_x,
00140                             inT16 y, bool best_is_max) const;
00141 
00142   // Helper returns the mean gradient value for the vertical column at the
00143   // given x, (in the external coordinates) by subtracting the mean of the
00144   // transformed column 2 pixels left from the mean of the transformed column
00145   // 2 pixels to the right.
00146   // This gives a positive value for a good left edge and negative for right.
00147   // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge.
00148   int BestMeanGradientInColumn(const DENORM* denorm, inT16 x, inT16 min_y,
00149                                inT16 max_y, bool best_is_max) const;
00150 
00151   // Helper returns the mean pixel value over the line between the start_pt and
00152   // end_pt (inclusive), but shifted perpendicular to the line in the projection
00153   // image by offset pixels. For simplicity, it is assumed that the vector is
00154   // either nearly horizontal or nearly vertical. It works on skewed textlines!
00155   // The end points are in external coordinates, and will be denormalized with
00156   // the denorm if not NULL before further conversion to pix coordinates.
00157   // After all the conversions, the offset is added to the direction
00158   // perpendicular to the line direction. The offset is thus in projection image
00159   // coordinates, which allows the caller to get a guaranteed displacement
00160   // between pixels used to calculate gradients.
00161   int MeanPixelsInLineSegment(const DENORM* denorm, int offset,
00162                               TPOINT start_pt, TPOINT end_pt) const;
00163 
00164   // Helper function to add 1 to a rectangle in source image coords to the
00165   // internal projection pix_.
00166   void IncrementRectangle8Bit(const TBOX& box);
00167   // Inserts a list of blobs into the projection.
00168   // Rotation is a multiple of 90 degrees to get from blob coords to
00169   // nontext_map coords, image_box is the bounds of the nontext_map.
00170   // Blobs are spread horizontally or vertically according to their internal
00171   // flags, but the spreading is truncated by set pixels in the nontext_map
00172   // and also by the horizontal rule line limits on the blobs.
00173   void ProjectBlobs(BLOBNBOX_LIST* blobs, const FCOORD& rotation,
00174                     const TBOX& image_box, Pix* nontext_map);
00175   // Pads the bounding box of the given blob according to whether it is on
00176   // a horizontal or vertical text line, taking into account tab-stops near
00177   // the blob. Returns true if padding was in the horizontal direction.
00178   bool PadBlobBox(BLOBNBOX* blob, TBOX* bbox);
00179 
00180   // Helper denormalizes the TPOINT with the denorm if not NULL, then
00181   // converts to pix_ coordinates.
00182   void TransformToPixCoords(const DENORM* denorm, TPOINT* pt) const;
00183 
00184   // Helper truncates the TPOINT to be within the pix_.
00185   void TruncateToImageBounds(TPOINT* pt) const;
00186 
00187   // Transform tesseract coordinates to coordinates used in the pix.
00188   int ImageXToProjectionX(int x) const;
00189   int ImageYToProjectionY(int y) const;
00190 
00191   // The down-sampling scale factor used in building the image.
00192   int scale_factor_;
00193   // The blob coordinates of the top-left (origin of the pix_) in tesseract
00194   // coordinates. Used to transform the bottom-up tesseract coordinates to
00195   // the top-down coordinates of the pix.
00196   int x_origin_;
00197   int y_origin_;
00198   // The image of horizontally smeared blob boxes summed to provide a
00199   // textline density map. As with a horizontal projection, the map has
00200   // dips in the gaps between textlines.
00201   Pix* pix_;
00202 };
00203 
00204 }  // namespace tesseract.
00205 
00206 #endif  // TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_