Tesseract
3.02
|
00001 // Copyright 2011 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 00014 #ifndef TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_ 00015 #define TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_ 00016 00017 #include "blobgrid.h" // For BlobGrid 00018 00019 class DENORM; 00020 struct Pix; 00021 struct TPOINT; 00022 00023 namespace tesseract { 00024 00025 class ColPartition; 00026 00027 // Simple class to encapsulate the computation of an image representing 00028 // local textline density, and function(s) to make use of it. 00029 // The underlying principle is that if you smear connected components 00030 // horizontally (vertically for components on a vertically written textline) 00031 // and count the number of smeared components in an image, then the resulting 00032 // image shows the density of the textlines at each image position. 00033 class TextlineProjection { 00034 public: 00035 // The down-scaling factor is computed to obtain a projection resolution 00036 // of about 100 dpi, whatever the input. 00037 explicit TextlineProjection(int resolution); 00038 ~TextlineProjection(); 00039 00040 // Build the projection profile given the input_block containing lists of 00041 // blobs, a rotation to convert to image coords, 00042 // and a full-resolution nontext_map, marking out areas to avoid. 00043 // During construction, we have the following assumptions: 00044 // The rotation is a multiple of 90 degrees, ie no deskew yet. 00045 // The blobs have had their left and right rules set to also limit 00046 // the range of projection. 00047 void ConstructProjection(TO_BLOCK* input_block, 00048 const FCOORD& rotation, Pix* nontext_map); 00049 00050 // Display the blobs in the window colored according to textline quality. 00051 void PlotGradedBlobs(BLOBNBOX_LIST* blobs, ScrollView* win); 00052 00053 // Moves blobs that look like they don't sit well on a textline from the 00054 // input blobs list to the output small_blobs list. 00055 // This gets them away from initial textline finding to stop diacritics 00056 // from forming incorrect textlines. (Introduced mainly to fix Thai.) 00057 void MoveNonTextlineBlobs(BLOBNBOX_LIST* blobs, 00058 BLOBNBOX_LIST* small_blobs) const; 00059 00060 // Create a window and display the projection in it. 00061 void DisplayProjection() const; 00062 00063 // Compute the distance of the box from the partition using curved projection 00064 // space. As DistanceOfBoxFromBox, except that the direction is taken from 00065 // the ColPartition and the median bounds of the ColPartition are used as 00066 // the to_box. 00067 int DistanceOfBoxFromPartition(const TBOX& box, const ColPartition& part, 00068 const DENORM* denorm, bool debug) const; 00069 00070 // Compute the distance from the from_box to the to_box using curved 00071 // projection space. Separation that involves a decrease in projection 00072 // density (moving from the from_box to the to_box) is weighted more heavily 00073 // than constant density, and an increase is weighted less. 00074 // If horizontal_textline is true, then curved space is used vertically, 00075 // as for a diacritic on the edge of a textline. 00076 // The projection uses original image coords, so denorm is used to get 00077 // back to the image coords from box/part space. 00078 int DistanceOfBoxFromBox(const TBOX& from_box, const TBOX& to_box, 00079 bool horizontal_textline, 00080 const DENORM* denorm, bool debug) const; 00081 00082 // Compute the distance between (x, y1) and (x, y2) using the rule that 00083 // a decrease in textline density is weighted more heavily than an increase. 00084 // The coordinates are in source image space, ie processed by any denorm 00085 // already, but not yet scaled by scale_factor_. 00086 // Going from the outside of a textline to the inside should measure much 00087 // less distance than going from the inside of a textline to the outside. 00088 int VerticalDistance(bool debug, int x, int y1, int y2) const; 00089 00090 // Compute the distance between (x1, y) and (x2, y) using the rule that 00091 // a decrease in textline density is weighted more heavily than an increase. 00092 int HorizontalDistance(bool debug, int x1, int x2, int y) const; 00093 00094 // Returns true if the blob appears to be outside of a horizontal textline. 00095 // Such blobs are potentially diacritics (even if large in Thai) and should 00096 // be kept away from initial textline finding. 00097 bool BoxOutOfHTextline(const TBOX& box, const DENORM* denorm, 00098 bool debug) const; 00099 00100 // Evaluates the textlineiness of a ColPartition. Uses EvaluateBox below, 00101 // but uses the median top/bottom for horizontal and median left/right for 00102 // vertical instead of the bounding box edges. 00103 // Evaluates for both horizontal and vertical and returns the best result, 00104 // with a positive value for horizontal and a negative value for vertical. 00105 int EvaluateColPartition(const ColPartition& part, const DENORM* denorm, 00106 bool debug) const; 00107 00108 // Computes the mean projection gradients over the horizontal and vertical 00109 // edges of the box: 00110 // -h-h-h-h-h-h 00111 // |------------| mean=htop -v|+v--------+v|-v 00112 // |+h+h+h+h+h+h| -v|+v +v|-v 00113 // | | -v|+v +v|-v 00114 // | box | -v|+v box +v|-v 00115 // | | -v|+v +v|-v 00116 // |+h+h+h+h+h+h| -v|+v +v|-v 00117 // |------------| mean=hbot -v|+v--------+v|-v 00118 // -h-h-h-h-h-h 00119 // mean=vleft mean=vright 00120 // 00121 // Returns MAX(htop,hbot) - MAX(vleft,vright), which is a positive number 00122 // for a horizontal textline, a negative number for a vertical textline, 00123 // and near zero for undecided. Undecided is most likely non-text. 00124 int EvaluateBox(const TBOX& box, const DENORM* denorm, bool debug) const; 00125 00126 private: 00127 // Internal version of EvaluateBox returns the unclipped gradients as well 00128 // as the result of EvaluateBox. 00129 // hgrad1 and hgrad2 are the gradients for the horizontal textline. 00130 int EvaluateBoxInternal(const TBOX& box, const DENORM* denorm, bool debug, 00131 int* hgrad1, int* hgrad2, 00132 int* vgrad1, int* vgrad2) const; 00133 00134 // Helper returns the mean gradient value for the horizontal row at the given 00135 // y, (in the external coordinates) by subtracting the mean of the transformed 00136 // row 2 pixels above from the mean of the transformed row 2 pixels below. 00137 // This gives a positive value for a good top edge and negative for bottom. 00138 // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge. 00139 int BestMeanGradientInRow(const DENORM* denorm, inT16 min_x, inT16 max_x, 00140 inT16 y, bool best_is_max) const; 00141 00142 // Helper returns the mean gradient value for the vertical column at the 00143 // given x, (in the external coordinates) by subtracting the mean of the 00144 // transformed column 2 pixels left from the mean of the transformed column 00145 // 2 pixels to the right. 00146 // This gives a positive value for a good left edge and negative for right. 00147 // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge. 00148 int BestMeanGradientInColumn(const DENORM* denorm, inT16 x, inT16 min_y, 00149 inT16 max_y, bool best_is_max) const; 00150 00151 // Helper returns the mean pixel value over the line between the start_pt and 00152 // end_pt (inclusive), but shifted perpendicular to the line in the projection 00153 // image by offset pixels. For simplicity, it is assumed that the vector is 00154 // either nearly horizontal or nearly vertical. It works on skewed textlines! 00155 // The end points are in external coordinates, and will be denormalized with 00156 // the denorm if not NULL before further conversion to pix coordinates. 00157 // After all the conversions, the offset is added to the direction 00158 // perpendicular to the line direction. The offset is thus in projection image 00159 // coordinates, which allows the caller to get a guaranteed displacement 00160 // between pixels used to calculate gradients. 00161 int MeanPixelsInLineSegment(const DENORM* denorm, int offset, 00162 TPOINT start_pt, TPOINT end_pt) const; 00163 00164 // Helper function to add 1 to a rectangle in source image coords to the 00165 // internal projection pix_. 00166 void IncrementRectangle8Bit(const TBOX& box); 00167 // Inserts a list of blobs into the projection. 00168 // Rotation is a multiple of 90 degrees to get from blob coords to 00169 // nontext_map coords, image_box is the bounds of the nontext_map. 00170 // Blobs are spread horizontally or vertically according to their internal 00171 // flags, but the spreading is truncated by set pixels in the nontext_map 00172 // and also by the horizontal rule line limits on the blobs. 00173 void ProjectBlobs(BLOBNBOX_LIST* blobs, const FCOORD& rotation, 00174 const TBOX& image_box, Pix* nontext_map); 00175 // Pads the bounding box of the given blob according to whether it is on 00176 // a horizontal or vertical text line, taking into account tab-stops near 00177 // the blob. Returns true if padding was in the horizontal direction. 00178 bool PadBlobBox(BLOBNBOX* blob, TBOX* bbox); 00179 00180 // Helper denormalizes the TPOINT with the denorm if not NULL, then 00181 // converts to pix_ coordinates. 00182 void TransformToPixCoords(const DENORM* denorm, TPOINT* pt) const; 00183 00184 // Helper truncates the TPOINT to be within the pix_. 00185 void TruncateToImageBounds(TPOINT* pt) const; 00186 00187 // Transform tesseract coordinates to coordinates used in the pix. 00188 int ImageXToProjectionX(int x) const; 00189 int ImageYToProjectionY(int y) const; 00190 00191 // The down-sampling scale factor used in building the image. 00192 int scale_factor_; 00193 // The blob coordinates of the top-left (origin of the pix_) in tesseract 00194 // coordinates. Used to transform the bottom-up tesseract coordinates to 00195 // the top-down coordinates of the pix. 00196 int x_origin_; 00197 int y_origin_; 00198 // The image of horizontally smeared blob boxes summed to provide a 00199 // textline density map. As with a horizontal projection, the map has 00200 // dips in the gaps between textlines. 00201 Pix* pix_; 00202 }; 00203 00204 } // namespace tesseract. 00205 00206 #endif // TESSERACT_TEXTORD_TEXTLINEPROJECTION_H_