Tesseract
3.02
|
00001 00002 // File: ocrpara.h 00003 // Description: OCR Paragraph Output Type 00004 // Author: David Eger 00005 // Created: 2010-11-15 00006 // 00007 // (C) Copyright 2010, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_CCSTRUCT_OCRPARA_H_ 00021 #define TESSERACT_CCSTRUCT_OCRPARA_H_ 00022 00023 #include "publictypes.h" 00024 #include "elst.h" 00025 #include "strngs.h" 00026 00027 class ParagraphModel; 00028 00029 struct PARA : public ELIST_LINK { 00030 public: 00031 PARA() : model(NULL), is_list_item(false), 00032 is_very_first_or_continuation(false), has_drop_cap(false) {} 00033 00034 // We do not own the model, we just reference it. 00035 // model may be NULL if there is not a good model for this paragraph. 00036 const ParagraphModel *model; 00037 00038 bool is_list_item; 00039 00040 // The first paragraph on a page often lacks a first line indent, but should 00041 // still be modeled by the same model as other body text paragraphs on the 00042 // page. 00043 bool is_very_first_or_continuation; 00044 00045 // Does this paragraph begin with a drop cap? 00046 bool has_drop_cap; 00047 }; 00048 00049 ELISTIZEH(PARA) 00050 00051 // A geometric model of paragraph indentation and alignment. 00052 // 00053 // Measurements are in pixels. The meaning of the integer arguments changes 00054 // depending upon the value of justification. Distances less than or equal 00055 // to tolerance apart we take as "equivalent" for the purpose of model 00056 // matching, and in the examples below, we assume tolerance is zero. 00057 // 00058 // justification = LEFT: 00059 // margin the "ignored" margin to the left block edge. 00060 // first_indent indent from the left margin to a typical first text line. 00061 // body_indent indent from the left margin of a typical body text line. 00062 // 00063 // justification = RIGHT: 00064 // margin the "ignored" margin to the right block edge. 00065 // first_indent indent from the right margin to a typical first text line. 00066 // body_indent indent from the right margin of a typical body text line. 00067 // 00068 // justification = CENTER: 00069 // margin ignored 00070 // first_indent ignored 00071 // body_indent ignored 00072 // 00073 // ====== Extended example, assuming each letter is ten pixels wide: ======= 00074 // 00075 // +--------------------------------+ 00076 // | Awesome | ParagraphModel(CENTER, 0, 0, 0) 00077 // | Centered Title | 00078 // | Paragraph Detection | 00079 // | OCR TEAM | 00080 // | 10 November 2010 | 00081 // | | 00082 // | Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0) 00083 // |This paragraph starts at the top| 00084 // |of the page and takes 3 lines. | 00085 // | Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0) 00086 // |which indicates that the first | 00087 // |paragraph is not a continuation | 00088 // |from a previous page, as it is | 00089 // |indented just like this second | 00090 // |paragraph. | 00091 // | Here is a block quote. It | ParagraphModel(LEFT, 30, 0, 0) 00092 // | looks like the prior text | 00093 // | but it is indented more | 00094 // | and is fully justified. | 00095 // | So how does one deal with | ParagraphModel(LEFT, 0, 20, 0) 00096 // |centered text, block quotes, | 00097 // |normal paragraphs, and lists | 00098 // |like what follows? | 00099 // |1. Make a plan. | ParagraphModel(LEFT, 0, 0, 30) 00100 // |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30) 00101 // | looking for lines where the | 00102 // | first word of the next line | 00103 // | would fit on the previous | 00104 // | line. | 00105 // |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30) 00106 // | Python and try it out. | 00107 // |4. Determine how to fix the | ParagraphModel(LEFT, 0, 0, 30) 00108 // | mistakes. | 00109 // |5. Repeat. | ParagraphModel(LEFT, 0, 0, 30) 00110 // | For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0) 00111 // |you can try to identify source | 00112 // |code. Ouch! | 00113 // +--------------------------------+ 00114 class ParagraphModel { 00115 public: 00116 ParagraphModel(tesseract::ParagraphJustification justification, 00117 int margin, 00118 int first_indent, 00119 int body_indent, 00120 int tolerance) 00121 : justification_(justification), 00122 margin_(margin), 00123 first_indent_(first_indent), 00124 body_indent_(body_indent), 00125 tolerance_(tolerance) { 00126 // Make one of {first_indent, body_indent} is 0. 00127 int added_margin = first_indent; 00128 if (body_indent < added_margin) 00129 added_margin = body_indent; 00130 margin_ += added_margin; 00131 first_indent_ -= added_margin; 00132 body_indent_ -= added_margin; 00133 } 00134 00135 ParagraphModel() 00136 : justification_(tesseract::JUSTIFICATION_UNKNOWN), 00137 margin_(0), 00138 first_indent_(0), 00139 body_indent_(0), 00140 tolerance_(0) { } 00141 00142 // ValidFirstLine() and ValidBodyLine() take arguments describing a text line 00143 // in a block of text which we are trying to model: 00144 // lmargin, lindent: these add up to the distance from the leftmost ink 00145 // in the text line to the surrounding text block's left 00146 // edge. 00147 // rmargin, rindent: these add up to the distance from the rightmost ink 00148 // in the text line to the surrounding text block's right 00149 // edge. 00150 // The caller determines the division between "margin" and "indent", which 00151 // only actually affect whether we think the line may be centered. 00152 // 00153 // If the amount of whitespace matches the amount of whitespace expected on 00154 // the relevant side of the line (within tolerance_) we say it matches. 00155 00156 // Return whether a given text line could be a first paragraph line according 00157 // to this paragraph model. 00158 bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const; 00159 00160 // Return whether a given text line could be a first paragraph line according 00161 // to this paragraph model. 00162 bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const; 00163 00164 tesseract::ParagraphJustification justification() const { 00165 return justification_; 00166 } 00167 int margin() const { return margin_; } 00168 int first_indent() const { return first_indent_; } 00169 int body_indent() const { return body_indent_; } 00170 int tolerance() const { return tolerance_; } 00171 bool is_flush() const { 00172 return (justification_ == tesseract::JUSTIFICATION_LEFT || 00173 justification_ == tesseract::JUSTIFICATION_RIGHT) && 00174 abs(first_indent_ - body_indent_) <= tolerance_; 00175 } 00176 00177 // Return whether this model is likely to agree with the other model on most 00178 // paragraphs they are marked. 00179 bool Comparable(const ParagraphModel &other) const; 00180 00181 STRING ToString() const; 00182 00183 private: 00184 tesseract::ParagraphJustification justification_; 00185 int margin_; 00186 int first_indent_; 00187 int body_indent_; 00188 int tolerance_; 00189 }; 00190 00191 #endif // TESSERACT_CCSTRUCT_OCRPARA_H_