Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: paragraphs.h 00003 * Description: Paragraph Detection internal data structures. 00004 * Author: David Eger 00005 * Created: 11 March 2011 00006 * 00007 * (C) Copyright 2011, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ 00021 #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ 00022 00023 #include "paragraphs.h" 00024 #ifdef _MSC_VER 00025 #include <string> 00026 #else 00027 #include "strings.h" 00028 #endif 00029 00030 // NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS 00031 // DATA STRUCTURES OR FUNCTIONS IN THIS FILE. 00032 00033 class WERD_CHOICE; 00034 00035 namespace tesseract { 00036 00037 // Return whether the given word is likely to be a list item start word. 00038 bool AsciiLikelyListItem(const STRING &word); 00039 00040 // Return the first Unicode Codepoint from werd[pos]. 00041 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos); 00042 00043 // Set right word attributes given either a unicharset and werd or a utf8 00044 // string. 00045 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, 00046 const STRING &utf8, 00047 bool *is_list, bool *starts_idea, bool *ends_idea); 00048 00049 // Set left word attributes given either a unicharset and werd or a utf8 string. 00050 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, 00051 const STRING &utf8, 00052 bool *is_list, bool *starts_idea, bool *ends_idea); 00053 00054 enum LineType { 00055 LT_START = 'S', // First line of a paragraph. 00056 LT_BODY = 'C', // Continuation line of a paragraph. 00057 LT_UNKNOWN = 'U', // No clues. 00058 LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY. 00059 }; 00060 00061 // The first paragraph in a page of body text is often un-indented. 00062 // This is a typographic convention which is common to indicate either that: 00063 // (1) The paragraph is the continuation of a previous paragraph, or 00064 // (2) The paragraph is the first paragraph in a chapter. 00065 // 00066 // I refer to such paragraphs as "crown"s, and the output of the paragraph 00067 // detection algorithm attempts to give them the same paragraph model as 00068 // the rest of the body text. 00069 // 00070 // Nonetheless, while building hypotheses, it is useful to mark the lines 00071 // of crown paragraphs temporarily as crowns, either aligned left or right. 00072 extern const ParagraphModel *kCrownLeft; 00073 extern const ParagraphModel *kCrownRight; 00074 00075 inline bool StrongModel(const ParagraphModel *model) { 00076 return model != NULL && model != kCrownLeft && model != kCrownRight; 00077 } 00078 00079 struct LineHypothesis { 00080 LineHypothesis() : ty(LT_UNKNOWN), model(NULL) {} 00081 LineHypothesis(LineType line_type, const ParagraphModel *m) 00082 : ty(line_type), model(m) {} 00083 LineHypothesis(const LineHypothesis &other) 00084 : ty(other.ty), model(other.model) {} 00085 00086 bool operator==(const LineHypothesis &other) const { 00087 return ty == other.ty && model == other.model; 00088 } 00089 00090 LineType ty; 00091 const ParagraphModel *model; 00092 }; 00093 00094 class ParagraphTheory; // Forward Declaration 00095 00096 typedef GenericVectorEqEq<const ParagraphModel *> SetOfModels; 00097 00098 // Row Scratch Registers are data generated by the paragraph detection 00099 // algorithm based on a RowInfo input. 00100 class RowScratchRegisters { 00101 public: 00102 // We presume row will outlive us. 00103 void Init(const RowInfo &row); 00104 00105 LineType GetLineType() const; 00106 00107 LineType GetLineType(const ParagraphModel *model) const; 00108 00109 // Mark this as a start line type, sans model. This is useful for the 00110 // initial marking of probable body lines or paragraph start lines. 00111 void SetStartLine(); 00112 00113 // Mark this as a body line type, sans model. This is useful for the 00114 // initial marking of probably body lines or paragraph start lines. 00115 void SetBodyLine(); 00116 00117 // Record that this row fits as a paragraph start line in the given model, 00118 void AddStartLine(const ParagraphModel *model); 00119 // Record that this row fits as a paragraph body line in the given model, 00120 void AddBodyLine(const ParagraphModel *model); 00121 00122 // Clear all hypotheses about this line. 00123 void SetUnknown() { hypotheses_.truncate(0); } 00124 00125 // Append all hypotheses of strong models that match this row as a start. 00126 void StartHypotheses(SetOfModels *models) const; 00127 00128 // Append all hypotheses of strong models matching this row. 00129 void StrongHypotheses(SetOfModels *models) const; 00130 00131 // Append all hypotheses for this row. 00132 void NonNullHypotheses(SetOfModels *models) const; 00133 00134 // Discard any hypotheses whose model is not in the given list. 00135 void DiscardNonMatchingHypotheses(const SetOfModels &models); 00136 00137 // If we have only one hypothesis and that is that this line is a paragraph 00138 // start line of a certain model, return that model. Else return NULL. 00139 const ParagraphModel *UniqueStartHypothesis() const; 00140 00141 // If we have only one hypothesis and that is that this line is a paragraph 00142 // body line of a certain model, return that model. Else return NULL. 00143 const ParagraphModel *UniqueBodyHypothesis() const; 00144 00145 // Return the indentation for the side opposite of the aligned side. 00146 int OffsideIndent(tesseract::ParagraphJustification just) const { 00147 switch (just) { 00148 case tesseract::JUSTIFICATION_RIGHT: return lindent_; 00149 case tesseract::JUSTIFICATION_LEFT: return rindent_; 00150 default: return lindent_ > rindent_ ? lindent_ : rindent_; 00151 } 00152 } 00153 00154 // Return the indentation for the side the text is aligned to. 00155 int AlignsideIndent(tesseract::ParagraphJustification just) const { 00156 switch (just) { 00157 case tesseract::JUSTIFICATION_RIGHT: return rindent_; 00158 case tesseract::JUSTIFICATION_LEFT: return lindent_; 00159 default: return lindent_ > rindent_ ? lindent_ : rindent_; 00160 } 00161 } 00162 00163 // Append header fields to a vector of row headings. 00164 static void AppendDebugHeaderFields(GenericVector<STRING> *header); 00165 00166 // Append data for this row to a vector of debug strings. 00167 void AppendDebugInfo(const ParagraphTheory &theory, 00168 GenericVector<STRING> *dbg) const; 00169 00170 const RowInfo *ri_; 00171 00172 // These four constants form a horizontal box model for the white space 00173 // on the edges of each line. At each point in the algorithm, the following 00174 // shall hold: 00175 // ri_->pix_ldistance = lmargin_ + lindent_ 00176 // ri_->pix_rdistance = rindent_ + rmargin_ 00177 int lmargin_; 00178 int lindent_; 00179 int rindent_; 00180 int rmargin_; 00181 00182 private: 00183 // Hypotheses of either LT_START or LT_BODY 00184 GenericVectorEqEq<LineHypothesis> hypotheses_; 00185 }; 00186 00187 // A collection of convenience functions for wrapping the set of 00188 // Paragraph Models we believe correctly model the paragraphs in the image. 00189 class ParagraphTheory { 00190 public: 00191 // We presume models will outlive us, and that models will take ownership 00192 // of any ParagraphModel *'s we add. 00193 explicit ParagraphTheory(GenericVector<ParagraphModel *> *models) 00194 : models_(models) {} 00195 GenericVector<ParagraphModel *> &models() { return *models_; } 00196 const GenericVector<ParagraphModel *> &models() const { return *models_; } 00197 00198 // Return an existing model if one that is Comparable() can be found. 00199 // Else, allocate a new copy of model to save and return a pointer to it. 00200 const ParagraphModel *AddModel(const ParagraphModel &model); 00201 00202 // Discard any models we've made that are not in the list of used models. 00203 void DiscardUnusedModels(const SetOfModels &used_models); 00204 00205 // Return the set of all non-centered models. 00206 void NonCenteredModels(SetOfModels *models); 00207 00208 // If any of the non-centered paragraph models we know about fit 00209 // rows[start, end), return it. Else NULL. 00210 const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows, 00211 int start, int end) const; 00212 00213 int IndexOf(const ParagraphModel *model) const; 00214 00215 private: 00216 GenericVector<ParagraphModel *> *models_; 00217 GenericVectorEqEq<ParagraphModel *> models_we_added_; 00218 }; 00219 00220 bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, 00221 int row, const ParagraphModel *model); 00222 bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, 00223 int row, const ParagraphModel *model); 00224 bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, 00225 int a, int b, const ParagraphModel *model); 00226 00227 // A class for smearing Paragraph Model hypotheses to surrounding rows. 00228 // The idea here is that StrongEvidenceClassify first marks only exceedingly 00229 // obvious start and body rows and constructs models of them. Thereafter, 00230 // we may have left over unmarked lines (mostly end-of-paragraph lines) which 00231 // were too short to have much confidence about, but which fit the models we've 00232 // constructed perfectly and which we ought to mark. This class is used to 00233 // "smear" our models over the text. 00234 class ParagraphModelSmearer { 00235 public: 00236 ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows, 00237 int row_start, int row_end, 00238 ParagraphTheory *theory); 00239 00240 // Smear forward paragraph models from existing row markings to subsequent 00241 // text lines if they fit, and mark any thereafter still unmodeled rows 00242 // with any model in the theory that fits them. 00243 void Smear(); 00244 00245 private: 00246 // Record in open_models_ for rows [start_row, end_row) the list of models 00247 // currently open at each row. 00248 // A model is still open in a row if some previous row has said model as a 00249 // start hypothesis, and all rows since (including this row) would fit as 00250 // either a body or start line in that model. 00251 void CalculateOpenModels(int row_start, int row_end); 00252 00253 SetOfModels &OpenModels(int row) { 00254 return open_models_[row - row_start_ + 1]; 00255 } 00256 00257 ParagraphTheory *theory_; 00258 GenericVector<RowScratchRegisters> *rows_; 00259 int row_start_; 00260 int row_end_; 00261 00262 // open_models_ corresponds to rows[start_row_ - 1, end_row_] 00263 // 00264 // open_models_: Contains models which there was an active (open) paragraph 00265 // as of the previous line and for which the left and right 00266 // indents admit the possibility that this text line continues 00267 // to fit the same model. 00268 // TODO(eger): Think about whether we can get rid of "Open" models and just 00269 // use the current hypotheses on RowScratchRegisters. 00270 GenericVector<SetOfModels> open_models_; 00271 }; 00272 00273 // Clear all hypotheses about lines [start, end) and reset the margins to the 00274 // percentile (0..100) value of the left and right row edges for this run of 00275 // rows. 00276 void RecomputeMarginsAndClearHypotheses( 00277 GenericVector<RowScratchRegisters> *rows, int start, int end, 00278 int percentile); 00279 00280 // Return the minimum inter-word space in rows[row_start, row_end). 00281 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, 00282 int row_start, int row_end); 00283 00284 // Return whether the first word on the after line can fit in the space at 00285 // the end of the before line (knowing which way the text is aligned and read). 00286 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, 00287 const RowScratchRegisters &after, 00288 tesseract::ParagraphJustification justification); 00289 00290 // Return whether the first word on the after line can fit in the space at 00291 // the end of the before line (not knowing the text alignment). 00292 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, 00293 const RowScratchRegisters &after); 00294 00295 // Do rows[start, end) form a single instance of the given paragraph model? 00296 bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, 00297 int start, int end, const ParagraphModel *model); 00298 00299 // Do the text and geometry of two rows support a paragraph break between them? 00300 bool LikelyParagraphStart(const RowScratchRegisters &before, 00301 const RowScratchRegisters &after, 00302 tesseract::ParagraphJustification j); 00303 00304 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known), 00305 // normalize each row_owner to point to an actual PARA, and output the 00306 // paragraphs in order onto paragraphs. 00307 void CanonicalizeDetectionResults( 00308 GenericVector<PARA *> *row_owners, 00309 PARA_LIST *paragraphs); 00310 00311 } // namespace 00312 #endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_