tesseract-doc/paragraphs__internal_8h_source.html

00001 /**********************************************************************
00002  * File:        paragraphs.h
00003  * Description: Paragraph Detection internal data structures.
00004  * Author:      David Eger
00005  * Created:     11 March 2011
00006  *
00007  * (C) Copyright 2011, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019
00020 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
00021 #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
00022
00023 #include "paragraphs.h"
00024 #ifdef _MSC_VER
00025 #include <string>
00026 #else
00027 #include "strings.h"
00028 #endif
00029
00030 // NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
00031 // DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
00032
00033 class WERD_CHOICE;
00034
00035 namespace tesseract {
00036
00037 // Return whether the given word is likely to be a list item start word.
00038 bool AsciiLikelyListItem(const STRING &word);
00039
00040 // Return the first Unicode Codepoint from werd[pos].
00041 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
00042
00043 // Set right word attributes given either a unicharset and werd or a utf8
00044 // string.
00045 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
00046                          const STRING &utf8,
00047                          bool *is_list, bool *starts_idea, bool *ends_idea);
00048
00049 // Set left word attributes given either a unicharset and werd or a utf8 string.
00050 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
00051                         const STRING &utf8,
00052                         bool *is_list, bool *starts_idea, bool *ends_idea);
00053
00054 enum LineType {
00055   LT_START = 'S',     // First line of a paragraph.
00056   LT_BODY = 'C',      // Continuation line of a paragraph.
00057   LT_UNKNOWN = 'U',   // No clues.
00058   LT_MULTIPLE = 'M',  // Matches for both LT_START and LT_BODY.
00059 };
00060
00061 // The first paragraph in a page of body text is often un-indented.
00062 // This is a typographic convention which is common to indicate either that:
00063 // (1) The paragraph is the continuation of a previous paragraph, or
00064 // (2) The paragraph is the first paragraph in a chapter.
00065 //
00066 // I refer to such paragraphs as "crown"s, and the output of the paragraph
00067 // detection algorithm attempts to give them the same paragraph model as
00068 // the rest of the body text.
00069 //
00070 // Nonetheless, while building hypotheses, it is useful to mark the lines
00071 // of crown paragraphs temporarily as crowns, either aligned left or right.
00072 extern const ParagraphModel *kCrownLeft;
00073 extern const ParagraphModel *kCrownRight;
00074
00075 inline bool StrongModel(const ParagraphModel *model) {
00076   return model != NULL && model != kCrownLeft && model != kCrownRight;
00077 }
00078
00079 struct LineHypothesis {
00080   LineHypothesis() : ty(LT_UNKNOWN), model(NULL) {}
00081   LineHypothesis(LineType line_type, const ParagraphModel *m)
00082       : ty(line_type), model(m) {}
00083   LineHypothesis(const LineHypothesis &other)
00084       : ty(other.ty), model(other.model) {}
00085
00086   bool operator==(const LineHypothesis &other) const {
00087     return ty == other.ty && model == other.model;
00088   }
00089
00090   LineType ty;
00091   const ParagraphModel *model;
00092 };
00093
00094 class ParagraphTheory;  // Forward Declaration
00095
00096 typedef GenericVectorEqEq<const ParagraphModel *> SetOfModels;
00097
00098 // Row Scratch Registers are data generated by the paragraph detection
00099 // algorithm based on a RowInfo input.
00100 class RowScratchRegisters {
00101  public:
00102   // We presume row will outlive us.
00103   void Init(const RowInfo &row);
00104
00105   LineType GetLineType() const;
00106
00107   LineType GetLineType(const ParagraphModel *model) const;
00108
00109   // Mark this as a start line type, sans model.  This is useful for the
00110   // initial marking of probable body lines or paragraph start lines.
00111   void SetStartLine();
00112
00113   // Mark this as a body line type, sans model.  This is useful for the
00114   // initial marking of probably body lines or paragraph start lines.
00115   void SetBodyLine();
00116
00117   // Record that this row fits as a paragraph start line in the given model,
00118   void AddStartLine(const ParagraphModel *model);
00119   // Record that this row fits as a paragraph body line in the given model,
00120   void AddBodyLine(const ParagraphModel *model);
00121
00122   // Clear all hypotheses about this line.
00123   void SetUnknown() { hypotheses_.truncate(0); }
00124
00125   // Append all hypotheses of strong models that match this row as a start.
00126   void StartHypotheses(SetOfModels *models) const;
00127
00128   // Append all hypotheses of strong models matching this row.
00129   void StrongHypotheses(SetOfModels *models) const;
00130
00131   // Append all hypotheses for this row.
00132   void NonNullHypotheses(SetOfModels *models) const;
00133
00134   // Discard any hypotheses whose model is not in the given list.
00135   void DiscardNonMatchingHypotheses(const SetOfModels &models);
00136
00137   // If we have only one hypothesis and that is that this line is a paragraph
00138   // start line of a certain model, return that model.  Else return NULL.
00139   const ParagraphModel *UniqueStartHypothesis() const;
00140
00141   // If we have only one hypothesis and that is that this line is a paragraph
00142   // body line of a certain model, return that model.  Else return NULL.
00143   const ParagraphModel *UniqueBodyHypothesis() const;
00144
00145   // Return the indentation for the side opposite of the aligned side.
00146   int OffsideIndent(tesseract::ParagraphJustification just) const {
00147     switch (just) {
00148       case tesseract::JUSTIFICATION_RIGHT: return lindent_;
00149       case tesseract::JUSTIFICATION_LEFT: return rindent_;
00150       default: return lindent_ > rindent_ ? lindent_ : rindent_;
00151     }
00152   }
00153
00154   // Return the indentation for the side the text is aligned to.
00155   int AlignsideIndent(tesseract::ParagraphJustification just) const {
00156     switch (just) {
00157       case tesseract::JUSTIFICATION_RIGHT: return rindent_;
00158       case tesseract::JUSTIFICATION_LEFT: return lindent_;
00159       default: return lindent_ > rindent_ ? lindent_ : rindent_;
00160     }
00161   }
00162
00163   // Append header fields to a vector of row headings.
00164   static void AppendDebugHeaderFields(GenericVector<STRING> *header);
00165
00166   // Append data for this row to a vector of debug strings.
00167   void AppendDebugInfo(const ParagraphTheory &theory,
00168                        GenericVector<STRING> *dbg) const;
00169
00170   const RowInfo *ri_;
00171
00172   // These four constants form a horizontal box model for the white space
00173   // on the edges of each line.  At each point in the algorithm, the following
00174   // shall hold:
00175   //   ri_->pix_ldistance = lmargin_ + lindent_
00176   //   ri_->pix_rdistance = rindent_ + rmargin_
00177   int lmargin_;
00178   int lindent_;
00179   int rindent_;
00180   int rmargin_;
00181
00182  private:
00183   // Hypotheses of either LT_START or LT_BODY
00184   GenericVectorEqEq<LineHypothesis> hypotheses_;
00185 };
00186
00187 // A collection of convenience functions for wrapping the set of
00188 // Paragraph Models we believe correctly model the paragraphs in the image.
00189 class ParagraphTheory {
00190  public:
00191   // We presume models will outlive us, and that models will take ownership
00192   // of any ParagraphModel *'s we add.
00193   explicit ParagraphTheory(GenericVector<ParagraphModel *> *models)
00194       : models_(models) {}
00195   GenericVector<ParagraphModel *> &models() { return *models_; }
00196   const GenericVector<ParagraphModel *> &models() const { return *models_; }
00197
00198   // Return an existing model if one that is Comparable() can be found.
00199   // Else, allocate a new copy of model to save and return a pointer to it.
00200   const ParagraphModel *AddModel(const ParagraphModel &model);
00201
00202   // Discard any models we've made that are not in the list of used models.
00203   void DiscardUnusedModels(const SetOfModels &used_models);
00204
00205   // Return the set of all non-centered models.
00206   void NonCenteredModels(SetOfModels *models);
00207
00208   // If any of the non-centered paragraph models we know about fit
00209   // rows[start, end), return it.  Else NULL.
00210   const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows,
00211                              int start, int end) const;
00212
00213   int IndexOf(const ParagraphModel *model) const;
00214
00215  private:
00216   GenericVector<ParagraphModel *> *models_;
00217   GenericVectorEqEq<ParagraphModel *> models_we_added_;
00218 };
00219
00220 bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
00221                     int row, const ParagraphModel *model);
00222 bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
00223                    int row, const ParagraphModel *model);
00224 bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
00225                      int a, int b, const ParagraphModel *model);
00226
00227 // A class for smearing Paragraph Model hypotheses to surrounding rows.
00228 // The idea here is that StrongEvidenceClassify first marks only exceedingly
00229 // obvious start and body rows and constructs models of them.  Thereafter,
00230 // we may have left over unmarked lines (mostly end-of-paragraph lines) which
00231 // were too short to have much confidence about, but which fit the models we've
00232 // constructed perfectly and which we ought to mark.  This class is used to
00233 // "smear" our models over the text.
00234 class ParagraphModelSmearer {
00235  public:
00236   ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
00237                         int row_start, int row_end,
00238                         ParagraphTheory *theory);
00239
00240   // Smear forward paragraph models from existing row markings to subsequent
00241   // text lines if they fit, and mark any thereafter still unmodeled rows
00242   // with any model in the theory that fits them.
00243   void Smear();
00244
00245  private:
00246   // Record in open_models_ for rows [start_row, end_row) the list of models
00247   // currently open at each row.
00248   // A model is still open in a row if some previous row has said model as a
00249   // start hypothesis, and all rows since (including this row) would fit as
00250   // either a body or start line in that model.
00251   void CalculateOpenModels(int row_start, int row_end);
00252
00253   SetOfModels &OpenModels(int row) {
00254     return open_models_[row - row_start_ + 1];
00255   }
00256
00257   ParagraphTheory *theory_;
00258   GenericVector<RowScratchRegisters> *rows_;
00259   int row_start_;
00260   int row_end_;
00261
00262   // open_models_ corresponds to rows[start_row_ - 1, end_row_]
00263   //
00264   // open_models_:  Contains models which there was an active (open) paragraph
00265   //                as of the previous line and for which the left and right
00266   //                indents admit the possibility that this text line continues
00267   //                to fit the same model.
00268   // TODO(eger): Think about whether we can get rid of "Open" models and just
00269   //   use the current hypotheses on RowScratchRegisters.
00270   GenericVector<SetOfModels> open_models_;
00271 };
00272
00273 // Clear all hypotheses about lines [start, end) and reset the margins to the
00274 // percentile (0..100) value of the left and right row edges for this run of
00275 // rows.
00276 void RecomputeMarginsAndClearHypotheses(
00277     GenericVector<RowScratchRegisters> *rows, int start, int end,
00278     int percentile);
00279
00280 // Return the minimum inter-word space in rows[row_start, row_end).
00281 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
00282                    int row_start, int row_end);
00283
00284 // Return whether the first word on the after line can fit in the space at
00285 // the end of the before line (knowing which way the text is aligned and read).
00286 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
00287                            const RowScratchRegisters &after,
00288                            tesseract::ParagraphJustification justification);
00289
00290 // Return whether the first word on the after line can fit in the space at
00291 // the end of the before line (not knowing the text alignment).
00292 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
00293                            const RowScratchRegisters &after);
00294
00295 // Do rows[start, end) form a single instance of the given paragraph model?
00296 bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
00297                   int start, int end, const ParagraphModel *model);
00298
00299 // Do the text and geometry of two rows support a paragraph break between them?
00300 bool LikelyParagraphStart(const RowScratchRegisters &before,
00301                           const RowScratchRegisters &after,
00302                           tesseract::ParagraphJustification j);
00303
00304 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
00305 // normalize each row_owner to point to an actual PARA, and output the
00306 // paragraphs in order onto paragraphs.
00307 void CanonicalizeDetectionResults(
00308     GenericVector<PARA *> *row_owners,
00309     PARA_LIST *paragraphs);
00310
00311 }  // namespace
00312 #endif  // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_