tesseract-doc/ocrpara_8h_source.html

00001
00002 // File:        ocrpara.h
00003 // Description: OCR Paragraph Output Type
00004 // Author:      David Eger
00005 // Created:     2010-11-15
00006 //
00007 // (C) Copyright 2010, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019
00020 #ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
00021 #define TESSERACT_CCSTRUCT_OCRPARA_H_
00022
00023 #include "publictypes.h"
00024 #include "elst.h"
00025 #include "strngs.h"
00026
00027 class ParagraphModel;
00028
00029 struct PARA : public ELIST_LINK {
00030  public:
00031   PARA() : model(NULL), is_list_item(false),
00032            is_very_first_or_continuation(false), has_drop_cap(false) {}
00033
00034   // We do not own the model, we just reference it.
00035   // model may be NULL if there is not a good model for this paragraph.
00036   const ParagraphModel *model;
00037
00038   bool is_list_item;
00039
00040   // The first paragraph on a page often lacks a first line indent, but should
00041   // still be modeled by the same model as other body text paragraphs on the
00042   // page.
00043   bool is_very_first_or_continuation;
00044
00045   // Does this paragraph begin with a drop cap?
00046   bool has_drop_cap;
00047 };
00048
00049 ELISTIZEH(PARA)
00050
00051 // A geometric model of paragraph indentation and alignment.
00052 //
00053 // Measurements are in pixels. The meaning of the integer arguments changes
00054 // depending upon the value of justification.  Distances less than or equal
00055 // to tolerance apart we take as "equivalent" for the purpose of model
00056 // matching, and in the examples below, we assume tolerance is zero.
00057 //
00058 // justification = LEFT:
00059 //   margin       the "ignored" margin to the left block edge.
00060 //   first_indent indent from the left margin to a typical first text line.
00061 //   body_indent  indent from the left margin of a typical body text line.
00062 //
00063 // justification = RIGHT:
00064 //   margin       the "ignored" margin to the right block edge.
00065 //   first_indent indent from the right margin to a typical first text line.
00066 //   body_indent  indent from the right margin of a typical body text line.
00067 //
00068 // justification = CENTER:
00069 //   margin       ignored
00070 //   first_indent ignored
00071 //   body_indent  ignored
00072 //
00073 //  ====== Extended example, assuming each letter is ten pixels wide: =======
00074 //
00075 // +--------------------------------+
00076 // |      Awesome                   | ParagraphModel(CENTER, 0, 0, 0)
00077 // |   Centered Title               |
00078 // | Paragraph Detection            |
00079 // |      OCR TEAM                  |
00080 // |  10 November 2010              |
00081 // |                                |
00082 // |  Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
00083 // |This paragraph starts at the top|
00084 // |of the page and takes 3 lines.  |
00085 // |  Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
00086 // |which indicates that the first  |
00087 // |paragraph is not a continuation |
00088 // |from a previous page, as it is  |
00089 // |indented just like this second  |
00090 // |paragraph.                      |
00091 // |   Here is a block quote. It    | ParagraphModel(LEFT, 30, 0, 0)
00092 // |   looks like the prior text    |
00093 // |   but it  is indented  more    |
00094 // |   and is fully justified.      |
00095 // |  So how does one deal with     | ParagraphModel(LEFT, 0, 20, 0)
00096 // |centered text, block quotes,    |
00097 // |normal paragraphs, and lists    |
00098 // |like what follows?              |
00099 // |1. Make a plan.                 | ParagraphModel(LEFT, 0, 0, 30)
00100 // |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
00101 // |   looking for lines where the  |
00102 // |   first word of the next line  |
00103 // |   would fit on the previous    |
00104 // |   line.                        |
00105 // |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
00106 // |   Python and try it out.       |
00107 // |4. Determine how to fix the     | ParagraphModel(LEFT, 0, 0, 30)
00108 // |   mistakes.                    |
00109 // |5. Repeat.                      | ParagraphModel(LEFT, 0, 0, 30)
00110 // |  For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
00111 // |you can try to identify source  |
00112 // |code.  Ouch!                    |
00113 // +--------------------------------+
00114 class ParagraphModel {
00115  public:
00116   ParagraphModel(tesseract::ParagraphJustification justification,
00117                  int margin,
00118                  int first_indent,
00119                  int body_indent,
00120                  int tolerance)
00121       : justification_(justification),
00122         margin_(margin),
00123         first_indent_(first_indent),
00124         body_indent_(body_indent),
00125         tolerance_(tolerance) {
00126     // Make one of {first_indent, body_indent} is 0.
00127     int added_margin = first_indent;
00128     if (body_indent < added_margin)
00129       added_margin = body_indent;
00130     margin_ += added_margin;
00131     first_indent_ -= added_margin;
00132     body_indent_ -= added_margin;
00133   }
00134
00135   ParagraphModel()
00136       : justification_(tesseract::JUSTIFICATION_UNKNOWN),
00137          margin_(0),
00138          first_indent_(0),
00139          body_indent_(0),
00140          tolerance_(0) { }
00141
00142   // ValidFirstLine() and ValidBodyLine() take arguments describing a text line
00143   // in a block of text which we are trying to model:
00144   //   lmargin, lindent:  these add up to the distance from the leftmost ink
00145   //                      in the text line to the surrounding text block's left
00146   //                      edge.
00147   //   rmargin, rindent:  these add up to the distance from the rightmost ink
00148   //                      in the text line to the surrounding text block's right
00149   //                      edge.
00150   // The caller determines the division between "margin" and "indent", which
00151   // only actually affect whether we think the line may be centered.
00152   //
00153   // If the amount of whitespace matches the amount of whitespace expected on
00154   // the relevant side of the line (within tolerance_) we say it matches.
00155
00156   // Return whether a given text line could be a first paragraph line according
00157   // to this paragraph model.
00158   bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
00159
00160   // Return whether a given text line could be a first paragraph line according
00161   // to this paragraph model.
00162   bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
00163
00164   tesseract::ParagraphJustification justification() const {
00165     return justification_;
00166   }
00167   int margin() const { return margin_; }
00168   int first_indent() const { return first_indent_; }
00169   int body_indent() const { return body_indent_; }
00170   int tolerance() const { return tolerance_; }
00171   bool is_flush() const {
00172     return (justification_ == tesseract::JUSTIFICATION_LEFT ||
00173             justification_ == tesseract::JUSTIFICATION_RIGHT) &&
00174         abs(first_indent_ - body_indent_) <= tolerance_;
00175   }
00176
00177   // Return whether this model is likely to agree with the other model on most
00178   // paragraphs they are marked.
00179   bool Comparable(const ParagraphModel &other) const;
00180
00181   STRING ToString() const;
00182
00183  private:
00184   tesseract::ParagraphJustification justification_;
00185   int margin_;
00186   int first_indent_;
00187   int body_indent_;
00188   int tolerance_;
00189 };
00190
00191 #endif  // TESSERACT_CCSTRUCT_OCRPARA_H_