Tesseract  3.02
tesseract-ocr/ccmain/paragraphs.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        paragraphs.h
00003  * Description: Paragraph Detection data structures.
00004  * Author:      David Eger
00005  * Created:     25 February 2011
00006  *
00007  * (C) Copyright 2011, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
00021 #define TESSERACT_CCMAIN_PARAGRAPHS_H_
00022 
00023 #include "rect.h"
00024 #include "ocrpara.h"
00025 #include "genericvector.h"
00026 #include "strngs.h"
00027 
00028 
00029 class WERD;
00030 class UNICHARSET;
00031 
00032 namespace tesseract {
00033 
00034 class MutableIterator;
00035 
00036 // This structure captures all information needed about a text line for the
00037 // purposes of paragraph detection.  It is meant to be exceedingly light-weight
00038 // so that we can easily test paragraph detection independent of the rest of
00039 // Tesseract.
00040 class RowInfo {
00041  public:
00042   // Constant data derived from Tesseract output.
00043   STRING text;        // the full UTF-8 text of the line.
00044   bool ltr;           // whether the majority of the text is left-to-right
00045                       // TODO(eger) make this more fine-grained.
00046 
00047   bool has_leaders;   // does the line contain leader dots (.....)?
00048   bool has_drop_cap;  // does the line have a drop cap?
00049   int pix_ldistance;  // distance to the left pblock boundary in pixels
00050   int pix_rdistance;  // distance to the right pblock boundary in pixels
00051   float pix_xheight;  // guessed xheight for the line
00052   int average_interword_space; // average space between words in pixels.
00053 
00054   int num_words;
00055   TBOX lword_box;     // in normalized (horiz text rows) space
00056   TBOX rword_box;     // in normalized (horiz text rows) space
00057 
00058   STRING lword_text;   // the UTF-8 text of the leftmost werd
00059   STRING rword_text;   // the UTF-8 text of the rightmost werd
00060 
00061   //   The text of a paragraph typically starts with the start of an idea and
00062   // ends with the end of an idea.  Here we define paragraph as something that
00063   // may have a first line indent and a body indent which may be different.
00064   // Typical words that start an idea are:
00065   //   1. Words in western scripts that start with
00066   //      a capital letter, for example "The"
00067   //   2. Bulleted or numbered list items, for
00068   //      example "2."
00069   // Typical words which end an idea are words ending in punctuation marks. In
00070   // this vocabulary, each list item is represented as a paragraph.
00071   bool lword_indicates_list_item;
00072   bool lword_likely_starts_idea;
00073   bool lword_likely_ends_idea;
00074 
00075   bool rword_indicates_list_item;
00076   bool rword_likely_starts_idea;
00077   bool rword_likely_ends_idea;
00078 };
00079 
00080 // Main entry point for Paragraph Detection Algorithm.
00081 //
00082 // Given a set of equally spaced textlines (described by row_infos),
00083 // Split them into paragraphs.  See http://goto/paragraphstalk
00084 //
00085 // Output:
00086 //   row_owners - one pointer for each row, to the paragraph it belongs to.
00087 //   paragraphs - this is the actual list of PARA objects.
00088 //   models - the list of paragraph models referenced by the PARA objects.
00089 //            caller is responsible for deleting the models.
00090 void DetectParagraphs(int debug_level,
00091                       GenericVector<RowInfo> *row_infos,
00092                       GenericVector<PARA *> *row_owners,
00093                       PARA_LIST *paragraphs,
00094                       GenericVector<ParagraphModel *> *models);
00095 
00096 // Given a MutableIterator to the start of a block, run DetectParagraphs on
00097 // that block and commit the results to the underlying ROW and BLOCK structs,
00098 // saving the ParagraphModels in models.  Caller owns the models.
00099 // We use unicharset during the function to answer questions such as "is the
00100 // first letter of this word upper case?"
00101 void DetectParagraphs(int debug_level,
00102                       const MutableIterator *block_start,
00103                       GenericVector<ParagraphModel *> *models);
00104 
00105 }  // namespace
00106 
00107 #endif  // TESSERACT_CCMAIN_PARAGRAPHS_H_