Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: paragraphs.h 00003 * Description: Paragraph Detection data structures. 00004 * Author: David Eger 00005 * Created: 25 February 2011 00006 * 00007 * (C) Copyright 2011, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_ 00021 #define TESSERACT_CCMAIN_PARAGRAPHS_H_ 00022 00023 #include "rect.h" 00024 #include "ocrpara.h" 00025 #include "genericvector.h" 00026 #include "strngs.h" 00027 00028 00029 class WERD; 00030 class UNICHARSET; 00031 00032 namespace tesseract { 00033 00034 class MutableIterator; 00035 00036 // This structure captures all information needed about a text line for the 00037 // purposes of paragraph detection. It is meant to be exceedingly light-weight 00038 // so that we can easily test paragraph detection independent of the rest of 00039 // Tesseract. 00040 class RowInfo { 00041 public: 00042 // Constant data derived from Tesseract output. 00043 STRING text; // the full UTF-8 text of the line. 00044 bool ltr; // whether the majority of the text is left-to-right 00045 // TODO(eger) make this more fine-grained. 00046 00047 bool has_leaders; // does the line contain leader dots (.....)? 00048 bool has_drop_cap; // does the line have a drop cap? 00049 int pix_ldistance; // distance to the left pblock boundary in pixels 00050 int pix_rdistance; // distance to the right pblock boundary in pixels 00051 float pix_xheight; // guessed xheight for the line 00052 int average_interword_space; // average space between words in pixels. 00053 00054 int num_words; 00055 TBOX lword_box; // in normalized (horiz text rows) space 00056 TBOX rword_box; // in normalized (horiz text rows) space 00057 00058 STRING lword_text; // the UTF-8 text of the leftmost werd 00059 STRING rword_text; // the UTF-8 text of the rightmost werd 00060 00061 // The text of a paragraph typically starts with the start of an idea and 00062 // ends with the end of an idea. Here we define paragraph as something that 00063 // may have a first line indent and a body indent which may be different. 00064 // Typical words that start an idea are: 00065 // 1. Words in western scripts that start with 00066 // a capital letter, for example "The" 00067 // 2. Bulleted or numbered list items, for 00068 // example "2." 00069 // Typical words which end an idea are words ending in punctuation marks. In 00070 // this vocabulary, each list item is represented as a paragraph. 00071 bool lword_indicates_list_item; 00072 bool lword_likely_starts_idea; 00073 bool lword_likely_ends_idea; 00074 00075 bool rword_indicates_list_item; 00076 bool rword_likely_starts_idea; 00077 bool rword_likely_ends_idea; 00078 }; 00079 00080 // Main entry point for Paragraph Detection Algorithm. 00081 // 00082 // Given a set of equally spaced textlines (described by row_infos), 00083 // Split them into paragraphs. See http://goto/paragraphstalk 00084 // 00085 // Output: 00086 // row_owners - one pointer for each row, to the paragraph it belongs to. 00087 // paragraphs - this is the actual list of PARA objects. 00088 // models - the list of paragraph models referenced by the PARA objects. 00089 // caller is responsible for deleting the models. 00090 void DetectParagraphs(int debug_level, 00091 GenericVector<RowInfo> *row_infos, 00092 GenericVector<PARA *> *row_owners, 00093 PARA_LIST *paragraphs, 00094 GenericVector<ParagraphModel *> *models); 00095 00096 // Given a MutableIterator to the start of a block, run DetectParagraphs on 00097 // that block and commit the results to the underlying ROW and BLOCK structs, 00098 // saving the ParagraphModels in models. Caller owns the models. 00099 // We use unicharset during the function to answer questions such as "is the 00100 // first letter of this word upper case?" 00101 void DetectParagraphs(int debug_level, 00102 const MutableIterator *block_start, 00103 GenericVector<ParagraphModel *> *models); 00104 00105 } // namespace 00106 00107 #endif // TESSERACT_CCMAIN_PARAGRAPHS_H_