Tesseract  3.02
tesseract-ocr/ccmain/pageiterator.h
Go to the documentation of this file.
00001 
00002 // File:        pageiterator.h
00003 // Description: Iterator for tesseract page structure that avoids using
00004 //              tesseract internal data structures.
00005 // Author:      Ray Smith
00006 // Created:     Fri Feb 26 11:01:06 PST 2010
00007 //
00008 // (C) Copyright 2010, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H__
00022 #define TESSERACT_CCMAIN_PAGEITERATOR_H__
00023 
00024 #include "publictypes.h"
00025 #include "platform.h"
00026 
00027 class C_BLOB_IT;
00028 class PBLOB_IT;
00029 class PAGE_RES;
00030 class PAGE_RES_IT;
00031 class WERD;
00032 struct Pix;
00033 
00034 namespace tesseract {
00035 
00036 class Tesseract;
00037 
00051 class TESS_API PageIterator {
00052  public:
00067   PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
00068                int scale, int scaled_yres,
00069                int rect_left, int rect_top,
00070                int rect_width, int rect_height);
00071   virtual ~PageIterator();
00072 
00079   PageIterator(const PageIterator& src);
00080   const PageIterator& operator=(const PageIterator& src);
00081 
00083   bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
00084 
00085   // ============= Moving around within the page ============.
00086 
00091   virtual void Begin();
00092 
00098   virtual void RestartParagraph();
00099 
00104   bool IsWithinFirstTextlineOfParagraph() const;
00105 
00111   virtual void RestartRow();
00112 
00124   virtual bool Next(PageIteratorLevel level);
00125 
00139   virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
00140 
00157   virtual bool IsAtFinalElement(PageIteratorLevel level,
00158                                 PageIteratorLevel element) const;
00159 
00166   int Cmp(const PageIterator &other) const;
00167 
00168   // ============= Accessing data ==============.
00169   // Coordinate system:
00170   // Integer coordinates are at the cracks between the pixels.
00171   // The top-left corner of the top-left pixel in the image is at (0,0).
00172   // The bottom-right corner of the bottom-right pixel in the image is at
00173   // (width, height).
00174   // Every bounding box goes from the top-left of the top-left contained
00175   // pixel to the bottom-right of the bottom-right contained pixel, so
00176   // the bounding box of the single top-left pixel in the image is:
00177   // (0,0)->(1,1).
00178   // If an image rectangle has been set in the API, then returned coordinates
00179   // relate to the original (full) image, rather than the rectangle.
00180 
00190   bool BoundingBox(PageIteratorLevel level,
00191                    int* left, int* top, int* right, int* bottom) const;
00197   bool BoundingBoxInternal(PageIteratorLevel level,
00198                            int* left, int* top, int* right, int* bottom) const;
00199 
00201   bool Empty(PageIteratorLevel level) const;
00202 
00207   PolyBlockType BlockType() const;
00208 
00215   Pix* GetBinaryImage(PageIteratorLevel level) const;
00216 
00227   Pix* GetImage(PageIteratorLevel level, int padding,
00228                 int* left, int* top) const;
00229 
00236   bool Baseline(PageIteratorLevel level,
00237                 int* x1, int* y1, int* x2, int* y2) const;
00238 
00247   void Orientation(tesseract::Orientation *orientation,
00248                    tesseract::WritingDirection *writing_direction,
00249                    tesseract::TextlineOrder *textline_order,
00250                    float *deskew_angle) const;
00251 
00280   void ParagraphInfo(tesseract::ParagraphJustification *justification,
00281                      bool *is_list_item,
00282                      bool *is_crown,
00283                      int *first_line_indent) const;
00284 
00285  protected:
00290   TESS_LOCAL void BeginWord(int offset);
00291 
00293   PAGE_RES* page_res_;
00295   Tesseract* tesseract_;
00300   PAGE_RES_IT* it_;
00305   WERD* word_;
00307   int word_length_;
00309   int blob_index_;
00315   C_BLOB_IT* cblob_it_;
00317   int scale_;
00318   int scaled_yres_;
00319   int rect_left_;
00320   int rect_top_;
00321   int rect_width_;
00322   int rect_height_;
00323 };
00324 
00325 }  // namespace tesseract.
00326 
00327 #endif  // TESSERACT_CCMAIN_PAGEITERATOR_H__