Tesseract  3.02
tesseract-ocr/ccmain/resultiterator.h
Go to the documentation of this file.
00001 
00002 // File:        resultiterator.h
00003 // Description: Iterator for tesseract results that is capable of
00004 //              iterating in proper reading order over Bi Directional
00005 //              (e.g. mixed Hebrew and English) text.
00006 // Author:      David Eger
00007 // Created:     Fri May 27 13:58:06 PST 2011
00008 //
00009 // (C) Copyright 2011, Google Inc.
00010 // Licensed under the Apache License, Version 2.0 (the "License");
00011 // you may not use this file except in compliance with the License.
00012 // You may obtain a copy of the License at
00013 // http://www.apache.org/licenses/LICENSE-2.0
00014 // Unless required by applicable law or agreed to in writing, software
00015 // distributed under the License is distributed on an "AS IS" BASIS,
00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 // See the License for the specific language governing permissions and
00018 // limitations under the License.
00019 //
00021 
00022 #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H__
00023 #define TESSERACT_CCMAIN_RESULT_ITERATOR_H__
00024 
00025 #include "platform.h"
00026 #include "ltrresultiterator.h"
00027 #include "genericvector.h"
00028 
00029 class BLOB_CHOICE_IT;
00030 class WERD_RES;
00031 class STRING;
00032 
00033 namespace tesseract {
00034 
00035 class Tesseract;
00036 
00037 class TESS_API ResultIterator : public LTRResultIterator {
00038  public:
00039   static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
00040 
00045   virtual ~ResultIterator() {}
00046 
00047   // ============= Moving around within the page ============.
00052   virtual void Begin();
00053 
00066   virtual bool Next(PageIteratorLevel level);
00067 
00074   virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
00075 
00081   virtual bool IsAtFinalElement(PageIteratorLevel level,
00082                                 PageIteratorLevel element) const;
00083 
00084   // ============= Accessing data ==============.
00085 
00090   virtual char* GetUTF8Text(PageIteratorLevel level) const;
00091 
00096   bool ParagraphIsLtr() const;
00097 
00098   // ============= Exposed only for testing =============.
00099 
00122   static void CalculateTextlineOrder(
00123       bool paragraph_is_ltr,
00124       const GenericVector<StrongScriptDirection> &word_dirs,
00125       GenericVectorEqEq<int> *reading_order);
00126 
00127   static const int kMinorRunStart;
00128   static const int kMinorRunEnd;
00129   static const int kComplexWord;
00130 
00131  protected:
00138   TESS_LOCAL explicit ResultIterator(const LTRResultIterator &resit);
00139 
00140  private:
00145   bool CurrentParagraphIsLtr() const;
00146 
00158   void CalculateTextlineOrder(bool paragraph_is_ltr,
00159                               const LTRResultIterator &resit,
00160                               GenericVectorEqEq<int> *indices) const;
00162   void CalculateTextlineOrder(bool paragraph_is_ltr,
00163                               const LTRResultIterator &resit,
00164                               GenericVector<StrongScriptDirection> *ssd,
00165                               GenericVectorEqEq<int> *indices) const;
00166 
00171   int LTRWordIndex() const;
00172 
00177   void CalculateBlobOrder(GenericVector<int> *blob_indices) const;
00178 
00180   void MoveToLogicalStartOfTextline();
00181 
00186   void MoveToLogicalStartOfWord();
00187 
00189   bool IsAtFinalSymbolOfWord() const;
00190 
00192   bool IsAtFirstSymbolOfWord() const;
00193 
00198   void AppendSuffixMarks(STRING *text) const;
00199 
00201   void AppendUTF8WordText(STRING *text) const;
00202 
00210   void IterateAndAppendUTF8TextlineText(STRING *text);
00211 
00218   void AppendUTF8ParagraphText(STRING *text) const;
00219 
00221   bool BidiDebug(int min_level) const;
00222 
00223   bool current_paragraph_is_ltr_;
00224 
00229   bool at_beginning_of_minor_run_;
00230 
00232   bool in_minor_direction_;
00233 };
00234 
00235 }  // namespace tesseract.
00236 
00237 #endif  // TESSERACT_CCMAIN_RESULT_ITERATOR_H__