Tesseract  3.02
tesseract::ResultIterator Class Reference

#include <resultiterator.h>

Inheritance diagram for tesseract::ResultIterator:
tesseract::LTRResultIterator tesseract::PageIterator tesseract::MutableIterator

List of all members.

Public Member Functions

virtual ~ResultIterator ()
virtual void Begin ()
virtual bool Next (PageIteratorLevel level)
virtual bool IsAtBeginningOf (PageIteratorLevel level) const
virtual bool IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const
virtual char * GetUTF8Text (PageIteratorLevel level) const
bool ParagraphIsLtr () const

Static Public Member Functions

static ResultIteratorStartOfParagraph (const LTRResultIterator &resit)
static void CalculateTextlineOrder (bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)

Static Public Attributes

static const int kMinorRunStart = -1
static const int kMinorRunEnd = -2
static const int kComplexWord = -3

Protected Member Functions

TESS_LOCAL ResultIterator (const LTRResultIterator &resit)

Detailed Description

Definition at line 37 of file resultiterator.h.


Constructor & Destructor Documentation

virtual tesseract::ResultIterator::~ResultIterator ( ) [inline, virtual]

ResultIterator is copy constructible! The default copy constructor works just fine for us.

Definition at line 45 of file resultiterator.h.

{}
tesseract::ResultIterator::ResultIterator ( const LTRResultIterator resit) [explicit, protected]

We presume the data associated with the given iterator will outlive us. NB: This is private because it does something that is non-obvious: it resets to the beginning of the paragraph instead of staying wherever resit might have pointed.

Definition at line 33 of file resultiterator.cpp.

    : LTRResultIterator(resit) {
  in_minor_direction_ = false;
  at_beginning_of_minor_run_ = false;
  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
  MoveToLogicalStartOfTextline();
}

Member Function Documentation

void tesseract::ResultIterator::Begin ( ) [virtual]

Moves the iterator to point to the start of the page to begin an iteration.

Reimplemented from tesseract::PageIterator.

Definition at line 408 of file resultiterator.cpp.

                           {
  LTRResultIterator::Begin();
  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
  in_minor_direction_ = false;
  at_beginning_of_minor_run_ = false;
  MoveToLogicalStartOfTextline();
}
void tesseract::ResultIterator::CalculateTextlineOrder ( bool  paragraph_is_ltr,
const GenericVector< StrongScriptDirection > &  word_dirs,
GenericVectorEqEq< int > *  reading_order 
) [static]

Yields the reading order as a sequence of indices and (optional) meta-marks for a set of words (given left-to-right). The meta marks are passed as negative values: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The next indexed word contains both left-to-right and right-to-left characters and was treated as neutral.

For example, suppose we have five words in a text line, indexed [0,1,2,3,4] from the leftmost side of the text line. The following are all believable reading_orders:

Left-to-Right (in ltr paragraph): { 0, 1, 2, 3, 4 } Left-to-Right (in rtl paragraph): { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } Right-to-Left (in rtl paragraph): { 4, 3, 2, 1, 0 } Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }

Definition at line 248 of file resultiterator.cpp.

                                           {
  reading_order->truncate(0);
  if (word_dirs.size() == 0) return;

  // Take all of the runs of minor direction words and insert them
  // in reverse order.
  int minor_direction, major_direction, major_step, start, end;
  if (paragraph_is_ltr) {
    start = 0;
    end = word_dirs.size();
    major_step = 1;
    major_direction = DIR_LEFT_TO_RIGHT;
    minor_direction = DIR_RIGHT_TO_LEFT;
  } else {
    start = word_dirs.size() - 1;
    end = -1;
    major_step = -1;
    major_direction = DIR_RIGHT_TO_LEFT;
    minor_direction = DIR_LEFT_TO_RIGHT;
    // Special rule: if there are neutral words at the right most side
    //   of a line adjacent to a left-to-right word in the middle of the
    //   line, we interpret the end of the line as a single LTR sequence.
    if (word_dirs[start] == DIR_NEUTRAL) {
      int neutral_end = start;
      while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
        neutral_end--;
      }
      if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
        // LTR followed by neutrals.
        // Scan for the beginning of the minor left-to-right run.
        int left = neutral_end;
        for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
          if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
        }
        reading_order->push_back(kMinorRunStart);
        for (int i = left; i < word_dirs.size(); i++) {
          reading_order->push_back(i);
          if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
        }
        reading_order->push_back(kMinorRunEnd);
        start = left - 1;
      }
    }
  }
  for (int i = start; i != end;) {
    if (word_dirs[i] == minor_direction) {
      int j = i;
      while (j != end && word_dirs[j] != major_direction)
        j += major_step;
      if (j == end) j -= major_step;
      while (j != i && word_dirs[j] != minor_direction)
        j -= major_step;
      //  [j..i] is a minor direction run.
      reading_order->push_back(kMinorRunStart);
      for (int k = j; k != i; k -= major_step) {
        reading_order->push_back(k);
      }
      reading_order->push_back(i);
      reading_order->push_back(kMinorRunEnd);
      i = j + major_step;
    } else {
      reading_order->push_back(i);
      if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
      i += major_step;
    }
  }
}
char * tesseract::ResultIterator::GetUTF8Text ( PageIteratorLevel  level) const [virtual]

Returns the null terminated UTF-8 encoded text string for the current object at the given level. Use delete [] to free after use.

Reimplemented from tesseract::LTRResultIterator.

Definition at line 551 of file resultiterator.cpp.

                                                               {
  if (it_->word() == NULL) return NULL;  // Already at the end!
  STRING text;
  switch (level) {
    case RIL_BLOCK:
      {
        ResultIterator pp(*this);
        do {
          pp.AppendUTF8ParagraphText(&text);
        } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
      }
      break;
    case RIL_PARA:
      AppendUTF8ParagraphText(&text);
      break;
    case RIL_TEXTLINE:
      {
        ResultIterator it(*this);
        it.MoveToLogicalStartOfTextline();
        it.IterateAndAppendUTF8TextlineText(&text);
      }
      break;
    case RIL_WORD:
      AppendUTF8WordText(&text);
      break;
    case RIL_SYMBOL:
      {
        bool reading_direction_is_ltr =
          current_paragraph_is_ltr_ ^ in_minor_direction_;
        if (at_beginning_of_minor_run_) {
          text += reading_direction_is_ltr ? kLRM : kRLM;
        }
        text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
        if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
      }
      break;
  }
  int length = text.length() + 1;
  char* result = new char[length];
  strncpy(result, text.string(), length);
  return result;
}
bool tesseract::ResultIterator::IsAtBeginningOf ( PageIteratorLevel  level) const [virtual]

IsAtBeginningOf() returns whether we're at the logical beginning of the given level. (as opposed to ResultIterator's left-to-right top-to-bottom order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). For a full description, see pageiterator.h

Reimplemented from tesseract::PageIterator.

Definition at line 491 of file resultiterator.cpp.

                                                                  {
  if (it_->block() == NULL) return false;  // Already at the end!
  if (it_->word() == NULL) return true;  // In an image block.
  if (level == RIL_SYMBOL) return true;  // Always at beginning of a symbol.

  bool at_word_start = IsAtFirstSymbolOfWord();
  if (level == RIL_WORD) return at_word_start;

  ResultIterator line_start(*this);
  // move to the first word in the line...
  line_start.MoveToLogicalStartOfTextline();

  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
  if (level == RIL_TEXTLINE) return at_textline_start;

  // now we move to the left-most word...
  line_start.RestartRow();
  bool at_block_start = at_textline_start &&
      line_start.it_->block() != line_start.it_->prev_block();
  if (level == RIL_BLOCK) return at_block_start;

  bool at_para_start = at_block_start ||
      (at_textline_start &&
       line_start.it_->row()->row->para() !=
           line_start.it_->prev_row()->row->para());
  if (level == RIL_PARA) return at_para_start;

  ASSERT_HOST(false);  // shouldn't happen.
  return false;
}
bool tesseract::ResultIterator::IsAtFinalElement ( PageIteratorLevel  level,
PageIteratorLevel  element 
) const [virtual]

Implement PageIterator's IsAtFinalElement correctly in a BiDi context. For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we point at the last word in a paragraph. See PageIterator for full comment.

NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the change that the variable next is now a ResultIterator instead of a PageIterator.

Reimplemented from tesseract::PageIterator.

Definition at line 527 of file resultiterator.cpp.

                                                                       {
  if (Empty(element)) return true;  // Already at the end!
  // The result is true if we step forward by element and find we are
  // at the the end of the page or at beginning of *all* levels in:
  // [level, element).
  // When there is more than one level difference between element and level,
  // we could for instance move forward one symbol and still be at the first
  // word on a line, so we also have to be at the first symbol in a word.
  ResultIterator next(*this);
  next.Next(element);
  if (next.Empty(element)) return true;  // Reached the end of the page.
  while (element > level) {
    element = static_cast<PageIteratorLevel>(element - 1);
    if (!next.IsAtBeginningOf(element))
      return false;
  }
  return true;
}
bool tesseract::ResultIterator::Next ( PageIteratorLevel  level) [virtual]

Moves to the start of the next object at the given level in the page hierarchy in the appropriate reading order and returns false if the end of the page was reached. NOTE that RIL_SYMBOL will skip non-text blocks, but all other PageIteratorLevel level values will visit each non-text block once. Think of non text blocks as containing a single para, with a single line, with a single imaginary word. Calls to Next with different levels may be freely intermixed. This function iterates words in right-to-left scripts correctly, if the appropriate language has been loaded into Tesseract.

Reimplemented from tesseract::PageIterator.

Definition at line 416 of file resultiterator.cpp.

                                                 {
  if (it_->block() == NULL) return false; // already at end!
  switch (level) {
    case RIL_BLOCK:  // explicit fall-through
    case RIL_PARA:   // explicit fall-through
    case RIL_TEXTLINE:
      if (!PageIterator::Next(level)) return false;
      if (IsWithinFirstTextlineOfParagraph()) {
        // if we've advanced to a new paragraph,
        // recalculate current_paragraph_is_ltr_
        current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
      }
      in_minor_direction_ = false;
      MoveToLogicalStartOfTextline();
      return it_->block() != NULL;
    case RIL_SYMBOL:
    {
      GenericVector<int> blob_order;
      CalculateBlobOrder(&blob_order);
      int next_blob = 0;
      while (next_blob < blob_order.size() &&
             blob_index_ != blob_order[next_blob])
        next_blob++;
      next_blob++;
      if (next_blob < blob_order.size()) {
        // we're in the same word; simply advance one blob.
        BeginWord(blob_order[next_blob]);
        at_beginning_of_minor_run_ = false;
        return true;
      }
      level = RIL_WORD;  // we've fallen through to the next word.
    }
    case RIL_WORD:  // explicit fall-through.
    {
      if (it_->word() == NULL) return Next(RIL_BLOCK);
      GenericVectorEqEq<int> word_indices;
      int this_word_index = LTRWordIndex();
      CalculateTextlineOrder(current_paragraph_is_ltr_,
                             *this,
                             &word_indices);
      int final_real_index = word_indices.size() - 1;
      while (final_real_index > 0 && word_indices[final_real_index] < 0)
        final_real_index--;
      for (int i = 0; i < final_real_index; i++) {
        if (word_indices[i] == this_word_index) {
          int j = i + 1;
          for (; j < final_real_index && word_indices[j] < 0; j++) {
            if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
            if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
          }
          at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
          // awesome, we move to word_indices[j]
          if (BidiDebug(3)) {
            tprintf("Next(RIL_WORD): %d -> %d\n",
                    this_word_index, word_indices[j]);
          }
          PageIterator::RestartRow();
          for (int k = 0; k < word_indices[j]; k++) {
            PageIterator::Next(RIL_WORD);
          }
          MoveToLogicalStartOfWord();
          return true;
        }
      }
      if (BidiDebug(3)) {
        tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
      }
      // we're going off the end of the text line.
      return Next(RIL_TEXTLINE);
    }
  }
  ASSERT_HOST(false);  // shouldn't happen.
  return false;
}
bool tesseract::ResultIterator::ParagraphIsLtr ( ) const

Return whether the current paragraph's dominant reading direction is left-to-right (as opposed to right-to-left).

Definition at line 46 of file resultiterator.cpp.

                                          {
  return current_paragraph_is_ltr_;
}
ResultIterator * tesseract::ResultIterator::StartOfParagraph ( const LTRResultIterator resit) [static]

Definition at line 41 of file resultiterator.cpp.

                                    {
  return new ResultIterator(resit);
}

Member Data Documentation

const int tesseract::ResultIterator::kComplexWord = -3 [static]

Definition at line 129 of file resultiterator.h.

const int tesseract::ResultIterator::kMinorRunEnd = -2 [static]

Definition at line 128 of file resultiterator.h.

Definition at line 127 of file resultiterator.h.


The documentation for this class was generated from the following files: