Tesseract  3.02
tesseract::Dawg Class Reference

#include <dawg.h>

Inheritance diagram for tesseract::Dawg:
tesseract::SquishedDawg tesseract::Trie

List of all members.

Public Member Functions

DawgType type () const
const STRINGlang () const
PermuterType permuter () const
virtual ~Dawg ()
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg.
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const char * > *cb) const
virtual EDGE_REF edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
 Returns the edge that corresponds to the letter out of this node.
virtual void unichar_ids_of (NODE_REF node, NodeChildVector *vec) const =0
virtual NODE_REF next_node (EDGE_REF edge_ref) const =0
virtual bool end_of_word (EDGE_REF edge_ref) const =0
virtual UNICHAR_ID edge_letter (EDGE_REF edge_ref) const =0
 Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
virtual void print_node (NODE_REF node, int max_num_edges) const =0
virtual void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const

Static Public Attributes

static const inT16 kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file.
static const UNICHAR_ID kPatternUnicharID = 0

Protected Member Functions

 Dawg ()
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge.
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge.
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge.
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word.
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge.
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg.
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges.
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
void init (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
bool match_words (WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const char * > *cb) const

Protected Attributes

DawgType type_
STRING lang_
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg.
int unicharset_size_
int flag_start_bit_
int next_node_start_bit_
uinT64 next_node_mask_
uinT64 flags_mask_
uinT64 letter_mask_
int debug_level_

Detailed Description

Abstract class (an interface) that declares methods needed by the various tesseract classes to operate on SquishedDawg and Trie objects.

This class initializes all the edge masks (since their usage by SquishedDawg and Trie is identical) and implements simple accessors for each of the fields encoded in an EDGE_RECORD. This class also implements word_in_dawg() and check_for_words() (since they use only the public methods of SquishedDawg and Trie classes that are inherited from the Dawg base class).

Definition at line 122 of file dawg.h.


Constructor & Destructor Documentation

virtual tesseract::Dawg::~Dawg ( ) [inline, virtual]

Definition at line 135 of file dawg.h.

{};
tesseract::Dawg::Dawg ( ) [inline, protected]

Definition at line 191 of file dawg.h.

{}

Member Function Documentation

int tesseract::Dawg::check_for_words ( const char *  filename,
const UNICHARSET unicharset,
bool  enable_wildcard 
) const

Checks the Dawg for the words that are listed in the requested file. Returns the number of words in the given file missing from the Dawg.

Definition at line 69 of file dawg.cpp.

                                                      {
  if (filename == NULL) return 0;

  FILE       *word_file;
  char       string [CHARS_PER_LINE];
  int misses = 0;
  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);

  word_file = open_file (filename, "r");

  while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
    chomp_string(string);  // remove newline
    WERD_CHOICE word(string, unicharset);
    if (word.length() > 0 &&
        !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
      if (!match_words(&word, 0, 0,
                       enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
        tprintf("Missing word: %s\n", string);
        ++misses;
      }
    } else {
      tprintf("Failed to create a valid word from %s\n", string);
    }
  }
  fclose (word_file);
  // Make sure the user sees this with fprintf instead of tprintf.
  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
  return misses;
}
int tesseract::Dawg::direction_from_edge_rec ( const EDGE_RECORD edge_rec) const [inline, protected]

Returns the direction flag of this edge.

Definition at line 202 of file dawg.h.

                                                                        {
    return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
      BACKWARD_EDGE : FORWARD_EDGE;
  }
virtual EDGE_REF tesseract::Dawg::edge_char_of ( NODE_REF  node,
UNICHAR_ID  unichar_id,
bool  word_end 
) const [pure virtual]

Returns the edge that corresponds to the letter out of this node.

Implemented in tesseract::SquishedDawg, and tesseract::Trie.

virtual UNICHAR_ID tesseract::Dawg::edge_letter ( EDGE_REF  edge_ref) const [pure virtual]

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implemented in tesseract::SquishedDawg, and tesseract::Trie.

bool tesseract::Dawg::edge_rec_match ( NODE_REF  next_node,
bool  word_end,
UNICHAR_ID  unichar_id,
NODE_REF  other_next_node,
bool  other_word_end,
UNICHAR_ID  other_unichar_id 
) const [inline, protected]

Returns true if all the values are equal (any value matches next_node if next_node == NO_EDGE, any value matches word_end if word_end is false).

Definition at line 253 of file dawg.h.

                                                                {
    return ((unichar_id == other_unichar_id) &&
            (next_node == NO_EDGE || next_node == other_next_node) &&
            (!word_end || (word_end == other_word_end)));
  }
virtual bool tesseract::Dawg::end_of_word ( EDGE_REF  edge_ref) const [pure virtual]

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implemented in tesseract::SquishedDawg, and tesseract::Trie.

bool tesseract::Dawg::end_of_word_from_edge_rec ( const EDGE_RECORD edge_rec) const [inline, protected]

Returns true if this edge marks the end of a word.

Definition at line 207 of file dawg.h.

                                                                           {
    return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
  }
int tesseract::Dawg::given_greater_than_edge_rec ( NODE_REF  next_node,
bool  word_end,
UNICHAR_ID  unichar_id,
const EDGE_RECORD edge_rec 
) const [inline, protected]

Sequentially compares the given values of unichar ID, next node and word end marker with the values in the given EDGE_RECORD. Returns: 1 if at any step the given input value exceeds that of edge_rec (and all the values already checked are the same) 0 if edge_rec_match() returns true -1 otherwise

Definition at line 232 of file dawg.h.

                                                                            {
    UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
    NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
    bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
    if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
                       curr_word_end, curr_unichar_id)) return 0;
    if (unichar_id > curr_unichar_id) return 1;
    if (unichar_id == curr_unichar_id) {
      if (next_node > curr_next_node) return 1;
      if (next_node == curr_next_node) {
        if (word_end > curr_word_end) return 1;
      }
    }
    return -1;
  }
void tesseract::Dawg::init ( DawgType  type,
const STRING lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
) [protected]

Sets type_, lang_, perm_, unicharset_size_. Initializes the values of various masks from unicharset_size_.

Definition at line 159 of file dawg.cpp.

                                                                         {
  type_ = type;
  lang_ = lang;
  perm_ = perm;
  ASSERT_HOST(unicharset_size > 0);
  unicharset_size_ = unicharset_size;
  // Set bit masks.
  flag_start_bit_ = ceil(log(static_cast<double>(unicharset_size_)) / log(2.0));
  next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS;
  letter_mask_ = ~(~0 << flag_start_bit_);
  next_node_mask_ = ~0 << (flag_start_bit_ + NUM_FLAG_BITS);
  flags_mask_ = ~(letter_mask_ | next_node_mask_);

  debug_level_ = debug_level;
}
void tesseract::Dawg::iterate_words ( const UNICHARSET unicharset,
TessCallback1< const char * > *  cb 
) const

Definition at line 101 of file dawg.cpp.

                                                                {
  WERD_CHOICE word(&unicharset);
  iterate_words_rec(word, 0, cb);
}
void tesseract::Dawg::iterate_words_rec ( const WERD_CHOICE word_so_far,
NODE_REF  to_explore,
TessCallback1< const char * > *  cb 
) const [protected]

Definition at line 107 of file dawg.cpp.

                                                                    {
  NodeChildVector children;
  this->unichar_ids_of(to_explore, &children);
  for (int i = 0; i < children.size(); i++) {
    WERD_CHOICE next_word(word_so_far);
    next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
    if (this->end_of_word(children[i].edge_ref)) {
      STRING s;
      next_word.string_and_lengths(&s, NULL);
      cb->Run(s.string());
    }
    NODE_REF next = next_node(children[i].edge_ref);
    if (next != 0) {
      iterate_words_rec(next_word, next, cb);
    }
  }
}
const STRING& tesseract::Dawg::lang ( ) const [inline]

Definition at line 132 of file dawg.h.

{ return lang_; }
bool tesseract::Dawg::marker_flag_from_edge_rec ( const EDGE_RECORD edge_rec) const [inline, protected]

Returns the marker flag of this edge.

Definition at line 198 of file dawg.h.

                                                                           {
    return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
  }
bool tesseract::Dawg::match_words ( WERD_CHOICE word,
inT32  index,
NODE_REF  node,
UNICHAR_ID  wildcard 
) const [protected]

Matches all of the words that are represented by this string. If wilcard is set to something other than INVALID_UNICHAR_ID, the *'s in this string are interpreted as wildcards. WERD_CHOICE param is not passed by const so that wildcard searches can modify it and work without having to copy WERD_CHOICEs.

Definition at line 127 of file dawg.cpp.

                                                                 {
  EDGE_REF edge;
  inT32 word_end;

  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
    bool any_matched = false;
    NodeChildVector vec;
    this->unichar_ids_of(node, &vec);
    for (int i = 0; i < vec.size(); ++i) {
      word->set_unichar_id(vec[i].unichar_id, index);
      if (match_words(word, index, node, wildcard))
        any_matched = true;
    }
    word->set_unichar_id(wildcard, index);
    return any_matched;
  } else {
    word_end = index == word->length() - 1;
    edge = edge_char_of(node, word->unichar_id(index), word_end);
    if (edge != NO_EDGE) {  // normal edge in DAWG
      node = next_node(edge);
      if (word_end) {
        if (debug_level_ > 1) word->print("match_words() found: ");
        return true;
      } else if (node != 0) {
        return match_words(word, index+1, node, wildcard);
      }
    }
  }
  return false;
}
virtual NODE_REF tesseract::Dawg::next_node ( EDGE_REF  edge_ref) const [pure virtual]

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implemented in tesseract::SquishedDawg, and tesseract::Trie.

NODE_REF tesseract::Dawg::next_node_from_edge_rec ( const EDGE_RECORD edge_rec) const [inline, protected]

Returns the next node visited by following this edge.

Definition at line 194 of file dawg.h.

                                                                             {
    return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
  }
virtual EDGE_REF tesseract::Dawg::pattern_loop_edge ( EDGE_REF  edge_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const [inline, virtual]

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented in tesseract::Trie.

Definition at line 185 of file dawg.h.

                                                                     {
    return false;
  }
PermuterType tesseract::Dawg::permuter ( ) const [inline]

Definition at line 133 of file dawg.h.

{ return perm_; }
virtual void tesseract::Dawg::print_node ( NODE_REF  node,
int  max_num_edges 
) const [pure virtual]

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implemented in tesseract::SquishedDawg, and tesseract::Trie.

void tesseract::Dawg::set_marker_flag_in_edge_rec ( EDGE_RECORD edge_rec) [inline, protected]

Sets this edge record to be the last one in a sequence of edges.

Definition at line 222 of file dawg.h.

                                                                 {
    *edge_rec |= (MARKER_FLAG << flag_start_bit_);
  }
void tesseract::Dawg::set_next_node_in_edge_rec ( EDGE_RECORD edge_rec,
EDGE_REF  value 
) [inline, protected]

Sets the next node link for this edge in the Dawg.

Definition at line 216 of file dawg.h.

                                             {
    *edge_rec &= (~next_node_mask_);
    *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
  }
DawgType tesseract::Dawg::type ( ) const [inline]

Definition at line 131 of file dawg.h.

{ return type_; }
UNICHAR_ID tesseract::Dawg::unichar_id_from_edge_rec ( const EDGE_RECORD edge_rec) const [inline, protected]

Returns UNICHAR_ID recorded in this edge.

Definition at line 211 of file dawg.h.

                                         {
    return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
  }
virtual void tesseract::Dawg::unichar_id_to_patterns ( UNICHAR_ID  unichar_id,
const UNICHARSET unicharset,
GenericVector< UNICHAR_ID > *  vec 
) const [inline, virtual]

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented in tesseract::Trie.

Definition at line 178 of file dawg.h.

                                                                            {};
virtual void tesseract::Dawg::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec 
) const [pure virtual]

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implemented in tesseract::SquishedDawg, and tesseract::Trie.

bool tesseract::Dawg::word_in_dawg ( const WERD_CHOICE word) const

Returns true if the given word is in the Dawg.

Definition at line 48 of file dawg.cpp.

                                                     {
  if (word.length() == 0) return false;
  NODE_REF node = 0;
  int end_index = word.length() - 1;
  for (int i = 0; i <= end_index; i++) {
    if (debug_level_ > 1) {
      tprintf("word_in_dawg: exploring node " REFFORMAT ":\n", node);
      print_node(node, MAX_NODE_EDGES_DISPLAY);
      tprintf("\n");
    }
    EDGE_REF edge = edge_char_of(node, word.unichar_id(i), i == end_index);
    if (edge != NO_EDGE) {
      node = next_node(edge);
      if (node == 0) node = NO_EDGE;
    } else {
      return false;
    }
  }
  return true;
}

Member Data Documentation

Definition at line 298 of file dawg.h.

Definition at line 292 of file dawg.h.

Definition at line 295 of file dawg.h.

Magic number to determine endianness when reading the Dawg from file.

Definition at line 125 of file dawg.h.

A special unichar id that indicates that any appropriate pattern (e.g.dicitonary word, 0-9 digit, etc) can be inserted instead Used for expressing patterns in punctuation and number Dawgs.

Definition at line 129 of file dawg.h.

Definition at line 284 of file dawg.h.

Definition at line 296 of file dawg.h.

Definition at line 294 of file dawg.h.

Definition at line 293 of file dawg.h.

PermuterType tesseract::Dawg::perm_ [protected]

Permuter code that should be used if the word is found in this Dawg.

Definition at line 286 of file dawg.h.

Definition at line 283 of file dawg.h.

Definition at line 291 of file dawg.h.


The documentation for this class was generated from the following files: