Tesseract  3.02
tesseract::Trie Class Reference

#include <trie.h>

Inheritance diagram for tesseract::Trie:
tesseract::Dawg

List of all members.

Public Types

enum  RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE }

Public Member Functions

 Trie (DawgType type, const STRING &lang, PermuterType perm, uinT64 max_num_edges, int unicharset_size, int debug_level)
virtual ~Trie ()
void clear ()
EDGE_REF edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
void unichar_ids_of (NODE_REF node, NodeChildVector *vec) const
NODE_REF next_node (EDGE_REF edge_ref) const
bool end_of_word (EDGE_REF edge_ref) const
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const
void print_node (NODE_REF node, int max_num_edges) const
SquishedDawgtrie_to_dawg ()
bool read_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
bool read_pattern_list (const char *filename, const UNICHARSET &unicharset)
void initialize_patterns (UNICHARSET *unicharset)
void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
bool add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
bool add_word_to_dawg (const WERD_CHOICE &word)

Static Public Member Functions

static const char * get_reverse_policy_name (RTLReversePolicy reverse_policy)

Static Public Attributes

static const int kSaneNumConcreteChars = 4
static const char kAlphaPatternUnicode [] = "\u2000"
static const char kDigitPatternUnicode [] = "\u2001"
static const char kAlphanumPatternUnicode [] = "\u2002"
static const char kPuncPatternUnicode [] = "\u2003"
static const char kLowerPatternUnicode [] = "\u2004"
static const char kUpperPatternUnicode [] = "\u2005"

Protected Member Functions

EDGE_RECORDderef_edge_ref (EDGE_REF edge_ref) const
EDGE_REF make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const
void link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
void print_edge_rec (const EDGE_RECORD &edge_rec) const
bool can_be_eliminated (const EDGE_RECORD &edge_rec)
void print_all (const char *msg, int max_num_edges)
bool edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const
bool add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
bool add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
void add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
NODE_REF new_dawg_node ()
void remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
void remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
bool eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
bool reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, const EDGE_VECTOR &backward_edges, NODE_MARKER reduced_nodes)
void sort_edges (EDGE_VECTOR *edges)
void reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes)
UNICHAR_ID character_class_to_pattern (char ch)

Protected Attributes

TRIE_NODES nodes_
uinT64 num_edges_
uinT64 max_num_edges_
uinT64 deref_direction_mask_
uinT64 deref_node_index_mask_
bool initialized_patterns_
UNICHAR_ID alpha_pattern_
UNICHAR_ID digit_pattern_
UNICHAR_ID alphanum_pattern_
UNICHAR_ID punc_pattern_
UNICHAR_ID lower_pattern_
UNICHAR_ID upper_pattern_

Detailed Description

Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.

Definition at line 62 of file trie.h.


Member Enumeration Documentation

Enumerator:
RRP_DO_NO_REVERSE 
RRP_REVERSE_IF_HAS_RTL 
RRP_FORCE_REVERSE 

Definition at line 64 of file trie.h.


Constructor & Destructor Documentation

tesseract::Trie::Trie ( DawgType  type,
const STRING lang,
PermuterType  perm,
uinT64  max_num_edges,
int  unicharset_size,
int  debug_level 
) [inline]

Definition at line 89 of file trie.h.

                                                                   {
    init(type, lang, perm, unicharset_size, debug_level);
    num_edges_ = 0;
    max_num_edges_ = max_num_edges;
    deref_node_index_mask_ = ~letter_mask_;
    new_dawg_node();  // need to allocate node 0
    initialized_patterns_ = false;
  }
virtual tesseract::Trie::~Trie ( ) [inline, virtual]

Definition at line 98 of file trie.h.


Member Function Documentation

bool tesseract::Trie::add_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
) [protected]

Definition at line 123 of file trie.cpp.

                                                   {
  if (num_edges_ == max_num_edges_) return false;
  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
    &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
  int search_index;
  if (node1 == 0) {
    search_index = 0;  // find the index to make the add sorted
    while (search_index < vec->size() &&
           given_greater_than_edge_rec(node2, word_end, unichar_id,
                                       (*vec)[search_index]) == 1) {
      search_index++;
    }
  } else {
    search_index = vec->size();  // add is unsorted, so index does not matter
  }
  EDGE_RECORD edge_rec;
  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
  if (search_index < vec->size()) {
    vec->insert(edge_rec, search_index);
  } else {
    vec->push_back(edge_rec);
  }
  if (debug_level_ > 1) {
    tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
    print_edge_rec(edge_rec);
    tprintf("\n");
  }
  num_edges_++;
  return true;
}
bool tesseract::Trie::add_new_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
bool  word_end,
UNICHAR_ID  unichar_id 
) [inline, protected]

Definition at line 332 of file trie.h.

                                                                        {
    return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
                             word_end, unichar_id) &&
            add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
                             word_end, unichar_id));
  }
void tesseract::Trie::add_word_ending ( EDGE_RECORD edge,
NODE_REF  the_next_node,
bool  repeats,
UNICHAR_ID  unichar_id 
) [protected]

Definition at line 156 of file trie.cpp.

                                                  {
  EDGE_RECORD *back_edge_ptr;
  EDGE_INDEX back_edge_index;
  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
                           unichar_id, &back_edge_ptr, &back_edge_index));
  if (marker_flag) {
    *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
    *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
  }
  // Mark both directions as end of word.
  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
}
bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word,
const GenericVector< bool > *  repetitions 
)

Definition at line 173 of file trie.cpp.

                                                                    {
  if (word.length() <= 0) return false;  // can't add empty words
  if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
  // Make sure the word does not contain invalid unchar ids.
  for (int i = 0; i < word.length(); ++i) {
    if (word.unichar_id(i) < 0 ||
        word.unichar_id(i) >= unicharset_size_) return false;
  }

  EDGE_RECORD *edge_ptr;
  NODE_REF last_node = 0;
  NODE_REF the_next_node;
  bool marker_flag = false;
  EDGE_INDEX edge_index;
  int i;
  inT32 still_finding_chars = true;
  inT32 word_end = false;
  bool  add_failed = false;
  bool found;

  if (debug_level_ > 1) word.print("\nAdding word: ");

  UNICHAR_ID unichar_id;
  for (i = 0; i < word.length() - 1; ++i) {
    unichar_id = word.unichar_id(i);
    marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
    if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
    if (still_finding_chars) {
      found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
                           unichar_id, &edge_ptr, &edge_index);
      if (found && debug_level_ > 1) {
        tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
                edge_index, last_node);
      }
      if (!found) {
        still_finding_chars = false;
      } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
        word_end = true;
        still_finding_chars = false;
        remove_edge(last_node, 0, word_end, unichar_id);
      } else {
        if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
        last_node = next_node_from_edge_rec(*edge_ptr);
      }
    }
    if (!still_finding_chars) {
      the_next_node = new_dawg_node();
      if (debug_level_ > 1)
        tprintf("adding node " REFFORMAT "\n", the_next_node);
      if (the_next_node == 0) {
        add_failed = true;
        break;
      }
      if (!add_new_edge(last_node, the_next_node,
                        marker_flag, word_end, unichar_id)) {
        add_failed = true;
        break;
      }
      word_end = false;
      last_node = the_next_node;
    }
  }
  the_next_node = 0;
  unichar_id = word.unichar_id(i);
  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
  if (still_finding_chars &&
      edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
                   unichar_id, &edge_ptr, &edge_index)) {
    // An extension of this word already exists in the trie, so we
    // only have to add the ending flags in both directions.
    add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
                    marker_flag, unichar_id);
  } else {
    if (!add_failed &&
        !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
      add_failed = true;
  }
  if (add_failed) {
    tprintf("Re-initializing document dictionary...\n");
    clear();
    return false;
  } else {
    return true;
  }
}
bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word) [inline]

Definition at line 245 of file trie.h.

                                                 {
    return add_word_to_dawg(word, NULL);
  }
bool tesseract::Trie::can_be_eliminated ( const EDGE_RECORD edge_rec) [inline, protected]

Definition at line 302 of file trie.h.

                                                             {
    NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
    return (node_ref != NO_EDGE &&
            nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
  }
UNICHAR_ID tesseract::Trie::character_class_to_pattern ( char  ch) [protected]

Definition at line 346 of file trie.cpp.

                                                   {
  if (ch == 'c') {
    return alpha_pattern_;
  } else if (ch == 'd') {
    return digit_pattern_;
  } else if (ch == 'n') {
    return alphanum_pattern_;
  } else if (ch == 'p') {
    return punc_pattern_;
  } else if (ch == 'a') {
    return lower_pattern_;
  } else if (ch == 'A') {
    return upper_pattern_;
  } else {
    return INVALID_UNICHAR_ID;
  }
}
void tesseract::Trie::clear ( )

Definition at line 65 of file trie.cpp.

                 {
  nodes_.delete_data_pointers();
  nodes_.clear();
  num_edges_ = 0;
  new_dawg_node();  // Need to allocate node 0.
}
EDGE_RECORD* tesseract::Trie::deref_edge_ref ( EDGE_REF  edge_ref) const [inline, protected]

Definition at line 267 of file trie.h.

                                                              {
    int edge_index = static_cast<int>(
      (edge_ref & letter_mask_) >> LETTER_START_BIT);
    int node_index = static_cast<int>(
      (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
    TRIE_NODE_RECORD *node_rec = nodes_[node_index];
    return &(node_rec->forward_edges[edge_index]);
  }
EDGE_REF tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const [inline, virtual]

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 104 of file trie.h.

                                             {
    EDGE_RECORD *edge_ptr;
    EDGE_INDEX edge_index;
    if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
                      &edge_ptr, &edge_index)) return NO_EDGE;
    return make_edge_ref(node_ref, edge_index);
  }
bool tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
NODE_REF  next_node,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id,
EDGE_RECORD **  edge_ptr,
EDGE_INDEX edge_index 
) const [protected]

Definition at line 72 of file trie.cpp.

                                                                              {
  if (debug_level_ == 3) {
    tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
            " direction %d word_end %d unichar_id %d, exploring node:\n",
            node_ref, next_node, direction, word_end, unichar_id);
    if (node_ref != NO_EDGE) {
      print_node(node_ref, nodes_[node_ref]->forward_edges.size());
    }
  }
  if (node_ref == NO_EDGE) return false;
  assert(node_ref < nodes_.size());
  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
    nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
  int vec_size = vec.size();
  if (node_ref == 0) { // binary search
    EDGE_INDEX start = 0;
    EDGE_INDEX end = vec_size - 1;
    EDGE_INDEX k;
    int compare;
    while (start <= end) {
      k = (start + end) >> 1;  // (start + end) / 2
      compare = given_greater_than_edge_rec(next_node, word_end,
                                            unichar_id, vec[k]);
      if (compare == 0) {  // given == vec[k]
        *edge_ptr = &(vec[k]);
        *edge_index = k;
        return true;
      } else if (compare == 1) {  // given > vec[k]
        start = k + 1;
      } else {  // given < vec[k]
        end = k - 1;
      }
    }
  } else {  // linear search
    for (int i = 0; i < vec_size; ++i) {
      EDGE_RECORD &edge_rec = vec[i];
      if (edge_rec_match(next_node, word_end, unichar_id,
                         next_node_from_edge_rec(edge_rec),
                         end_of_word_from_edge_rec(edge_rec),
                         unichar_id_from_edge_rec(edge_rec))) {
        *edge_ptr = &(edge_rec);
        *edge_index = i;
        return true;
      }
    }
  }
  return false;  // not found
}
UNICHAR_ID tesseract::Trie::edge_letter ( EDGE_REF  edge_ref) const [inline, virtual]

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 145 of file trie.h.

                                                  {
    if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
    return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
  }
bool tesseract::Trie::eliminate_redundant_edges ( NODE_REF  node,
const EDGE_RECORD edge1,
const EDGE_RECORD edge2 
) [protected]

Definition at line 512 of file trie.cpp.

                                                               {
  if (debug_level_ > 1) {
    tprintf("\nCollapsing node %d:\n", node);
    print_node(node, MAX_NODE_EDGES_DISPLAY);
    tprintf("Candidate edges: ");
    print_edge_rec(edge1);
    tprintf(", ");
    print_edge_rec(edge2);
    tprintf("\n\n");
  }
  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
  // Translate all edges going to/from next_node2 to go to/from next_node1.
  EDGE_RECORD *edge_ptr = NULL;
  EDGE_INDEX edge_index;
  int i;
  // Remove the backward link in node to next_node2.
  const EDGE_RECORD &fwd_edge = next_node2_ptr->forward_edges[0];
  remove_edge_linkage(node, next_node2, BACKWARD_EDGE,
                      end_of_word_from_edge_rec(fwd_edge),
                      unichar_id_from_edge_rec(fwd_edge));
  // Copy all the backward links in next_node2 to node next_node1
  for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
    const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
    NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
    UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
    int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
    bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
    add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
                     curr_word_end, curr_unichar_id);
    // Relocate the corresponding forward edge in curr_next_node
    ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
                             curr_word_end, curr_unichar_id,
                             &edge_ptr, &edge_index));
    set_next_node_in_edge_rec(edge_ptr, next_node1);
  }
  int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
                              next_node2_ptr->backward_edges.size());
  if (debug_level_ > 1) {
    tprintf("removed %d edges from node " REFFORMAT "\n",
            next_node2_num_edges, next_node2);
  }
  next_node2_ptr->forward_edges.clear();
  next_node2_ptr->backward_edges.clear();
  num_edges_ -= next_node2_num_edges;
  return true;
}
bool tesseract::Trie::end_of_word ( EDGE_REF  edge_ref) const [inline, virtual]

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 139 of file trie.h.

                                            {
    if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
    return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
  }
const char * tesseract::Trie::get_reverse_policy_name ( RTLReversePolicy  reverse_policy) [static]

Definition at line 60 of file trie.cpp.

                                                                         {
  return RTLReversePolicyNames[reverse_policy];
}
void tesseract::Trie::link_edge ( EDGE_RECORD edge,
NODE_REF  nxt,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
) [inline, protected]

Sets up this edge record to the requested values.

Definition at line 282 of file trie.h.

                                                                             {
    EDGE_RECORD flags = 0;
    if (repeats) flags |= MARKER_FLAG;
    if (word_end) flags |= WERD_END_FLAG;
    if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
    *edge = ((nxt << next_node_start_bit_) |
             (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
             (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
  }
EDGE_REF tesseract::Trie::make_edge_ref ( NODE_REF  node_index,
EDGE_INDEX  edge_index 
) const [inline, protected]

Constructs EDGE_REF from the given node_index and edge_index.

Definition at line 276 of file trie.h.

                                                             {
    return ((node_index << flag_start_bit_) |
            (edge_index << LETTER_START_BIT));
  }
NODE_REF tesseract::Trie::new_dawg_node ( ) [protected]

Definition at line 261 of file trie.cpp.

                             {
  TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD();
  if (node == NULL) return 0;  // failed to create new node
  nodes_.push_back(node);
  return nodes_.length() - 1;
}
NODE_REF tesseract::Trie::next_node ( EDGE_REF  edge_ref) const [inline, virtual]

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 130 of file trie.h.

                                              {
    if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
    return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
  }
virtual EDGE_REF tesseract::Trie::pattern_loop_edge ( EDGE_REF  edge_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const [inline, virtual]

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented from tesseract::Dawg.

Definition at line 223 of file trie.h.

                                                          {
    if (edge_ref == NO_EDGE) return NO_EDGE;
    EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
    return (marker_flag_from_edge_rec(*edge_rec) &&
            unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
            word_end == end_of_word_from_edge_rec(*edge_rec)) ?
            edge_ref : NO_EDGE;
  }
void tesseract::Trie::print_all ( const char *  msg,
int  max_num_edges 
) [inline, protected]

Definition at line 310 of file trie.h.

                                                     {
    tprintf("\n__________________________\n%s\n", msg);
    for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
    tprintf("__________________________\n");
  }
void tesseract::Trie::print_edge_rec ( const EDGE_RECORD edge_rec) const [inline, protected]

Prints the given EDGE_RECORD.

Definition at line 293 of file trie.h.

                                                                {
    tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
            marker_flag_from_edge_rec(edge_rec) ? "R," : "",
            (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
            end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
            unichar_id_from_edge_rec(edge_rec));
  }
void tesseract::Trie::print_node ( NODE_REF  node,
int  max_num_edges 
) const [virtual]

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 648 of file trie.cpp.

                                                            {
  if (node == NO_EDGE) return;  // nothing to print
  TRIE_NODE_RECORD *node_ptr = nodes_[node];
  int num_fwd = node_ptr->forward_edges.size();
  int num_bkw = node_ptr->backward_edges.size();
  EDGE_VECTOR *vec;
  for (int dir = 0; dir < 2; ++dir) {
    if (dir == 0) {
      vec = &(node_ptr->forward_edges);
      tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
    } else {
      vec = &(node_ptr->backward_edges);
      tprintf("\t");
    }
    int i;
    for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
         i < max_num_edges; ++i) {
      print_edge_rec((*vec)[i]);
      tprintf(" ");
    }
    if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
    tprintf("\n");
  }
}
bool tesseract::Trie::read_pattern_list ( const char *  filename,
const UNICHARSET unicharset 
)

Definition at line 364 of file trie.cpp.

                                                           {
  if (!initialized_patterns_) {
    tprintf("please call initialize_patterns() before read_pattern_list()\n");
    return false;
  }

  FILE *pattern_file = open_file (filename, "r");
  if (pattern_file == NULL) {
    tprintf("Error opening pattern file %s\n", filename);
    return false;
  }

  int pattern_count = 0;
  char string[CHARS_PER_LINE];
  while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) {
    chomp_string(string);  // remove newline
    // Parse the pattern and construct a unichar id vector.
    // Record the number of repetitions of each unichar in the parallel vector.
    WERD_CHOICE word(&unicharset);
    GenericVector<bool> repetitions_vec;
    const char *str_ptr = string;
    int step = unicharset.step(str_ptr);
    bool failed = false;
    while (step > 0) {
      UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
      if (step == 1 && *str_ptr == '\\') {
        ++str_ptr;
        if (*str_ptr == '\\') {  // regular '\' unichar that was escaped
          curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
        } else {
          if (word.length() < kSaneNumConcreteChars) {
            tprintf("Please provide at least %d concrete characters at the"
                    " beginning of the pattern\n", kSaneNumConcreteChars);
            failed = true;
            break;
          }
          // Parse character class from expression.
          curr_unichar_id = character_class_to_pattern(*str_ptr);
        }
      } else {
        curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
      }
      if (curr_unichar_id ==  INVALID_UNICHAR_ID) {
        failed = true;
        break;  // failed to parse this pattern
      }
      word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
      repetitions_vec.push_back(false);
      str_ptr += step;
      step = unicharset.step(str_ptr);
      // Check if there is a repetition pattern specified after this unichar.
      if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
        repetitions_vec[repetitions_vec.size()-1] = true;
        str_ptr += 2;
        step = unicharset.step(str_ptr);
      }
    }
    if (failed) {
      tprintf("Invalid user pattern %s\n", string);
      continue;
    }
    // Insert the pattern into the trie.
    if (debug_level_ > 2) {
      tprintf("Inserting expanded user pattern %s\n",
              word.debug_string().string());
    }
    if (!this->word_in_dawg(word)) {
      this->add_word_to_dawg(word, &repetitions_vec);
      if (!this->word_in_dawg(word)) {
        tprintf("Error: failed to insert pattern '%s'\n", string);
      }
    }
    ++pattern_count;
  }
  if (debug_level_) {
    tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
  }
  fclose(pattern_file);
  return true;
}
bool tesseract::Trie::read_word_list ( const char *  filename,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse 
)

Definition at line 268 of file trie.cpp.

                                                               {
  FILE *word_file;
  char string[CHARS_PER_LINE];
  int  word_count = 0;

  word_file = open_file (filename, "r");

  while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
    chomp_string(string);  // remove newline
    WERD_CHOICE word(string, unicharset);
    if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
        word.has_rtl_unichar_id()) ||
        reverse_policy == RRP_FORCE_REVERSE) {
      word.reverse_and_mirror_unichar_ids();
    }
    ++word_count;
    if (debug_level_ && word_count % 10000 == 0)
      tprintf("Read %d words so far\n", word_count);
    if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
      if (!this->word_in_dawg(word)) {
        this->add_word_to_dawg(word);
        if (!this->word_in_dawg(word)) {
          tprintf("Error: word '%s' not in DAWG after adding it\n", string);
          return false;
        }
      }
    } else if (debug_level_) {
      tprintf("Skipping invalid word %s\n", string);
      if (debug_level_ >= 3) word.print();
    }
  }
  if (debug_level_)
    tprintf("Read %d words total.\n", word_count);
  fclose(word_file);
  return true;
}
bool tesseract::Trie::reduce_lettered_edges ( EDGE_INDEX  edge_index,
UNICHAR_ID  unichar_id,
NODE_REF  node,
const EDGE_VECTOR backward_edges,
NODE_MARKER  reduced_nodes 
) [protected]

Definition at line 563 of file trie.cpp.

                                                            {
  if (debug_level_ > 1)
    tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
  // Compare each of the edge pairs with the given unichar_id.
  bool did_something = false;
  for (int i = edge_index; i < backward_edges.size() - 1; ++i) {
    // Find the first edge that can be eliminated.
    UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
    while (i < backward_edges.size() &&
           ((curr_unichar_id = unichar_id_from_edge_rec(backward_edges[i])) ==
            unichar_id) &&
           !can_be_eliminated(backward_edges[i])) ++i;
    if (i == backward_edges.size() || curr_unichar_id != unichar_id) break;
    const EDGE_RECORD &edge_rec = backward_edges[i];
    // Compare it to the rest of the edges with the given unichar_id.
    for (int j = i + 1; j < backward_edges.size(); ++j) {
      const EDGE_RECORD &next_edge_rec = backward_edges[j];
      if (unichar_id_from_edge_rec(next_edge_rec) != unichar_id) break;
      if (end_of_word_from_edge_rec(next_edge_rec) ==
          end_of_word_from_edge_rec(edge_rec) &&
          can_be_eliminated(next_edge_rec) &&
          eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
        reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
        did_something = true;
        --j;  // do not increment j if next_edge_rec was removed
      }
    }
  }
  return did_something;
}
void tesseract::Trie::reduce_node_input ( NODE_REF  node,
NODE_MARKER  reduced_nodes 
) [protected]

Eliminates any redundant edges from this node in the Trie.

Definition at line 615 of file trie.cpp.

                                                        {
  if (debug_level_ > 1) {
    tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
    print_node(node, MAX_NODE_EDGES_DISPLAY);
  }

  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
  if (node != 0) sort_edges(&backward_edges);
  EDGE_INDEX edge_index = 0;
  while (edge_index < backward_edges.size()) {
    UNICHAR_ID unichar_id =
      unichar_id_from_edge_rec(backward_edges[edge_index]);
    while (reduce_lettered_edges(edge_index, unichar_id, node,
                                 backward_edges, reduced_nodes));
    while (++edge_index < backward_edges.size() &&
           unichar_id_from_edge_rec(backward_edges[edge_index]) == unichar_id);
  }
  reduced_nodes[node] = true;  // mark as reduced

  if (debug_level_ > 1) {
    tprintf("Node " REFFORMAT " after reduction:\n", node);
    print_node(node, MAX_NODE_EDGES_DISPLAY);
  }

  for (int i = 0; i < backward_edges.size(); ++i) {
    NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
    if (next_node != 0 && !reduced_nodes[next_node]) {
      reduce_node_input(next_node, reduced_nodes);
    }
  }
}
void tesseract::Trie::remove_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  word_end,
UNICHAR_ID  unichar_id 
) [inline, protected]

Definition at line 357 of file trie.h.

                                                         {
    remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
    remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
  }
void tesseract::Trie::remove_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
) [protected]

Definition at line 446 of file trie.cpp.

                                                                     {
  EDGE_RECORD *edge_ptr = NULL;
  EDGE_INDEX edge_index = 0;
  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
                           unichar_id, &edge_ptr, &edge_index));
  if (debug_level_ > 1) {
    tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
    print_edge_rec(*edge_ptr);
    tprintf("\n");
  }
  if (direction == FORWARD_EDGE) {
    nodes_[node1]->forward_edges.remove(edge_index);
  } else {
    nodes_[node1]->backward_edges.remove(edge_index);
  }
  --num_edges_;
}
void tesseract::Trie::sort_edges ( EDGE_VECTOR edges) [protected]

Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.

Definition at line 598 of file trie.cpp.

                                        {
  int num_edges = edges->size();
  if (num_edges <= 1) return;
  for (int i = 0; i < num_edges - 1; ++i) {
    int min = i;
    for (int j = (i + 1); j < num_edges; ++j) {
      if (unichar_id_from_edge_rec((*edges)[j]) <
          unichar_id_from_edge_rec((*edges)[min])) min = j;
    }
    if (i != min) {
      EDGE_RECORD temp = (*edges)[i];
      (*edges)[i] = (*edges)[min];
      (*edges)[min] = temp;
    }
  }
}
SquishedDawg * tesseract::Trie::trie_to_dawg ( )

Definition at line 465 of file trie.cpp.

                                 {
  if (debug_level_ > 2) {
    print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
  }
  NODE_MARKER reduced_nodes = new bool[nodes_.size()];
  for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
  this->reduce_node_input(0, reduced_nodes);
  delete[] reduced_nodes;

  if (debug_level_ > 2) {
    print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
  }
  // Build a translation map from node indices in nodes_ vector to
  // their target indices in EDGE_ARRAY.
  NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
  int i, j;
  node_ref_map[0] = 0;
  for (i = 0; i < nodes_.size(); ++i) {
    node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
  }
  int num_forward_edges = node_ref_map[i];

  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
  EDGE_ARRAY edge_array =
    (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD));
  EDGE_ARRAY edge_array_ptr = edge_array;
  for (i = 0; i < nodes_.size(); ++i) {
    TRIE_NODE_RECORD *node_ptr = nodes_[i];
    int end = node_ptr->forward_edges.size();
    for (j = 0; j < end; ++j) {
      EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
      NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
      ASSERT_HOST(node_ref < nodes_.size());
      UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
      link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
                end_of_word_from_edge_rec(edge_rec), unichar_id);
      if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
      ++edge_array_ptr;
    }
  }
  delete[] node_ref_map;

  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
                          perm_, unicharset_size_, debug_level_);
}
void tesseract::Trie::unichar_id_to_patterns ( UNICHAR_ID  unichar_id,
const UNICHARSET unicharset,
GenericVector< UNICHAR_ID > *  vec 
) const [virtual]

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented from tesseract::Dawg.

Definition at line 324 of file trie.cpp.

                                                                        {
  bool is_alpha = unicharset.get_isalpha(unichar_id);
  if (is_alpha) {
    vec->push_back(alpha_pattern_);
    vec->push_back(alphanum_pattern_);
    if (unicharset.get_islower(unichar_id)) {
      vec->push_back(lower_pattern_);
    } else if (unicharset.get_isupper(unichar_id)) {
      vec->push_back(upper_pattern_);
    }
  }
  if (unicharset.get_isdigit(unichar_id)) {
    vec->push_back(digit_pattern_);
    if (!is_alpha) vec->push_back(alphanum_pattern_);
  }
  if (unicharset.get_ispunctuation(unichar_id)) {
    vec->push_back(punc_pattern_);
  }
}
void tesseract::Trie::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec 
) const [inline, virtual]

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 117 of file trie.h.

                                                                 {
    const EDGE_VECTOR &forward_edges =
      nodes_[static_cast<int>(node)]->forward_edges;
    for (int i = 0; i < forward_edges.size(); ++i) {
      vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
                               make_edge_ref(node, i)));
    }
  }

Member Data Documentation

Definition at line 403 of file trie.h.

Definition at line 405 of file trie.h.

Definition at line 398 of file trie.h.

Definition at line 399 of file trie.h.

Definition at line 404 of file trie.h.

Definition at line 402 of file trie.h.

const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002" [static]

Definition at line 77 of file trie.h.

const char tesseract::Trie::kAlphaPatternUnicode = "\u2000" [static]

Definition at line 75 of file trie.h.

const char tesseract::Trie::kDigitPatternUnicode = "\u2001" [static]

Definition at line 76 of file trie.h.

const char tesseract::Trie::kLowerPatternUnicode = "\u2004" [static]

Definition at line 79 of file trie.h.

const char tesseract::Trie::kPuncPatternUnicode = "\u2003" [static]

Definition at line 78 of file trie.h.

Definition at line 71 of file trie.h.

const char tesseract::Trie::kUpperPatternUnicode = "\u2005" [static]

Definition at line 80 of file trie.h.

Definition at line 407 of file trie.h.

Definition at line 397 of file trie.h.

Definition at line 395 of file trie.h.

Definition at line 396 of file trie.h.

Definition at line 406 of file trie.h.

Definition at line 408 of file trie.h.


The documentation for this class was generated from the following files: