#include <trie.h>

Inheritance diagram for tesseract::Trie:

Public Types
enum	RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE }
Public Member Functions
	Trie (DawgType type, const STRING &lang, PermuterType perm, uinT64 max_num_edges, int unicharset_size, int debug_level)
virtual	~Trie ()
void	clear ()
EDGE_REF	edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
void	unichar_ids_of (NODE_REF node, NodeChildVector *vec) const
NODE_REF	next_node (EDGE_REF edge_ref) const
bool	end_of_word (EDGE_REF edge_ref) const
UNICHAR_ID	edge_letter (EDGE_REF edge_ref) const
void	print_node (NODE_REF node, int max_num_edges) const
SquishedDawg *	trie_to_dawg ()
bool	read_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
bool	read_pattern_list (const char *filename, const UNICHARSET &unicharset)
void	initialize_patterns (UNICHARSET *unicharset)
void	unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
virtual EDGE_REF	pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
bool	add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
bool	add_word_to_dawg (const WERD_CHOICE &word)
Static Public Member Functions
static const char *	get_reverse_policy_name (RTLReversePolicy reverse_policy)
Static Public Attributes
static const int	kSaneNumConcreteChars = 4
static const char	kAlphaPatternUnicode [] = "\u2000"
static const char	kDigitPatternUnicode [] = "\u2001"
static const char	kAlphanumPatternUnicode [] = "\u2002"
static const char	kPuncPatternUnicode [] = "\u2003"
static const char	kLowerPatternUnicode [] = "\u2004"
static const char	kUpperPatternUnicode [] = "\u2005"
Protected Member Functions
EDGE_RECORD *	deref_edge_ref (EDGE_REF edge_ref) const
EDGE_REF	make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const
void	link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
void	print_edge_rec (const EDGE_RECORD &edge_rec) const
bool	can_be_eliminated (const EDGE_RECORD &edge_rec)
void	print_all (const char *msg, int max_num_edges)
bool	edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD *edge_ptr, EDGE_INDEX edge_index) const
bool	add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
bool	add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
void	add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
NODE_REF	new_dawg_node ()
void	remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
void	remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
bool	eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
bool	reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, const EDGE_VECTOR &backward_edges, NODE_MARKER reduced_nodes)
void	sort_edges (EDGE_VECTOR *edges)
void	reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes)
UNICHAR_ID	character_class_to_pattern (char ch)
Protected Attributes
TRIE_NODES	nodes_
uinT64	num_edges_
uinT64	max_num_edges_
uinT64	deref_direction_mask_
uinT64	deref_node_index_mask_
bool	initialized_patterns_
UNICHAR_ID	alpha_pattern_
UNICHAR_ID	digit_pattern_
UNICHAR_ID	alphanum_pattern_
UNICHAR_ID	punc_pattern_
UNICHAR_ID	lower_pattern_
UNICHAR_ID	upper_pattern_

Detailed Description

Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.

Definition at line 62 of file trie.h.

Member Enumeration Documentation

enum tesseract::Trie::RTLReversePolicy

Enumerator:

RRP_DO_NO_REVERSE
RRP_REVERSE_IF_HAS_RTL
RRP_FORCE_REVERSE

Definition at line 64 of file trie.h.

                        {
    RRP_DO_NO_REVERSE,
    RRP_REVERSE_IF_HAS_RTL,
    RRP_FORCE_REVERSE,
  };

Constructor & Destructor Documentation

tesseract::Trie::Trie	(	DawgType	type,
		const STRING &	lang,
		PermuterType	perm,
		uinT64	max_num_edges,
		int	unicharset_size,
		int	debug_level
	)		`[inline]`

Definition at line 89 of file trie.h.

                                                                   {
    init(type, lang, perm, unicharset_size, debug_level);
    num_edges_ = 0;
    max_num_edges_ = max_num_edges;
    deref_node_index_mask_ = ~letter_mask_;
    new_dawg_node();  // need to allocate node 0
    initialized_patterns_ = false;
  }

virtual tesseract::Trie::~Trie ( ) [inline, virtual]

Definition at line 98 of file trie.h.

{ nodes_.delete_data_pointers(); }

Member Function Documentation

bool tesseract::Trie::add_edge_linkage	(	NODE_REF	node1,
		NODE_REF	node2,
		bool	repeats,
		int	direction,
		bool	word_end,
		UNICHAR_ID	unichar_id
	)		`[protected]`

Definition at line 123 of file trie.cpp.

                                                   {
  if (num_edges_ == max_num_edges_) return false;
  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
    &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
  int search_index;
  if (node1 == 0) {
    search_index = 0;  // find the index to make the add sorted
    while (search_index < vec->size() &&
           given_greater_than_edge_rec(node2, word_end, unichar_id,
                                       (*vec)[search_index]) == 1) {
      search_index++;
    }
  } else {
    search_index = vec->size();  // add is unsorted, so index does not matter
  }
  EDGE_RECORD edge_rec;
  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
  if (search_index < vec->size()) {
    vec->insert(edge_rec, search_index);
  } else {
    vec->push_back(edge_rec);
  }
  if (debug_level_ > 1) {
    tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
    print_edge_rec(edge_rec);
    tprintf("\n");
  }
  num_edges_++;
  return true;
}

bool tesseract::Trie::add_new_edge	(	NODE_REF	node1,
		NODE_REF	node2,
		bool	repeats,
		bool	word_end,
		UNICHAR_ID	unichar_id
	)		`[inline, protected]`

Definition at line 332 of file trie.h.

                                                                        {
    return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
                             word_end, unichar_id) &&
            add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
                             word_end, unichar_id));
  }

void tesseract::Trie::add_word_ending	(	EDGE_RECORD *	edge,
		NODE_REF	the_next_node,
		bool	repeats,
		UNICHAR_ID	unichar_id
	)		`[protected]`

Definition at line 156 of file trie.cpp.

                                                  {
  EDGE_RECORD *back_edge_ptr;
  EDGE_INDEX back_edge_index;
  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
                           unichar_id, &back_edge_ptr, &back_edge_index));
  if (marker_flag) {
    *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
    *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
  }
  // Mark both directions as end of word.
  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
}

bool tesseract::Trie::add_word_to_dawg	(	const WERD_CHOICE &	word,
		const GenericVector< bool > *	repetitions
	)

Definition at line 173 of file trie.cpp.

                                                                    {
  if (word.length() <= 0) return false;  // can't add empty words
  if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
  // Make sure the word does not contain invalid unchar ids.
  for (int i = 0; i < word.length(); ++i) {
    if (word.unichar_id(i) < 0 ||
        word.unichar_id(i) >= unicharset_size_) return false;
  }

  EDGE_RECORD *edge_ptr;
  NODE_REF last_node = 0;
  NODE_REF the_next_node;
  bool marker_flag = false;
  EDGE_INDEX edge_index;
  int i;
  inT32 still_finding_chars = true;
  inT32 word_end = false;
  bool  add_failed = false;
  bool found;

  if (debug_level_ > 1) word.print("\nAdding word: ");

  UNICHAR_ID unichar_id;
  for (i = 0; i < word.length() - 1; ++i) {
    unichar_id = word.unichar_id(i);
    marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
    if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
    if (still_finding_chars) {
      found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
                           unichar_id, &edge_ptr, &edge_index);
      if (found && debug_level_ > 1) {
        tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
                edge_index, last_node);
      }
      if (!found) {
        still_finding_chars = false;
      } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
        word_end = true;
        still_finding_chars = false;
        remove_edge(last_node, 0, word_end, unichar_id);
      } else {
        if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
        last_node = next_node_from_edge_rec(*edge_ptr);
      }
    }
    if (!still_finding_chars) {
      the_next_node = new_dawg_node();
      if (debug_level_ > 1)
        tprintf("adding node " REFFORMAT "\n", the_next_node);
      if (the_next_node == 0) {
        add_failed = true;
        break;
      }
      if (!add_new_edge(last_node, the_next_node,
                        marker_flag, word_end, unichar_id)) {
        add_failed = true;
        break;
      }
      word_end = false;
      last_node = the_next_node;
    }
  }
  the_next_node = 0;
  unichar_id = word.unichar_id(i);
  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
  if (still_finding_chars &&
      edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
                   unichar_id, &edge_ptr, &edge_index)) {
    // An extension of this word already exists in the trie, so we
    // only have to add the ending flags in both directions.
    add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
                    marker_flag, unichar_id);
  } else {
    if (!add_failed &&
        !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
      add_failed = true;
  }
  if (add_failed) {
    tprintf("Re-initializing document dictionary...\n");
    clear();
    return false;
  } else {
    return true;
  }
}

bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE & word ) [inline]

Definition at line 245 of file trie.h.

                                                 {
    return add_word_to_dawg(word, NULL);
  }

bool tesseract::Trie::can_be_eliminated ( const EDGE_RECORD & edge_rec ) [inline, protected]

Definition at line 302 of file trie.h.

                                                             {
    NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
    return (node_ref != NO_EDGE &&
            nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
  }

UNICHAR_ID tesseract::Trie::character_class_to_pattern ( char ch ) [protected]

Definition at line 346 of file trie.cpp.

                                                   {
  if (ch == 'c') {
    return alpha_pattern_;
  } else if (ch == 'd') {
    return digit_pattern_;
  } else if (ch == 'n') {
    return alphanum_pattern_;
  } else if (ch == 'p') {
    return punc_pattern_;
  } else if (ch == 'a') {
    return lower_pattern_;
  } else if (ch == 'A') {
    return upper_pattern_;
  } else {
    return INVALID_UNICHAR_ID;
  }
}

void tesseract::Trie::clear ( )

Definition at line 65 of file trie.cpp.

                 {
  nodes_.delete_data_pointers();
  nodes_.clear();
  num_edges_ = 0;
  new_dawg_node();  // Need to allocate node 0.
}

EDGE_RECORD* tesseract::Trie::deref_edge_ref ( EDGE_REF edge_ref ) const [inline, protected]

Definition at line 267 of file trie.h.

                                                              {
    int edge_index = static_cast<int>(
      (edge_ref & letter_mask_) >> LETTER_START_BIT);
    int node_index = static_cast<int>(
      (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
    TRIE_NODE_RECORD *node_rec = nodes_[node_index];
    return &(node_rec->forward_edges[edge_index]);
  }

EDGE_REF tesseract::Trie::edge_char_of	(	NODE_REF	node_ref,
		UNICHAR_ID	unichar_id,
		bool	word_end
	)		const `[inline, virtual]`

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 104 of file trie.h.

                                             {
    EDGE_RECORD *edge_ptr;
    EDGE_INDEX edge_index;
    if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
                      &edge_ptr, &edge_index)) return NO_EDGE;
    return make_edge_ref(node_ref, edge_index);
  }

bool tesseract::Trie::edge_char_of	(	NODE_REF	node_ref,
		NODE_REF	next_node,
		int	direction,
		bool	word_end,
		UNICHAR_ID	unichar_id,
		EDGE_RECORD **	edge_ptr,
		EDGE_INDEX *	edge_index
	)		const `[protected]`

Definition at line 72 of file trie.cpp.

                                                                              {
  if (debug_level_ == 3) {
    tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
            " direction %d word_end %d unichar_id %d, exploring node:\n",
            node_ref, next_node, direction, word_end, unichar_id);
    if (node_ref != NO_EDGE) {
      print_node(node_ref, nodes_[node_ref]->forward_edges.size());
    }
  }
  if (node_ref == NO_EDGE) return false;
  assert(node_ref < nodes_.size());
  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
    nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
  int vec_size = vec.size();
  if (node_ref == 0) { // binary search
    EDGE_INDEX start = 0;
    EDGE_INDEX end = vec_size - 1;
    EDGE_INDEX k;
    int compare;
    while (start <= end) {
      k = (start + end) >> 1;  // (start + end) / 2
      compare = given_greater_than_edge_rec(next_node, word_end,
                                            unichar_id, vec[k]);
      if (compare == 0) {  // given == vec[k]
        *edge_ptr = &(vec[k]);
        *edge_index = k;
        return true;
      } else if (compare == 1) {  // given > vec[k]
        start = k + 1;
      } else {  // given < vec[k]
        end = k - 1;
      }
    }
  } else {  // linear search
    for (int i = 0; i < vec_size; ++i) {
      EDGE_RECORD &edge_rec = vec[i];
      if (edge_rec_match(next_node, word_end, unichar_id,
                         next_node_from_edge_rec(edge_rec),
                         end_of_word_from_edge_rec(edge_rec),
                         unichar_id_from_edge_rec(edge_rec))) {
        *edge_ptr = &(edge_rec);
        *edge_index = i;
        return true;
      }
    }
  }
  return false;  // not found
}

UNICHAR_ID tesseract::Trie::edge_letter ( EDGE_REF edge_ref ) const [inline, virtual]

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 145 of file trie.h.

                                                  {
    if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
    return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
  }

bool tesseract::Trie::eliminate_redundant_edges	(	NODE_REF	node,
		const EDGE_RECORD &	edge1,
		const EDGE_RECORD &	edge2
	)		`[protected]`

Definition at line 512 of file trie.cpp.

                                                               {
  if (debug_level_ > 1) {
    tprintf("\nCollapsing node %d:\n", node);
    print_node(node, MAX_NODE_EDGES_DISPLAY);
    tprintf("Candidate edges: ");
    print_edge_rec(edge1);
    tprintf(", ");
    print_edge_rec(edge2);
    tprintf("\n\n");
  }
  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
  // Translate all edges going to/from next_node2 to go to/from next_node1.
  EDGE_RECORD *edge_ptr = NULL;
  EDGE_INDEX edge_index;
  int i;
  // Remove the backward link in node to next_node2.
  const EDGE_RECORD &fwd_edge = next_node2_ptr->forward_edges[0];
  remove_edge_linkage(node, next_node2, BACKWARD_EDGE,
                      end_of_word_from_edge_rec(fwd_edge),
                      unichar_id_from_edge_rec(fwd_edge));
  // Copy all the backward links in next_node2 to node next_node1
  for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
    const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
    NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
    UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
    int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
    bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
    add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
                     curr_word_end, curr_unichar_id);
    // Relocate the corresponding forward edge in curr_next_node
    ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
                             curr_word_end, curr_unichar_id,
                             &edge_ptr, &edge_index));
    set_next_node_in_edge_rec(edge_ptr, next_node1);
  }
  int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
                              next_node2_ptr->backward_edges.size());
  if (debug_level_ > 1) {
    tprintf("removed %d edges from node " REFFORMAT "\n",
            next_node2_num_edges, next_node2);
  }
  next_node2_ptr->forward_edges.clear();
  next_node2_ptr->backward_edges.clear();
  num_edges_ -= next_node2_num_edges;
  return true;
}

bool tesseract::Trie::end_of_word ( EDGE_REF edge_ref ) const [inline, virtual]

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 139 of file trie.h.

                                            {
    if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
    return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
  }

const char * tesseract::Trie::get_reverse_policy_name ( RTLReversePolicy reverse_policy ) [static]

Definition at line 60 of file trie.cpp.

                                                                         {
  return RTLReversePolicyNames[reverse_policy];
}

void tesseract::Trie::initialize_patterns ( UNICHARSET * unicharset )

Definition at line 307 of file trie.cpp.

                                                     {
  unicharset->unichar_insert(kAlphaPatternUnicode);
  alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);
  unicharset->unichar_insert(kDigitPatternUnicode);
  digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);
  unicharset->unichar_insert(kAlphanumPatternUnicode);
  alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);
  unicharset->unichar_insert(kPuncPatternUnicode);
  punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);
  unicharset->unichar_insert(kLowerPatternUnicode);
  lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);
  unicharset->unichar_insert(kUpperPatternUnicode);
  upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
  initialized_patterns_ = true;
  unicharset_size_ = unicharset->size();
}

void tesseract::Trie::link_edge	(	EDGE_RECORD *	edge,
		NODE_REF	nxt,
		bool	repeats,
		int	direction,
		bool	word_end,
		UNICHAR_ID	unichar_id
	)		`[inline, protected]`

Sets up this edge record to the requested values.

Definition at line 282 of file trie.h.

                                                                             {
    EDGE_RECORD flags = 0;
    if (repeats) flags |= MARKER_FLAG;
    if (word_end) flags |= WERD_END_FLAG;
    if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
    *edge = ((nxt << next_node_start_bit_) |
             (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
             (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
  }

EDGE_REF tesseract::Trie::make_edge_ref	(	NODE_REF	node_index,
		EDGE_INDEX	edge_index
	)		const `[inline, protected]`

Constructs EDGE_REF from the given node_index and edge_index.

Definition at line 276 of file trie.h.

                                                             {
    return ((node_index << flag_start_bit_) |
            (edge_index << LETTER_START_BIT));
  }

NODE_REF tesseract::Trie::new_dawg_node ( ) [protected]

Definition at line 261 of file trie.cpp.

                             {
  TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD();
  if (node == NULL) return 0;  // failed to create new node
  nodes_.push_back(node);
  return nodes_.length() - 1;
}

NODE_REF tesseract::Trie::next_node ( EDGE_REF edge_ref ) const [inline, virtual]

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 130 of file trie.h.

                                              {
    if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
    return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
  }

virtual EDGE_REF tesseract::Trie::pattern_loop_edge	(	EDGE_REF	edge_ref,
		UNICHAR_ID	unichar_id,
		bool	word_end
	)		const `[inline, virtual]`

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented from tesseract::Dawg.

Definition at line 223 of file trie.h.

                                                          {
    if (edge_ref == NO_EDGE) return NO_EDGE;
    EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
    return (marker_flag_from_edge_rec(*edge_rec) &&
            unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
            word_end == end_of_word_from_edge_rec(*edge_rec)) ?
            edge_ref : NO_EDGE;
  }

void tesseract::Trie::print_all	(	const char *	msg,
		int	max_num_edges
	)		`[inline, protected]`

Definition at line 310 of file trie.h.

                                                     {
    tprintf("\n__________________________\n%s\n", msg);
    for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
    tprintf("__________________________\n");
  }

void tesseract::Trie::print_edge_rec ( const EDGE_RECORD & edge_rec ) const [inline, protected]

Prints the given EDGE_RECORD.

Definition at line 293 of file trie.h.

                                                                {
    tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
            marker_flag_from_edge_rec(edge_rec) ? "R," : "",
            (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
            end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
            unichar_id_from_edge_rec(edge_rec));
  }

void tesseract::Trie::print_node	(	NODE_REF	node,
		int	max_num_edges
	)		const `[virtual]`

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 648 of file trie.cpp.

                                                            {
  if (node == NO_EDGE) return;  // nothing to print
  TRIE_NODE_RECORD *node_ptr = nodes_[node];
  int num_fwd = node_ptr->forward_edges.size();
  int num_bkw = node_ptr->backward_edges.size();
  EDGE_VECTOR *vec;
  for (int dir = 0; dir < 2; ++dir) {
    if (dir == 0) {
      vec = &(node_ptr->forward_edges);
      tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
    } else {
      vec = &(node_ptr->backward_edges);
      tprintf("\t");
    }
    int i;
    for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
         i < max_num_edges; ++i) {
      print_edge_rec((*vec)[i]);
      tprintf(" ");
    }
    if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
    tprintf("\n");
  }
}

bool tesseract::Trie::read_pattern_list	(	const char *	filename,
		const UNICHARSET &	unicharset
	)

Definition at line 364 of file trie.cpp.

                                                           {
  if (!initialized_patterns_) {
    tprintf("please call initialize_patterns() before read_pattern_list()\n");
    return false;
  }

  FILE *pattern_file = open_file (filename, "r");
  if (pattern_file == NULL) {
    tprintf("Error opening pattern file %s\n", filename);
    return false;
  }

  int pattern_count = 0;
  char string[CHARS_PER_LINE];
  while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) {
    chomp_string(string);  // remove newline
    // Parse the pattern and construct a unichar id vector.
    // Record the number of repetitions of each unichar in the parallel vector.
    WERD_CHOICE word(&unicharset);
    GenericVector<bool> repetitions_vec;
    const char *str_ptr = string;
    int step = unicharset.step(str_ptr);
    bool failed = false;
    while (step > 0) {
      UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
      if (step == 1 && *str_ptr == '\\') {
        ++str_ptr;
        if (*str_ptr == '\\') {  // regular '\' unichar that was escaped
          curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
        } else {
          if (word.length() < kSaneNumConcreteChars) {
            tprintf("Please provide at least %d concrete characters at the"
                    " beginning of the pattern\n", kSaneNumConcreteChars);
            failed = true;
            break;
          }
          // Parse character class from expression.
          curr_unichar_id = character_class_to_pattern(*str_ptr);
        }
      } else {
        curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
      }
      if (curr_unichar_id ==  INVALID_UNICHAR_ID) {
        failed = true;
        break;  // failed to parse this pattern
      }
      word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
      repetitions_vec.push_back(false);
      str_ptr += step;
      step = unicharset.step(str_ptr);
      // Check if there is a repetition pattern specified after this unichar.
      if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
        repetitions_vec[repetitions_vec.size()-1] = true;
        str_ptr += 2;
        step = unicharset.step(str_ptr);
      }
    }
    if (failed) {
      tprintf("Invalid user pattern %s\n", string);
      continue;
    }
    // Insert the pattern into the trie.
    if (debug_level_ > 2) {
      tprintf("Inserting expanded user pattern %s\n",
              word.debug_string().string());
    }
    if (!this->word_in_dawg(word)) {
      this->add_word_to_dawg(word, &repetitions_vec);
      if (!this->word_in_dawg(word)) {
        tprintf("Error: failed to insert pattern '%s'\n", string);
      }
    }
    ++pattern_count;
  }
  if (debug_level_) {
    tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
  }
  fclose(pattern_file);
  return true;
}

bool tesseract::Trie::read_word_list	(	const char *	filename,
		const UNICHARSET &	unicharset,
		Trie::RTLReversePolicy	reverse
	)

Definition at line 268 of file trie.cpp.

                                                               {
  FILE *word_file;
  char string[CHARS_PER_LINE];
  int  word_count = 0;

  word_file = open_file (filename, "r");

  while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
    chomp_string(string);  // remove newline
    WERD_CHOICE word(string, unicharset);
    if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
        word.has_rtl_unichar_id()) ||
        reverse_policy == RRP_FORCE_REVERSE) {
      word.reverse_and_mirror_unichar_ids();
    }
    ++word_count;
    if (debug_level_ && word_count % 10000 == 0)
      tprintf("Read %d words so far\n", word_count);
    if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
      if (!this->word_in_dawg(word)) {
        this->add_word_to_dawg(word);
        if (!this->word_in_dawg(word)) {
          tprintf("Error: word '%s' not in DAWG after adding it\n", string);
          return false;
        }
      }
    } else if (debug_level_) {
      tprintf("Skipping invalid word %s\n", string);
      if (debug_level_ >= 3) word.print();
    }
  }
  if (debug_level_)
    tprintf("Read %d words total.\n", word_count);
  fclose(word_file);
  return true;
}

bool tesseract::Trie::reduce_lettered_edges	(	EDGE_INDEX	edge_index,
		UNICHAR_ID	unichar_id,
		NODE_REF	node,
		const EDGE_VECTOR &	backward_edges,
		NODE_MARKER	reduced_nodes
	)		`[protected]`

Definition at line 563 of file trie.cpp.

                                                            {
  if (debug_level_ > 1)
    tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
  // Compare each of the edge pairs with the given unichar_id.
  bool did_something = false;
  for (int i = edge_index; i < backward_edges.size() - 1; ++i) {
    // Find the first edge that can be eliminated.
    UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
    while (i < backward_edges.size() &&
           ((curr_unichar_id = unichar_id_from_edge_rec(backward_edges[i])) ==
            unichar_id) &&
           !can_be_eliminated(backward_edges[i])) ++i;
    if (i == backward_edges.size() || curr_unichar_id != unichar_id) break;
    const EDGE_RECORD &edge_rec = backward_edges[i];
    // Compare it to the rest of the edges with the given unichar_id.
    for (int j = i + 1; j < backward_edges.size(); ++j) {
      const EDGE_RECORD &next_edge_rec = backward_edges[j];
      if (unichar_id_from_edge_rec(next_edge_rec) != unichar_id) break;
      if (end_of_word_from_edge_rec(next_edge_rec) ==
          end_of_word_from_edge_rec(edge_rec) &&
          can_be_eliminated(next_edge_rec) &&
          eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
        reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
        did_something = true;
        --j;  // do not increment j if next_edge_rec was removed
      }
    }
  }
  return did_something;
}

void tesseract::Trie::reduce_node_input	(	NODE_REF	node,
		NODE_MARKER	reduced_nodes
	)		`[protected]`

Eliminates any redundant edges from this node in the Trie.

Definition at line 615 of file trie.cpp.

                                                        {
  if (debug_level_ > 1) {
    tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
    print_node(node, MAX_NODE_EDGES_DISPLAY);
  }

  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
  if (node != 0) sort_edges(&backward_edges);
  EDGE_INDEX edge_index = 0;
  while (edge_index < backward_edges.size()) {
    UNICHAR_ID unichar_id =
      unichar_id_from_edge_rec(backward_edges[edge_index]);
    while (reduce_lettered_edges(edge_index, unichar_id, node,
                                 backward_edges, reduced_nodes));
    while (++edge_index < backward_edges.size() &&
           unichar_id_from_edge_rec(backward_edges[edge_index]) == unichar_id);
  }
  reduced_nodes[node] = true;  // mark as reduced

  if (debug_level_ > 1) {
    tprintf("Node " REFFORMAT " after reduction:\n", node);
    print_node(node, MAX_NODE_EDGES_DISPLAY);
  }

  for (int i = 0; i < backward_edges.size(); ++i) {
    NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
    if (next_node != 0 && !reduced_nodes[next_node]) {
      reduce_node_input(next_node, reduced_nodes);
    }
  }
}

void tesseract::Trie::remove_edge	(	NODE_REF	node1,
		NODE_REF	node2,
		bool	word_end,
		UNICHAR_ID	unichar_id
	)		`[inline, protected]`

Definition at line 357 of file trie.h.

                                                         {
    remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
    remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
  }

void tesseract::Trie::remove_edge_linkage	(	NODE_REF	node1,
		NODE_REF	node2,
		int	direction,
		bool	word_end,
		UNICHAR_ID	unichar_id
	)		`[protected]`

Definition at line 446 of file trie.cpp.

                                                                     {
  EDGE_RECORD *edge_ptr = NULL;
  EDGE_INDEX edge_index = 0;
  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
                           unichar_id, &edge_ptr, &edge_index));
  if (debug_level_ > 1) {
    tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
    print_edge_rec(*edge_ptr);
    tprintf("\n");
  }
  if (direction == FORWARD_EDGE) {
    nodes_[node1]->forward_edges.remove(edge_index);
  } else {
    nodes_[node1]->backward_edges.remove(edge_index);
  }
  --num_edges_;
}

void tesseract::Trie::sort_edges ( EDGE_VECTOR * edges ) [protected]

Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.

Definition at line 598 of file trie.cpp.

                                        {
  int num_edges = edges->size();
  if (num_edges <= 1) return;
  for (int i = 0; i < num_edges - 1; ++i) {
    int min = i;
    for (int j = (i + 1); j < num_edges; ++j) {
      if (unichar_id_from_edge_rec((*edges)[j]) <
          unichar_id_from_edge_rec((*edges)[min])) min = j;
    }
    if (i != min) {
      EDGE_RECORD temp = (*edges)[i];
      (*edges)[i] = (*edges)[min];
      (*edges)[min] = temp;
    }
  }
}

SquishedDawg * tesseract::Trie::trie_to_dawg ( )

Definition at line 465 of file trie.cpp.

                                 {
  if (debug_level_ > 2) {
    print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
  }
  NODE_MARKER reduced_nodes = new bool[nodes_.size()];
  for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
  this->reduce_node_input(0, reduced_nodes);
  delete[] reduced_nodes;

  if (debug_level_ > 2) {
    print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
  }
  // Build a translation map from node indices in nodes_ vector to
  // their target indices in EDGE_ARRAY.
  NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
  int i, j;
  node_ref_map[0] = 0;
  for (i = 0; i < nodes_.size(); ++i) {
    node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
  }
  int num_forward_edges = node_ref_map[i];

  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
  EDGE_ARRAY edge_array =
    (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD));
  EDGE_ARRAY edge_array_ptr = edge_array;
  for (i = 0; i < nodes_.size(); ++i) {
    TRIE_NODE_RECORD *node_ptr = nodes_[i];
    int end = node_ptr->forward_edges.size();
    for (j = 0; j < end; ++j) {
      EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
      NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
      ASSERT_HOST(node_ref < nodes_.size());
      UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
      link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
                end_of_word_from_edge_rec(edge_rec), unichar_id);
      if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
      ++edge_array_ptr;
    }
  }
  delete[] node_ref_map;

  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
                          perm_, unicharset_size_, debug_level_);
}

void tesseract::Trie::unichar_id_to_patterns	(	UNICHAR_ID	unichar_id,
		const UNICHARSET &	unicharset,
		GenericVector< UNICHAR_ID > *	vec
	)		const `[virtual]`

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented from tesseract::Dawg.

Definition at line 324 of file trie.cpp.

                                                                        {
  bool is_alpha = unicharset.get_isalpha(unichar_id);
  if (is_alpha) {
    vec->push_back(alpha_pattern_);
    vec->push_back(alphanum_pattern_);
    if (unicharset.get_islower(unichar_id)) {
      vec->push_back(lower_pattern_);
    } else if (unicharset.get_isupper(unichar_id)) {
      vec->push_back(upper_pattern_);
    }
  }
  if (unicharset.get_isdigit(unichar_id)) {
    vec->push_back(digit_pattern_);
    if (!is_alpha) vec->push_back(alphanum_pattern_);
  }
  if (unicharset.get_ispunctuation(unichar_id)) {
    vec->push_back(punc_pattern_);
  }
}

void tesseract::Trie::unichar_ids_of	(	NODE_REF	node,
		NodeChildVector *	vec
	)		const `[inline, virtual]`

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 117 of file trie.h.

                                                                 {
    const EDGE_VECTOR &forward_edges =
      nodes_[static_cast<int>(node)]->forward_edges;
    for (int i = 0; i < forward_edges.size(); ++i) {
      vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
                               make_edge_ref(node, i)));
    }
  }