Tesseract
3.02
|
#include <trie.h>
Public Types | |
enum | RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE } |
Public Member Functions | |
Trie (DawgType type, const STRING &lang, PermuterType perm, uinT64 max_num_edges, int unicharset_size, int debug_level) | |
virtual | ~Trie () |
void | clear () |
EDGE_REF | edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const |
void | unichar_ids_of (NODE_REF node, NodeChildVector *vec) const |
NODE_REF | next_node (EDGE_REF edge_ref) const |
bool | end_of_word (EDGE_REF edge_ref) const |
UNICHAR_ID | edge_letter (EDGE_REF edge_ref) const |
void | print_node (NODE_REF node, int max_num_edges) const |
SquishedDawg * | trie_to_dawg () |
bool | read_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse) |
bool | read_pattern_list (const char *filename, const UNICHARSET &unicharset) |
void | initialize_patterns (UNICHARSET *unicharset) |
void | unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const |
virtual EDGE_REF | pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const |
bool | add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions) |
bool | add_word_to_dawg (const WERD_CHOICE &word) |
Static Public Member Functions | |
static const char * | get_reverse_policy_name (RTLReversePolicy reverse_policy) |
Static Public Attributes | |
static const int | kSaneNumConcreteChars = 4 |
static const char | kAlphaPatternUnicode [] = "\u2000" |
static const char | kDigitPatternUnicode [] = "\u2001" |
static const char | kAlphanumPatternUnicode [] = "\u2002" |
static const char | kPuncPatternUnicode [] = "\u2003" |
static const char | kLowerPatternUnicode [] = "\u2004" |
static const char | kUpperPatternUnicode [] = "\u2005" |
Protected Member Functions | |
EDGE_RECORD * | deref_edge_ref (EDGE_REF edge_ref) const |
EDGE_REF | make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const |
void | link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
void | print_edge_rec (const EDGE_RECORD &edge_rec) const |
bool | can_be_eliminated (const EDGE_RECORD &edge_rec) |
void | print_all (const char *msg, int max_num_edges) |
bool | edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const |
bool | add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
bool | add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id) |
void | add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id) |
NODE_REF | new_dawg_node () |
void | remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id) |
void | remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) |
bool | eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2) |
bool | reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, const EDGE_VECTOR &backward_edges, NODE_MARKER reduced_nodes) |
void | sort_edges (EDGE_VECTOR *edges) |
void | reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes) |
UNICHAR_ID | character_class_to_pattern (char ch) |
Protected Attributes | |
TRIE_NODES | nodes_ |
uinT64 | num_edges_ |
uinT64 | max_num_edges_ |
uinT64 | deref_direction_mask_ |
uinT64 | deref_node_index_mask_ |
bool | initialized_patterns_ |
UNICHAR_ID | alpha_pattern_ |
UNICHAR_ID | digit_pattern_ |
UNICHAR_ID | alphanum_pattern_ |
UNICHAR_ID | punc_pattern_ |
UNICHAR_ID | lower_pattern_ |
UNICHAR_ID | upper_pattern_ |
Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.
tesseract::Trie::Trie | ( | DawgType | type, |
const STRING & | lang, | ||
PermuterType | perm, | ||
uinT64 | max_num_edges, | ||
int | unicharset_size, | ||
int | debug_level | ||
) | [inline] |
Definition at line 89 of file trie.h.
{ init(type, lang, perm, unicharset_size, debug_level); num_edges_ = 0; max_num_edges_ = max_num_edges; deref_node_index_mask_ = ~letter_mask_; new_dawg_node(); // need to allocate node 0 initialized_patterns_ = false; }
virtual tesseract::Trie::~Trie | ( | ) | [inline, virtual] |
Definition at line 98 of file trie.h.
{ nodes_.delete_data_pointers(); }
bool tesseract::Trie::add_edge_linkage | ( | NODE_REF | node1, |
NODE_REF | node2, | ||
bool | repeats, | ||
int | direction, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [protected] |
Definition at line 123 of file trie.cpp.
{ if (num_edges_ == max_num_edges_) return false; EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges); int search_index; if (node1 == 0) { search_index = 0; // find the index to make the add sorted while (search_index < vec->size() && given_greater_than_edge_rec(node2, word_end, unichar_id, (*vec)[search_index]) == 1) { search_index++; } } else { search_index = vec->size(); // add is unsorted, so index does not matter } EDGE_RECORD edge_rec; link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id); if (search_index < vec->size()) { vec->insert(edge_rec, search_index); } else { vec->push_back(edge_rec); } if (debug_level_ > 1) { tprintf("new edge in nodes_[" REFFORMAT "]: ", node1); print_edge_rec(edge_rec); tprintf("\n"); } num_edges_++; return true; }
bool tesseract::Trie::add_new_edge | ( | NODE_REF | node1, |
NODE_REF | node2, | ||
bool | repeats, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [inline, protected] |
Definition at line 332 of file trie.h.
{ return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, word_end, unichar_id) && add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, word_end, unichar_id)); }
void tesseract::Trie::add_word_ending | ( | EDGE_RECORD * | edge, |
NODE_REF | the_next_node, | ||
bool | repeats, | ||
UNICHAR_ID | unichar_id | ||
) | [protected] |
Definition at line 156 of file trie.cpp.
{ EDGE_RECORD *back_edge_ptr; EDGE_INDEX back_edge_index; ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, unichar_id, &back_edge_ptr, &back_edge_index)); if (marker_flag) { *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_); *edge_ptr |= (MARKER_FLAG << flag_start_bit_); } // Mark both directions as end of word. *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_); *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); }
bool tesseract::Trie::add_word_to_dawg | ( | const WERD_CHOICE & | word, |
const GenericVector< bool > * | repetitions | ||
) |
Definition at line 173 of file trie.cpp.
{ if (word.length() <= 0) return false; // can't add empty words if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length()); // Make sure the word does not contain invalid unchar ids. for (int i = 0; i < word.length(); ++i) { if (word.unichar_id(i) < 0 || word.unichar_id(i) >= unicharset_size_) return false; } EDGE_RECORD *edge_ptr; NODE_REF last_node = 0; NODE_REF the_next_node; bool marker_flag = false; EDGE_INDEX edge_index; int i; inT32 still_finding_chars = true; inT32 word_end = false; bool add_failed = false; bool found; if (debug_level_ > 1) word.print("\nAdding word: "); UNICHAR_ID unichar_id; for (i = 0; i < word.length() - 1; ++i) { unichar_id = word.unichar_id(i); marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false; if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); if (still_finding_chars) { found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr, &edge_index); if (found && debug_level_ > 1) { tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", edge_index, last_node); } if (!found) { still_finding_chars = false; } else if (next_node_from_edge_rec(*edge_ptr) == 0) { word_end = true; still_finding_chars = false; remove_edge(last_node, 0, word_end, unichar_id); } else { if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr); last_node = next_node_from_edge_rec(*edge_ptr); } } if (!still_finding_chars) { the_next_node = new_dawg_node(); if (debug_level_ > 1) tprintf("adding node " REFFORMAT "\n", the_next_node); if (the_next_node == 0) { add_failed = true; break; } if (!add_new_edge(last_node, the_next_node, marker_flag, word_end, unichar_id)) { add_failed = true; break; } word_end = false; last_node = the_next_node; } } the_next_node = 0; unichar_id = word.unichar_id(i); marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false; if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); if (still_finding_chars && edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, unichar_id, &edge_ptr, &edge_index)) { // An extension of this word already exists in the trie, so we // only have to add the ending flags in both directions. add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), marker_flag, unichar_id); } else { if (!add_failed && !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) add_failed = true; } if (add_failed) { tprintf("Re-initializing document dictionary...\n"); clear(); return false; } else { return true; } }
bool tesseract::Trie::add_word_to_dawg | ( | const WERD_CHOICE & | word | ) | [inline] |
Definition at line 245 of file trie.h.
{ return add_word_to_dawg(word, NULL); }
bool tesseract::Trie::can_be_eliminated | ( | const EDGE_RECORD & | edge_rec | ) | [inline, protected] |
UNICHAR_ID tesseract::Trie::character_class_to_pattern | ( | char | ch | ) | [protected] |
Definition at line 346 of file trie.cpp.
{ if (ch == 'c') { return alpha_pattern_; } else if (ch == 'd') { return digit_pattern_; } else if (ch == 'n') { return alphanum_pattern_; } else if (ch == 'p') { return punc_pattern_; } else if (ch == 'a') { return lower_pattern_; } else if (ch == 'A') { return upper_pattern_; } else { return INVALID_UNICHAR_ID; } }
void tesseract::Trie::clear | ( | ) |
Definition at line 65 of file trie.cpp.
{ nodes_.delete_data_pointers(); nodes_.clear(); num_edges_ = 0; new_dawg_node(); // Need to allocate node 0. }
EDGE_RECORD* tesseract::Trie::deref_edge_ref | ( | EDGE_REF | edge_ref | ) | const [inline, protected] |
Definition at line 267 of file trie.h.
{ int edge_index = static_cast<int>( (edge_ref & letter_mask_) >> LETTER_START_BIT); int node_index = static_cast<int>( (edge_ref & deref_node_index_mask_) >> flag_start_bit_); TRIE_NODE_RECORD *node_rec = nodes_[node_index]; return &(node_rec->forward_edges[edge_index]); }
EDGE_REF tesseract::Trie::edge_char_of | ( | NODE_REF | node_ref, |
UNICHAR_ID | unichar_id, | ||
bool | word_end | ||
) | const [inline, virtual] |
Returns the edge that corresponds to the letter out of this node.
Implements tesseract::Dawg.
Definition at line 104 of file trie.h.
{ EDGE_RECORD *edge_ptr; EDGE_INDEX edge_index; if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr, &edge_index)) return NO_EDGE; return make_edge_ref(node_ref, edge_index); }
bool tesseract::Trie::edge_char_of | ( | NODE_REF | node_ref, |
NODE_REF | next_node, | ||
int | direction, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id, | ||
EDGE_RECORD ** | edge_ptr, | ||
EDGE_INDEX * | edge_index | ||
) | const [protected] |
Definition at line 72 of file trie.cpp.
{ if (debug_level_ == 3) { tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT " direction %d word_end %d unichar_id %d, exploring node:\n", node_ref, next_node, direction, word_end, unichar_id); if (node_ref != NO_EDGE) { print_node(node_ref, nodes_[node_ref]->forward_edges.size()); } } if (node_ref == NO_EDGE) return false; assert(node_ref < nodes_.size()); EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges; int vec_size = vec.size(); if (node_ref == 0) { // binary search EDGE_INDEX start = 0; EDGE_INDEX end = vec_size - 1; EDGE_INDEX k; int compare; while (start <= end) { k = (start + end) >> 1; // (start + end) / 2 compare = given_greater_than_edge_rec(next_node, word_end, unichar_id, vec[k]); if (compare == 0) { // given == vec[k] *edge_ptr = &(vec[k]); *edge_index = k; return true; } else if (compare == 1) { // given > vec[k] start = k + 1; } else { // given < vec[k] end = k - 1; } } } else { // linear search for (int i = 0; i < vec_size; ++i) { EDGE_RECORD &edge_rec = vec[i]; if (edge_rec_match(next_node, word_end, unichar_id, next_node_from_edge_rec(edge_rec), end_of_word_from_edge_rec(edge_rec), unichar_id_from_edge_rec(edge_rec))) { *edge_ptr = &(edge_rec); *edge_index = i; return true; } } } return false; // not found }
UNICHAR_ID tesseract::Trie::edge_letter | ( | EDGE_REF | edge_ref | ) | const [inline, virtual] |
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 145 of file trie.h.
{ if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID; return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref)); }
bool tesseract::Trie::eliminate_redundant_edges | ( | NODE_REF | node, |
const EDGE_RECORD & | edge1, | ||
const EDGE_RECORD & | edge2 | ||
) | [protected] |
Definition at line 512 of file trie.cpp.
{ if (debug_level_ > 1) { tprintf("\nCollapsing node %d:\n", node); print_node(node, MAX_NODE_EDGES_DISPLAY); tprintf("Candidate edges: "); print_edge_rec(edge1); tprintf(", "); print_edge_rec(edge2); tprintf("\n\n"); } NODE_REF next_node1 = next_node_from_edge_rec(edge1); NODE_REF next_node2 = next_node_from_edge_rec(edge2); TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2]; // Translate all edges going to/from next_node2 to go to/from next_node1. EDGE_RECORD *edge_ptr = NULL; EDGE_INDEX edge_index; int i; // Remove the backward link in node to next_node2. const EDGE_RECORD &fwd_edge = next_node2_ptr->forward_edges[0]; remove_edge_linkage(node, next_node2, BACKWARD_EDGE, end_of_word_from_edge_rec(fwd_edge), unichar_id_from_edge_rec(fwd_edge)); // Copy all the backward links in next_node2 to node next_node1 for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) { const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i]; NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge); UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge); int curr_word_end = end_of_word_from_edge_rec(bkw_edge); bool marker_flag = marker_flag_from_edge_rec(bkw_edge); add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, curr_word_end, curr_unichar_id); // Relocate the corresponding forward edge in curr_next_node ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, curr_word_end, curr_unichar_id, &edge_ptr, &edge_index)); set_next_node_in_edge_rec(edge_ptr, next_node1); } int next_node2_num_edges = (next_node2_ptr->forward_edges.size() + next_node2_ptr->backward_edges.size()); if (debug_level_ > 1) { tprintf("removed %d edges from node " REFFORMAT "\n", next_node2_num_edges, next_node2); } next_node2_ptr->forward_edges.clear(); next_node2_ptr->backward_edges.clear(); num_edges_ -= next_node2_num_edges; return true; }
bool tesseract::Trie::end_of_word | ( | EDGE_REF | edge_ref | ) | const [inline, virtual] |
Returns true if the edge indicated by the given EDGE_REF marks the end of a word.
Implements tesseract::Dawg.
Definition at line 139 of file trie.h.
{ if (edge_ref == NO_EDGE || num_edges_ == 0) return false; return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref)); }
const char * tesseract::Trie::get_reverse_policy_name | ( | RTLReversePolicy | reverse_policy | ) | [static] |
Definition at line 60 of file trie.cpp.
{ return RTLReversePolicyNames[reverse_policy]; }
void tesseract::Trie::initialize_patterns | ( | UNICHARSET * | unicharset | ) |
Definition at line 307 of file trie.cpp.
{ unicharset->unichar_insert(kAlphaPatternUnicode); alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode); unicharset->unichar_insert(kDigitPatternUnicode); digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode); unicharset->unichar_insert(kAlphanumPatternUnicode); alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode); unicharset->unichar_insert(kPuncPatternUnicode); punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode); unicharset->unichar_insert(kLowerPatternUnicode); lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode); unicharset->unichar_insert(kUpperPatternUnicode); upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode); initialized_patterns_ = true; unicharset_size_ = unicharset->size(); }
void tesseract::Trie::link_edge | ( | EDGE_RECORD * | edge, |
NODE_REF | nxt, | ||
bool | repeats, | ||
int | direction, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [inline, protected] |
Sets up this edge record to the requested values.
Definition at line 282 of file trie.h.
{ EDGE_RECORD flags = 0; if (repeats) flags |= MARKER_FLAG; if (word_end) flags |= WERD_END_FLAG; if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG; *edge = ((nxt << next_node_start_bit_) | (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) | (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT)); }
EDGE_REF tesseract::Trie::make_edge_ref | ( | NODE_REF | node_index, |
EDGE_INDEX | edge_index | ||
) | const [inline, protected] |
Constructs EDGE_REF from the given node_index and edge_index.
Definition at line 276 of file trie.h.
{ return ((node_index << flag_start_bit_) | (edge_index << LETTER_START_BIT)); }
NODE_REF tesseract::Trie::new_dawg_node | ( | ) | [protected] |
Definition at line 261 of file trie.cpp.
{ TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD(); if (node == NULL) return 0; // failed to create new node nodes_.push_back(node); return nodes_.length() - 1; }
Returns the next node visited by following the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 130 of file trie.h.
{ if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE; return next_node_from_edge_rec(*deref_edge_ref(edge_ref)); }
virtual EDGE_REF tesseract::Trie::pattern_loop_edge | ( | EDGE_REF | edge_ref, |
UNICHAR_ID | unichar_id, | ||
bool | word_end | ||
) | const [inline, virtual] |
Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.
Reimplemented from tesseract::Dawg.
Definition at line 223 of file trie.h.
{ if (edge_ref == NO_EDGE) return NO_EDGE; EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref); return (marker_flag_from_edge_rec(*edge_rec) && unichar_id == unichar_id_from_edge_rec(*edge_rec) && word_end == end_of_word_from_edge_rec(*edge_rec)) ? edge_ref : NO_EDGE; }
void tesseract::Trie::print_all | ( | const char * | msg, |
int | max_num_edges | ||
) | [inline, protected] |
void tesseract::Trie::print_edge_rec | ( | const EDGE_RECORD & | edge_rec | ) | const [inline, protected] |
Prints the given EDGE_RECORD.
Definition at line 293 of file trie.h.
{ tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec), marker_flag_from_edge_rec(edge_rec) ? "R," : "", (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B", end_of_word_from_edge_rec(edge_rec) ? ",E" : "", unichar_id_from_edge_rec(edge_rec)); }
void tesseract::Trie::print_node | ( | NODE_REF | node, |
int | max_num_edges | ||
) | const [virtual] |
Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.
Implements tesseract::Dawg.
Definition at line 648 of file trie.cpp.
{ if (node == NO_EDGE) return; // nothing to print TRIE_NODE_RECORD *node_ptr = nodes_[node]; int num_fwd = node_ptr->forward_edges.size(); int num_bkw = node_ptr->backward_edges.size(); EDGE_VECTOR *vec; for (int dir = 0; dir < 2; ++dir) { if (dir == 0) { vec = &(node_ptr->forward_edges); tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw); } else { vec = &(node_ptr->backward_edges); tprintf("\t"); } int i; for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && i < max_num_edges; ++i) { print_edge_rec((*vec)[i]); tprintf(" "); } if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("..."); tprintf("\n"); } }
bool tesseract::Trie::read_pattern_list | ( | const char * | filename, |
const UNICHARSET & | unicharset | ||
) |
Definition at line 364 of file trie.cpp.
{ if (!initialized_patterns_) { tprintf("please call initialize_patterns() before read_pattern_list()\n"); return false; } FILE *pattern_file = open_file (filename, "r"); if (pattern_file == NULL) { tprintf("Error opening pattern file %s\n", filename); return false; } int pattern_count = 0; char string[CHARS_PER_LINE]; while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) { chomp_string(string); // remove newline // Parse the pattern and construct a unichar id vector. // Record the number of repetitions of each unichar in the parallel vector. WERD_CHOICE word(&unicharset); GenericVector<bool> repetitions_vec; const char *str_ptr = string; int step = unicharset.step(str_ptr); bool failed = false; while (step > 0) { UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; if (step == 1 && *str_ptr == '\\') { ++str_ptr; if (*str_ptr == '\\') { // regular '\' unichar that was escaped curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); } else { if (word.length() < kSaneNumConcreteChars) { tprintf("Please provide at least %d concrete characters at the" " beginning of the pattern\n", kSaneNumConcreteChars); failed = true; break; } // Parse character class from expression. curr_unichar_id = character_class_to_pattern(*str_ptr); } } else { curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); } if (curr_unichar_id == INVALID_UNICHAR_ID) { failed = true; break; // failed to parse this pattern } word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0); repetitions_vec.push_back(false); str_ptr += step; step = unicharset.step(str_ptr); // Check if there is a repetition pattern specified after this unichar. if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') { repetitions_vec[repetitions_vec.size()-1] = true; str_ptr += 2; step = unicharset.step(str_ptr); } } if (failed) { tprintf("Invalid user pattern %s\n", string); continue; } // Insert the pattern into the trie. if (debug_level_ > 2) { tprintf("Inserting expanded user pattern %s\n", word.debug_string().string()); } if (!this->word_in_dawg(word)) { this->add_word_to_dawg(word, &repetitions_vec); if (!this->word_in_dawg(word)) { tprintf("Error: failed to insert pattern '%s'\n", string); } } ++pattern_count; } if (debug_level_) { tprintf("Read %d valid patterns from %s\n", pattern_count, filename); } fclose(pattern_file); return true; }
bool tesseract::Trie::read_word_list | ( | const char * | filename, |
const UNICHARSET & | unicharset, | ||
Trie::RTLReversePolicy | reverse | ||
) |
Definition at line 268 of file trie.cpp.
{ FILE *word_file; char string[CHARS_PER_LINE]; int word_count = 0; word_file = open_file (filename, "r"); while (fgets(string, CHARS_PER_LINE, word_file) != NULL) { chomp_string(string); // remove newline WERD_CHOICE word(string, unicharset); if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && word.has_rtl_unichar_id()) || reverse_policy == RRP_FORCE_REVERSE) { word.reverse_and_mirror_unichar_ids(); } ++word_count; if (debug_level_ && word_count % 10000 == 0) tprintf("Read %d words so far\n", word_count); if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { if (!this->word_in_dawg(word)) { this->add_word_to_dawg(word); if (!this->word_in_dawg(word)) { tprintf("Error: word '%s' not in DAWG after adding it\n", string); return false; } } } else if (debug_level_) { tprintf("Skipping invalid word %s\n", string); if (debug_level_ >= 3) word.print(); } } if (debug_level_) tprintf("Read %d words total.\n", word_count); fclose(word_file); return true; }
bool tesseract::Trie::reduce_lettered_edges | ( | EDGE_INDEX | edge_index, |
UNICHAR_ID | unichar_id, | ||
NODE_REF | node, | ||
const EDGE_VECTOR & | backward_edges, | ||
NODE_MARKER | reduced_nodes | ||
) | [protected] |
Definition at line 563 of file trie.cpp.
{ if (debug_level_ > 1) tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index); // Compare each of the edge pairs with the given unichar_id. bool did_something = false; for (int i = edge_index; i < backward_edges.size() - 1; ++i) { // Find the first edge that can be eliminated. UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; while (i < backward_edges.size() && ((curr_unichar_id = unichar_id_from_edge_rec(backward_edges[i])) == unichar_id) && !can_be_eliminated(backward_edges[i])) ++i; if (i == backward_edges.size() || curr_unichar_id != unichar_id) break; const EDGE_RECORD &edge_rec = backward_edges[i]; // Compare it to the rest of the edges with the given unichar_id. for (int j = i + 1; j < backward_edges.size(); ++j) { const EDGE_RECORD &next_edge_rec = backward_edges[j]; if (unichar_id_from_edge_rec(next_edge_rec) != unichar_id) break; if (end_of_word_from_edge_rec(next_edge_rec) == end_of_word_from_edge_rec(edge_rec) && can_be_eliminated(next_edge_rec) && eliminate_redundant_edges(node, edge_rec, next_edge_rec)) { reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0; did_something = true; --j; // do not increment j if next_edge_rec was removed } } } return did_something; }
void tesseract::Trie::reduce_node_input | ( | NODE_REF | node, |
NODE_MARKER | reduced_nodes | ||
) | [protected] |
Eliminates any redundant edges from this node in the Trie.
Definition at line 615 of file trie.cpp.
{ if (debug_level_ > 1) { tprintf("reduce_node_input(node=" REFFORMAT ")\n", node); print_node(node, MAX_NODE_EDGES_DISPLAY); } EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges; if (node != 0) sort_edges(&backward_edges); EDGE_INDEX edge_index = 0; while (edge_index < backward_edges.size()) { UNICHAR_ID unichar_id = unichar_id_from_edge_rec(backward_edges[edge_index]); while (reduce_lettered_edges(edge_index, unichar_id, node, backward_edges, reduced_nodes)); while (++edge_index < backward_edges.size() && unichar_id_from_edge_rec(backward_edges[edge_index]) == unichar_id); } reduced_nodes[node] = true; // mark as reduced if (debug_level_ > 1) { tprintf("Node " REFFORMAT " after reduction:\n", node); print_node(node, MAX_NODE_EDGES_DISPLAY); } for (int i = 0; i < backward_edges.size(); ++i) { NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]); if (next_node != 0 && !reduced_nodes[next_node]) { reduce_node_input(next_node, reduced_nodes); } } }
void tesseract::Trie::remove_edge | ( | NODE_REF | node1, |
NODE_REF | node2, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [inline, protected] |
Definition at line 357 of file trie.h.
{ remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id); remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id); }
void tesseract::Trie::remove_edge_linkage | ( | NODE_REF | node1, |
NODE_REF | node2, | ||
int | direction, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [protected] |
Definition at line 446 of file trie.cpp.
{ EDGE_RECORD *edge_ptr = NULL; EDGE_INDEX edge_index = 0; ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, unichar_id, &edge_ptr, &edge_index)); if (debug_level_ > 1) { tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1); print_edge_rec(*edge_ptr); tprintf("\n"); } if (direction == FORWARD_EDGE) { nodes_[node1]->forward_edges.remove(edge_index); } else { nodes_[node1]->backward_edges.remove(edge_index); } --num_edges_; }
void tesseract::Trie::sort_edges | ( | EDGE_VECTOR * | edges | ) | [protected] |
Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.
Definition at line 598 of file trie.cpp.
{ int num_edges = edges->size(); if (num_edges <= 1) return; for (int i = 0; i < num_edges - 1; ++i) { int min = i; for (int j = (i + 1); j < num_edges; ++j) { if (unichar_id_from_edge_rec((*edges)[j]) < unichar_id_from_edge_rec((*edges)[min])) min = j; } if (i != min) { EDGE_RECORD temp = (*edges)[i]; (*edges)[i] = (*edges)[min]; (*edges)[min] = temp; } } }
SquishedDawg * tesseract::Trie::trie_to_dawg | ( | ) |
Definition at line 465 of file trie.cpp.
{ if (debug_level_ > 2) { print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY); } NODE_MARKER reduced_nodes = new bool[nodes_.size()]; for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0; this->reduce_node_input(0, reduced_nodes); delete[] reduced_nodes; if (debug_level_ > 2) { print_all("After reduction:", MAX_NODE_EDGES_DISPLAY); } // Build a translation map from node indices in nodes_ vector to // their target indices in EDGE_ARRAY. NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1]; int i, j; node_ref_map[0] = 0; for (i = 0; i < nodes_.size(); ++i) { node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size(); } int num_forward_edges = node_ref_map[i]; // Convert nodes_ vector into EDGE_ARRAY translating the next node references // in edges using node_ref_map. Empty nodes and backward edges are dropped. EDGE_ARRAY edge_array = (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD)); EDGE_ARRAY edge_array_ptr = edge_array; for (i = 0; i < nodes_.size(); ++i) { TRIE_NODE_RECORD *node_ptr = nodes_[i]; int end = node_ptr->forward_edges.size(); for (j = 0; j < end; ++j) { EDGE_RECORD &edge_rec = node_ptr->forward_edges[j]; NODE_REF node_ref = next_node_from_edge_rec(edge_rec); ASSERT_HOST(node_ref < nodes_.size()); UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec); link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE, end_of_word_from_edge_rec(edge_rec), unichar_id); if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr); ++edge_array_ptr; } } delete[] node_ref_map; return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, perm_, unicharset_size_, debug_level_); }
void tesseract::Trie::unichar_id_to_patterns | ( | UNICHAR_ID | unichar_id, |
const UNICHARSET & | unicharset, | ||
GenericVector< UNICHAR_ID > * | vec | ||
) | const [virtual] |
Fills vec with unichar ids that represent the character classes of the given unichar_id.
Reimplemented from tesseract::Dawg.
Definition at line 324 of file trie.cpp.
{ bool is_alpha = unicharset.get_isalpha(unichar_id); if (is_alpha) { vec->push_back(alpha_pattern_); vec->push_back(alphanum_pattern_); if (unicharset.get_islower(unichar_id)) { vec->push_back(lower_pattern_); } else if (unicharset.get_isupper(unichar_id)) { vec->push_back(upper_pattern_); } } if (unicharset.get_isdigit(unichar_id)) { vec->push_back(digit_pattern_); if (!is_alpha) vec->push_back(alphanum_pattern_); } if (unicharset.get_ispunctuation(unichar_id)) { vec->push_back(punc_pattern_); } }
void tesseract::Trie::unichar_ids_of | ( | NODE_REF | node, |
NodeChildVector * | vec | ||
) | const [inline, virtual] |
Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.
Implements tesseract::Dawg.
Definition at line 117 of file trie.h.
{ const EDGE_VECTOR &forward_edges = nodes_[static_cast<int>(node)]->forward_edges; for (int i = 0; i < forward_edges.size(); ++i) { vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]), make_edge_ref(node, i))); } }
UNICHAR_ID tesseract::Trie::alpha_pattern_ [protected] |
UNICHAR_ID tesseract::Trie::alphanum_pattern_ [protected] |
uinT64 tesseract::Trie::deref_direction_mask_ [protected] |
uinT64 tesseract::Trie::deref_node_index_mask_ [protected] |
UNICHAR_ID tesseract::Trie::digit_pattern_ [protected] |
bool tesseract::Trie::initialized_patterns_ [protected] |
const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002" [static] |
const char tesseract::Trie::kAlphaPatternUnicode = "\u2000" [static] |
const char tesseract::Trie::kDigitPatternUnicode = "\u2001" [static] |
const char tesseract::Trie::kLowerPatternUnicode = "\u2004" [static] |
const char tesseract::Trie::kPuncPatternUnicode = "\u2003" [static] |
const int tesseract::Trie::kSaneNumConcreteChars = 4 [static] |
const char tesseract::Trie::kUpperPatternUnicode = "\u2005" [static] |
UNICHAR_ID tesseract::Trie::lower_pattern_ [protected] |
uinT64 tesseract::Trie::max_num_edges_ [protected] |
TRIE_NODES tesseract::Trie::nodes_ [protected] |
uinT64 tesseract::Trie::num_edges_ [protected] |
UNICHAR_ID tesseract::Trie::punc_pattern_ [protected] |
UNICHAR_ID tesseract::Trie::upper_pattern_ [protected] |