Tesseract
3.02
|
#include <dawg.h>
Public Member Functions | |
SquishedDawg (FILE *file, DawgType type, const STRING &lang, PermuterType perm, int debug_level) | |
SquishedDawg (const char *filename, DawgType type, const STRING &lang, PermuterType perm, int debug_level) | |
SquishedDawg (EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level) | |
~SquishedDawg () | |
int | NumEdges () |
EDGE_REF | edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const |
Returns the edge that corresponds to the letter out of this node. | |
void | unichar_ids_of (NODE_REF node, NodeChildVector *vec) const |
NODE_REF | next_node (EDGE_REF edge) const |
bool | end_of_word (EDGE_REF edge_ref) const |
UNICHAR_ID | edge_letter (EDGE_REF edge_ref) const |
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. | |
void | print_node (NODE_REF node, int max_num_edges) const |
void | write_squished_dawg (FILE *file) |
Writes the squished/reduced Dawg to a file. | |
void | write_squished_dawg (const char *filename) |
Concrete class that can operate on a compacted (squished) Dawg (read, search and write to file). This class is read-only in the sense that new words can not be added to an instance of SquishedDawg. The underlying representation of the nodes and edges in SquishedDawg is stored as a contiguous EDGE_ARRAY (read from file or given as an argument to the constructor).
tesseract::SquishedDawg::SquishedDawg | ( | EDGE_ARRAY | edges, |
int | num_edges, | ||
DawgType | type, | ||
const STRING & | lang, | ||
PermuterType | perm, | ||
int | unicharset_size, | ||
int | debug_level | ||
) | [inline] |
tesseract::SquishedDawg::~SquishedDawg | ( | ) |
EDGE_REF tesseract::SquishedDawg::edge_char_of | ( | NODE_REF | node, |
UNICHAR_ID | unichar_id, | ||
bool | word_end | ||
) | const [virtual] |
Returns the edge that corresponds to the letter out of this node.
Implements tesseract::Dawg.
Definition at line 183 of file dawg.cpp.
{ EDGE_REF edge = node; if (node == 0) { // binary search EDGE_REF start = 0; EDGE_REF end = num_forward_edges_in_node0 - 1; int compare; while (start <= end) { edge = (start + end) >> 1; // (start + end) / 2 compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id, edges_[edge]); if (compare == 0) { // given == vec[k] return edge; } else if (compare == 1) { // given > vec[k] start = edge + 1; } else { // given < vec[k] end = edge - 1; } } } else { // linear search if (edge != NO_EDGE && edge_occupied(edge)) { do { if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) && (!word_end || end_of_word_from_edge_rec(edges_[edge]))) return (edge); } while (!last_edge(edge++)); } } return (NO_EDGE); // not found }
UNICHAR_ID tesseract::SquishedDawg::edge_letter | ( | EDGE_REF | edge_ref | ) | const [inline, virtual] |
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 410 of file dawg.h.
{ return unichar_id_from_edge_rec((edges_[edge_ref])); }
bool tesseract::SquishedDawg::end_of_word | ( | EDGE_REF | edge_ref | ) | const [inline, virtual] |
Returns true if the edge indicated by the given EDGE_REF marks the end of a word.
Implements tesseract::Dawg.
Definition at line 405 of file dawg.h.
{ return end_of_word_from_edge_rec((edges_[edge_ref])); }
Returns the next node visited by following the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 399 of file dawg.h.
{ return next_node_from_edge_rec((edges_[edge])); }
int tesseract::SquishedDawg::NumEdges | ( | ) | [inline] |
void tesseract::SquishedDawg::print_node | ( | NODE_REF | node, |
int | max_num_edges | ||
) | const [virtual] |
Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.
Implements tesseract::Dawg.
Definition at line 228 of file dawg.cpp.
{ if (node == NO_EDGE) return; // nothing to print EDGE_REF edge = node; const char *forward_string = "FORWARD"; const char *backward_string = " "; const char *last_string = "LAST"; const char *not_last_string = " "; const char *eow_string = "EOW"; const char *not_eow_string = " "; const char *direction; const char *is_last; const char *eow; UNICHAR_ID unichar_id; if (edge_occupied(edge)) { do { direction = forward_edge(edge) ? forward_string : backward_string; is_last = last_edge(edge) ? last_string : not_last_string; eow = end_of_word(edge) ? eow_string : not_eow_string; unichar_id = edge_letter(edge); tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", edge, next_node(edge), unichar_id, direction, is_last, eow); if (edge - node > max_num_edges) return; } while (!last_edge(edge++)); if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) { do { direction = forward_edge(edge) ? forward_string : backward_string; is_last = last_edge(edge) ? last_string : not_last_string; eow = end_of_word(edge) ? eow_string : not_eow_string; unichar_id = edge_letter(edge); tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", edge, next_node(edge), unichar_id, direction, is_last, eow); if (edge - node > MAX_NODE_EDGES_DISPLAY) return; } while (!last_edge(edge++)); } } else { tprintf(REFFORMAT " : no edges in this node\n", node); } tprintf("\n"); }
void tesseract::SquishedDawg::unichar_ids_of | ( | NODE_REF | node, |
NodeChildVector * | vec | ||
) | const [inline, virtual] |
Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.
Implements tesseract::Dawg.
Definition at line 388 of file dawg.h.
{ EDGE_REF edge = node; if (!edge_occupied(edge) || edge == NO_EDGE) return; assert(forward_edge(edge)); // we don't expect any backward edges to do { // be present when this funciton is called vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); } while (!last_edge(edge++)); }
void tesseract::SquishedDawg::write_squished_dawg | ( | FILE * | file | ) |
Writes the squished/reduced Dawg to a file.
Definition at line 369 of file dawg.cpp.
{ EDGE_REF edge; inT32 num_edges; inT32 node_count = 0; NODE_MAP node_map; EDGE_REF old_index; EDGE_RECORD temp_record; if (debug_level_) tprintf("write_squished_dawg\n"); node_map = build_node_map(&node_count); // Write the magic number to help detecting a change in endianness. inT16 magic = kDawgMagicNumber; fwrite(&magic, sizeof(inT16), 1, file); fwrite(&unicharset_size_, sizeof(inT32), 1, file); // Count the number of edges in this Dawg. num_edges = 0; for (edge=0; edge < num_edges_; edge++) if (forward_edge(edge)) num_edges++; fwrite(&num_edges, sizeof(inT32), 1, file); // write edge count to file if (debug_level_) { tprintf("%d nodes in DAWG\n", node_count); tprintf("%d edges in DAWG\n", num_edges); } for (edge = 0; edge < num_edges_; edge++) { if (forward_edge(edge)) { // write forward edges do { old_index = next_node_from_edge_rec(edges_[edge]); set_next_node(edge, node_map[old_index]); temp_record = edges_[edge]; fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file); set_next_node(edge, old_index); } while (!last_edge(edge++)); if (edge >= num_edges_) break; if (backward_edge(edge)) // skip back links while (!last_edge(edge++)); edge--; } } free(node_map); }
void tesseract::SquishedDawg::write_squished_dawg | ( | const char * | filename | ) | [inline] |