Tesseract  3.02
tesseract::SquishedDawg Class Reference

#include <dawg.h>

Inheritance diagram for tesseract::SquishedDawg:
tesseract::Dawg

List of all members.

Public Member Functions

 SquishedDawg (FILE *file, DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 SquishedDawg (const char *filename, DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 SquishedDawg (EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
 ~SquishedDawg ()
int NumEdges ()
EDGE_REF edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const
 Returns the edge that corresponds to the letter out of this node.
void unichar_ids_of (NODE_REF node, NodeChildVector *vec) const
NODE_REF next_node (EDGE_REF edge) const
bool end_of_word (EDGE_REF edge_ref) const
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const
 Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
void print_node (NODE_REF node, int max_num_edges) const
void write_squished_dawg (FILE *file)
 Writes the squished/reduced Dawg to a file.
void write_squished_dawg (const char *filename)

Detailed Description

Concrete class that can operate on a compacted (squished) Dawg (read, search and write to file). This class is read-only in the sense that new words can not be added to an instance of SquishedDawg. The underlying representation of the nodes and edges in SquishedDawg is stored as a contiguous EDGE_ARRAY (read from file or given as an argument to the constructor).

Definition at line 352 of file dawg.h.


Constructor & Destructor Documentation

tesseract::SquishedDawg::SquishedDawg ( FILE *  file,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  debug_level 
) [inline]

Definition at line 354 of file dawg.h.

                                                   {
    read_squished_dawg(file, type, lang, perm, debug_level);
    num_forward_edges_in_node0 = num_forward_edges(0);
  }
tesseract::SquishedDawg::SquishedDawg ( const char *  filename,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  debug_level 
) [inline]

Definition at line 359 of file dawg.h.

                                                                       {
    FILE *file = fopen(filename, "rb");
    if (file == NULL) {
      tprintf("Failed to open dawg file %s\n", filename);
      exit(1);
    }
    read_squished_dawg(file, type, lang, perm, debug_level);
    num_forward_edges_in_node0 = num_forward_edges(0);
    fclose(file);
  }
tesseract::SquishedDawg::SquishedDawg ( EDGE_ARRAY  edges,
int  num_edges,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
) [inline]

Definition at line 370 of file dawg.h.

                                                     :
    edges_(edges), num_edges_(num_edges) {
    init(type, lang, perm, unicharset_size, debug_level);
    num_forward_edges_in_node0 = num_forward_edges(0);
    if (debug_level > 3) print_all("SquishedDawg:");
  }
tesseract::SquishedDawg::~SquishedDawg ( )

Definition at line 181 of file dawg.cpp.

{ memfree(edges_); }

Member Function Documentation

EDGE_REF tesseract::SquishedDawg::edge_char_of ( NODE_REF  node,
UNICHAR_ID  unichar_id,
bool  word_end 
) const [virtual]

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 183 of file dawg.cpp.

                                                         {
  EDGE_REF edge = node;
  if (node == 0) {  // binary search
    EDGE_REF start = 0;
    EDGE_REF end = num_forward_edges_in_node0 - 1;
    int compare;
    while (start <= end) {
      edge = (start + end) >> 1;  // (start + end) / 2
      compare = given_greater_than_edge_rec(NO_EDGE, word_end,
                                            unichar_id, edges_[edge]);
      if (compare == 0) {  // given == vec[k]
        return edge;
      } else if (compare == 1) {  // given > vec[k]
        start = edge + 1;
      } else {  // given < vec[k]
        end = edge - 1;
      }
    }
  } else {  // linear search
    if (edge != NO_EDGE && edge_occupied(edge)) {
      do {
        if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
            (!word_end || end_of_word_from_edge_rec(edges_[edge])))
          return (edge);
      } while (!last_edge(edge++));
    }
  }
  return (NO_EDGE);  // not found
}
UNICHAR_ID tesseract::SquishedDawg::edge_letter ( EDGE_REF  edge_ref) const [inline, virtual]

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 410 of file dawg.h.

                                                  {
    return unichar_id_from_edge_rec((edges_[edge_ref]));
  }
bool tesseract::SquishedDawg::end_of_word ( EDGE_REF  edge_ref) const [inline, virtual]

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 405 of file dawg.h.

                                            {
    return end_of_word_from_edge_rec((edges_[edge_ref]));
  }
NODE_REF tesseract::SquishedDawg::next_node ( EDGE_REF  edge) const [inline, virtual]

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 399 of file dawg.h.

                                          {
    return next_node_from_edge_rec((edges_[edge]));
  }
int tesseract::SquishedDawg::NumEdges ( ) [inline]

Definition at line 380 of file dawg.h.

{ return num_edges_; }
void tesseract::SquishedDawg::print_node ( NODE_REF  node,
int  max_num_edges 
) const [virtual]

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 228 of file dawg.cpp.

                                                                    {
  if (node == NO_EDGE) return;  // nothing to print

  EDGE_REF   edge = node;
  const char       *forward_string  = "FORWARD";
  const char       *backward_string = "       ";

  const char       *last_string     = "LAST";
  const char       *not_last_string = "    ";

  const char       *eow_string      = "EOW";
  const char       *not_eow_string  = "   ";

  const char       *direction;
  const char       *is_last;
  const char       *eow;

  UNICHAR_ID unichar_id;

  if (edge_occupied(edge)) {
    do {
      direction =
        forward_edge(edge) ? forward_string : backward_string;
      is_last = last_edge(edge) ? last_string : not_last_string;
      eow = end_of_word(edge) ? eow_string : not_eow_string;

      unichar_id = edge_letter(edge);
      tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
              edge, next_node(edge), unichar_id,
              direction, is_last, eow);

      if (edge - node > max_num_edges) return;
    } while (!last_edge(edge++));

    if (edge < num_edges_ &&
        edge_occupied(edge) && backward_edge(edge)) {
      do {
        direction =
          forward_edge(edge) ? forward_string : backward_string;
        is_last = last_edge(edge) ? last_string : not_last_string;
        eow = end_of_word(edge) ? eow_string : not_eow_string;

        unichar_id = edge_letter(edge);
        tprintf(REFFORMAT " : next = " REFFORMAT
                ", unichar_id = %d, %s %s %s\n",
                edge, next_node(edge), unichar_id,
                direction, is_last, eow);

        if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
      } while (!last_edge(edge++));
    }
  }
  else {
    tprintf(REFFORMAT " : no edges in this node\n", node);
  }
  tprintf("\n");
}
void tesseract::SquishedDawg::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec 
) const [inline, virtual]

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 388 of file dawg.h.

                                                                 {
    EDGE_REF edge = node;
    if (!edge_occupied(edge) || edge == NO_EDGE) return;
    assert(forward_edge(edge));  // we don't expect any backward edges to
    do {                         // be present when this funciton is called
      vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
    } while (!last_edge(edge++));
  }
void tesseract::SquishedDawg::write_squished_dawg ( FILE *  file)

Writes the squished/reduced Dawg to a file.

Definition at line 369 of file dawg.cpp.

                                                 {
  EDGE_REF    edge;
  inT32       num_edges;
  inT32       node_count = 0;
  NODE_MAP    node_map;
  EDGE_REF    old_index;
  EDGE_RECORD temp_record;

  if (debug_level_) tprintf("write_squished_dawg\n");

  node_map = build_node_map(&node_count);

  // Write the magic number to help detecting a change in endianness.
  inT16 magic = kDawgMagicNumber;
  fwrite(&magic, sizeof(inT16), 1, file);
  fwrite(&unicharset_size_, sizeof(inT32), 1, file);

  // Count the number of edges in this Dawg.
  num_edges = 0;
  for (edge=0; edge < num_edges_; edge++)
    if (forward_edge(edge))
      num_edges++;

  fwrite(&num_edges, sizeof(inT32), 1, file);  // write edge count to file

  if (debug_level_) {
    tprintf("%d nodes in DAWG\n", node_count);
    tprintf("%d edges in DAWG\n", num_edges);
  }

  for (edge = 0; edge < num_edges_; edge++) {
    if (forward_edge(edge)) {  // write forward edges
      do {
        old_index = next_node_from_edge_rec(edges_[edge]);
        set_next_node(edge, node_map[old_index]);
        temp_record = edges_[edge];
        fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file);
        set_next_node(edge, old_index);
      } while (!last_edge(edge++));

      if (edge >= num_edges_) break;
      if (backward_edge(edge))  // skip back links
        while (!last_edge(edge++));

      edge--;
    }
  }
  free(node_map);
}
void tesseract::SquishedDawg::write_squished_dawg ( const char *  filename) [inline]

Opens the file with the given filename and writes the squished/reduced Dawg to the file.

Definition at line 423 of file dawg.h.

                                                 {
    FILE *file = fopen(filename, "wb");
    if (file == NULL) {
      tprintf("Error opening %s\n", filename);
      exit(1);
    }
    this->write_squished_dawg(file);
    fclose(file);
  }

The documentation for this class was generated from the following files: