Tesseract  3.02
tesseract-ocr/dict/dawg.h
Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:         dawg.h  (Formerly dawg.h)
00005  * Description:  Definition of a class that represents Directed Accyclic Word
00006  *               Graph (DAWG), functions to build and manipulate the DAWG.
00007  * Author:       Mark Seaman, SW Productivity
00008  * Created:      Fri Oct 16 14:37:00 1987
00009  * Modified:     Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt
00010  * Language:     C
00011  * Package:      N/A
00012  * Status:       Reusable Software Component
00013  *
00014  * (c) Copyright 1987, Hewlett-Packard Company.
00015  ** Licensed under the Apache License, Version 2.0 (the "License");
00016  ** you may not use this file except in compliance with the License.
00017  ** You may obtain a copy of the License at
00018  ** http://www.apache.org/licenses/LICENSE-2.0
00019  ** Unless required by applicable law or agreed to in writing, software
00020  ** distributed under the License is distributed on an "AS IS" BASIS,
00021  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00022  ** See the License for the specific language governing permissions and
00023  ** limitations under the License.
00024  *
00025  *********************************************************************************/
00026 
00027 #ifndef DICT_DAWG_H_
00028 #define DICT_DAWG_H_
00029 
00030 /*----------------------------------------------------------------------
00031               I n c l u d e s
00032 ----------------------------------------------------------------------*/
00033 
00034 #include "elst.h"
00035 #include "ratngs.h"
00036 #include "params.h"
00037 #include "tesscallback.h"
00038 
00039 #ifndef __GNUC__
00040 #ifdef _WIN32
00041 #define NO_EDGE                (inT64) 0xffffffffffffffffi64
00042 #endif  /*_WIN32*/
00043 #else
00044 #define NO_EDGE                (inT64) 0xffffffffffffffffll
00045 #endif /*__GNUC__*/
00046 
00047 /*----------------------------------------------------------------------
00048               T y p e s
00049 ----------------------------------------------------------------------*/
00050 class UNICHARSET;
00051 
00052 typedef uinT64 EDGE_RECORD;
00053 typedef EDGE_RECORD *EDGE_ARRAY;
00054 typedef inT64 EDGE_REF;
00055 typedef inT64 NODE_REF;
00056 typedef EDGE_REF *NODE_MAP;
00057 
00058 namespace tesseract {
00059 
00060 struct NodeChild {
00061   UNICHAR_ID unichar_id;
00062   EDGE_REF edge_ref;
00063   NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
00064   NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
00065 };
00066 
00067 typedef GenericVector<NodeChild> NodeChildVector;
00068 typedef GenericVector<int> SuccessorList;
00069 typedef GenericVector<SuccessorList *> SuccessorListsVector;
00070 
00071 enum DawgType {
00072   DAWG_TYPE_PUNCTUATION,
00073   DAWG_TYPE_WORD,
00074   DAWG_TYPE_NUMBER,
00075   DAWG_TYPE_PATTERN,
00076 
00077   DAWG_TYPE_COUNT  // number of enum entries
00078 };
00079 
00080 /*----------------------------------------------------------------------
00081               C o n s t a n t s
00082 ----------------------------------------------------------------------*/
00083 
00084 #define FORWARD_EDGE           (inT32) 0
00085 #define BACKWARD_EDGE          (inT32) 1
00086 #define MAX_NODE_EDGES_DISPLAY (inT64) 100
00087 #define MARKER_FLAG            (inT64) 1
00088 #define DIRECTION_FLAG         (inT64) 2
00089 #define WERD_END_FLAG          (inT64) 4
00090 #define LETTER_START_BIT       0
00091 #define NUM_FLAG_BITS          3
00092 #define REFFORMAT "%lld"
00093 
00094 // Set kBeginningDawgsType[i] to true if a Dawg of
00095 // DawgType i can contain the beginning of a word.
00096 static const bool kBeginningDawgsType[] = { 1, 1, 1, 1 };
00097 
00098 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
00099   { 0, 1, 1, 0 },  // for DAWG_TYPE_PUNCTUATION
00100   { 1, 0, 0, 0 },  // for DAWG_TYPE_WORD
00101   { 1, 0, 0, 0 },  // for DAWG_TYPE_NUMBER
00102   { 0, 0, 0, 0 },  // for DAWG_TYPE_PATTERN
00103 };
00104 
00105 static const char kWildcard[] = "*";
00106 
00107 
00108 /*----------------------------------------------------------------------
00109               C l a s s e s   a n d   S t r u c t s
00110 ----------------------------------------------------------------------*/
00111 //
00121 //
00122 class Dawg {
00123  public:
00125   static const inT16 kDawgMagicNumber = 42;
00129   static const UNICHAR_ID kPatternUnicharID = 0;
00130 
00131   inline DawgType type() const { return type_; }
00132   inline const STRING &lang() const { return lang_; }
00133   inline PermuterType permuter() const { return perm_; }
00134 
00135   virtual ~Dawg() {};
00136 
00138   bool word_in_dawg(const WERD_CHOICE &word) const;
00139 
00142   int check_for_words(const char *filename,
00143                       const UNICHARSET &unicharset,
00144                       bool enable_wildcard) const;
00145 
00146   // For each word in the Dawg, call the given (permanent) callback with the
00147   // text (UTF-8) version of the word.
00148   void iterate_words(const UNICHARSET &unicharset,
00149                      TessCallback1<const char *> *cb) const;
00150 
00151   // Pure virtual function that should be implemented by the derived classes.
00152 
00154   virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00155                                 bool word_end) const = 0;
00156 
00159   virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
00160 
00163   virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
00164 
00167   virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
00168 
00170   virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
00171 
00174   virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
00175 
00178   virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
00179                                       const UNICHARSET &unicharset,
00180                                       GenericVector<UNICHAR_ID> *vec) const {};
00181 
00185   virtual EDGE_REF pattern_loop_edge(
00186       EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const {
00187     return false;
00188   }
00189 
00190  protected:
00191   Dawg() {}
00192 
00194   inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00195     return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
00196   }
00198   inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00199     return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
00200   }
00202   inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00203     return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
00204       BACKWARD_EDGE : FORWARD_EDGE;
00205   }
00207   inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00208     return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
00209   }
00211   inline UNICHAR_ID unichar_id_from_edge_rec(
00212       const EDGE_RECORD &edge_rec) const {
00213     return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
00214   }
00216   inline void set_next_node_in_edge_rec(
00217       EDGE_RECORD *edge_rec, EDGE_REF value) {
00218     *edge_rec &= (~next_node_mask_);
00219     *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
00220   }
00222   inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
00223     *edge_rec |= (MARKER_FLAG << flag_start_bit_);
00224   }
00232   inline int given_greater_than_edge_rec(NODE_REF next_node,
00233                                          bool word_end,
00234                                          UNICHAR_ID unichar_id,
00235                                          const EDGE_RECORD &edge_rec) const {
00236     UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
00237     NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
00238     bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
00239     if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
00240                        curr_word_end, curr_unichar_id)) return 0;
00241     if (unichar_id > curr_unichar_id) return 1;
00242     if (unichar_id == curr_unichar_id) {
00243       if (next_node > curr_next_node) return 1;
00244       if (next_node == curr_next_node) {
00245         if (word_end > curr_word_end) return 1;
00246       }
00247     }
00248     return -1;
00249   }
00253   inline bool edge_rec_match(NODE_REF next_node,
00254                              bool word_end,
00255                              UNICHAR_ID unichar_id,
00256                              NODE_REF other_next_node,
00257                              bool other_word_end,
00258                              UNICHAR_ID other_unichar_id) const {
00259     return ((unichar_id == other_unichar_id) &&
00260             (next_node == NO_EDGE || next_node == other_next_node) &&
00261             (!word_end || (word_end == other_word_end)));
00262   }
00263 
00266   void init(DawgType type, const STRING &lang,
00267             PermuterType perm, int unicharset_size, int debug_level);
00268 
00274   bool match_words(WERD_CHOICE *word, inT32 index,
00275                    NODE_REF node, UNICHAR_ID wildcard) const;
00276 
00277   // Recursively iterate over all words in a dawg (see public iterate_words).
00278   void iterate_words_rec(const WERD_CHOICE &word_so_far,
00279                          NODE_REF to_explore,
00280                          TessCallback1<const char *> *cb) const;
00281 
00282   // Member Variables.
00283   DawgType type_;
00284   STRING lang_;
00286   PermuterType perm_;
00287   // Variables to construct various edge masks. Formerly:
00288   // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64
00289   // #define FLAGS_MASK     (inT64) 0x0000000700000000i64
00290   // #define LETTER_MASK    (inT64) 0x00000000ffffffffi64
00291   int unicharset_size_;
00292   int flag_start_bit_;
00293   int next_node_start_bit_;
00294   uinT64 next_node_mask_;
00295   uinT64 flags_mask_;
00296   uinT64 letter_mask_;
00297   // Level of debug statements to print to stdout.
00298   int debug_level_;
00299 };
00300 
00301 //
00304 //
00305 struct DawgInfo {
00306   DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
00307   DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
00308   bool operator==(const DawgInfo &other) {
00309     return (this->dawg_index == other.dawg_index && this->ref == other.ref);
00310   }
00311   int dawg_index;
00312   EDGE_REF ref;
00313 };
00314 class DawgInfoVector : public GenericVector<DawgInfo> {
00315  public:
00317   ~DawgInfoVector() {
00318     if (size_reserved_ > 0) {
00319       delete[] data_;
00320       size_used_ = 0;
00321       size_reserved_ = 0;
00322     }
00323   }
00326   void clear() { size_used_ = 0; }
00330   inline bool add_unique(const DawgInfo &new_info, bool debug,
00331                          const char *debug_msg) {
00332     for (int i = 0; i < size_used_; ++i) {
00333       if (data_[i] == new_info) return false;
00334     }
00335     push_back(new_info);
00336     if (debug) {
00337       tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
00338               new_info.dawg_index, new_info.ref);
00339     }
00340     return true;
00341   }
00342 };
00343 
00344 //
00351 //
00352 class SquishedDawg : public Dawg {
00353  public:
00354   SquishedDawg(FILE *file, DawgType type, const STRING &lang,
00355                PermuterType perm, int debug_level) {
00356     read_squished_dawg(file, type, lang, perm, debug_level);
00357     num_forward_edges_in_node0 = num_forward_edges(0);
00358   }
00359   SquishedDawg(const char* filename, DawgType type,
00360                const STRING &lang, PermuterType perm, int debug_level) {
00361     FILE *file = fopen(filename, "rb");
00362     if (file == NULL) {
00363       tprintf("Failed to open dawg file %s\n", filename);
00364       exit(1);
00365     }
00366     read_squished_dawg(file, type, lang, perm, debug_level);
00367     num_forward_edges_in_node0 = num_forward_edges(0);
00368     fclose(file);
00369   }
00370   SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
00371                const STRING &lang, PermuterType perm,
00372                int unicharset_size, int debug_level) :
00373     edges_(edges), num_edges_(num_edges) {
00374     init(type, lang, perm, unicharset_size, debug_level);
00375     num_forward_edges_in_node0 = num_forward_edges(0);
00376     if (debug_level > 3) print_all("SquishedDawg:");
00377   }
00378   ~SquishedDawg();
00379 
00380   int NumEdges() { return num_edges_; }
00381 
00383   EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00384                         bool word_end) const;
00385 
00388   void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
00389     EDGE_REF edge = node;
00390     if (!edge_occupied(edge) || edge == NO_EDGE) return;
00391     assert(forward_edge(edge));  // we don't expect any backward edges to
00392     do {                         // be present when this funciton is called
00393       vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
00394     } while (!last_edge(edge++));
00395   }
00396 
00399   NODE_REF next_node(EDGE_REF edge) const {
00400     return next_node_from_edge_rec((edges_[edge]));
00401   }
00402 
00405   bool end_of_word(EDGE_REF edge_ref) const {
00406     return end_of_word_from_edge_rec((edges_[edge_ref]));
00407   }
00408 
00410   UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
00411     return unichar_id_from_edge_rec((edges_[edge_ref]));
00412   }
00413 
00416   void print_node(NODE_REF node, int max_num_edges) const;
00417 
00419   void write_squished_dawg(FILE *file);
00420 
00423   void write_squished_dawg(const char *filename) {
00424     FILE *file = fopen(filename, "wb");
00425     if (file == NULL) {
00426       tprintf("Error opening %s\n", filename);
00427       exit(1);
00428     }
00429     this->write_squished_dawg(file);
00430     fclose(file);
00431   }
00432 
00433  private:
00435   inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
00436     set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
00437   }
00439   inline void set_empty_edge(EDGE_REF edge_ref) {
00440     (edges_[edge_ref] = next_node_mask_);
00441   }
00443   inline void clear_all_edges() {
00444     for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
00445   }
00447   inline void clear_marker_flag(EDGE_REF edge_ref) {
00448      (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
00449   }
00451   inline bool forward_edge(EDGE_REF edge_ref) const {
00452     return (edge_occupied(edge_ref) &&
00453             (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00454   }
00456   inline bool backward_edge(EDGE_REF edge_ref) const {
00457     return (edge_occupied(edge_ref) &&
00458             (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00459   }
00461   inline bool edge_occupied(EDGE_REF edge_ref) const {
00462     return (edges_[edge_ref] != next_node_mask_);
00463   }
00465   inline bool last_edge(EDGE_REF edge_ref) const {
00466     return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
00467   }
00468 
00470   inT32 num_forward_edges(NODE_REF node) const;
00471 
00473   void read_squished_dawg(FILE *file, DawgType type, const STRING &lang,
00474                           PermuterType perm, int debug_level);
00475 
00477   void print_edge(EDGE_REF edge) const;
00478 
00480   void print_all(const char* msg) {
00481     tprintf("\n__________________________\n%s\n", msg);
00482     for (int i = 0; i < num_edges_; ++i) print_edge(i);
00483     tprintf("__________________________\n");
00484   }
00486   NODE_MAP build_node_map(inT32 *num_nodes) const;
00487 
00488 
00489   // Member variables.
00490   EDGE_ARRAY edges_;
00491   int num_edges_;
00492   int num_forward_edges_in_node0;
00493 };
00494 
00495 }  // namespace tesseract
00496 
00497 #endif  // DICT_DAWG_H_