Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: dawg.h (Formerly dawg.h) 00005 * Description: Definition of a class that represents Directed Accyclic Word 00006 * Graph (DAWG), functions to build and manipulate the DAWG. 00007 * Author: Mark Seaman, SW Productivity 00008 * Created: Fri Oct 16 14:37:00 1987 00009 * Modified: Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt 00010 * Language: C 00011 * Package: N/A 00012 * Status: Reusable Software Component 00013 * 00014 * (c) Copyright 1987, Hewlett-Packard Company. 00015 ** Licensed under the Apache License, Version 2.0 (the "License"); 00016 ** you may not use this file except in compliance with the License. 00017 ** You may obtain a copy of the License at 00018 ** http://www.apache.org/licenses/LICENSE-2.0 00019 ** Unless required by applicable law or agreed to in writing, software 00020 ** distributed under the License is distributed on an "AS IS" BASIS, 00021 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00022 ** See the License for the specific language governing permissions and 00023 ** limitations under the License. 00024 * 00025 *********************************************************************************/ 00026 00027 #ifndef DICT_DAWG_H_ 00028 #define DICT_DAWG_H_ 00029 00030 /*---------------------------------------------------------------------- 00031 I n c l u d e s 00032 ----------------------------------------------------------------------*/ 00033 00034 #include "elst.h" 00035 #include "ratngs.h" 00036 #include "params.h" 00037 #include "tesscallback.h" 00038 00039 #ifndef __GNUC__ 00040 #ifdef _WIN32 00041 #define NO_EDGE (inT64) 0xffffffffffffffffi64 00042 #endif /*_WIN32*/ 00043 #else 00044 #define NO_EDGE (inT64) 0xffffffffffffffffll 00045 #endif /*__GNUC__*/ 00046 00047 /*---------------------------------------------------------------------- 00048 T y p e s 00049 ----------------------------------------------------------------------*/ 00050 class UNICHARSET; 00051 00052 typedef uinT64 EDGE_RECORD; 00053 typedef EDGE_RECORD *EDGE_ARRAY; 00054 typedef inT64 EDGE_REF; 00055 typedef inT64 NODE_REF; 00056 typedef EDGE_REF *NODE_MAP; 00057 00058 namespace tesseract { 00059 00060 struct NodeChild { 00061 UNICHAR_ID unichar_id; 00062 EDGE_REF edge_ref; 00063 NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {} 00064 NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} 00065 }; 00066 00067 typedef GenericVector<NodeChild> NodeChildVector; 00068 typedef GenericVector<int> SuccessorList; 00069 typedef GenericVector<SuccessorList *> SuccessorListsVector; 00070 00071 enum DawgType { 00072 DAWG_TYPE_PUNCTUATION, 00073 DAWG_TYPE_WORD, 00074 DAWG_TYPE_NUMBER, 00075 DAWG_TYPE_PATTERN, 00076 00077 DAWG_TYPE_COUNT // number of enum entries 00078 }; 00079 00080 /*---------------------------------------------------------------------- 00081 C o n s t a n t s 00082 ----------------------------------------------------------------------*/ 00083 00084 #define FORWARD_EDGE (inT32) 0 00085 #define BACKWARD_EDGE (inT32) 1 00086 #define MAX_NODE_EDGES_DISPLAY (inT64) 100 00087 #define MARKER_FLAG (inT64) 1 00088 #define DIRECTION_FLAG (inT64) 2 00089 #define WERD_END_FLAG (inT64) 4 00090 #define LETTER_START_BIT 0 00091 #define NUM_FLAG_BITS 3 00092 #define REFFORMAT "%lld" 00093 00094 // Set kBeginningDawgsType[i] to true if a Dawg of 00095 // DawgType i can contain the beginning of a word. 00096 static const bool kBeginningDawgsType[] = { 1, 1, 1, 1 }; 00097 00098 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = { 00099 { 0, 1, 1, 0 }, // for DAWG_TYPE_PUNCTUATION 00100 { 1, 0, 0, 0 }, // for DAWG_TYPE_WORD 00101 { 1, 0, 0, 0 }, // for DAWG_TYPE_NUMBER 00102 { 0, 0, 0, 0 }, // for DAWG_TYPE_PATTERN 00103 }; 00104 00105 static const char kWildcard[] = "*"; 00106 00107 00108 /*---------------------------------------------------------------------- 00109 C l a s s e s a n d S t r u c t s 00110 ----------------------------------------------------------------------*/ 00111 // 00121 // 00122 class Dawg { 00123 public: 00125 static const inT16 kDawgMagicNumber = 42; 00129 static const UNICHAR_ID kPatternUnicharID = 0; 00130 00131 inline DawgType type() const { return type_; } 00132 inline const STRING &lang() const { return lang_; } 00133 inline PermuterType permuter() const { return perm_; } 00134 00135 virtual ~Dawg() {}; 00136 00138 bool word_in_dawg(const WERD_CHOICE &word) const; 00139 00142 int check_for_words(const char *filename, 00143 const UNICHARSET &unicharset, 00144 bool enable_wildcard) const; 00145 00146 // For each word in the Dawg, call the given (permanent) callback with the 00147 // text (UTF-8) version of the word. 00148 void iterate_words(const UNICHARSET &unicharset, 00149 TessCallback1<const char *> *cb) const; 00150 00151 // Pure virtual function that should be implemented by the derived classes. 00152 00154 virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, 00155 bool word_end) const = 0; 00156 00159 virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0; 00160 00163 virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0; 00164 00167 virtual bool end_of_word(EDGE_REF edge_ref) const = 0; 00168 00170 virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0; 00171 00174 virtual void print_node(NODE_REF node, int max_num_edges) const = 0; 00175 00178 virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, 00179 const UNICHARSET &unicharset, 00180 GenericVector<UNICHAR_ID> *vec) const {}; 00181 00185 virtual EDGE_REF pattern_loop_edge( 00186 EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const { 00187 return false; 00188 } 00189 00190 protected: 00191 Dawg() {} 00192 00194 inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00195 return ((edge_rec & next_node_mask_) >> next_node_start_bit_); 00196 } 00198 inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00199 return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0; 00200 } 00202 inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00203 return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? 00204 BACKWARD_EDGE : FORWARD_EDGE; 00205 } 00207 inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00208 return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0; 00209 } 00211 inline UNICHAR_ID unichar_id_from_edge_rec( 00212 const EDGE_RECORD &edge_rec) const { 00213 return ((edge_rec & letter_mask_) >> LETTER_START_BIT); 00214 } 00216 inline void set_next_node_in_edge_rec( 00217 EDGE_RECORD *edge_rec, EDGE_REF value) { 00218 *edge_rec &= (~next_node_mask_); 00219 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_); 00220 } 00222 inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) { 00223 *edge_rec |= (MARKER_FLAG << flag_start_bit_); 00224 } 00232 inline int given_greater_than_edge_rec(NODE_REF next_node, 00233 bool word_end, 00234 UNICHAR_ID unichar_id, 00235 const EDGE_RECORD &edge_rec) const { 00236 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec); 00237 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec); 00238 bool curr_word_end = end_of_word_from_edge_rec(edge_rec); 00239 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node, 00240 curr_word_end, curr_unichar_id)) return 0; 00241 if (unichar_id > curr_unichar_id) return 1; 00242 if (unichar_id == curr_unichar_id) { 00243 if (next_node > curr_next_node) return 1; 00244 if (next_node == curr_next_node) { 00245 if (word_end > curr_word_end) return 1; 00246 } 00247 } 00248 return -1; 00249 } 00253 inline bool edge_rec_match(NODE_REF next_node, 00254 bool word_end, 00255 UNICHAR_ID unichar_id, 00256 NODE_REF other_next_node, 00257 bool other_word_end, 00258 UNICHAR_ID other_unichar_id) const { 00259 return ((unichar_id == other_unichar_id) && 00260 (next_node == NO_EDGE || next_node == other_next_node) && 00261 (!word_end || (word_end == other_word_end))); 00262 } 00263 00266 void init(DawgType type, const STRING &lang, 00267 PermuterType perm, int unicharset_size, int debug_level); 00268 00274 bool match_words(WERD_CHOICE *word, inT32 index, 00275 NODE_REF node, UNICHAR_ID wildcard) const; 00276 00277 // Recursively iterate over all words in a dawg (see public iterate_words). 00278 void iterate_words_rec(const WERD_CHOICE &word_so_far, 00279 NODE_REF to_explore, 00280 TessCallback1<const char *> *cb) const; 00281 00282 // Member Variables. 00283 DawgType type_; 00284 STRING lang_; 00286 PermuterType perm_; 00287 // Variables to construct various edge masks. Formerly: 00288 // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64 00289 // #define FLAGS_MASK (inT64) 0x0000000700000000i64 00290 // #define LETTER_MASK (inT64) 0x00000000ffffffffi64 00291 int unicharset_size_; 00292 int flag_start_bit_; 00293 int next_node_start_bit_; 00294 uinT64 next_node_mask_; 00295 uinT64 flags_mask_; 00296 uinT64 letter_mask_; 00297 // Level of debug statements to print to stdout. 00298 int debug_level_; 00299 }; 00300 00301 // 00304 // 00305 struct DawgInfo { 00306 DawgInfo() : dawg_index(-1), ref(NO_EDGE) {} 00307 DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {} 00308 bool operator==(const DawgInfo &other) { 00309 return (this->dawg_index == other.dawg_index && this->ref == other.ref); 00310 } 00311 int dawg_index; 00312 EDGE_REF ref; 00313 }; 00314 class DawgInfoVector : public GenericVector<DawgInfo> { 00315 public: 00317 ~DawgInfoVector() { 00318 if (size_reserved_ > 0) { 00319 delete[] data_; 00320 size_used_ = 0; 00321 size_reserved_ = 0; 00322 } 00323 } 00326 void clear() { size_used_ = 0; } 00330 inline bool add_unique(const DawgInfo &new_info, bool debug, 00331 const char *debug_msg) { 00332 for (int i = 0; i < size_used_; ++i) { 00333 if (data_[i] == new_info) return false; 00334 } 00335 push_back(new_info); 00336 if (debug) { 00337 tprintf("%s[%d, " REFFORMAT "]\n", debug_msg, 00338 new_info.dawg_index, new_info.ref); 00339 } 00340 return true; 00341 } 00342 }; 00343 00344 // 00351 // 00352 class SquishedDawg : public Dawg { 00353 public: 00354 SquishedDawg(FILE *file, DawgType type, const STRING &lang, 00355 PermuterType perm, int debug_level) { 00356 read_squished_dawg(file, type, lang, perm, debug_level); 00357 num_forward_edges_in_node0 = num_forward_edges(0); 00358 } 00359 SquishedDawg(const char* filename, DawgType type, 00360 const STRING &lang, PermuterType perm, int debug_level) { 00361 FILE *file = fopen(filename, "rb"); 00362 if (file == NULL) { 00363 tprintf("Failed to open dawg file %s\n", filename); 00364 exit(1); 00365 } 00366 read_squished_dawg(file, type, lang, perm, debug_level); 00367 num_forward_edges_in_node0 = num_forward_edges(0); 00368 fclose(file); 00369 } 00370 SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, 00371 const STRING &lang, PermuterType perm, 00372 int unicharset_size, int debug_level) : 00373 edges_(edges), num_edges_(num_edges) { 00374 init(type, lang, perm, unicharset_size, debug_level); 00375 num_forward_edges_in_node0 = num_forward_edges(0); 00376 if (debug_level > 3) print_all("SquishedDawg:"); 00377 } 00378 ~SquishedDawg(); 00379 00380 int NumEdges() { return num_edges_; } 00381 00383 EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, 00384 bool word_end) const; 00385 00388 void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const { 00389 EDGE_REF edge = node; 00390 if (!edge_occupied(edge) || edge == NO_EDGE) return; 00391 assert(forward_edge(edge)); // we don't expect any backward edges to 00392 do { // be present when this funciton is called 00393 vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); 00394 } while (!last_edge(edge++)); 00395 } 00396 00399 NODE_REF next_node(EDGE_REF edge) const { 00400 return next_node_from_edge_rec((edges_[edge])); 00401 } 00402 00405 bool end_of_word(EDGE_REF edge_ref) const { 00406 return end_of_word_from_edge_rec((edges_[edge_ref])); 00407 } 00408 00410 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const { 00411 return unichar_id_from_edge_rec((edges_[edge_ref])); 00412 } 00413 00416 void print_node(NODE_REF node, int max_num_edges) const; 00417 00419 void write_squished_dawg(FILE *file); 00420 00423 void write_squished_dawg(const char *filename) { 00424 FILE *file = fopen(filename, "wb"); 00425 if (file == NULL) { 00426 tprintf("Error opening %s\n", filename); 00427 exit(1); 00428 } 00429 this->write_squished_dawg(file); 00430 fclose(file); 00431 } 00432 00433 private: 00435 inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) { 00436 set_next_node_in_edge_rec(&(edges_[edge_ref]), value); 00437 } 00439 inline void set_empty_edge(EDGE_REF edge_ref) { 00440 (edges_[edge_ref] = next_node_mask_); 00441 } 00443 inline void clear_all_edges() { 00444 for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge); 00445 } 00447 inline void clear_marker_flag(EDGE_REF edge_ref) { 00448 (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_)); 00449 } 00451 inline bool forward_edge(EDGE_REF edge_ref) const { 00452 return (edge_occupied(edge_ref) && 00453 (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); 00454 } 00456 inline bool backward_edge(EDGE_REF edge_ref) const { 00457 return (edge_occupied(edge_ref) && 00458 (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); 00459 } 00461 inline bool edge_occupied(EDGE_REF edge_ref) const { 00462 return (edges_[edge_ref] != next_node_mask_); 00463 } 00465 inline bool last_edge(EDGE_REF edge_ref) const { 00466 return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0; 00467 } 00468 00470 inT32 num_forward_edges(NODE_REF node) const; 00471 00473 void read_squished_dawg(FILE *file, DawgType type, const STRING &lang, 00474 PermuterType perm, int debug_level); 00475 00477 void print_edge(EDGE_REF edge) const; 00478 00480 void print_all(const char* msg) { 00481 tprintf("\n__________________________\n%s\n", msg); 00482 for (int i = 0; i < num_edges_; ++i) print_edge(i); 00483 tprintf("__________________________\n"); 00484 } 00486 NODE_MAP build_node_map(inT32 *num_nodes) const; 00487 00488 00489 // Member variables. 00490 EDGE_ARRAY edges_; 00491 int num_edges_; 00492 int num_forward_edges_in_node0; 00493 }; 00494 00495 } // namespace tesseract 00496 00497 #endif // DICT_DAWG_H_