Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: trie.h (Formerly trie.h) 00005 * Description: Functions to build a trie data structure. 00006 * Author: Mark Seaman, SW Productivity 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Fri Jul 26 11:26:34 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 #ifndef TRIE_H 00026 #define TRIE_H 00027 00028 #include "dawg.h" 00029 #include "cutil.h" 00030 #include "genericvector.h" 00031 00032 class UNICHARSET; 00033 00034 // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed 00035 // max int32, we will need to change GenericVector to use int64 for size 00036 // and address indices. This does not seem to be needed immediately, 00037 // since currently the largest number of edges limit used by tesseract 00038 // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32. 00039 // There are also int casts below to satisfy the WIN32 compiler that would 00040 // need to be changed. 00041 // It might be cleanest to change the types of most of the Trie/Dawg related 00042 // typedefs to int and restrict the casts to extracting these values from 00043 // the 64 bit EDGE_RECORD. 00044 typedef inT64 EDGE_INDEX; // index of an edge in a given node 00045 typedef bool *NODE_MARKER; 00046 typedef GenericVector<EDGE_RECORD> EDGE_VECTOR; 00047 00048 struct TRIE_NODE_RECORD { 00049 EDGE_VECTOR forward_edges; 00050 EDGE_VECTOR backward_edges; 00051 }; 00052 typedef GenericVector<TRIE_NODE_RECORD *> TRIE_NODES; 00053 00054 namespace tesseract { 00055 00062 class Trie : public Dawg { 00063 public: 00064 enum RTLReversePolicy { 00065 RRP_DO_NO_REVERSE, 00066 RRP_REVERSE_IF_HAS_RTL, 00067 RRP_FORCE_REVERSE, 00068 }; 00069 00070 // Minimum number of concrete characters at the beginning of user patterns. 00071 static const int kSaneNumConcreteChars = 4; 00072 // Various unicode whitespace characters are used to denote unichar patterns, 00073 // (character classifier would never produce these whitespace characters as a 00074 // valid classification). 00075 static const char kAlphaPatternUnicode[]; 00076 static const char kDigitPatternUnicode[]; 00077 static const char kAlphanumPatternUnicode[]; 00078 static const char kPuncPatternUnicode[]; 00079 static const char kLowerPatternUnicode[]; 00080 static const char kUpperPatternUnicode[]; 00081 00082 static const char *get_reverse_policy_name( 00083 RTLReversePolicy reverse_policy); 00084 00085 // max_num_edges argument allows limiting the amount of memory this 00086 // Trie can consume (if a new word insert would cause the Trie to 00087 // contain more edges than max_num_edges, all the edges are cleared 00088 // so that new inserts can proceed). 00089 Trie(DawgType type, const STRING &lang, PermuterType perm, 00090 uinT64 max_num_edges, int unicharset_size, int debug_level) { 00091 init(type, lang, perm, unicharset_size, debug_level); 00092 num_edges_ = 0; 00093 max_num_edges_ = max_num_edges; 00094 deref_node_index_mask_ = ~letter_mask_; 00095 new_dawg_node(); // need to allocate node 0 00096 initialized_patterns_ = false; 00097 } 00098 virtual ~Trie() { nodes_.delete_data_pointers(); } 00099 00100 // Reset the Trie to empty. 00101 void clear(); 00102 00104 EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, 00105 bool word_end) const { 00106 EDGE_RECORD *edge_ptr; 00107 EDGE_INDEX edge_index; 00108 if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, 00109 &edge_ptr, &edge_index)) return NO_EDGE; 00110 return make_edge_ref(node_ref, edge_index); 00111 } 00112 00117 void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const { 00118 const EDGE_VECTOR &forward_edges = 00119 nodes_[static_cast<int>(node)]->forward_edges; 00120 for (int i = 0; i < forward_edges.size(); ++i) { 00121 vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]), 00122 make_edge_ref(node, i))); 00123 } 00124 } 00125 00130 NODE_REF next_node(EDGE_REF edge_ref) const { 00131 if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE; 00132 return next_node_from_edge_rec(*deref_edge_ref(edge_ref)); 00133 } 00134 00139 bool end_of_word(EDGE_REF edge_ref) const { 00140 if (edge_ref == NO_EDGE || num_edges_ == 0) return false; 00141 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref)); 00142 } 00143 00145 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const { 00146 if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID; 00147 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref)); 00148 } 00149 00150 // Prints the contents of the node indicated by the given NODE_REF. 00151 // At most max_num_edges will be printed. 00152 void print_node(NODE_REF node, int max_num_edges) const; 00153 00154 // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg. 00155 // Eliminates redundant edges and returns the pointer to the SquishedDawg. 00156 // Note: the caller is responsible for deallocating memory associated 00157 // with the returned SquishedDawg pointer. 00158 SquishedDawg *trie_to_dawg(); 00159 00160 // Inserts the list of words from the given file into the Trie. 00161 // If reverse is true, calls WERD_CHOICE::reverse_unichar_ids_if_rtl() 00162 // on each word before inserting it into the Trie. 00163 bool read_word_list(const char *filename, 00164 const UNICHARSET &unicharset, 00165 Trie::RTLReversePolicy reverse); 00166 00167 // Inserts the list of patterns from the given file into the Trie. 00168 // The pattern list file should contain one pattern per line in UTF-8 format. 00169 // 00170 // Each pattern can contain any non-whitespace characters, however only the 00171 // patterns that contain characters from the unicharset of the corresponding 00172 // language will be useful. 00173 // The only meta character is '\'. To be used in a pattern as an ordinary 00174 // string it should be escaped with '\' (e.g. string "C:\Documents" should 00175 // be written in the patterns file as "C:\\Documents"). 00176 // This function supports a very limited regular expression syntax. One can 00177 // express a character, a certain character class and a number of times the 00178 // entity should be repeated in the pattern. 00179 // 00180 // To denote a character class use one of: 00181 // \c - unichar for which UNICHARSET::get_isalpha() is true (character) 00182 // \d - unichar for which UNICHARSET::get_isdigit() is true 00183 // \n - unichar for which UNICHARSET::get_isdigit() and 00184 // UNICHARSET::isalpha() are true 00185 // \p - unichar for which UNICHARSET::get_ispunct() is true 00186 // \a - unichar for which UNICHARSET::get_islower() is true 00187 // \A - unichar for which UNICHARSET::get_isupper() is true 00188 // 00189 // \* could be specified after each character or pattern to indicate that 00190 // the character/pattern can be repeated any number of times before the next 00191 // character/pattern occurs. 00192 // 00193 // Examples: 00194 // 1-8\d\d-GOOG-411 will be expanded to strings: 00195 // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411. 00196 // 00197 // http://www.\n\*.com will be expanded to strings like: 00198 // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com 00199 // 00200 // Note: In choosing which patterns to include please be aware of the fact 00201 // providing very generic patterns will make tesseract run slower. 00202 // For example \n\* at the beginning of the pattern will make Tesseract 00203 // consider all the combinations of proposed character choices for each 00204 // of the segmentations, which will be unacceptably slow. 00205 // Because of potential problems with speed that could be difficult to 00206 // identify, each user pattern has to have at least kSaneNumConcreteChars 00207 // concrete characters from the unicharset at the beginning. 00208 bool read_pattern_list(const char *filename, const UNICHARSET &unicharset); 00209 00210 // Initializes the values of *_pattern_ unichar ids. 00211 // This function should be called before calling read_pattern_list(). 00212 void initialize_patterns(UNICHARSET *unicharset); 00213 00214 // Fills in the given unichar id vector with the unichar ids that represent 00215 // the patterns of the character classes of the given unichar_id. 00216 void unichar_id_to_patterns(UNICHAR_ID unichar_id, 00217 const UNICHARSET &unicharset, 00218 GenericVector<UNICHAR_ID> *vec) const; 00219 00220 // Returns the given EDGE_REF if the EDGE_RECORD that it points to has 00221 // a self loop and the given unichar_id matches the unichar_id stored in the 00222 // EDGE_RECORD, returns NO_EDGE otherwise. 00223 virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, 00224 UNICHAR_ID unichar_id, 00225 bool word_end) const { 00226 if (edge_ref == NO_EDGE) return NO_EDGE; 00227 EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref); 00228 return (marker_flag_from_edge_rec(*edge_rec) && 00229 unichar_id == unichar_id_from_edge_rec(*edge_rec) && 00230 word_end == end_of_word_from_edge_rec(*edge_rec)) ? 00231 edge_ref : NO_EDGE; 00232 } 00233 00234 // Adds a word to the Trie (creates the necessary nodes and edges). 00235 // 00236 // If repetitions vector is not NULL, each entry in the vector indicates 00237 // whether the unichar id with the corresponding index in the word is allowed 00238 // to repeat an unlimited number of times. For each entry that is true, MARKER 00239 // flag of the corresponding edge created for this unichar id is set to true). 00240 // 00241 // Return true if add succeeded, false otherwise (e.g. when a word contained 00242 // an invalid unichar id or the trie was getting too large and was cleared). 00243 bool add_word_to_dawg(const WERD_CHOICE &word, 00244 const GenericVector<bool> *repetitions); 00245 bool add_word_to_dawg(const WERD_CHOICE &word) { 00246 return add_word_to_dawg(word, NULL); 00247 } 00248 00249 protected: 00250 // The structure of an EDGE_REF for Trie edges is as follows: 00251 // [LETTER_START_BIT, flag_start_bit_): 00252 // edge index in *_edges in a TRIE_NODE_RECORD 00253 // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector) 00254 // 00255 // With this arrangement there are enough bits to represent edge indices 00256 // (each node can have at most unicharset_size_ forward edges and 00257 // the position of flag_start_bit is set to be log2(unicharset_size_)). 00258 // It is also possible to accommodate a maximum number of nodes that is at 00259 // least as large as that of the SquishedDawg representation (in SquishedDawg 00260 // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent 00261 // the next node index). 00262 // 00263 00264 // Returns the pointer to EDGE_RECORD after decoding the location 00265 // of the edge from the information in the given EDGE_REF. 00266 // This function assumes that EDGE_REF holds valid node/edge indices. 00267 inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const { 00268 int edge_index = static_cast<int>( 00269 (edge_ref & letter_mask_) >> LETTER_START_BIT); 00270 int node_index = static_cast<int>( 00271 (edge_ref & deref_node_index_mask_) >> flag_start_bit_); 00272 TRIE_NODE_RECORD *node_rec = nodes_[node_index]; 00273 return &(node_rec->forward_edges[edge_index]); 00274 } 00276 inline EDGE_REF make_edge_ref(NODE_REF node_index, 00277 EDGE_INDEX edge_index) const { 00278 return ((node_index << flag_start_bit_) | 00279 (edge_index << LETTER_START_BIT)); 00280 } 00282 inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, 00283 int direction, bool word_end, UNICHAR_ID unichar_id) { 00284 EDGE_RECORD flags = 0; 00285 if (repeats) flags |= MARKER_FLAG; 00286 if (word_end) flags |= WERD_END_FLAG; 00287 if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG; 00288 *edge = ((nxt << next_node_start_bit_) | 00289 (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) | 00290 (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT)); 00291 } 00293 inline void print_edge_rec(const EDGE_RECORD &edge_rec) const { 00294 tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec), 00295 marker_flag_from_edge_rec(edge_rec) ? "R," : "", 00296 (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B", 00297 end_of_word_from_edge_rec(edge_rec) ? ",E" : "", 00298 unichar_id_from_edge_rec(edge_rec)); 00299 } 00300 // Returns true if the next node in recorded the given EDGE_RECORD 00301 // has exactly one forward edge. 00302 inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) { 00303 NODE_REF node_ref = next_node_from_edge_rec(edge_rec); 00304 return (node_ref != NO_EDGE && 00305 nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1); 00306 } 00307 00308 // Prints the contents of the Trie. 00309 // At most max_num_edges will be printed for each node. 00310 void print_all(const char* msg, int max_num_edges) { 00311 tprintf("\n__________________________\n%s\n", msg); 00312 for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges); 00313 tprintf("__________________________\n"); 00314 } 00315 00316 // Finds the edge with the given direction, word_end and unichar_id 00317 // in the node indicated by node_ref. Fills in the pointer to the 00318 // EDGE_RECORD and the index of the edge with the the values 00319 // corresponding to the edge found. Returns true if an edge was found. 00320 bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, 00321 int direction, bool word_end, UNICHAR_ID unichar_id, 00322 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const; 00323 00324 // Adds an single edge linkage between node1 and node2 in the direction 00325 // indicated by direction argument. 00326 bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, 00327 int direction, bool word_end, 00328 UNICHAR_ID unichar_id); 00329 00330 // Adds forward edge linkage from node1 to node2 and the corresponding 00331 // backward edge linkage in the other direction. 00332 bool add_new_edge(NODE_REF node1, NODE_REF node2, 00333 bool repeats, bool word_end, UNICHAR_ID unichar_id) { 00334 return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, 00335 word_end, unichar_id) && 00336 add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, 00337 word_end, unichar_id)); 00338 } 00339 00340 // Sets the word ending flags in an already existing edge pair. 00341 // Returns true on success. 00342 void add_word_ending(EDGE_RECORD *edge, 00343 NODE_REF the_next_node, 00344 bool repeats, 00345 UNICHAR_ID unichar_id); 00346 00347 // Allocates space for a new node in the Trie. 00348 NODE_REF new_dawg_node(); 00349 00350 // Removes a single edge linkage to between node1 and node2 in the 00351 // direction indicated by direction argument. 00352 void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, 00353 bool word_end, UNICHAR_ID unichar_id); 00354 00355 // Removes forward edge linkage from node1 to node2 and the corresponding 00356 // backward edge linkage in the other direction. 00357 void remove_edge(NODE_REF node1, NODE_REF node2, 00358 bool word_end, UNICHAR_ID unichar_id) { 00359 remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id); 00360 remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id); 00361 } 00362 00363 // Compares edge1 and edge2 in the given node to see if they point to two 00364 // next nodes that could be collapsed. If they do, performs the reduction 00365 // and returns true. 00366 bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, 00367 const EDGE_RECORD &edge2); 00368 00369 // Assuming that edge_index indicates the first edge in a group of edges 00370 // in this node with a particular letter value, looks through these edges 00371 // to see if any of them can be collapsed. If so does it. Returns to the 00372 // caller when all edges with this letter have been reduced. 00373 // Returns true if further reduction is possible with this same letter. 00374 bool reduce_lettered_edges(EDGE_INDEX edge_index, 00375 UNICHAR_ID unichar_id, 00376 NODE_REF node, 00377 const EDGE_VECTOR &backward_edges, 00378 NODE_MARKER reduced_nodes); 00379 00386 void sort_edges(EDGE_VECTOR *edges); 00387 00389 void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes); 00390 00391 // Returns the pattern unichar id for the given character class code. 00392 UNICHAR_ID character_class_to_pattern(char ch); 00393 00394 // Member variables 00395 TRIE_NODES nodes_; // vector of nodes in the Trie 00396 uinT64 num_edges_; // sum of all edges (forward and backward) 00397 uinT64 max_num_edges_; // maximum number of edges allowed 00398 uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction 00399 uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index 00400 // Variables for translating character class codes denoted in user patterns 00401 // file to the unichar ids used to represent them in a Trie. 00402 bool initialized_patterns_; 00403 UNICHAR_ID alpha_pattern_; 00404 UNICHAR_ID digit_pattern_; 00405 UNICHAR_ID alphanum_pattern_; 00406 UNICHAR_ID punc_pattern_; 00407 UNICHAR_ID lower_pattern_; 00408 UNICHAR_ID upper_pattern_; 00409 }; 00410 } // namespace tesseract 00411 00412 #endif