Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: dawg.c (Formerly dawg.c) 00005 * Description: Use a Directed Accyclic Word Graph 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 00029 #ifdef _MSC_VER 00030 #pragma warning(disable:4244) // Conversion warnings 00031 #pragma warning(disable:4800) // int/bool warnings 00032 #endif 00033 #include "dawg.h" 00034 00035 #include "cutil.h" 00036 #include "dict.h" 00037 #include "emalloc.h" 00038 #include "freelist.h" 00039 #include "helpers.h" 00040 #include "strngs.h" 00041 #include "tprintf.h" 00042 00043 /*---------------------------------------------------------------------- 00044 F u n c t i o n s f o r D a w g 00045 ----------------------------------------------------------------------*/ 00046 namespace tesseract { 00047 00048 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const { 00049 if (word.length() == 0) return false; 00050 NODE_REF node = 0; 00051 int end_index = word.length() - 1; 00052 for (int i = 0; i <= end_index; i++) { 00053 if (debug_level_ > 1) { 00054 tprintf("word_in_dawg: exploring node " REFFORMAT ":\n", node); 00055 print_node(node, MAX_NODE_EDGES_DISPLAY); 00056 tprintf("\n"); 00057 } 00058 EDGE_REF edge = edge_char_of(node, word.unichar_id(i), i == end_index); 00059 if (edge != NO_EDGE) { 00060 node = next_node(edge); 00061 if (node == 0) node = NO_EDGE; 00062 } else { 00063 return false; 00064 } 00065 } 00066 return true; 00067 } 00068 00069 int Dawg::check_for_words(const char *filename, 00070 const UNICHARSET &unicharset, 00071 bool enable_wildcard) const { 00072 if (filename == NULL) return 0; 00073 00074 FILE *word_file; 00075 char string [CHARS_PER_LINE]; 00076 int misses = 0; 00077 UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); 00078 00079 word_file = open_file (filename, "r"); 00080 00081 while (fgets (string, CHARS_PER_LINE, word_file) != NULL) { 00082 chomp_string(string); // remove newline 00083 WERD_CHOICE word(string, unicharset); 00084 if (word.length() > 0 && 00085 !word.contains_unichar_id(INVALID_UNICHAR_ID)) { 00086 if (!match_words(&word, 0, 0, 00087 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { 00088 tprintf("Missing word: %s\n", string); 00089 ++misses; 00090 } 00091 } else { 00092 tprintf("Failed to create a valid word from %s\n", string); 00093 } 00094 } 00095 fclose (word_file); 00096 // Make sure the user sees this with fprintf instead of tprintf. 00097 if (debug_level_) tprintf("Number of lost words=%d\n", misses); 00098 return misses; 00099 } 00100 00101 void Dawg::iterate_words(const UNICHARSET &unicharset, 00102 TessCallback1<const char *> *cb) const { 00103 WERD_CHOICE word(&unicharset); 00104 iterate_words_rec(word, 0, cb); 00105 } 00106 00107 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far, 00108 NODE_REF to_explore, 00109 TessCallback1<const char *> *cb) const { 00110 NodeChildVector children; 00111 this->unichar_ids_of(to_explore, &children); 00112 for (int i = 0; i < children.size(); i++) { 00113 WERD_CHOICE next_word(word_so_far); 00114 next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0); 00115 if (this->end_of_word(children[i].edge_ref)) { 00116 STRING s; 00117 next_word.string_and_lengths(&s, NULL); 00118 cb->Run(s.string()); 00119 } 00120 NODE_REF next = next_node(children[i].edge_ref); 00121 if (next != 0) { 00122 iterate_words_rec(next_word, next, cb); 00123 } 00124 } 00125 } 00126 00127 bool Dawg::match_words(WERD_CHOICE *word, inT32 index, 00128 NODE_REF node, UNICHAR_ID wildcard) const { 00129 EDGE_REF edge; 00130 inT32 word_end; 00131 00132 if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) { 00133 bool any_matched = false; 00134 NodeChildVector vec; 00135 this->unichar_ids_of(node, &vec); 00136 for (int i = 0; i < vec.size(); ++i) { 00137 word->set_unichar_id(vec[i].unichar_id, index); 00138 if (match_words(word, index, node, wildcard)) 00139 any_matched = true; 00140 } 00141 word->set_unichar_id(wildcard, index); 00142 return any_matched; 00143 } else { 00144 word_end = index == word->length() - 1; 00145 edge = edge_char_of(node, word->unichar_id(index), word_end); 00146 if (edge != NO_EDGE) { // normal edge in DAWG 00147 node = next_node(edge); 00148 if (word_end) { 00149 if (debug_level_ > 1) word->print("match_words() found: "); 00150 return true; 00151 } else if (node != 0) { 00152 return match_words(word, index+1, node, wildcard); 00153 } 00154 } 00155 } 00156 return false; 00157 } 00158 00159 void Dawg::init(DawgType type, const STRING &lang, 00160 PermuterType perm, int unicharset_size, int debug_level) { 00161 type_ = type; 00162 lang_ = lang; 00163 perm_ = perm; 00164 ASSERT_HOST(unicharset_size > 0); 00165 unicharset_size_ = unicharset_size; 00166 // Set bit masks. 00167 flag_start_bit_ = ceil(log(static_cast<double>(unicharset_size_)) / log(2.0)); 00168 next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS; 00169 letter_mask_ = ~(~0 << flag_start_bit_); 00170 next_node_mask_ = ~0 << (flag_start_bit_ + NUM_FLAG_BITS); 00171 flags_mask_ = ~(letter_mask_ | next_node_mask_); 00172 00173 debug_level_ = debug_level; 00174 } 00175 00176 00177 /*---------------------------------------------------------------------- 00178 F u n c t i o n s f o r S q u i s h e d D a w g 00179 ----------------------------------------------------------------------*/ 00180 00181 SquishedDawg::~SquishedDawg() { memfree(edges_); } 00182 00183 EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, 00184 UNICHAR_ID unichar_id, 00185 bool word_end) const { 00186 EDGE_REF edge = node; 00187 if (node == 0) { // binary search 00188 EDGE_REF start = 0; 00189 EDGE_REF end = num_forward_edges_in_node0 - 1; 00190 int compare; 00191 while (start <= end) { 00192 edge = (start + end) >> 1; // (start + end) / 2 00193 compare = given_greater_than_edge_rec(NO_EDGE, word_end, 00194 unichar_id, edges_[edge]); 00195 if (compare == 0) { // given == vec[k] 00196 return edge; 00197 } else if (compare == 1) { // given > vec[k] 00198 start = edge + 1; 00199 } else { // given < vec[k] 00200 end = edge - 1; 00201 } 00202 } 00203 } else { // linear search 00204 if (edge != NO_EDGE && edge_occupied(edge)) { 00205 do { 00206 if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) && 00207 (!word_end || end_of_word_from_edge_rec(edges_[edge]))) 00208 return (edge); 00209 } while (!last_edge(edge++)); 00210 } 00211 } 00212 return (NO_EDGE); // not found 00213 } 00214 00215 inT32 SquishedDawg::num_forward_edges(NODE_REF node) const { 00216 EDGE_REF edge = node; 00217 inT32 num = 0; 00218 00219 if (forward_edge (edge)) { 00220 do { 00221 num++; 00222 } while (!last_edge(edge++)); 00223 } 00224 00225 return (num); 00226 } 00227 00228 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const { 00229 if (node == NO_EDGE) return; // nothing to print 00230 00231 EDGE_REF edge = node; 00232 const char *forward_string = "FORWARD"; 00233 const char *backward_string = " "; 00234 00235 const char *last_string = "LAST"; 00236 const char *not_last_string = " "; 00237 00238 const char *eow_string = "EOW"; 00239 const char *not_eow_string = " "; 00240 00241 const char *direction; 00242 const char *is_last; 00243 const char *eow; 00244 00245 UNICHAR_ID unichar_id; 00246 00247 if (edge_occupied(edge)) { 00248 do { 00249 direction = 00250 forward_edge(edge) ? forward_string : backward_string; 00251 is_last = last_edge(edge) ? last_string : not_last_string; 00252 eow = end_of_word(edge) ? eow_string : not_eow_string; 00253 00254 unichar_id = edge_letter(edge); 00255 tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", 00256 edge, next_node(edge), unichar_id, 00257 direction, is_last, eow); 00258 00259 if (edge - node > max_num_edges) return; 00260 } while (!last_edge(edge++)); 00261 00262 if (edge < num_edges_ && 00263 edge_occupied(edge) && backward_edge(edge)) { 00264 do { 00265 direction = 00266 forward_edge(edge) ? forward_string : backward_string; 00267 is_last = last_edge(edge) ? last_string : not_last_string; 00268 eow = end_of_word(edge) ? eow_string : not_eow_string; 00269 00270 unichar_id = edge_letter(edge); 00271 tprintf(REFFORMAT " : next = " REFFORMAT 00272 ", unichar_id = %d, %s %s %s\n", 00273 edge, next_node(edge), unichar_id, 00274 direction, is_last, eow); 00275 00276 if (edge - node > MAX_NODE_EDGES_DISPLAY) return; 00277 } while (!last_edge(edge++)); 00278 } 00279 } 00280 else { 00281 tprintf(REFFORMAT " : no edges in this node\n", node); 00282 } 00283 tprintf("\n"); 00284 } 00285 00286 void SquishedDawg::print_edge(EDGE_REF edge) const { 00287 if (edge == NO_EDGE) { 00288 tprintf("NO_EDGE\n"); 00289 } else { 00290 tprintf(REFFORMAT " : next = " REFFORMAT 00291 ", unichar_id = '%d', %s %s %s\n", edge, 00292 next_node(edge), edge_letter(edge), 00293 (forward_edge(edge) ? "FORWARD" : " "), 00294 (last_edge(edge) ? "LAST" : " "), 00295 (end_of_word(edge) ? "EOW" : "")); 00296 } 00297 } 00298 00299 void SquishedDawg::read_squished_dawg(FILE *file, 00300 DawgType type, 00301 const STRING &lang, 00302 PermuterType perm, 00303 int debug_level) { 00304 if (debug_level) tprintf("Reading squished dawg\n"); 00305 00306 // Read the magic number and if it does not match kDawgMagicNumber 00307 // set swap to true to indicate that we need to switch endianness. 00308 inT16 magic; 00309 fread(&magic, sizeof(inT16), 1, file); 00310 bool swap = (magic != kDawgMagicNumber); 00311 00312 int unicharset_size; 00313 fread(&unicharset_size, sizeof(inT32), 1, file); 00314 fread(&num_edges_, sizeof(inT32), 1, file); 00315 00316 if (swap) { 00317 unicharset_size = reverse32(unicharset_size); 00318 num_edges_ = reverse32(num_edges_); 00319 } 00320 ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty 00321 Dawg::init(type, lang, perm, unicharset_size, debug_level); 00322 00323 edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_); 00324 fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file); 00325 EDGE_REF edge; 00326 if (swap) { 00327 for (edge = 0; edge < num_edges_; ++edge) { 00328 edges_[edge] = reverse64(edges_[edge]); 00329 } 00330 } 00331 if (debug_level > 2) { 00332 tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n", 00333 type_, lang_.string(), perm_, unicharset_size_, num_edges_); 00334 for (edge = 0; edge < num_edges_; ++edge) 00335 print_edge(edge); 00336 } 00337 } 00338 00339 NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const { 00340 EDGE_REF edge; 00341 NODE_MAP node_map; 00342 inT32 node_counter; 00343 inT32 num_edges; 00344 00345 node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_); 00346 00347 for (edge = 0; edge < num_edges_; edge++) // init all slots 00348 node_map [edge] = -1; 00349 00350 node_counter = num_forward_edges(0); 00351 00352 *num_nodes = 0; 00353 for (edge = 0; edge < num_edges_; edge++) { // search all slots 00354 00355 if (forward_edge(edge)) { 00356 (*num_nodes)++; // count nodes links 00357 node_map[edge] = (edge ? node_counter : 0); 00358 num_edges = num_forward_edges(edge); 00359 if (edge != 0) node_counter += num_edges; 00360 edge += num_edges; 00361 if (edge >= num_edges_) break; 00362 if (backward_edge(edge)) while (!last_edge(edge++)); 00363 edge--; 00364 } 00365 } 00366 return (node_map); 00367 } 00368 00369 void SquishedDawg::write_squished_dawg(FILE *file) { 00370 EDGE_REF edge; 00371 inT32 num_edges; 00372 inT32 node_count = 0; 00373 NODE_MAP node_map; 00374 EDGE_REF old_index; 00375 EDGE_RECORD temp_record; 00376 00377 if (debug_level_) tprintf("write_squished_dawg\n"); 00378 00379 node_map = build_node_map(&node_count); 00380 00381 // Write the magic number to help detecting a change in endianness. 00382 inT16 magic = kDawgMagicNumber; 00383 fwrite(&magic, sizeof(inT16), 1, file); 00384 fwrite(&unicharset_size_, sizeof(inT32), 1, file); 00385 00386 // Count the number of edges in this Dawg. 00387 num_edges = 0; 00388 for (edge=0; edge < num_edges_; edge++) 00389 if (forward_edge(edge)) 00390 num_edges++; 00391 00392 fwrite(&num_edges, sizeof(inT32), 1, file); // write edge count to file 00393 00394 if (debug_level_) { 00395 tprintf("%d nodes in DAWG\n", node_count); 00396 tprintf("%d edges in DAWG\n", num_edges); 00397 } 00398 00399 for (edge = 0; edge < num_edges_; edge++) { 00400 if (forward_edge(edge)) { // write forward edges 00401 do { 00402 old_index = next_node_from_edge_rec(edges_[edge]); 00403 set_next_node(edge, node_map[old_index]); 00404 temp_record = edges_[edge]; 00405 fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file); 00406 set_next_node(edge, old_index); 00407 } while (!last_edge(edge++)); 00408 00409 if (edge >= num_edges_) break; 00410 if (backward_edge(edge)) // skip back links 00411 while (!last_edge(edge++)); 00412 00413 edge--; 00414 } 00415 } 00416 free(node_map); 00417 } 00418 00419 } // namespace tesseract