Tesseract  3.02
tesseract-ocr/dict/dawg.cpp
Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        dawg.c  (Formerly dawg.c)
00005  * Description:  Use a Directed Accyclic Word Graph
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 /*----------------------------------------------------------------------
00026               I n c l u d e s
00027 ----------------------------------------------------------------------*/
00028 
00029 #ifdef _MSC_VER
00030 #pragma warning(disable:4244)  // Conversion warnings
00031 #pragma warning(disable:4800)  // int/bool warnings
00032 #endif
00033 #include "dawg.h"
00034 
00035 #include "cutil.h"
00036 #include "dict.h"
00037 #include "emalloc.h"
00038 #include "freelist.h"
00039 #include "helpers.h"
00040 #include "strngs.h"
00041 #include "tprintf.h"
00042 
00043 /*----------------------------------------------------------------------
00044               F u n c t i o n s   f o r   D a w g
00045 ----------------------------------------------------------------------*/
00046 namespace tesseract {
00047 
00048 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
00049   if (word.length() == 0) return false;
00050   NODE_REF node = 0;
00051   int end_index = word.length() - 1;
00052   for (int i = 0; i <= end_index; i++) {
00053     if (debug_level_ > 1) {
00054       tprintf("word_in_dawg: exploring node " REFFORMAT ":\n", node);
00055       print_node(node, MAX_NODE_EDGES_DISPLAY);
00056       tprintf("\n");
00057     }
00058     EDGE_REF edge = edge_char_of(node, word.unichar_id(i), i == end_index);
00059     if (edge != NO_EDGE) {
00060       node = next_node(edge);
00061       if (node == 0) node = NO_EDGE;
00062     } else {
00063       return false;
00064     }
00065   }
00066   return true;
00067 }
00068 
00069 int Dawg::check_for_words(const char *filename,
00070                           const UNICHARSET &unicharset,
00071                           bool enable_wildcard) const {
00072   if (filename == NULL) return 0;
00073 
00074   FILE       *word_file;
00075   char       string [CHARS_PER_LINE];
00076   int misses = 0;
00077   UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
00078 
00079   word_file = open_file (filename, "r");
00080 
00081   while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
00082     chomp_string(string);  // remove newline
00083     WERD_CHOICE word(string, unicharset);
00084     if (word.length() > 0 &&
00085         !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
00086       if (!match_words(&word, 0, 0,
00087                        enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
00088         tprintf("Missing word: %s\n", string);
00089         ++misses;
00090       }
00091     } else {
00092       tprintf("Failed to create a valid word from %s\n", string);
00093     }
00094   }
00095   fclose (word_file);
00096   // Make sure the user sees this with fprintf instead of tprintf.
00097   if (debug_level_) tprintf("Number of lost words=%d\n", misses);
00098   return misses;
00099 }
00100 
00101 void Dawg::iterate_words(const UNICHARSET &unicharset,
00102                          TessCallback1<const char *> *cb) const {
00103   WERD_CHOICE word(&unicharset);
00104   iterate_words_rec(word, 0, cb);
00105 }
00106 
00107 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
00108                              NODE_REF to_explore,
00109                              TessCallback1<const char *> *cb) const {
00110   NodeChildVector children;
00111   this->unichar_ids_of(to_explore, &children);
00112   for (int i = 0; i < children.size(); i++) {
00113     WERD_CHOICE next_word(word_so_far);
00114     next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
00115     if (this->end_of_word(children[i].edge_ref)) {
00116       STRING s;
00117       next_word.string_and_lengths(&s, NULL);
00118       cb->Run(s.string());
00119     }
00120     NODE_REF next = next_node(children[i].edge_ref);
00121     if (next != 0) {
00122       iterate_words_rec(next_word, next, cb);
00123     }
00124   }
00125 }
00126 
00127 bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
00128                        NODE_REF node, UNICHAR_ID wildcard) const {
00129   EDGE_REF edge;
00130   inT32 word_end;
00131 
00132   if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
00133     bool any_matched = false;
00134     NodeChildVector vec;
00135     this->unichar_ids_of(node, &vec);
00136     for (int i = 0; i < vec.size(); ++i) {
00137       word->set_unichar_id(vec[i].unichar_id, index);
00138       if (match_words(word, index, node, wildcard))
00139         any_matched = true;
00140     }
00141     word->set_unichar_id(wildcard, index);
00142     return any_matched;
00143   } else {
00144     word_end = index == word->length() - 1;
00145     edge = edge_char_of(node, word->unichar_id(index), word_end);
00146     if (edge != NO_EDGE) {  // normal edge in DAWG
00147       node = next_node(edge);
00148       if (word_end) {
00149         if (debug_level_ > 1) word->print("match_words() found: ");
00150         return true;
00151       } else if (node != 0) {
00152         return match_words(word, index+1, node, wildcard);
00153       }
00154     }
00155   }
00156   return false;
00157 }
00158 
00159 void Dawg::init(DawgType type, const STRING &lang,
00160                 PermuterType perm, int unicharset_size, int debug_level) {
00161   type_ = type;
00162   lang_ = lang;
00163   perm_ = perm;
00164   ASSERT_HOST(unicharset_size > 0);
00165   unicharset_size_ = unicharset_size;
00166   // Set bit masks.
00167   flag_start_bit_ = ceil(log(static_cast<double>(unicharset_size_)) / log(2.0));
00168   next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS;
00169   letter_mask_ = ~(~0 << flag_start_bit_);
00170   next_node_mask_ = ~0 << (flag_start_bit_ + NUM_FLAG_BITS);
00171   flags_mask_ = ~(letter_mask_ | next_node_mask_);
00172 
00173   debug_level_ = debug_level;
00174 }
00175 
00176 
00177 /*----------------------------------------------------------------------
00178          F u n c t i o n s   f o r   S q u i s h e d    D a w g
00179 ----------------------------------------------------------------------*/
00180 
00181 SquishedDawg::~SquishedDawg() { memfree(edges_); }
00182 
00183 EDGE_REF SquishedDawg::edge_char_of(NODE_REF node,
00184                                     UNICHAR_ID unichar_id,
00185                                     bool word_end) const {
00186   EDGE_REF edge = node;
00187   if (node == 0) {  // binary search
00188     EDGE_REF start = 0;
00189     EDGE_REF end = num_forward_edges_in_node0 - 1;
00190     int compare;
00191     while (start <= end) {
00192       edge = (start + end) >> 1;  // (start + end) / 2
00193       compare = given_greater_than_edge_rec(NO_EDGE, word_end,
00194                                             unichar_id, edges_[edge]);
00195       if (compare == 0) {  // given == vec[k]
00196         return edge;
00197       } else if (compare == 1) {  // given > vec[k]
00198         start = edge + 1;
00199       } else {  // given < vec[k]
00200         end = edge - 1;
00201       }
00202     }
00203   } else {  // linear search
00204     if (edge != NO_EDGE && edge_occupied(edge)) {
00205       do {
00206         if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
00207             (!word_end || end_of_word_from_edge_rec(edges_[edge])))
00208           return (edge);
00209       } while (!last_edge(edge++));
00210     }
00211   }
00212   return (NO_EDGE);  // not found
00213 }
00214 
00215 inT32 SquishedDawg::num_forward_edges(NODE_REF node) const {
00216   EDGE_REF   edge = node;
00217   inT32        num  = 0;
00218 
00219   if (forward_edge (edge)) {
00220     do {
00221       num++;
00222     } while (!last_edge(edge++));
00223   }
00224 
00225   return (num);
00226 }
00227 
00228 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
00229   if (node == NO_EDGE) return;  // nothing to print
00230 
00231   EDGE_REF   edge = node;
00232   const char       *forward_string  = "FORWARD";
00233   const char       *backward_string = "       ";
00234 
00235   const char       *last_string     = "LAST";
00236   const char       *not_last_string = "    ";
00237 
00238   const char       *eow_string      = "EOW";
00239   const char       *not_eow_string  = "   ";
00240 
00241   const char       *direction;
00242   const char       *is_last;
00243   const char       *eow;
00244 
00245   UNICHAR_ID unichar_id;
00246 
00247   if (edge_occupied(edge)) {
00248     do {
00249       direction =
00250         forward_edge(edge) ? forward_string : backward_string;
00251       is_last = last_edge(edge) ? last_string : not_last_string;
00252       eow = end_of_word(edge) ? eow_string : not_eow_string;
00253 
00254       unichar_id = edge_letter(edge);
00255       tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
00256               edge, next_node(edge), unichar_id,
00257               direction, is_last, eow);
00258 
00259       if (edge - node > max_num_edges) return;
00260     } while (!last_edge(edge++));
00261 
00262     if (edge < num_edges_ &&
00263         edge_occupied(edge) && backward_edge(edge)) {
00264       do {
00265         direction =
00266           forward_edge(edge) ? forward_string : backward_string;
00267         is_last = last_edge(edge) ? last_string : not_last_string;
00268         eow = end_of_word(edge) ? eow_string : not_eow_string;
00269 
00270         unichar_id = edge_letter(edge);
00271         tprintf(REFFORMAT " : next = " REFFORMAT
00272                 ", unichar_id = %d, %s %s %s\n",
00273                 edge, next_node(edge), unichar_id,
00274                 direction, is_last, eow);
00275 
00276         if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
00277       } while (!last_edge(edge++));
00278     }
00279   }
00280   else {
00281     tprintf(REFFORMAT " : no edges in this node\n", node);
00282   }
00283   tprintf("\n");
00284 }
00285 
00286 void SquishedDawg::print_edge(EDGE_REF edge) const {
00287   if (edge == NO_EDGE) {
00288     tprintf("NO_EDGE\n");
00289   } else {
00290     tprintf(REFFORMAT " : next = " REFFORMAT
00291             ", unichar_id = '%d', %s %s %s\n", edge,
00292             next_node(edge), edge_letter(edge),
00293             (forward_edge(edge) ? "FORWARD" : "       "),
00294             (last_edge(edge) ? "LAST"    : "    "),
00295             (end_of_word(edge) ? "EOW"     : ""));
00296   }
00297 }
00298 
00299 void SquishedDawg::read_squished_dawg(FILE *file,
00300                                       DawgType type,
00301                                       const STRING &lang,
00302                                       PermuterType perm,
00303                                       int debug_level) {
00304   if (debug_level) tprintf("Reading squished dawg\n");
00305 
00306   // Read the magic number and if it does not match kDawgMagicNumber
00307   // set swap to true to indicate that we need to switch endianness.
00308   inT16 magic;
00309   fread(&magic, sizeof(inT16), 1, file);
00310   bool swap = (magic != kDawgMagicNumber);
00311 
00312   int unicharset_size;
00313   fread(&unicharset_size, sizeof(inT32), 1, file);
00314   fread(&num_edges_, sizeof(inT32), 1, file);
00315 
00316   if (swap) {
00317     unicharset_size = reverse32(unicharset_size);
00318     num_edges_ = reverse32(num_edges_);
00319   }
00320   ASSERT_HOST(num_edges_ > 0);  // DAWG should not be empty
00321   Dawg::init(type, lang, perm, unicharset_size, debug_level);
00322 
00323   edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
00324   fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file);
00325   EDGE_REF edge;
00326   if (swap) {
00327     for (edge = 0; edge < num_edges_; ++edge) {
00328       edges_[edge] = reverse64(edges_[edge]);
00329     }
00330   }
00331   if (debug_level > 2) {
00332     tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
00333             type_, lang_.string(), perm_, unicharset_size_, num_edges_);
00334     for (edge = 0; edge < num_edges_; ++edge)
00335       print_edge(edge);
00336   }
00337 }
00338 
00339 NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
00340   EDGE_REF   edge;
00341   NODE_MAP   node_map;
00342   inT32       node_counter;
00343   inT32       num_edges;
00344 
00345   node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);
00346 
00347   for (edge = 0; edge < num_edges_; edge++)       // init all slots
00348     node_map [edge] = -1;
00349 
00350   node_counter = num_forward_edges(0);
00351 
00352   *num_nodes   = 0;
00353   for (edge = 0; edge < num_edges_; edge++) {     // search all slots
00354 
00355     if (forward_edge(edge)) {
00356       (*num_nodes)++;                          // count nodes links
00357       node_map[edge] = (edge ? node_counter : 0);
00358       num_edges = num_forward_edges(edge);
00359       if (edge != 0) node_counter += num_edges;
00360       edge += num_edges;
00361       if (edge >= num_edges_) break;
00362       if (backward_edge(edge)) while (!last_edge(edge++));
00363       edge--;
00364     }
00365   }
00366   return (node_map);
00367 }
00368 
00369 void SquishedDawg::write_squished_dawg(FILE *file) {
00370   EDGE_REF    edge;
00371   inT32       num_edges;
00372   inT32       node_count = 0;
00373   NODE_MAP    node_map;
00374   EDGE_REF    old_index;
00375   EDGE_RECORD temp_record;
00376 
00377   if (debug_level_) tprintf("write_squished_dawg\n");
00378 
00379   node_map = build_node_map(&node_count);
00380 
00381   // Write the magic number to help detecting a change in endianness.
00382   inT16 magic = kDawgMagicNumber;
00383   fwrite(&magic, sizeof(inT16), 1, file);
00384   fwrite(&unicharset_size_, sizeof(inT32), 1, file);
00385 
00386   // Count the number of edges in this Dawg.
00387   num_edges = 0;
00388   for (edge=0; edge < num_edges_; edge++)
00389     if (forward_edge(edge))
00390       num_edges++;
00391 
00392   fwrite(&num_edges, sizeof(inT32), 1, file);  // write edge count to file
00393 
00394   if (debug_level_) {
00395     tprintf("%d nodes in DAWG\n", node_count);
00396     tprintf("%d edges in DAWG\n", num_edges);
00397   }
00398 
00399   for (edge = 0; edge < num_edges_; edge++) {
00400     if (forward_edge(edge)) {  // write forward edges
00401       do {
00402         old_index = next_node_from_edge_rec(edges_[edge]);
00403         set_next_node(edge, node_map[old_index]);
00404         temp_record = edges_[edge];
00405         fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file);
00406         set_next_node(edge, old_index);
00407       } while (!last_edge(edge++));
00408 
00409       if (edge >= num_edges_) break;
00410       if (backward_edge(edge))  // skip back links
00411         while (!last_edge(edge++));
00412 
00413       edge--;
00414     }
00415   }
00416   free(node_map);
00417 }
00418 
00419 }  // namespace tesseract