tesseract-doc/wordlist2dawg_8cpp_source.html

00001
00002 // File:        wordlist2dawg.cpp
00003 // Description: Program to generate a DAWG from a word list file
00004 // Author:      Thomas Kielbus
00005 // Created:     Thu May 10 18:11:42 PDT 2007
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019
00020 // Given a file that contains a list of words (one word per line) this program
00021 // generates the corresponding squished DAWG file.
00022
00023 #include <stdio.h>
00024
00025 #include "classify.h"
00026 #include "dawg.h"
00027 #include "dict.h"
00028 #include "emalloc.h"
00029 #include "freelist.h"
00030 #include "helpers.h"
00031 #include "serialis.h"
00032 #include "trie.h"
00033 #include "unicharset.h"
00034
00035 static const int kMaxNumEdges =  30000000;
00036
00037 int main(int argc, char** argv) {
00038   int min_word_length;
00039   int max_word_length;
00040   if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
00041       (argc == 6 && strcmp(argv[1], "-r") == 0) ||
00042       (argc == 7 && strcmp(argv[1], "-l") == 0 &&
00043          sscanf(argv[2], "%d", &min_word_length) == 1 &&
00044          sscanf(argv[3], "%d", &max_word_length) == 1))) {
00045     printf("Usage: %s [-t | -r [reverse policy] |"
00046            " -l min_len max_len] word_list_file"
00047            " dawg_file unicharset_file\n", argv[0]);
00048     return 1;
00049   }
00050   tesseract::Classify *classify = new tesseract::Classify();
00051   int argv_index = 0;
00052   if (argc == 5) ++argv_index;
00053   tesseract::Trie::RTLReversePolicy reverse_policy =
00054       tesseract::Trie::RRP_DO_NO_REVERSE;
00055   if (argc == 6) {
00056     ++argv_index;
00057     int tmp_int;
00058     sscanf(argv[++argv_index], "%d", &tmp_int);
00059     reverse_policy = static_cast<tesseract::Trie::RTLReversePolicy>(tmp_int);
00060     tprintf("Set reverse_policy to %s\n",
00061             tesseract::Trie::get_reverse_policy_name(reverse_policy));
00062   }
00063   if (argc == 7) argv_index += 3;
00064   const char* wordlist_filename = argv[++argv_index];
00065   const char* dawg_filename = argv[++argv_index];
00066   const char* unicharset_file = argv[++argv_index];
00067   tprintf("Loading unicharset from '%s'\n", unicharset_file);
00068   if (!classify->getDict().getUnicharset().load_from_file(unicharset_file)) {
00069     tprintf("Failed to load unicharset from '%s'\n", unicharset_file);
00070     delete classify;
00071     return 1;
00072   }
00073   const UNICHARSET &unicharset = classify->getDict().getUnicharset();
00074   if (argc == 4 || argc == 6) {
00075     tesseract::Trie trie(
00076         // the first 3 arguments are not used in this case
00077         tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
00078         kMaxNumEdges, unicharset.size(),
00079         classify->getDict().dawg_debug_level);
00080     tprintf("Reading word list from '%s'\n", wordlist_filename);
00081     if (!trie.read_word_list(wordlist_filename, unicharset, reverse_policy)) {
00082       tprintf("Failed to read word list from '%s'\n", wordlist_filename);
00083       exit(1);
00084     }
00085     tprintf("Reducing Trie to SquishedDawg\n");
00086     tesseract::SquishedDawg *dawg = trie.trie_to_dawg();
00087     if (dawg != NULL && dawg->NumEdges() > 0) {
00088       tprintf("Writing squished DAWG to '%s'\n", dawg_filename);
00089       dawg->write_squished_dawg(dawg_filename);
00090     } else {
00091       tprintf("Dawg is empty, skip producing the output file\n");
00092     }
00093     delete dawg;
00094   } else if (argc == 5) {
00095     tprintf("Loading dawg DAWG from '%s'\n", dawg_filename);
00096     tesseract::SquishedDawg words(
00097         dawg_filename,
00098         // these 3 arguments are not used in this case
00099         tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
00100         classify->getDict().dawg_debug_level);
00101     tprintf("Checking word list from '%s'\n", wordlist_filename);
00102     words.check_for_words(wordlist_filename, unicharset, true);
00103   } else if (argc == 7) {
00104     // Place words of different lengths in separate Dawgs.
00105     char str[CHARS_PER_LINE];
00106     FILE *word_file = fopen(wordlist_filename, "rb");
00107     if (word_file == NULL) {
00108       tprintf("Failed to open wordlist file %s\n", wordlist_filename);
00109       exit(1);
00110     }
00111     FILE *dawg_file = fopen(dawg_filename, "wb");
00112     if (dawg_file == NULL) {
00113       tprintf("Failed to open dawg output file %s\n", dawg_filename);
00114       exit(1);
00115     }
00116     tprintf("Reading word list from '%s'\n", wordlist_filename);
00117     GenericVector<tesseract::Trie *> trie_vec;
00118     int i;
00119     for (i = min_word_length; i <= max_word_length; ++i) {
00120       trie_vec.push_back(new tesseract::Trie(
00121           // the first 3 arguments are not used in this case
00122           tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
00123           kMaxNumEdges, unicharset.size(),
00124           classify->getDict().dawg_debug_level));
00125     }
00126     while (fgets(str, CHARS_PER_LINE, word_file) != NULL) {
00127       chomp_string(str);  // remove newline
00128       int badpos;
00129       if (!unicharset.encodable_string(str, &badpos)) {
00130         tprintf("String '%s' not compatible with unicharset. "
00131                 "Bad chars here: '%s'\n", str, str + badpos);
00132         continue;
00133       }
00134       WERD_CHOICE word(str, unicharset);
00135       if ((reverse_policy == tesseract::Trie::RRP_REVERSE_IF_HAS_RTL &&
00136           word.has_rtl_unichar_id()) ||
00137           reverse_policy == tesseract::Trie::RRP_FORCE_REVERSE) {
00138         word.reverse_and_mirror_unichar_ids();
00139       }
00140       if (word.length() >= min_word_length &&
00141           word.length() <= max_word_length &&
00142           !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
00143         tesseract::Trie *curr_trie = trie_vec[word.length()-min_word_length];
00144         if (!curr_trie->word_in_dawg(word)) {
00145           if (!curr_trie->add_word_to_dawg(word)) {
00146             tprintf("Failed to add the following word to dawg:\n");
00147             word.print();
00148             exit(1);
00149           }
00150           if (classify->getDict().dawg_debug_level > 1) {
00151             tprintf("Added word %s of length %d\n", str, word.length());
00152           }
00153           if (!curr_trie->word_in_dawg(word)) {
00154             tprintf("Error: word '%s' not in DAWG after adding it\n", str);
00155             exit(1);
00156           }
00157         }
00158       }
00159     }
00160     fclose(word_file);
00161     tprintf("Writing fixed length dawgs to '%s'\n", dawg_filename);
00162     GenericVector<tesseract::SquishedDawg *> dawg_vec;
00163     for (i = 0; i <= max_word_length; ++i) {
00164       dawg_vec.push_back(i < min_word_length ? NULL :
00165                          trie_vec[i-min_word_length]->trie_to_dawg());
00166     }
00167     tesseract::Dict::WriteFixedLengthDawgs(
00168         dawg_vec, max_word_length - min_word_length + 1,
00169         classify->getDict().dawg_debug_level, dawg_file);
00170     fclose(dawg_file);
00171     dawg_vec.delete_data_pointers();
00172     trie_vec.delete_data_pointers();
00173   } else {  // should never get here
00174     tprintf("Invalid command-line options\n");
00175     exit(1);
00176   }
00177   delete classify;
00178   return 0;
00179 }