Tesseract
3.02
|
00001 00002 // File: dawg2wordlist.cpp 00003 // Description: Program to create a word list from a DAWG and unicharset. 00004 // Author: David Eger 00005 // Created: Thu 22 Dec 2011 00006 // 00007 // (C) Copyright 2011, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include "dawg.h" 00021 #include "host.h" 00022 #include "tesscallback.h" 00023 #include "trie.h" 00024 #include "unicharset.h" 00025 00026 const int kDictDebugLevel = 1; 00027 00028 tesseract::Dawg *LoadSquishedDawg(const UNICHARSET &unicharset, 00029 const char *filename) { 00030 const int kDictDebugLevel = 1; 00031 FILE *dawg_file = fopen(filename, "rb"); 00032 if (dawg_file == NULL) { 00033 tprintf("Could not open %s for reading.\n", filename); 00034 return NULL; 00035 } 00036 tprintf("Loading word list from %s\n", filename); 00037 tesseract::Dawg *retval = new tesseract::SquishedDawg( 00038 dawg_file, tesseract::DAWG_TYPE_WORD, "eng", SYSTEM_DAWG_PERM, 00039 kDictDebugLevel); 00040 tprintf("Word list loaded.\n"); 00041 fclose(dawg_file); 00042 return retval; 00043 } 00044 00045 class WordOutputter { 00046 public: 00047 WordOutputter(FILE *file) : file_(file) {} 00048 void output_word(const char *word) { fprintf(file_, "%s\n", word); } 00049 private: 00050 FILE *file_; 00051 }; 00052 00053 // returns 0 if successful. 00054 int WriteDawgAsWordlist(const UNICHARSET &unicharset, 00055 const tesseract::Dawg *dawg, 00056 const char *outfile_name) { 00057 FILE *out = fopen(outfile_name, "wb"); 00058 if (out == NULL) { 00059 tprintf("Could not open %s for writing.\n", outfile_name); 00060 return 1; 00061 } 00062 WordOutputter outputter(out); 00063 TessCallback1<const char *> *print_word_cb = 00064 NewPermanentTessCallback(&outputter, &WordOutputter::output_word); 00065 dawg->iterate_words(unicharset, print_word_cb); 00066 delete print_word_cb; 00067 return fclose(out); 00068 } 00069 00070 int main(int argc, char *argv[]) { 00071 if (argc != 4) { 00072 tprintf("Print all the words in a given dawg.\n"); 00073 tprintf("Usage: %s <unicharset> <dawgfile> <wordlistfile>\n", 00074 argv[0]); 00075 return 1; 00076 } 00077 const char *unicharset_file = argv[1]; 00078 const char *dawg_file = argv[2]; 00079 const char *wordlist_file = argv[3]; 00080 UNICHARSET unicharset; 00081 if (!unicharset.load_from_file(unicharset_file)) { 00082 tprintf("Error loading unicharset from %s.\n", unicharset_file); 00083 return 1; 00084 } 00085 tesseract::Dawg *dict = LoadSquishedDawg(unicharset, dawg_file); 00086 if (dict == NULL) { 00087 tprintf("Error loading dictionary from %s.\n", dawg_file); 00088 return 1; 00089 } 00090 int retval = WriteDawgAsWordlist(unicharset, dict, wordlist_file); 00091 delete dict; 00092 return retval; 00093 }