Tesseract  3.02
tesseract-ocr/training/dawg2wordlist.cpp
Go to the documentation of this file.
00001 
00002 // File:        dawg2wordlist.cpp
00003 // Description: Program to create a word list from a DAWG and unicharset.
00004 // Author:      David Eger
00005 // Created:     Thu 22 Dec 2011
00006 //
00007 // (C) Copyright 2011, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "dawg.h"
00021 #include "host.h"
00022 #include "tesscallback.h"
00023 #include "trie.h"
00024 #include "unicharset.h"
00025 
00026 const int kDictDebugLevel = 1;
00027 
00028 tesseract::Dawg *LoadSquishedDawg(const UNICHARSET &unicharset,
00029                                   const char *filename) {
00030   const int kDictDebugLevel = 1;
00031   FILE *dawg_file = fopen(filename, "rb");
00032   if (dawg_file == NULL) {
00033     tprintf("Could not open %s for reading.\n", filename);
00034     return NULL;
00035   }
00036   tprintf("Loading word list from %s\n", filename);
00037   tesseract::Dawg *retval = new tesseract::SquishedDawg(
00038       dawg_file, tesseract::DAWG_TYPE_WORD, "eng", SYSTEM_DAWG_PERM,
00039       kDictDebugLevel);
00040   tprintf("Word list loaded.\n");
00041   fclose(dawg_file);
00042   return retval;
00043 }
00044 
00045 class WordOutputter {
00046  public:
00047   WordOutputter(FILE *file) : file_(file) {}
00048   void output_word(const char *word) { fprintf(file_, "%s\n", word); }
00049  private:
00050   FILE *file_;
00051 };
00052 
00053 // returns 0 if successful.
00054 int WriteDawgAsWordlist(const UNICHARSET &unicharset,
00055                         const tesseract::Dawg *dawg,
00056                         const char *outfile_name) {
00057   FILE *out = fopen(outfile_name, "wb");
00058   if (out == NULL) {
00059     tprintf("Could not open %s for writing.\n", outfile_name);
00060     return 1;
00061   }
00062   WordOutputter outputter(out);
00063   TessCallback1<const char *> *print_word_cb =
00064       NewPermanentTessCallback(&outputter, &WordOutputter::output_word);
00065   dawg->iterate_words(unicharset, print_word_cb);
00066   delete print_word_cb;
00067   return fclose(out);
00068 }
00069 
00070 int main(int argc, char *argv[]) {
00071   if (argc != 4) {
00072     tprintf("Print all the words in a given dawg.\n");
00073     tprintf("Usage: %s <unicharset> <dawgfile> <wordlistfile>\n",
00074             argv[0]);
00075     return 1;
00076   }
00077   const char *unicharset_file = argv[1];
00078   const char *dawg_file = argv[2];
00079   const char *wordlist_file = argv[3];
00080   UNICHARSET unicharset;
00081   if (!unicharset.load_from_file(unicharset_file)) {
00082     tprintf("Error loading unicharset from %s.\n", unicharset_file);
00083     return 1;
00084   }
00085   tesseract::Dawg *dict = LoadSquishedDawg(unicharset, dawg_file);
00086   if (dict == NULL) {
00087     tprintf("Error loading dictionary from %s.\n", dawg_file);
00088     return 1;
00089   }
00090   int retval = WriteDawgAsWordlist(unicharset, dict, wordlist_file);
00091   delete dict;
00092   return retval;
00093 }