Tesseract  3.02
tesseract-ocr/training/ambiguous_words.cpp
Go to the documentation of this file.
00001 
00002 // File:        ambiguous_words.cpp
00003 // Description: A program that takes a text file with a list of words as
00004 //              input (one per line) and outputs a file with the words
00005 //              that were found in the dictionary followed by the words
00006 //              that are ambiguous to them.
00007 // Author:      Rika Antonova
00008 // Created:     Fri Oct 21 11:26:43 PDT 2011
00009 //
00010 // (C) Copyright 2011, Google Inc.
00011 // Licensed under the Apache License, Version 2.0 (the "License");
00012 // you may not use this file except in compliance with the License.
00013 // You may obtain a copy of the License at
00014 // http://www.apache.org/licenses/LICENSE-2.0
00015 // Unless required by applicable law or agreed to in writing, software
00016 // distributed under the License is distributed on an "AS IS" BASIS,
00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00018 // See the License for the specific language governing permissions and
00019 // limitations under the License.
00020 //
00022 //
00023 
00024 #include <stdio.h>
00025 
00026 #include "baseapi.h"
00027 #include "helpers.h"
00028 #include "strngs.h"
00029 #include "dict.h"
00030 #include "tesseractclass.h"
00031 
00032 int main(int argc, char** argv) {
00033 
00034   // Parse input arguments.
00035   if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) {
00036     printf("Usage: %s [-l lang] tessdata_dir wordlist_file"
00037            " output_ambiguious_wordlist_file\n", argv[0]);
00038     return 1;
00039   }
00040   int argv_offset = 0;
00041   STRING lang;
00042   if (argc == 6) {
00043     lang = argv[2];
00044     argv_offset = 2;
00045   } else {
00046     lang = "eng";
00047   }
00048   const char *tessdata_dir = argv[++argv_offset];
00049   const char *input_file_str = argv[++argv_offset];
00050   const char *output_file_str = argv[++argv_offset];
00051 
00052   // Initialize Tesseract.
00053   tesseract::TessBaseAPI api;
00054   GenericVector<STRING> vars_vec;
00055   GenericVector<STRING> vars_values;
00056   vars_vec.push_back("output_ambig_words_file");
00057   vars_values.push_back(output_file_str);
00058   api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY,
00059            NULL, NULL, &vars_vec, &vars_values, false);
00060   tesseract::Dict &dict = api.tesseract()->getDict();
00061   FILE *input_file = fopen(input_file_str, "rb");
00062   if (input_file == NULL) {
00063     tprintf("Failed to open input wordlist file %s\n", input_file_str);
00064     exit(1);
00065   }
00066   char str[CHARS_PER_LINE];
00067 
00068   // Read word list and call Dict::NoDangerousAmbig() for each word
00069   // to record ambiguities in the output file.
00070   while (fgets(str, CHARS_PER_LINE, input_file) != NULL) {
00071     chomp_string(str);  // remove newline
00072     WERD_CHOICE word(str, dict.getUnicharset());
00073     dict.NoDangerousAmbig(&word, NULL, false, NULL, NULL);
00074   }
00075   // Clean up.
00076   fclose(input_file);
00077 }