Tesseract
3.02
|
00001 00002 // File: ambiguous_words.cpp 00003 // Description: A program that takes a text file with a list of words as 00004 // input (one per line) and outputs a file with the words 00005 // that were found in the dictionary followed by the words 00006 // that are ambiguous to them. 00007 // Author: Rika Antonova 00008 // Created: Fri Oct 21 11:26:43 PDT 2011 00009 // 00010 // (C) Copyright 2011, Google Inc. 00011 // Licensed under the Apache License, Version 2.0 (the "License"); 00012 // you may not use this file except in compliance with the License. 00013 // You may obtain a copy of the License at 00014 // http://www.apache.org/licenses/LICENSE-2.0 00015 // Unless required by applicable law or agreed to in writing, software 00016 // distributed under the License is distributed on an "AS IS" BASIS, 00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 // See the License for the specific language governing permissions and 00019 // limitations under the License. 00020 // 00022 // 00023 00024 #include <stdio.h> 00025 00026 #include "baseapi.h" 00027 #include "helpers.h" 00028 #include "strngs.h" 00029 #include "dict.h" 00030 #include "tesseractclass.h" 00031 00032 int main(int argc, char** argv) { 00033 00034 // Parse input arguments. 00035 if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) { 00036 printf("Usage: %s [-l lang] tessdata_dir wordlist_file" 00037 " output_ambiguious_wordlist_file\n", argv[0]); 00038 return 1; 00039 } 00040 int argv_offset = 0; 00041 STRING lang; 00042 if (argc == 6) { 00043 lang = argv[2]; 00044 argv_offset = 2; 00045 } else { 00046 lang = "eng"; 00047 } 00048 const char *tessdata_dir = argv[++argv_offset]; 00049 const char *input_file_str = argv[++argv_offset]; 00050 const char *output_file_str = argv[++argv_offset]; 00051 00052 // Initialize Tesseract. 00053 tesseract::TessBaseAPI api; 00054 GenericVector<STRING> vars_vec; 00055 GenericVector<STRING> vars_values; 00056 vars_vec.push_back("output_ambig_words_file"); 00057 vars_values.push_back(output_file_str); 00058 api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY, 00059 NULL, NULL, &vars_vec, &vars_values, false); 00060 tesseract::Dict &dict = api.tesseract()->getDict(); 00061 FILE *input_file = fopen(input_file_str, "rb"); 00062 if (input_file == NULL) { 00063 tprintf("Failed to open input wordlist file %s\n", input_file_str); 00064 exit(1); 00065 } 00066 char str[CHARS_PER_LINE]; 00067 00068 // Read word list and call Dict::NoDangerousAmbig() for each word 00069 // to record ambiguities in the output file. 00070 while (fgets(str, CHARS_PER_LINE, input_file) != NULL) { 00071 chomp_string(str); // remove newline 00072 WERD_CHOICE word(str, dict.getUnicharset()); 00073 dict.NoDangerousAmbig(&word, NULL, false, NULL, NULL); 00074 } 00075 // Clean up. 00076 fclose(input_file); 00077 }