Tesseract  3.02
tesseract-ocr/training/unicharset_extractor.cpp
Go to the documentation of this file.
00001 
00002 // File:        unicharset_extractor.cpp
00003 // Description: Unicode character/ligature set extractor.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 // Given a list of box files on the command line, this program generates a file
00021 // containing a unicharset, a list of all the characters used by Tesseract
00022 //
00023 // The file contains the size of the set on the first line, and then one
00024 // unichar per line.
00025 
00026 #include <stdio.h>
00027 /*
00028 ** Include automatically generated configuration file if running autoconf
00029 */
00030 #ifdef HAVE_CONFIG_H
00031 #include "config_auto.h"
00032 #endif
00033 #if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3)
00034 #include <wchar.h>
00035 #include <wctype.h>
00036 #define USING_WCTYPE
00037 #endif
00038 #include <locale.h>
00039 
00040 #include "boxread.h"
00041 #include "rect.h"
00042 #include "strngs.h"
00043 #include "tessopt.h"
00044 #include "unichar.h"
00045 #include "unicharset.h"
00046 
00047 static const char* const kUnicharsetFileName = "unicharset";
00048 
00049 UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) {
00050   UNICHAR uch(wc);
00051   char *unichar = uch.utf8_str();
00052   UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
00053   delete[] unichar;
00054   return unichar_id;
00055 }
00056 
00057 // Set character properties using wctype if we have it.
00058 // Contributed by piggy@gmail.com.
00059 // Modified by Ray to use UNICHAR for unicode conversion
00060 // and to check for wctype using autoconf/presence of windows.
00061 void set_properties(UNICHARSET *unicharset, const char* const c_string) {
00062 #ifdef USING_WCTYPE
00063   UNICHAR_ID id;
00064   int wc;
00065 
00066   // Convert the string to a unichar id.
00067   id = unicharset->unichar_to_id(c_string);
00068 
00069   // Set the other_case property to be this unichar id by default.
00070   unicharset->set_other_case(id, id);
00071 
00072   int step = UNICHAR::utf8_step(c_string);
00073   if (step == 0)
00074     return; // Invalid utf-8.
00075 
00076   // Get the next Unicode code point in the string.
00077   UNICHAR ch(c_string, step);
00078   wc = ch.first_uni();
00079 
00080   /* Copy the properties. */
00081   if (iswalpha(wc)) {
00082     unicharset->set_isalpha(id, 1);
00083     if (iswlower(wc)) {
00084       unicharset->set_islower(id, 1);
00085       unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
00086                                                       towupper(wc)));
00087     }
00088     if (iswupper(wc)) {
00089       unicharset->set_isupper(id, 1);
00090       unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
00091                                                       towlower(wc)));
00092     }
00093   }
00094   if (iswdigit(wc))
00095     unicharset->set_isdigit(id, 1);
00096   if(iswpunct(wc))
00097     unicharset->set_ispunctuation(id, 1);
00098 
00099 #endif
00100 }
00101 
00102 int main(int argc, char** argv) {
00103   int option;
00104   const char* output_directory = ".";
00105   STRING unicharset_file_name;
00106   UNICHARSET unicharset;
00107 
00108   setlocale(LC_ALL, "");
00109   // Space character needed to represent NIL classification
00110   unicharset.unichar_insert(" ");
00111 
00112   // Print usage
00113   if (argc <= 1) {
00114     printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
00115     exit(1);
00116 
00117   }
00118 
00119   // Parse arguments
00120   while ((option = tessopt(argc, argv, "D" )) != EOF) {
00121     switch (option) {
00122       case 'D':
00123         output_directory = tessoptarg;
00124         ++tessoptind;
00125         break;
00126     }
00127   }
00128 
00129   // Save file name
00130   unicharset_file_name = output_directory;
00131   unicharset_file_name += "/";
00132   unicharset_file_name += kUnicharsetFileName;
00133 
00134   // Load box files
00135   for (; tessoptind < argc; ++tessoptind) {
00136     printf("Extracting unicharset from %s\n", argv[tessoptind]);
00137 
00138     FILE* box_file = fopen(argv[tessoptind], "rb");
00139     if (box_file == NULL) {
00140       printf("Cannot open box file %s\n", argv[tessoptind]);
00141       return -1;
00142     }
00143 
00144     TBOX box;
00145     STRING unichar_string;
00146     int line_number = 0;
00147     while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
00148       unicharset.unichar_insert(unichar_string.string());
00149       set_properties(&unicharset, unichar_string.string());
00150     }
00151   }
00152 
00153   // Write unicharset file
00154   if (unicharset.save_to_file(unicharset_file_name.string())) {
00155     printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
00156   }
00157   else {
00158     printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
00159     return -1;
00160   }
00161   return 0;
00162 }