Tesseract
3.02
|
00001 00002 // File: unicharset_extractor.cpp 00003 // Description: Unicode character/ligature set extractor. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 // Given a list of box files on the command line, this program generates a file 00021 // containing a unicharset, a list of all the characters used by Tesseract 00022 // 00023 // The file contains the size of the set on the first line, and then one 00024 // unichar per line. 00025 00026 #include <stdio.h> 00027 /* 00028 ** Include automatically generated configuration file if running autoconf 00029 */ 00030 #ifdef HAVE_CONFIG_H 00031 #include "config_auto.h" 00032 #endif 00033 #if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3) 00034 #include <wchar.h> 00035 #include <wctype.h> 00036 #define USING_WCTYPE 00037 #endif 00038 #include <locale.h> 00039 00040 #include "boxread.h" 00041 #include "rect.h" 00042 #include "strngs.h" 00043 #include "tessopt.h" 00044 #include "unichar.h" 00045 #include "unicharset.h" 00046 00047 static const char* const kUnicharsetFileName = "unicharset"; 00048 00049 UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) { 00050 UNICHAR uch(wc); 00051 char *unichar = uch.utf8_str(); 00052 UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar); 00053 delete[] unichar; 00054 return unichar_id; 00055 } 00056 00057 // Set character properties using wctype if we have it. 00058 // Contributed by piggy@gmail.com. 00059 // Modified by Ray to use UNICHAR for unicode conversion 00060 // and to check for wctype using autoconf/presence of windows. 00061 void set_properties(UNICHARSET *unicharset, const char* const c_string) { 00062 #ifdef USING_WCTYPE 00063 UNICHAR_ID id; 00064 int wc; 00065 00066 // Convert the string to a unichar id. 00067 id = unicharset->unichar_to_id(c_string); 00068 00069 // Set the other_case property to be this unichar id by default. 00070 unicharset->set_other_case(id, id); 00071 00072 int step = UNICHAR::utf8_step(c_string); 00073 if (step == 0) 00074 return; // Invalid utf-8. 00075 00076 // Get the next Unicode code point in the string. 00077 UNICHAR ch(c_string, step); 00078 wc = ch.first_uni(); 00079 00080 /* Copy the properties. */ 00081 if (iswalpha(wc)) { 00082 unicharset->set_isalpha(id, 1); 00083 if (iswlower(wc)) { 00084 unicharset->set_islower(id, 1); 00085 unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, 00086 towupper(wc))); 00087 } 00088 if (iswupper(wc)) { 00089 unicharset->set_isupper(id, 1); 00090 unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, 00091 towlower(wc))); 00092 } 00093 } 00094 if (iswdigit(wc)) 00095 unicharset->set_isdigit(id, 1); 00096 if(iswpunct(wc)) 00097 unicharset->set_ispunctuation(id, 1); 00098 00099 #endif 00100 } 00101 00102 int main(int argc, char** argv) { 00103 int option; 00104 const char* output_directory = "."; 00105 STRING unicharset_file_name; 00106 UNICHARSET unicharset; 00107 00108 setlocale(LC_ALL, ""); 00109 // Space character needed to represent NIL classification 00110 unicharset.unichar_insert(" "); 00111 00112 // Print usage 00113 if (argc <= 1) { 00114 printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]); 00115 exit(1); 00116 00117 } 00118 00119 // Parse arguments 00120 while ((option = tessopt(argc, argv, "D" )) != EOF) { 00121 switch (option) { 00122 case 'D': 00123 output_directory = tessoptarg; 00124 ++tessoptind; 00125 break; 00126 } 00127 } 00128 00129 // Save file name 00130 unicharset_file_name = output_directory; 00131 unicharset_file_name += "/"; 00132 unicharset_file_name += kUnicharsetFileName; 00133 00134 // Load box files 00135 for (; tessoptind < argc; ++tessoptind) { 00136 printf("Extracting unicharset from %s\n", argv[tessoptind]); 00137 00138 FILE* box_file = fopen(argv[tessoptind], "rb"); 00139 if (box_file == NULL) { 00140 printf("Cannot open box file %s\n", argv[tessoptind]); 00141 return -1; 00142 } 00143 00144 TBOX box; 00145 STRING unichar_string; 00146 int line_number = 0; 00147 while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) { 00148 unicharset.unichar_insert(unichar_string.string()); 00149 set_properties(&unicharset, unichar_string.string()); 00150 } 00151 } 00152 00153 // Write unicharset file 00154 if (unicharset.save_to_file(unicharset_file_name.string())) { 00155 printf("Wrote unicharset file %s.\n", unicharset_file_name.string()); 00156 } 00157 else { 00158 printf("Cannot save unicharset file %s.\n", unicharset_file_name.string()); 00159 return -1; 00160 } 00161 return 0; 00162 }