Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: tessedit.cpp (Formerly tessedit.c) 00003 * Description: Main program for merge of tess and editor. 00004 * Author: Ray Smith 00005 * Created: Tue Jan 07 15:21:46 GMT 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "mfcpch.h" 00021 //#include <osfcn.h> 00022 //#include <signal.h> 00023 //#include <time.h> 00024 //#include <unistd.h> 00025 #include "tfacep.h" //must be before main.h 00026 //#include "fileerr.h" 00027 #include "stderr.h" 00028 #include "basedir.h" 00029 #include "tessvars.h" 00030 //#include "debgwin.h" 00031 //#include "epapdest.h" 00032 #include "control.h" 00033 #include "imgs.h" 00034 #include "reject.h" 00035 #include "pageres.h" 00036 //#include "gpapdest.h" 00037 #include "nwmain.h" 00038 #include "pgedit.h" 00039 #include "tprintf.h" 00040 //#include "ipeerr.h" 00041 //#include "restart.h" 00042 #include "tessedit.h" 00043 //#include "fontfind.h" 00044 #include "permute.h" 00045 #include "stopper.h" 00046 #include "intmatcher.h" 00047 #include "chop.h" 00048 #include "efio.h" 00049 #include "danerror.h" 00050 #include "globals.h" 00051 #include "tesseractclass.h" 00052 #include "params.h" 00053 00054 #include "notdll.h" //phils nn stuff 00055 00056 #define VARDIR "configs/" /*variables files */ 00057 //config under api 00058 #define API_CONFIG "configs/api_config" 00059 00060 ETEXT_DESC *global_monitor = NULL; // progress monitor 00061 00062 namespace tesseract { 00063 00064 // Read a "config" file containing a set of variable, value pairs. 00065 // Searches the standard places: tessdata/configs, tessdata/tessconfigs 00066 // and also accepts a relative or absolute path name. 00067 void Tesseract::read_config_file(const char *filename, 00068 SetParamConstraint constraint) { 00069 STRING path = datadir; 00070 path += "configs/"; 00071 path += filename; 00072 FILE* fp; 00073 if ((fp = fopen(path.string(), "rb")) != NULL) { 00074 fclose(fp); 00075 } else { 00076 path = datadir; 00077 path += "tessconfigs/"; 00078 path += filename; 00079 if ((fp = fopen(path.string(), "rb")) != NULL) { 00080 fclose(fp); 00081 } else { 00082 path = filename; 00083 } 00084 } 00085 ParamUtils::ReadParamsFile(path.string(), constraint, this->params()); 00086 } 00087 00088 // Returns false if a unicharset file for the specified language was not found 00089 // or was invalid. 00090 // This function initializes TessdataManager. After TessdataManager is 00091 // no longer needed, TessdataManager::End() should be called. 00092 // 00093 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless 00094 // it is OEM_DEFAULT, in which case the value of the variable will be obtained 00095 // from the language-specific config file (stored in [lang].traineddata), from 00096 // the config files specified on the command line or left as the default 00097 // OEM_TESSERACT_ONLY if none of the configs specify this variable. 00098 bool Tesseract::init_tesseract_lang_data( 00099 const char *arg0, const char *textbase, const char *language, 00100 OcrEngineMode oem, char **configs, int configs_size, 00101 const GenericVector<STRING> *vars_vec, 00102 const GenericVector<STRING> *vars_values, 00103 bool set_only_non_debug_params) { 00104 // Set the basename, compute the data directory. 00105 main_setup(arg0, textbase); 00106 00107 // Set the language data path prefix 00108 lang = language != NULL ? language : "eng"; 00109 language_data_path_prefix = datadir; 00110 language_data_path_prefix += lang; 00111 language_data_path_prefix += "."; 00112 00113 // Initialize TessdataManager. 00114 STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; 00115 if (!tessdata_manager.Init(tessdata_path.string(), 00116 tessdata_manager_debug_level)) { 00117 return false; 00118 } 00119 00120 // If a language specific config file (lang.config) exists, load it in. 00121 if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) { 00122 ParamUtils::ReadParamsFromFp( 00123 tessdata_manager.GetDataFilePtr(), 00124 tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), 00125 SET_PARAM_CONSTRAINT_NONE, this->params()); 00126 if (tessdata_manager_debug_level) { 00127 tprintf("Loaded language config file\n"); 00128 } 00129 } 00130 00131 SetParamConstraint set_params_constraint = set_only_non_debug_params ? 00132 SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; 00133 // Load tesseract variables from config files. This is done after loading 00134 // language-specific variables from [lang].traineddata file, so that custom 00135 // config files can override values in [lang].traineddata file. 00136 for (int i = 0; i < configs_size; ++i) { 00137 read_config_file(configs[i], set_params_constraint); 00138 } 00139 00140 // Set params specified in vars_vec (done after setting params from config 00141 // files, so that params in vars_vec can override those from files). 00142 if (vars_vec != NULL && vars_values != NULL) { 00143 for (int i = 0; i < vars_vec->size(); ++i) { 00144 if (!ParamUtils::SetParam((*vars_vec)[i].string(), 00145 (*vars_values)[i].string(), 00146 set_params_constraint, this->params())) { 00147 tprintf("Error setting param %s\n", (*vars_vec)[i].string()); 00148 exit(1); 00149 } 00150 } 00151 } 00152 00153 if (((STRING &)tessedit_write_params_to_file).length() > 0) { 00154 FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb"); 00155 if (params_file != NULL) { 00156 ParamUtils::PrintParams(params_file, this->params()); 00157 fclose(params_file); 00158 if (tessdata_manager_debug_level > 0) { 00159 tprintf("Wrote parameters to %s\n", 00160 tessedit_write_params_to_file.string()); 00161 } 00162 } else { 00163 tprintf("Failed to open %s for writing params.\n", 00164 tessedit_write_params_to_file.string()); 00165 } 00166 } 00167 00168 // Determine which ocr engine(s) should be loaded and used for recognition. 00169 if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); 00170 if (tessdata_manager_debug_level) { 00171 tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n", 00172 static_cast<int>(tessedit_ocr_engine_mode)); 00173 } 00174 00175 // If we are only loading the config file (and so not planning on doing any 00176 // recognition) then there's nothing else do here. 00177 if (tessedit_init_config_only) { 00178 if (tessdata_manager_debug_level) { 00179 tprintf("Returning after loading config file\n"); 00180 } 00181 return true; 00182 } 00183 00184 // Load the unicharset 00185 if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || 00186 !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { 00187 return false; 00188 } 00189 if (unicharset.size() > MAX_NUM_CLASSES) { 00190 tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); 00191 return false; 00192 } 00193 if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); 00194 right_to_left_ = unicharset.major_right_to_left(); 00195 00196 if (!tessedit_ambigs_training && 00197 tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { 00198 unichar_ambigs.LoadUnicharAmbigs( 00199 tessdata_manager.GetDataFilePtr(), 00200 tessdata_manager.GetEndOffset(TESSDATA_AMBIGS), 00201 ambigs_debug_level, use_ambigs_for_adaption, &unicharset); 00202 if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); 00203 } 00204 00205 // Load Cube objects if necessary. 00206 if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { 00207 ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); 00208 if (tessdata_manager_debug_level) 00209 tprintf("Loaded Cube w/out combiner\n"); 00210 } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { 00211 ASSERT_HOST(init_cube_objects(true, &tessdata_manager)); 00212 if (tessdata_manager_debug_level) 00213 tprintf("Loaded Cube with combiner\n"); 00214 } 00215 00216 return true; 00217 } 00218 00219 // Helper returns true if the given string is in the vector of strings. 00220 static bool IsStrInList(const STRING& str, 00221 const GenericVector<STRING>& str_list) { 00222 for (int i = 0; i < str_list.size(); ++i) { 00223 if (str_list[i] == str) 00224 return true; 00225 } 00226 return false; 00227 } 00228 00229 // Parse a string of the form [~]<lang>[+[~]<lang>]*. 00230 // Langs with no prefix get appended to to_load, provided they 00231 // are not in there already. 00232 // Langs with ~ prefix get appended to not_to_load, provided they are not in 00233 // there already. 00234 void Tesseract::ParseLanguageString(const char* lang_str, 00235 GenericVector<STRING>* to_load, 00236 GenericVector<STRING>* not_to_load) { 00237 STRING remains(lang_str); 00238 while (remains.length() > 0) { 00239 // Find the start of the lang code and which vector to add to. 00240 const char* start = remains.string(); 00241 while (*start == '+') 00242 ++start; 00243 GenericVector<STRING>* target = to_load; 00244 if (*start == '~') { 00245 target = not_to_load; 00246 ++start; 00247 } 00248 // Find the index of the end of the lang code in string start. 00249 int end = strlen(start); 00250 const char* plus = strchr(start, '+'); 00251 if (plus != NULL && plus - start < end) 00252 end = plus - start; 00253 STRING lang_code(start); 00254 lang_code.truncate_at(end); 00255 STRING next(start + end); 00256 remains = next; 00257 // Check whether lang_code is already in the target vector and add. 00258 if (!IsStrInList(lang_code, *target)) { 00259 if (tessdata_manager_debug_level) 00260 tprintf("Adding language '%s' to list\n", lang_code.string()); 00261 target->push_back(lang_code); 00262 } 00263 } 00264 } 00265 00266 // Initialize for potentially a set of languages defined by the language 00267 // string and recursively any additional languages required by any language 00268 // traineddata file (via tessedit_load_sublangs in its config) that is loaded. 00269 // See init_tesseract_internal for args. 00270 int Tesseract::init_tesseract( 00271 const char *arg0, const char *textbase, const char *language, 00272 OcrEngineMode oem, char **configs, int configs_size, 00273 const GenericVector<STRING> *vars_vec, 00274 const GenericVector<STRING> *vars_values, 00275 bool set_only_non_debug_params) { 00276 GenericVector<STRING> langs_to_load; 00277 GenericVector<STRING> langs_not_to_load; 00278 ParseLanguageString(language, &langs_to_load, &langs_not_to_load); 00279 00280 sub_langs_.delete_data_pointers(); 00281 sub_langs_.clear(); 00282 // Find the first loadable lang and load into this. 00283 // Add any languages that this language requires 00284 bool loaded_primary = false; 00285 // Load the rest into sub_langs_. 00286 for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { 00287 if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) { 00288 const char *lang_str = langs_to_load[lang_index].string(); 00289 Tesseract *tess_to_init; 00290 if (!loaded_primary) { 00291 tess_to_init = this; 00292 } else { 00293 tess_to_init = new Tesseract; 00294 } 00295 00296 int result = tess_to_init->init_tesseract_internal( 00297 arg0, textbase, lang_str, oem, configs, configs_size, 00298 vars_vec, vars_values, set_only_non_debug_params); 00299 00300 if (!loaded_primary) { 00301 if (result < 0) { 00302 tprintf("Failed loading language '%s'\n", lang_str); 00303 } else { 00304 if (tessdata_manager_debug_level) 00305 tprintf("Loaded language '%s' as main language\n", lang_str); 00306 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), 00307 &langs_to_load, &langs_not_to_load); 00308 loaded_primary = true; 00309 } 00310 } else { 00311 if (result < 0) { 00312 tprintf("Failed loading language '%s'\n", lang_str); 00313 delete tess_to_init; 00314 } else { 00315 if (tessdata_manager_debug_level) 00316 tprintf("Loaded language '%s' as secondary language\n", lang_str); 00317 sub_langs_.push_back(tess_to_init); 00318 // Add any languages that this language requires 00319 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), 00320 &langs_to_load, &langs_not_to_load); 00321 } 00322 } 00323 } 00324 } 00325 if (!loaded_primary) { 00326 tprintf("Tesseract couldn't load any languages!\n"); 00327 return -1; // Couldn't load any language! 00328 } 00329 SetupUniversalFontIds(); 00330 return 0; 00331 } 00332 00333 // Common initialization for a single language. 00334 // arg0 is the datapath for the tessdata directory, which could be the 00335 // path of the tessdata directory with no trailing /, or (if tessdata 00336 // lives in the same directory as the executable, the path of the executable, 00337 // hence the name arg0. 00338 // textbase is an optional output file basename (used only for training) 00339 // language is the language code to load. 00340 // oem controls which engine(s) will operate on the image 00341 // configs (argv) is an array of config filenames to load variables from. 00342 // May be NULL. 00343 // configs_size (argc) is the number of elements in configs. 00344 // vars_vec is an optional vector of variables to set. 00345 // vars_values is an optional corresponding vector of values for the variables 00346 // in vars_vec. 00347 // If set_only_init_params is true, then only the initialization variables 00348 // will be set. 00349 int Tesseract::init_tesseract_internal( 00350 const char *arg0, const char *textbase, const char *language, 00351 OcrEngineMode oem, char **configs, int configs_size, 00352 const GenericVector<STRING> *vars_vec, 00353 const GenericVector<STRING> *vars_values, 00354 bool set_only_non_debug_params) { 00355 if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, 00356 configs_size, vars_vec, vars_values, 00357 set_only_non_debug_params)) { 00358 return -1; 00359 } 00360 if (tessedit_init_config_only) { 00361 tessdata_manager.End(); 00362 return 0; 00363 } 00364 // If only Cube will be used, skip loading Tesseract classifier's 00365 // pre-trained templates. 00366 bool init_tesseract_classifier = 00367 (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || 00368 tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED); 00369 // If only Cube will be used and if it has its own Unicharset, 00370 // skip initializing permuter and loading Tesseract Dawgs. 00371 bool init_dict = 00372 !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY && 00373 tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET)); 00374 program_editup(textbase, init_tesseract_classifier, init_dict); 00375 tessdata_manager.End(); 00376 return 0; //Normal exit 00377 } 00378 00379 // Helper builds the all_fonts table by adding new fonts from new_fonts. 00380 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts, 00381 UnicityTable<FontInfo>* all_fonts) { 00382 for (int i = 0; i < new_fonts.size(); ++i) { 00383 // UnicityTable uniques as we go. 00384 all_fonts->push_back(new_fonts.get(i)); 00385 } 00386 } 00387 00388 // Helper assigns an id to lang_fonts using the index in all_fonts table. 00389 static void AssignIds(const UnicityTable<FontInfo>& all_fonts, 00390 UnicityTable<FontInfo>* lang_fonts) { 00391 for (int i = 0; i < lang_fonts->size(); ++i) { 00392 int index = all_fonts.get_id(lang_fonts->get(i)); 00393 lang_fonts->get_mutable(i)->universal_id = index; 00394 } 00395 } 00396 00397 // Set the universal_id member of each font to be unique among all 00398 // instances of the same font loaded. 00399 void Tesseract::SetupUniversalFontIds() { 00400 // Note that we can get away with bitwise copying FontInfo in 00401 // all_fonts, as it is a temporary structure and we avoid setting the 00402 // delete callback. 00403 UnicityTable<FontInfo> all_fonts; 00404 all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo)); 00405 00406 // Create the universal ID table. 00407 CollectFonts(get_fontinfo_table(), &all_fonts); 00408 for (int i = 0; i < sub_langs_.size(); ++i) { 00409 CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts); 00410 } 00411 // Assign ids from the table to each font table. 00412 AssignIds(all_fonts, &get_fontinfo_table()); 00413 for (int i = 0; i < sub_langs_.size(); ++i) { 00414 AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table()); 00415 } 00416 font_table_size_ = all_fonts.size(); 00417 } 00418 00419 // init the LM component 00420 int Tesseract::init_tesseract_lm(const char *arg0, 00421 const char *textbase, 00422 const char *language) { 00423 if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, 00424 NULL, 0, NULL, NULL, false)) 00425 return -1; 00426 getDict().Load(); 00427 tessdata_manager.End(); 00428 return 0; 00429 } 00430 00431 void Tesseract::end_tesseract() { 00432 end_recog(); 00433 } 00434 00435 /* Define command type identifiers */ 00436 00437 enum CMD_EVENTS 00438 { 00439 ACTION_1_CMD_EVENT, 00440 RECOG_WERDS, 00441 RECOG_PSEUDO, 00442 ACTION_2_CMD_EVENT 00443 }; 00444 00445 } // namespace tesseract