Tesseract  3.02
tesseract-ocr/ccmain/tessedit.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        tessedit.cpp  (Formerly tessedit.c)
00003  * Description: Main program for merge of tess and editor.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Tue Jan 07 15:21:46 GMT 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "mfcpch.h"
00021 //#include                                                      <osfcn.h>
00022 //#include                                                      <signal.h>
00023 //#include                                                      <time.h>
00024 //#include                                                      <unistd.h>
00025 #include          "tfacep.h"     //must be before main.h
00026 //#include                                                      "fileerr.h"
00027 #include          "stderr.h"
00028 #include          "basedir.h"
00029 #include          "tessvars.h"
00030 //#include                                                      "debgwin.h"
00031 //#include                                      "epapdest.h"
00032 #include          "control.h"
00033 #include          "imgs.h"
00034 #include          "reject.h"
00035 #include          "pageres.h"
00036 //#include                                                      "gpapdest.h"
00037 #include          "nwmain.h"
00038 #include          "pgedit.h"
00039 #include          "tprintf.h"
00040 //#include                                      "ipeerr.h"
00041 //#include                                                      "restart.h"
00042 #include          "tessedit.h"
00043 //#include                                                      "fontfind.h"
00044 #include "permute.h"
00045 #include "stopper.h"
00046 #include "intmatcher.h"
00047 #include "chop.h"
00048 #include "efio.h"
00049 #include "danerror.h"
00050 #include "globals.h"
00051 #include "tesseractclass.h"
00052 #include "params.h"
00053 
00054 #include          "notdll.h"     //phils nn stuff
00055 
00056 #define VARDIR        "configs/" /*variables files */
00057                                  //config under api
00058 #define API_CONFIG      "configs/api_config"
00059 
00060 ETEXT_DESC *global_monitor = NULL;  // progress monitor
00061 
00062 namespace tesseract {
00063 
00064 // Read a "config" file containing a set of variable, value pairs.
00065 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
00066 // and also accepts a relative or absolute path name.
00067 void Tesseract::read_config_file(const char *filename,
00068                                  SetParamConstraint constraint) {
00069   STRING path = datadir;
00070   path += "configs/";
00071   path += filename;
00072   FILE* fp;
00073   if ((fp = fopen(path.string(), "rb")) != NULL) {
00074     fclose(fp);
00075   } else {
00076     path = datadir;
00077     path += "tessconfigs/";
00078     path += filename;
00079     if ((fp = fopen(path.string(), "rb")) != NULL) {
00080       fclose(fp);
00081     } else {
00082       path = filename;
00083     }
00084   }
00085   ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
00086 }
00087 
00088 // Returns false if a unicharset file for the specified language was not found
00089 // or was invalid.
00090 // This function initializes TessdataManager. After TessdataManager is
00091 // no longer needed, TessdataManager::End() should be called.
00092 //
00093 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
00094 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
00095 // from the language-specific config file (stored in [lang].traineddata), from
00096 // the config files specified on the command line or left as the default
00097 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
00098 bool Tesseract::init_tesseract_lang_data(
00099     const char *arg0, const char *textbase, const char *language,
00100     OcrEngineMode oem, char **configs, int configs_size,
00101     const GenericVector<STRING> *vars_vec,
00102     const GenericVector<STRING> *vars_values,
00103     bool set_only_non_debug_params) {
00104   // Set the basename, compute the data directory.
00105   main_setup(arg0, textbase);
00106 
00107   // Set the language data path prefix
00108   lang = language != NULL ? language : "eng";
00109   language_data_path_prefix = datadir;
00110   language_data_path_prefix += lang;
00111   language_data_path_prefix += ".";
00112 
00113   // Initialize TessdataManager.
00114   STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
00115   if (!tessdata_manager.Init(tessdata_path.string(),
00116                              tessdata_manager_debug_level)) {
00117     return false;
00118   }
00119 
00120   // If a language specific config file (lang.config) exists, load it in.
00121   if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
00122     ParamUtils::ReadParamsFromFp(
00123         tessdata_manager.GetDataFilePtr(),
00124         tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
00125         SET_PARAM_CONSTRAINT_NONE, this->params());
00126     if (tessdata_manager_debug_level) {
00127       tprintf("Loaded language config file\n");
00128     }
00129   }
00130 
00131   SetParamConstraint set_params_constraint = set_only_non_debug_params ?
00132       SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
00133   // Load tesseract variables from config files. This is done after loading
00134   // language-specific variables from [lang].traineddata file, so that custom
00135   // config files can override values in [lang].traineddata file.
00136   for (int i = 0; i < configs_size; ++i) {
00137     read_config_file(configs[i], set_params_constraint);
00138   }
00139 
00140   // Set params specified in vars_vec (done after setting params from config
00141   // files, so that params in vars_vec can override those from files).
00142   if (vars_vec != NULL && vars_values != NULL) {
00143     for (int i = 0; i < vars_vec->size(); ++i) {
00144       if (!ParamUtils::SetParam((*vars_vec)[i].string(),
00145                                 (*vars_values)[i].string(),
00146                                 set_params_constraint, this->params())) {
00147         tprintf("Error setting param %s\n", (*vars_vec)[i].string());
00148         exit(1);
00149       }
00150     }
00151   }
00152 
00153   if (((STRING &)tessedit_write_params_to_file).length() > 0) {
00154     FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
00155     if (params_file != NULL) {
00156       ParamUtils::PrintParams(params_file, this->params());
00157       fclose(params_file);
00158       if (tessdata_manager_debug_level > 0) {
00159         tprintf("Wrote parameters to %s\n",
00160                 tessedit_write_params_to_file.string());
00161       }
00162     } else {
00163       tprintf("Failed to open %s for writing params.\n",
00164               tessedit_write_params_to_file.string());
00165     }
00166   }
00167 
00168   // Determine which ocr engine(s) should be loaded and used for recognition.
00169   if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
00170   if (tessdata_manager_debug_level) {
00171     tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
00172             static_cast<int>(tessedit_ocr_engine_mode));
00173   }
00174 
00175   // If we are only loading the config file (and so not planning on doing any
00176   // recognition) then there's nothing else do here.
00177   if (tessedit_init_config_only) {
00178     if (tessdata_manager_debug_level) {
00179       tprintf("Returning after loading config file\n");
00180     }
00181     return true;
00182   }
00183 
00184   // Load the unicharset
00185   if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
00186       !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
00187     return false;
00188   }
00189   if (unicharset.size() > MAX_NUM_CLASSES) {
00190     tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
00191     return false;
00192   }
00193   if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
00194   right_to_left_ = unicharset.major_right_to_left();
00195 
00196   if (!tessedit_ambigs_training &&
00197       tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
00198     unichar_ambigs.LoadUnicharAmbigs(
00199         tessdata_manager.GetDataFilePtr(),
00200         tessdata_manager.GetEndOffset(TESSDATA_AMBIGS),
00201         ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
00202     if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
00203   }
00204 
00205   // Load Cube objects if necessary.
00206   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
00207     ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
00208     if (tessdata_manager_debug_level)
00209       tprintf("Loaded Cube w/out combiner\n");
00210   } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
00211     ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
00212     if (tessdata_manager_debug_level)
00213       tprintf("Loaded Cube with combiner\n");
00214   }
00215 
00216   return true;
00217 }
00218 
00219 // Helper returns true if the given string is in the vector of strings.
00220 static bool IsStrInList(const STRING& str,
00221                         const GenericVector<STRING>& str_list) {
00222   for (int i = 0; i < str_list.size(); ++i) {
00223     if (str_list[i] == str)
00224       return true;
00225   }
00226   return false;
00227 }
00228 
00229 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
00230 // Langs with no prefix get appended to to_load, provided they
00231 // are not in there already.
00232 // Langs with ~ prefix get appended to not_to_load, provided they are not in
00233 // there already.
00234 void Tesseract::ParseLanguageString(const char* lang_str,
00235                                     GenericVector<STRING>* to_load,
00236                                     GenericVector<STRING>* not_to_load) {
00237   STRING remains(lang_str);
00238   while (remains.length() > 0) {
00239     // Find the start of the lang code and which vector to add to.
00240     const char* start = remains.string();
00241     while (*start == '+')
00242       ++start;
00243     GenericVector<STRING>* target = to_load;
00244     if (*start == '~') {
00245       target = not_to_load;
00246       ++start;
00247     }
00248     // Find the index of the end of the lang code in string start.
00249     int end = strlen(start);
00250     const char* plus = strchr(start, '+');
00251     if (plus != NULL && plus - start < end)
00252       end = plus - start;
00253     STRING lang_code(start);
00254     lang_code.truncate_at(end);
00255     STRING next(start + end);
00256     remains = next;
00257     // Check whether lang_code is already in the target vector and add.
00258     if (!IsStrInList(lang_code, *target)) {
00259       if (tessdata_manager_debug_level)
00260         tprintf("Adding language '%s' to list\n", lang_code.string());
00261       target->push_back(lang_code);
00262     }
00263   }
00264 }
00265 
00266 // Initialize for potentially a set of languages defined by the language
00267 // string and recursively any additional languages required by any language
00268 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
00269 // See init_tesseract_internal for args.
00270 int Tesseract::init_tesseract(
00271     const char *arg0, const char *textbase, const char *language,
00272     OcrEngineMode oem, char **configs, int configs_size,
00273     const GenericVector<STRING> *vars_vec,
00274     const GenericVector<STRING> *vars_values,
00275     bool set_only_non_debug_params) {
00276   GenericVector<STRING> langs_to_load;
00277   GenericVector<STRING> langs_not_to_load;
00278   ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
00279 
00280   sub_langs_.delete_data_pointers();
00281   sub_langs_.clear();
00282   // Find the first loadable lang and load into this.
00283   // Add any languages that this language requires
00284   bool loaded_primary = false;
00285   // Load the rest into sub_langs_.
00286   for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
00287     if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
00288       const char *lang_str = langs_to_load[lang_index].string();
00289       Tesseract *tess_to_init;
00290       if (!loaded_primary) {
00291         tess_to_init = this;
00292       } else {
00293         tess_to_init = new Tesseract;
00294       }
00295 
00296       int result = tess_to_init->init_tesseract_internal(
00297           arg0, textbase, lang_str, oem, configs, configs_size,
00298           vars_vec, vars_values, set_only_non_debug_params);
00299 
00300       if (!loaded_primary) {
00301         if (result < 0) {
00302           tprintf("Failed loading language '%s'\n", lang_str);
00303         } else {
00304           if (tessdata_manager_debug_level)
00305             tprintf("Loaded language '%s' as main language\n", lang_str);
00306           ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
00307                               &langs_to_load, &langs_not_to_load);
00308           loaded_primary = true;
00309         }
00310       } else {
00311         if (result < 0) {
00312           tprintf("Failed loading language '%s'\n", lang_str);
00313           delete tess_to_init;
00314         } else {
00315           if (tessdata_manager_debug_level)
00316             tprintf("Loaded language '%s' as secondary language\n", lang_str);
00317           sub_langs_.push_back(tess_to_init);
00318           // Add any languages that this language requires
00319           ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
00320                               &langs_to_load, &langs_not_to_load);
00321         }
00322       }
00323     }
00324   }
00325   if (!loaded_primary) {
00326     tprintf("Tesseract couldn't load any languages!\n");
00327     return -1;  // Couldn't load any language!
00328   }
00329   SetupUniversalFontIds();
00330   return 0;
00331 }
00332 
00333 // Common initialization for a single language.
00334 // arg0 is the datapath for the tessdata directory, which could be the
00335 // path of the tessdata directory with no trailing /, or (if tessdata
00336 // lives in the same directory as the executable, the path of the executable,
00337 // hence the name arg0.
00338 // textbase is an optional output file basename (used only for training)
00339 // language is the language code to load.
00340 // oem controls which engine(s) will operate on the image
00341 // configs (argv) is an array of config filenames to load variables from.
00342 // May be NULL.
00343 // configs_size (argc) is the number of elements in configs.
00344 // vars_vec is an optional vector of variables to set.
00345 // vars_values is an optional corresponding vector of values for the variables
00346 // in vars_vec.
00347 // If set_only_init_params is true, then only the initialization variables
00348 // will be set.
00349 int Tesseract::init_tesseract_internal(
00350     const char *arg0, const char *textbase, const char *language,
00351     OcrEngineMode oem, char **configs, int configs_size,
00352     const GenericVector<STRING> *vars_vec,
00353     const GenericVector<STRING> *vars_values,
00354     bool set_only_non_debug_params) {
00355   if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
00356                                 configs_size, vars_vec, vars_values,
00357                                 set_only_non_debug_params)) {
00358     return -1;
00359   }
00360   if (tessedit_init_config_only) {
00361     tessdata_manager.End();
00362     return 0;
00363   }
00364   // If only Cube will be used, skip loading Tesseract classifier's
00365   // pre-trained templates.
00366   bool init_tesseract_classifier =
00367     (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY ||
00368      tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED);
00369   // If only Cube will be used and if it has its own Unicharset,
00370   // skip initializing permuter and loading Tesseract Dawgs.
00371   bool init_dict =
00372     !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY &&
00373       tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET));
00374   program_editup(textbase, init_tesseract_classifier, init_dict);
00375   tessdata_manager.End();
00376   return 0;                      //Normal exit
00377 }
00378 
00379 // Helper builds the all_fonts table by adding new fonts from new_fonts.
00380 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
00381                          UnicityTable<FontInfo>* all_fonts) {
00382   for (int i = 0; i < new_fonts.size(); ++i) {
00383     // UnicityTable uniques as we go.
00384     all_fonts->push_back(new_fonts.get(i));
00385   }
00386 }
00387 
00388 // Helper assigns an id to lang_fonts using the index in all_fonts table.
00389 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
00390                       UnicityTable<FontInfo>* lang_fonts) {
00391   for (int i = 0; i < lang_fonts->size(); ++i) {
00392     int index = all_fonts.get_id(lang_fonts->get(i));
00393     lang_fonts->get_mutable(i)->universal_id = index;
00394   }
00395 }
00396 
00397 // Set the universal_id member of each font to be unique among all
00398 // instances of the same font loaded.
00399 void Tesseract::SetupUniversalFontIds() {
00400   // Note that we can get away with bitwise copying FontInfo in
00401   // all_fonts, as it is a temporary structure and we avoid setting the
00402   // delete callback.
00403   UnicityTable<FontInfo> all_fonts;
00404   all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo));
00405 
00406   // Create the universal ID table.
00407   CollectFonts(get_fontinfo_table(), &all_fonts);
00408   for (int i = 0; i < sub_langs_.size(); ++i) {
00409     CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
00410   }
00411   // Assign ids from the table to each font table.
00412   AssignIds(all_fonts, &get_fontinfo_table());
00413   for (int i = 0; i < sub_langs_.size(); ++i) {
00414     AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
00415   }
00416   font_table_size_ = all_fonts.size();
00417 }
00418 
00419 // init the LM component
00420 int Tesseract::init_tesseract_lm(const char *arg0,
00421                    const char *textbase,
00422                    const char *language) {
00423   if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
00424                                 NULL, 0, NULL, NULL, false))
00425     return -1;
00426   getDict().Load();
00427   tessdata_manager.End();
00428   return 0;
00429 }
00430 
00431 void Tesseract::end_tesseract() {
00432   end_recog();
00433 }
00434 
00435 /* Define command type identifiers */
00436 
00437 enum CMD_EVENTS
00438 {
00439   ACTION_1_CMD_EVENT,
00440   RECOG_WERDS,
00441   RECOG_PSEUDO,
00442   ACTION_2_CMD_EVENT
00443 };
00444 
00445 }  // namespace tesseract