Tesseract
3.02
|
00001 00002 // File: tessdatamanager.h 00003 // Description: Functions to handle loading/combining tesseract data files. 00004 // Author: Daria Antonova 00005 // Created: Wed Jun 03 11:26:43 PST 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 00021 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 00022 00023 #include <stdio.h> 00024 #include "host.h" 00025 #include "tprintf.h" 00026 00027 static const char kTrainedDataSuffix[] = "traineddata"; 00028 00029 // When adding new tessdata types and file suffixes, please make sure to 00030 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText. 00031 static const char kLangConfigFileSuffix[] = "config"; 00032 static const char kUnicharsetFileSuffix[] = "unicharset"; 00033 static const char kAmbigsFileSuffix[] = "unicharambigs"; 00034 static const char kBuiltInTemplatesFileSuffix[] = "inttemp"; 00035 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable"; 00036 static const char kNormProtoFileSuffix[] = "normproto"; 00037 static const char kPuncDawgFileSuffix[] = "punc-dawg"; 00038 static const char kSystemDawgFileSuffix[] = "word-dawg"; 00039 static const char kNumberDawgFileSuffix[] = "number-dawg"; 00040 static const char kFreqDawgFileSuffix[] = "freq-dawg"; 00041 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs"; 00042 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset"; 00043 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg"; 00044 static const char kShapeTableFileSuffix[] = "shapetable"; 00045 static const char kBigramDawgFileSuffix[] = "bigram-dawg"; 00046 static const char kUnambigDawgFileSuffix[] = "unambig-dawg"; 00047 static const char kParamsTrainingModelFileSuffix[] = "params-training-model"; 00048 00049 namespace tesseract { 00050 00051 enum TessdataType { 00052 TESSDATA_LANG_CONFIG, // 0 00053 TESSDATA_UNICHARSET, // 1 00054 TESSDATA_AMBIGS, // 2 00055 TESSDATA_INTTEMP, // 3 00056 TESSDATA_PFFMTABLE, // 4 00057 TESSDATA_NORMPROTO, // 5 00058 TESSDATA_PUNC_DAWG, // 6 00059 TESSDATA_SYSTEM_DAWG, // 7 00060 TESSDATA_NUMBER_DAWG, // 8 00061 TESSDATA_FREQ_DAWG, // 9 00062 TESSDATA_FIXED_LENGTH_DAWGS, // 10 00063 TESSDATA_CUBE_UNICHARSET, // 11 00064 TESSDATA_CUBE_SYSTEM_DAWG, // 12 00065 TESSDATA_SHAPE_TABLE, // 13 00066 TESSDATA_BIGRAM_DAWG, // 14 00067 TESSDATA_UNAMBIG_DAWG, // 15 00068 TESSDATA_PARAMS_TRAINING_MODEL, // 16 00069 00070 TESSDATA_NUM_ENTRIES 00071 }; 00072 00077 static const char * const kTessdataFileSuffixes[] = { 00078 kLangConfigFileSuffix, // 0 00079 kUnicharsetFileSuffix, // 1 00080 kAmbigsFileSuffix, // 2 00081 kBuiltInTemplatesFileSuffix, // 3 00082 kBuiltInCutoffsFileSuffix, // 4 00083 kNormProtoFileSuffix, // 5 00084 kPuncDawgFileSuffix, // 6 00085 kSystemDawgFileSuffix, // 7 00086 kNumberDawgFileSuffix, // 8 00087 kFreqDawgFileSuffix, // 9 00088 kFixedLengthDawgsFileSuffix, // 10 00089 kCubeUnicharsetFileSuffix, // 11 00090 kCubeSystemDawgFileSuffix, // 12 00091 kShapeTableFileSuffix, // 13 00092 kBigramDawgFileSuffix, // 14 00093 kUnambigDawgFileSuffix, // 15 00094 kParamsTrainingModelFileSuffix, // 16 00095 }; 00096 00101 static const bool kTessdataFileIsText[] = { 00102 true, // 0 00103 true, // 1 00104 true, // 2 00105 false, // 3 00106 true, // 4 00107 true, // 5 00108 false, // 6 00109 false, // 7 00110 false, // 8 00111 false, // 9 00112 false, // 10 00113 true, // 11 00114 false, // 12 00115 false, // 13 00116 false, // 14 00117 false, // 15 00118 false, // 16 00119 }; 00120 00128 static const int kMaxNumTessdataEntries = 1000; 00129 00130 00131 class TessdataManager { 00132 public: 00133 TessdataManager() { 00134 data_file_ = NULL; 00135 actual_tessdata_num_entries_ = 0; 00136 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00137 offset_table_[i] = -1; 00138 } 00139 } 00140 ~TessdataManager() {} 00141 int DebugLevel() { return debug_level_; } 00142 00147 bool Init(const char *data_file_name, int debug_level); 00148 00150 inline FILE *GetDataFilePtr() const { return data_file_; } 00151 00157 inline bool SeekToStart(TessdataType tessdata_type) { 00158 if (debug_level_) { 00159 tprintf("TessdataManager: seek to offset %lld - start of tessdata" 00160 "type %d (%s))\n", offset_table_[tessdata_type], 00161 tessdata_type, kTessdataFileSuffixes[tessdata_type]); 00162 } 00163 if (offset_table_[tessdata_type] < 0) { 00164 return false; 00165 } else { 00166 ASSERT_HOST(fseek(data_file_, 00167 static_cast<size_t>(offset_table_[tessdata_type]), 00168 SEEK_SET) == 0); 00169 return true; 00170 } 00171 } 00173 inline inT64 GetEndOffset(TessdataType tessdata_type) const { 00174 int index = tessdata_type + 1; 00175 while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) { 00176 ++index; // skip tessdata types not present in the combined file 00177 } 00178 if (debug_level_) { 00179 tprintf("TessdataManager: end offset for type %d is %lld\n", 00180 tessdata_type, 00181 (index == actual_tessdata_num_entries_) ? -1 00182 : offset_table_[index]); 00183 } 00184 return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1; 00185 } 00187 inline void End() { 00188 if (data_file_ != NULL) { 00189 fclose(data_file_); 00190 data_file_ = NULL; 00191 } 00192 } 00193 bool swap() const { 00194 return swap_; 00195 } 00196 00198 static void WriteMetadata(inT64 *offset_table, FILE *output_file); 00199 00205 static bool CombineDataFiles(const char *language_data_path_prefix, 00206 const char *output_filename); 00207 00213 bool OverwriteComponents(const char *new_traineddata_filename, 00214 char **component_filenames, 00215 int num_new_components); 00216 00227 bool ExtractToFile(const char *filename); 00228 00234 static void CopyFile(FILE *input_file, FILE *output_file, 00235 bool newline_end, inT64 num_bytes_to_copy); 00236 00245 static bool TessdataTypeFromFileSuffix(const char *suffix, 00246 TessdataType *type, 00247 bool *text_file); 00248 00253 static bool TessdataTypeFromFileName(const char *filename, 00254 TessdataType *type, 00255 bool *text_file); 00256 00257 private: 00258 00263 static FILE *GetFilePtr(const char *language_data_path_prefix, 00264 const char *file_suffix, bool text_file); 00265 00270 inT64 offset_table_[TESSDATA_NUM_ENTRIES]; 00279 inT32 actual_tessdata_num_entries_; 00280 FILE *data_file_; 00281 int debug_level_; 00282 // True if the bytes need swapping. 00283 bool swap_; 00284 }; 00285 00286 00287 } // namespace tesseract 00288 00289 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_