Tesseract
3.02
|
00001 00002 // File: tessdatamanager.cpp 00003 // Description: Functions to handle loading/combining tesseract data files. 00004 // Author: Daria Antonova 00005 // Created: Wed Jun 03 11:26:43 PST 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include "tessdatamanager.h" 00025 00026 #include <stdio.h> 00027 00028 #include "serialis.h" 00029 #include "strngs.h" 00030 #include "tprintf.h" 00031 #include "params.h" 00032 00033 namespace tesseract { 00034 00035 bool TessdataManager::Init(const char *data_file_name, int debug_level) { 00036 int i; 00037 debug_level_ = debug_level; 00038 data_file_ = fopen(data_file_name, "rb"); 00039 if (data_file_ == NULL) { 00040 tprintf("Error opening data file %s\n", data_file_name); 00041 tprintf("Please make sure the TESSDATA_PREFIX environment variable is set " 00042 "to the parent directory of your \"tessdata\" directory.\n"); 00043 return false; 00044 } 00045 fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_); 00046 swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries); 00047 if (swap_) { 00048 actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_); 00049 } 00050 ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES); 00051 fread(offset_table_, sizeof(inT64), 00052 actual_tessdata_num_entries_, data_file_); 00053 if (swap_) { 00054 for (i = 0 ; i < actual_tessdata_num_entries_; ++i) { 00055 offset_table_[i] = reverse64(offset_table_[i]); 00056 } 00057 } 00058 if (debug_level_) { 00059 tprintf("TessdataManager loaded %d types of tesseract data files.\n", 00060 actual_tessdata_num_entries_); 00061 for (i = 0; i < actual_tessdata_num_entries_; ++i) { 00062 tprintf("Offset for type %d is %lld\n", i, offset_table_[i]); 00063 } 00064 } 00065 return true; 00066 } 00067 00068 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file, 00069 bool newline_end, inT64 num_bytes_to_copy) { 00070 if (num_bytes_to_copy == 0) return; 00071 int buffer_size = 1024; 00072 if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) { 00073 buffer_size = num_bytes_to_copy; 00074 } 00075 inT64 num_bytes_copied = 0; 00076 char *chunk = new char[buffer_size]; 00077 int bytes_read; 00078 char last_char = 0x0; 00079 while ((bytes_read = fread(chunk, sizeof(char), 00080 buffer_size, input_file))) { 00081 fwrite(chunk, sizeof(char), bytes_read, output_file); 00082 last_char = chunk[bytes_read-1]; 00083 if (num_bytes_to_copy > 0) { 00084 num_bytes_copied += bytes_read; 00085 if (num_bytes_copied == num_bytes_to_copy) break; 00086 if (num_bytes_copied + buffer_size > num_bytes_to_copy) { 00087 buffer_size = num_bytes_to_copy - num_bytes_copied; 00088 } 00089 } 00090 } 00091 if (newline_end) ASSERT_HOST(last_char == '\n'); 00092 delete[] chunk; 00093 } 00094 00095 void TessdataManager::WriteMetadata(inT64 *offset_table, FILE *output_file) { 00096 fseek(output_file, 0, SEEK_SET); 00097 inT32 num_entries = TESSDATA_NUM_ENTRIES; 00098 fwrite(&num_entries, sizeof(inT32), 1, output_file); 00099 fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file); 00100 fclose(output_file); 00101 00102 tprintf("TessdataManager combined tesseract data files.\n"); 00103 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00104 tprintf("Offset for type %d is %lld\n", i, offset_table[i]); 00105 } 00106 } 00107 00108 bool TessdataManager::CombineDataFiles( 00109 const char *language_data_path_prefix, 00110 const char *output_filename) { 00111 int i; 00112 inT64 offset_table[TESSDATA_NUM_ENTRIES]; 00113 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; 00114 FILE *output_file = fopen(output_filename, "wb"); 00115 if (output_file == NULL) { 00116 tprintf("Error opening %s for writing\n", output_filename); 00117 return false; 00118 } 00119 // Leave some space for recording the offset_table. 00120 fseek(output_file, 00121 sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); 00122 00123 TessdataType type = TESSDATA_NUM_ENTRIES; 00124 bool text_file = false; 00125 FILE *file_ptr[TESSDATA_NUM_ENTRIES]; 00126 00127 // Load individual tessdata components from files. 00128 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00129 ASSERT_HOST(TessdataTypeFromFileSuffix( 00130 kTessdataFileSuffixes[i], &type, &text_file)); 00131 STRING filename = language_data_path_prefix; 00132 filename += kTessdataFileSuffixes[i]; 00133 file_ptr[i] = fopen(filename.string(), "rb"); 00134 if (file_ptr[i] != NULL) { 00135 offset_table[type] = ftell(output_file); 00136 CopyFile(file_ptr[i], output_file, text_file, -1); 00137 fclose(file_ptr[i]); 00138 } 00139 } 00140 00141 // Make sure that the required components are present. 00142 if (file_ptr[TESSDATA_UNICHARSET] == NULL) { 00143 tprintf("Error opening unicharset file\n"); 00144 fclose(output_file); 00145 return false; 00146 } 00147 if (file_ptr[TESSDATA_INTTEMP] != NULL && 00148 (file_ptr[TESSDATA_PFFMTABLE] == NULL || 00149 file_ptr[TESSDATA_NORMPROTO] == NULL)) { 00150 tprintf("Error opening pffmtable and/or normproto files" 00151 " while inttemp file was present\n"); 00152 fclose(output_file); 00153 return false; 00154 } 00155 00156 WriteMetadata(offset_table, output_file); 00157 return true; 00158 } 00159 00160 bool TessdataManager::OverwriteComponents( 00161 const char *new_traineddata_filename, 00162 char **component_filenames, 00163 int num_new_components) { 00164 int i; 00165 inT64 offset_table[TESSDATA_NUM_ENTRIES]; 00166 TessdataType type = TESSDATA_NUM_ENTRIES; 00167 bool text_file = false; 00168 FILE *file_ptr[TESSDATA_NUM_ENTRIES]; 00169 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00170 offset_table[i] = -1; 00171 file_ptr[i] = NULL; 00172 } 00173 FILE *output_file = fopen(new_traineddata_filename, "wb"); 00174 if (output_file == NULL) { 00175 tprintf("Error opening %s for writing\n", new_traineddata_filename); 00176 return false; 00177 } 00178 00179 // Leave some space for recording the offset_table. 00180 fseek(output_file, 00181 sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); 00182 00183 // Open the files with the new components. 00184 for (i = 0; i < num_new_components; ++i) { 00185 TessdataTypeFromFileName(component_filenames[i], &type, &text_file); 00186 file_ptr[type] = fopen(component_filenames[i], "rb"); 00187 } 00188 00189 // Write updated data to the output traineddata file. 00190 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00191 if (file_ptr[i] != NULL) { 00192 // Get the data from the opened component file. 00193 offset_table[i] = ftell(output_file); 00194 CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1); 00195 fclose(file_ptr[i]); 00196 } else { 00197 // Get this data component from the loaded data file. 00198 if (SeekToStart(static_cast<TessdataType>(i))) { 00199 offset_table[i] = ftell(output_file); 00200 CopyFile(data_file_, output_file, kTessdataFileIsText[i], 00201 GetEndOffset(static_cast<TessdataType>(i)) - 00202 ftell(data_file_) + 1); 00203 } 00204 } 00205 } 00206 00207 WriteMetadata(offset_table, output_file); 00208 return true; 00209 } 00210 00211 bool TessdataManager::TessdataTypeFromFileSuffix( 00212 const char *suffix, TessdataType *type, bool *text_file) { 00213 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00214 if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { 00215 *type = static_cast<TessdataType>(i); 00216 *text_file = kTessdataFileIsText[i]; 00217 return true; 00218 } 00219 } 00220 printf("TessdataManager can't determine which tessdata" 00221 " component is represented by %s\n", suffix); 00222 return false; 00223 } 00224 00225 bool TessdataManager::TessdataTypeFromFileName( 00226 const char *filename, TessdataType *type, bool *text_file) { 00227 // Get the file suffix (extension) 00228 const char *suffix = strrchr(filename, '.'); 00229 if (suffix == NULL || *(++suffix) == '\0') return false; 00230 return TessdataTypeFromFileSuffix(suffix, type, text_file); 00231 } 00232 00233 bool TessdataManager::ExtractToFile(const char *filename) { 00234 TessdataType type = TESSDATA_NUM_ENTRIES; 00235 bool text_file = false; 00236 ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName( 00237 filename, &type, &text_file)); 00238 if (!SeekToStart(type)) return false; 00239 00240 FILE *output_file = fopen(filename, "wb"); 00241 if (output_file == NULL) { 00242 printf("Error openning %s\n", filename); 00243 exit(1); 00244 } 00245 inT64 begin_offset = ftell(GetDataFilePtr()); 00246 inT64 end_offset = GetEndOffset(type); 00247 tesseract::TessdataManager::CopyFile( 00248 GetDataFilePtr(), output_file, text_file, 00249 end_offset - begin_offset + 1); 00250 fclose(output_file); 00251 return true; 00252 } 00253 00254 } // namespace tesseract