Tesseract  3.02
tesseract-ocr/ccutil/tessdatamanager.cpp
Go to the documentation of this file.
00001 
00002 // File:        tessdatamanager.cpp
00003 // Description: Functions to handle loading/combining tesseract data files.
00004 // Author:      Daria Antonova
00005 // Created:     Wed Jun 03 11:26:43 PST 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include "tessdatamanager.h"
00025 
00026 #include <stdio.h>
00027 
00028 #include "serialis.h"
00029 #include "strngs.h"
00030 #include "tprintf.h"
00031 #include "params.h"
00032 
00033 namespace tesseract {
00034 
00035 bool TessdataManager::Init(const char *data_file_name, int debug_level) {
00036   int i;
00037   debug_level_ = debug_level;
00038   data_file_ = fopen(data_file_name, "rb");
00039   if (data_file_ == NULL) {
00040     tprintf("Error opening data file %s\n", data_file_name);
00041     tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
00042             "to the parent directory of your \"tessdata\" directory.\n");
00043     return false;
00044   }
00045   fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
00046   swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
00047   if (swap_) {
00048     actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
00049   }
00050   ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
00051   fread(offset_table_, sizeof(inT64),
00052         actual_tessdata_num_entries_, data_file_);
00053   if (swap_) {
00054     for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
00055       offset_table_[i] = reverse64(offset_table_[i]);
00056     }
00057   }
00058   if (debug_level_) {
00059     tprintf("TessdataManager loaded %d types of tesseract data files.\n",
00060             actual_tessdata_num_entries_);
00061     for (i = 0; i < actual_tessdata_num_entries_; ++i) {
00062       tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
00063     }
00064   }
00065   return true;
00066 }
00067 
00068 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
00069                                bool newline_end, inT64 num_bytes_to_copy) {
00070   if (num_bytes_to_copy == 0) return;
00071   int buffer_size = 1024;
00072   if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
00073     buffer_size = num_bytes_to_copy;
00074   }
00075   inT64 num_bytes_copied = 0;
00076   char *chunk = new char[buffer_size];
00077   int bytes_read;
00078   char last_char = 0x0;
00079   while ((bytes_read = fread(chunk, sizeof(char),
00080                              buffer_size, input_file))) {
00081     fwrite(chunk, sizeof(char), bytes_read, output_file);
00082     last_char = chunk[bytes_read-1];
00083     if (num_bytes_to_copy > 0) {
00084       num_bytes_copied += bytes_read;
00085       if (num_bytes_copied == num_bytes_to_copy) break;
00086       if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
00087         buffer_size = num_bytes_to_copy - num_bytes_copied;
00088       }
00089     }
00090   }
00091   if (newline_end) ASSERT_HOST(last_char == '\n');
00092   delete[] chunk;
00093 }
00094 
00095 void TessdataManager::WriteMetadata(inT64 *offset_table, FILE *output_file) {
00096   fseek(output_file, 0, SEEK_SET);
00097   inT32 num_entries = TESSDATA_NUM_ENTRIES;
00098   fwrite(&num_entries, sizeof(inT32), 1, output_file);
00099   fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
00100   fclose(output_file);
00101 
00102   tprintf("TessdataManager combined tesseract data files.\n");
00103   for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00104     tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
00105   }
00106 }
00107 
00108 bool TessdataManager::CombineDataFiles(
00109     const char *language_data_path_prefix,
00110     const char *output_filename) {
00111   int i;
00112   inT64 offset_table[TESSDATA_NUM_ENTRIES];
00113   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
00114   FILE *output_file = fopen(output_filename, "wb");
00115   if (output_file == NULL) {
00116     tprintf("Error opening %s for writing\n", output_filename);
00117     return false;
00118   }
00119   // Leave some space for recording the offset_table.
00120   fseek(output_file,
00121         sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
00122 
00123   TessdataType type = TESSDATA_NUM_ENTRIES;
00124   bool text_file = false;
00125   FILE *file_ptr[TESSDATA_NUM_ENTRIES];
00126 
00127   // Load individual tessdata components from files.
00128   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00129     ASSERT_HOST(TessdataTypeFromFileSuffix(
00130         kTessdataFileSuffixes[i], &type, &text_file));
00131     STRING filename = language_data_path_prefix;
00132     filename += kTessdataFileSuffixes[i];
00133     file_ptr[i] =  fopen(filename.string(), "rb");
00134     if (file_ptr[i] != NULL) {
00135       offset_table[type] = ftell(output_file);
00136       CopyFile(file_ptr[i], output_file, text_file, -1);
00137       fclose(file_ptr[i]);
00138     }
00139   }
00140 
00141   // Make sure that the required components are present.
00142   if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
00143     tprintf("Error opening unicharset file\n");
00144     fclose(output_file);
00145     return false;
00146   }
00147   if (file_ptr[TESSDATA_INTTEMP] != NULL &&
00148       (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
00149        file_ptr[TESSDATA_NORMPROTO] == NULL)) {
00150     tprintf("Error opening pffmtable and/or normproto files"
00151             " while inttemp file was present\n");
00152     fclose(output_file);
00153     return false;
00154   }
00155 
00156   WriteMetadata(offset_table, output_file);
00157   return true;
00158 }
00159 
00160 bool TessdataManager::OverwriteComponents(
00161     const char *new_traineddata_filename,
00162     char **component_filenames,
00163     int num_new_components) {
00164   int i;
00165   inT64 offset_table[TESSDATA_NUM_ENTRIES];
00166   TessdataType type = TESSDATA_NUM_ENTRIES;
00167   bool text_file = false;
00168   FILE *file_ptr[TESSDATA_NUM_ENTRIES];
00169   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00170     offset_table[i] = -1;
00171     file_ptr[i] = NULL;
00172   }
00173   FILE *output_file = fopen(new_traineddata_filename, "wb");
00174   if (output_file == NULL) {
00175     tprintf("Error opening %s for writing\n", new_traineddata_filename);
00176     return false;
00177   }
00178 
00179   // Leave some space for recording the offset_table.
00180   fseek(output_file,
00181         sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
00182 
00183   // Open the files with the new components.
00184   for (i = 0; i < num_new_components; ++i) {
00185     TessdataTypeFromFileName(component_filenames[i], &type, &text_file);
00186     file_ptr[type] = fopen(component_filenames[i], "rb");
00187   }
00188 
00189   // Write updated data to the output traineddata file.
00190   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00191     if (file_ptr[i] != NULL) {
00192       // Get the data from the opened component file.
00193       offset_table[i] = ftell(output_file);
00194       CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
00195       fclose(file_ptr[i]);
00196     } else {
00197       // Get this data component from the loaded data file.
00198       if (SeekToStart(static_cast<TessdataType>(i))) {
00199         offset_table[i] = ftell(output_file);
00200         CopyFile(data_file_, output_file, kTessdataFileIsText[i],
00201                  GetEndOffset(static_cast<TessdataType>(i)) -
00202                  ftell(data_file_) + 1);
00203       }
00204     }
00205   }
00206 
00207   WriteMetadata(offset_table, output_file);
00208   return true;
00209 }
00210 
00211 bool TessdataManager::TessdataTypeFromFileSuffix(
00212     const char *suffix, TessdataType *type, bool *text_file) {
00213   for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00214     if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
00215       *type = static_cast<TessdataType>(i);
00216       *text_file = kTessdataFileIsText[i];
00217       return true;
00218     }
00219   }
00220   printf("TessdataManager can't determine which tessdata"
00221          " component is represented by %s\n", suffix);
00222   return false;
00223 }
00224 
00225 bool TessdataManager::TessdataTypeFromFileName(
00226     const char *filename, TessdataType *type, bool *text_file) {
00227   // Get the file suffix (extension)
00228   const char *suffix = strrchr(filename, '.');
00229   if (suffix == NULL || *(++suffix) == '\0') return false;
00230   return TessdataTypeFromFileSuffix(suffix, type, text_file);
00231 }
00232 
00233 bool TessdataManager::ExtractToFile(const char *filename) {
00234   TessdataType type = TESSDATA_NUM_ENTRIES;
00235   bool text_file = false;
00236   ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(
00237       filename, &type, &text_file));
00238   if (!SeekToStart(type)) return false;
00239 
00240   FILE *output_file = fopen(filename, "wb");
00241   if (output_file == NULL) {
00242     printf("Error openning %s\n", filename);
00243     exit(1);
00244   }
00245   inT64 begin_offset = ftell(GetDataFilePtr());
00246   inT64 end_offset = GetEndOffset(type);
00247   tesseract::TessdataManager::CopyFile(
00248       GetDataFilePtr(), output_file, text_file,
00249       end_offset - begin_offset + 1);
00250   fclose(output_file);
00251   return true;
00252 }
00253 
00254 }  // namespace tesseract