Tesseract  3.02
tesseract-ocr/ccutil/tessdatamanager.h
Go to the documentation of this file.
00001 
00002 // File:        tessdatamanager.h
00003 // Description: Functions to handle loading/combining tesseract data files.
00004 // Author:      Daria Antonova
00005 // Created:     Wed Jun 03 11:26:43 PST 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00021 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00022 
00023 #include <stdio.h>
00024 #include "host.h"
00025 #include "tprintf.h"
00026 
00027 static const char kTrainedDataSuffix[] = "traineddata";
00028 
00029 // When adding new tessdata types and file suffixes, please make sure to
00030 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
00031 static const char kLangConfigFileSuffix[] = "config";
00032 static const char kUnicharsetFileSuffix[] = "unicharset";
00033 static const char kAmbigsFileSuffix[] = "unicharambigs";
00034 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
00035 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
00036 static const char kNormProtoFileSuffix[] = "normproto";
00037 static const char kPuncDawgFileSuffix[] = "punc-dawg";
00038 static const char kSystemDawgFileSuffix[] = "word-dawg";
00039 static const char kNumberDawgFileSuffix[] = "number-dawg";
00040 static const char kFreqDawgFileSuffix[] = "freq-dawg";
00041 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
00042 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
00043 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
00044 static const char kShapeTableFileSuffix[] = "shapetable";
00045 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
00046 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
00047 static const char kParamsTrainingModelFileSuffix[] = "params-training-model";
00048 
00049 namespace tesseract {
00050 
00051 enum TessdataType {
00052   TESSDATA_LANG_CONFIG,         // 0
00053   TESSDATA_UNICHARSET,          // 1
00054   TESSDATA_AMBIGS,              // 2
00055   TESSDATA_INTTEMP,             // 3
00056   TESSDATA_PFFMTABLE,           // 4
00057   TESSDATA_NORMPROTO,           // 5
00058   TESSDATA_PUNC_DAWG,           // 6
00059   TESSDATA_SYSTEM_DAWG,         // 7
00060   TESSDATA_NUMBER_DAWG,         // 8
00061   TESSDATA_FREQ_DAWG,           // 9
00062   TESSDATA_FIXED_LENGTH_DAWGS,  // 10
00063   TESSDATA_CUBE_UNICHARSET,     // 11
00064   TESSDATA_CUBE_SYSTEM_DAWG,    // 12
00065   TESSDATA_SHAPE_TABLE,         // 13
00066   TESSDATA_BIGRAM_DAWG,         // 14
00067   TESSDATA_UNAMBIG_DAWG,        // 15
00068   TESSDATA_PARAMS_TRAINING_MODEL,  // 16
00069 
00070   TESSDATA_NUM_ENTRIES
00071 };
00072 
00077 static const char * const kTessdataFileSuffixes[] = {
00078   kLangConfigFileSuffix,        // 0
00079   kUnicharsetFileSuffix,        // 1
00080   kAmbigsFileSuffix,            // 2
00081   kBuiltInTemplatesFileSuffix,  // 3
00082   kBuiltInCutoffsFileSuffix,    // 4
00083   kNormProtoFileSuffix,         // 5
00084   kPuncDawgFileSuffix,          // 6
00085   kSystemDawgFileSuffix,        // 7
00086   kNumberDawgFileSuffix,        // 8
00087   kFreqDawgFileSuffix,          // 9
00088   kFixedLengthDawgsFileSuffix,  // 10
00089   kCubeUnicharsetFileSuffix,    // 11
00090   kCubeSystemDawgFileSuffix,    // 12
00091   kShapeTableFileSuffix,        // 13
00092   kBigramDawgFileSuffix,        // 14
00093   kUnambigDawgFileSuffix,       // 15
00094   kParamsTrainingModelFileSuffix,  // 16
00095 };
00096 
00101 static const bool kTessdataFileIsText[] = {
00102   true,                         // 0
00103   true,                         // 1
00104   true,                         // 2
00105   false,                        // 3
00106   true,                         // 4
00107   true,                         // 5
00108   false,                        // 6
00109   false,                        // 7
00110   false,                        // 8
00111   false,                        // 9
00112   false,                        // 10
00113   true,                         // 11
00114   false,                        // 12
00115   false,                        // 13
00116   false,                        // 14
00117   false,                        // 15
00118   false,                        // 16
00119 };
00120 
00128 static const int kMaxNumTessdataEntries = 1000;
00129 
00130 
00131 class TessdataManager {
00132  public:
00133   TessdataManager() {
00134     data_file_ = NULL;
00135     actual_tessdata_num_entries_ = 0;
00136     for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00137       offset_table_[i] = -1;
00138     }
00139   }
00140   ~TessdataManager() {}
00141   int DebugLevel() { return debug_level_; }
00142 
00147   bool Init(const char *data_file_name, int debug_level);
00148 
00150   inline FILE *GetDataFilePtr() const { return data_file_; }
00151 
00157   inline bool SeekToStart(TessdataType tessdata_type) {
00158     if (debug_level_) {
00159       tprintf("TessdataManager: seek to offset %lld - start of tessdata"
00160               "type %d (%s))\n", offset_table_[tessdata_type],
00161               tessdata_type, kTessdataFileSuffixes[tessdata_type]);
00162     }
00163     if (offset_table_[tessdata_type] < 0) {
00164       return false;
00165     } else {
00166       ASSERT_HOST(fseek(data_file_,
00167                         static_cast<size_t>(offset_table_[tessdata_type]),
00168                         SEEK_SET) == 0);
00169       return true;
00170     }
00171   }
00173   inline inT64 GetEndOffset(TessdataType tessdata_type) const {
00174     int index = tessdata_type + 1;
00175     while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
00176       ++index;  // skip tessdata types not present in the combined file
00177     }
00178     if (debug_level_) {
00179       tprintf("TessdataManager: end offset for type %d is %lld\n",
00180               tessdata_type,
00181               (index == actual_tessdata_num_entries_) ? -1
00182               : offset_table_[index]);
00183     }
00184     return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
00185   }
00187   inline void End() {
00188     if (data_file_ != NULL) {
00189       fclose(data_file_);
00190       data_file_ = NULL;
00191     }
00192   }
00193   bool swap() const {
00194     return swap_;
00195   }
00196 
00198   static void WriteMetadata(inT64 *offset_table, FILE *output_file);
00199 
00205   static bool CombineDataFiles(const char *language_data_path_prefix,
00206                                const char *output_filename);
00207 
00213   bool OverwriteComponents(const char *new_traineddata_filename,
00214                             char **component_filenames,
00215                             int num_new_components);
00216 
00227   bool ExtractToFile(const char *filename);
00228 
00234   static void CopyFile(FILE *input_file, FILE *output_file,
00235                        bool newline_end, inT64 num_bytes_to_copy);
00236 
00245   static bool TessdataTypeFromFileSuffix(const char *suffix,
00246                                          TessdataType *type,
00247                                          bool *text_file);
00248 
00253   static bool TessdataTypeFromFileName(const char *filename,
00254                                        TessdataType *type,
00255                                        bool *text_file);
00256 
00257  private:
00258 
00263   static FILE *GetFilePtr(const char *language_data_path_prefix,
00264                           const char *file_suffix, bool text_file);
00265 
00270   inT64 offset_table_[TESSDATA_NUM_ENTRIES];
00279   inT32 actual_tessdata_num_entries_;
00280   FILE *data_file_;  
00281   int debug_level_;
00282   // True if the bytes need swapping.
00283   bool swap_;
00284 };
00285 
00286 
00287 }  // namespace tesseract
00288 
00289 #endif  // TESSERACT_CCUTIL_TESSDATAMANAGER_H_