Tesseract
3.02
|
#include <tessdatamanager.h>
Public Member Functions | |
TessdataManager () | |
~TessdataManager () | |
int | DebugLevel () |
bool | Init (const char *data_file_name, int debug_level) |
FILE * | GetDataFilePtr () const |
bool | SeekToStart (TessdataType tessdata_type) |
inT64 | GetEndOffset (TessdataType tessdata_type) const |
void | End () |
bool | swap () const |
bool | OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components) |
bool | ExtractToFile (const char *filename) |
Static Public Member Functions | |
static void | WriteMetadata (inT64 *offset_table, FILE *output_file) |
static bool | CombineDataFiles (const char *language_data_path_prefix, const char *output_filename) |
static void | CopyFile (FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy) |
static bool | TessdataTypeFromFileSuffix (const char *suffix, TessdataType *type, bool *text_file) |
static bool | TessdataTypeFromFileName (const char *filename, TessdataType *type, bool *text_file) |
Definition at line 131 of file tessdatamanager.h.
tesseract::TessdataManager::TessdataManager | ( | ) | [inline] |
Definition at line 133 of file tessdatamanager.h.
{ data_file_ = NULL; actual_tessdata_num_entries_ = 0; for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { offset_table_[i] = -1; } }
tesseract::TessdataManager::~TessdataManager | ( | ) | [inline] |
Definition at line 140 of file tessdatamanager.h.
{}
bool tesseract::TessdataManager::CombineDataFiles | ( | const char * | language_data_path_prefix, |
const char * | output_filename | ||
) | [static] |
Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.
Definition at line 108 of file tessdatamanager.cpp.
{ int i; inT64 offset_table[TESSDATA_NUM_ENTRIES]; for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; FILE *output_file = fopen(output_filename, "wb"); if (output_file == NULL) { tprintf("Error opening %s for writing\n", output_filename); return false; } // Leave some space for recording the offset_table. fseek(output_file, sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); TessdataType type = TESSDATA_NUM_ENTRIES; bool text_file = false; FILE *file_ptr[TESSDATA_NUM_ENTRIES]; // Load individual tessdata components from files. for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { ASSERT_HOST(TessdataTypeFromFileSuffix( kTessdataFileSuffixes[i], &type, &text_file)); STRING filename = language_data_path_prefix; filename += kTessdataFileSuffixes[i]; file_ptr[i] = fopen(filename.string(), "rb"); if (file_ptr[i] != NULL) { offset_table[type] = ftell(output_file); CopyFile(file_ptr[i], output_file, text_file, -1); fclose(file_ptr[i]); } } // Make sure that the required components are present. if (file_ptr[TESSDATA_UNICHARSET] == NULL) { tprintf("Error opening unicharset file\n"); fclose(output_file); return false; } if (file_ptr[TESSDATA_INTTEMP] != NULL && (file_ptr[TESSDATA_PFFMTABLE] == NULL || file_ptr[TESSDATA_NORMPROTO] == NULL)) { tprintf("Error opening pffmtable and/or normproto files" " while inttemp file was present\n"); fclose(output_file); return false; } WriteMetadata(offset_table, output_file); return true; }
void tesseract::TessdataManager::CopyFile | ( | FILE * | input_file, |
FILE * | output_file, | ||
bool | newline_end, | ||
inT64 | num_bytes_to_copy | ||
) | [static] |
Copies data from the given input file to the output_file provided. If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from the input file, otherwise all the data in the input file is copied.
Definition at line 68 of file tessdatamanager.cpp.
{ if (num_bytes_to_copy == 0) return; int buffer_size = 1024; if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) { buffer_size = num_bytes_to_copy; } inT64 num_bytes_copied = 0; char *chunk = new char[buffer_size]; int bytes_read; char last_char = 0x0; while ((bytes_read = fread(chunk, sizeof(char), buffer_size, input_file))) { fwrite(chunk, sizeof(char), bytes_read, output_file); last_char = chunk[bytes_read-1]; if (num_bytes_to_copy > 0) { num_bytes_copied += bytes_read; if (num_bytes_copied == num_bytes_to_copy) break; if (num_bytes_copied + buffer_size > num_bytes_to_copy) { buffer_size = num_bytes_to_copy - num_bytes_copied; } } } if (newline_end) ASSERT_HOST(last_char == '\n'); delete[] chunk; }
int tesseract::TessdataManager::DebugLevel | ( | ) | [inline] |
Definition at line 141 of file tessdatamanager.h.
{ return debug_level_; }
void tesseract::TessdataManager::End | ( | ) | [inline] |
Closes data_file_ (if it was opened by Init()).
Definition at line 187 of file tessdatamanager.h.
bool tesseract::TessdataManager::ExtractToFile | ( | const char * | filename | ) |
Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.
Definition at line 233 of file tessdatamanager.cpp.
{ TessdataType type = TESSDATA_NUM_ENTRIES; bool text_file = false; ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName( filename, &type, &text_file)); if (!SeekToStart(type)) return false; FILE *output_file = fopen(filename, "wb"); if (output_file == NULL) { printf("Error openning %s\n", filename); exit(1); } inT64 begin_offset = ftell(GetDataFilePtr()); inT64 end_offset = GetEndOffset(type); tesseract::TessdataManager::CopyFile( GetDataFilePtr(), output_file, text_file, end_offset - begin_offset + 1); fclose(output_file); return true; }
FILE* tesseract::TessdataManager::GetDataFilePtr | ( | ) | const [inline] |
inT64 tesseract::TessdataManager::GetEndOffset | ( | TessdataType | tessdata_type | ) | const [inline] |
Returns the end offset for the given tesseract data file type.
Definition at line 173 of file tessdatamanager.h.
{ int index = tessdata_type + 1; while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) { ++index; // skip tessdata types not present in the combined file } if (debug_level_) { tprintf("TessdataManager: end offset for type %d is %lld\n", tessdata_type, (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index]); } return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1; }
bool tesseract::TessdataManager::Init | ( | const char * | data_file_name, |
int | debug_level | ||
) |
Opens the given data file and reads the offset table. Returns true on success.
Definition at line 35 of file tessdatamanager.cpp.
{ int i; debug_level_ = debug_level; data_file_ = fopen(data_file_name, "rb"); if (data_file_ == NULL) { tprintf("Error opening data file %s\n", data_file_name); tprintf("Please make sure the TESSDATA_PREFIX environment variable is set " "to the parent directory of your \"tessdata\" directory.\n"); return false; } fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_); swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries); if (swap_) { actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_); } ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES); fread(offset_table_, sizeof(inT64), actual_tessdata_num_entries_, data_file_); if (swap_) { for (i = 0 ; i < actual_tessdata_num_entries_; ++i) { offset_table_[i] = reverse64(offset_table_[i]); } } if (debug_level_) { tprintf("TessdataManager loaded %d types of tesseract data files.\n", actual_tessdata_num_entries_); for (i = 0; i < actual_tessdata_num_entries_; ++i) { tprintf("Offset for type %d is %lld\n", i, offset_table_[i]); } } return true; }
bool tesseract::TessdataManager::OverwriteComponents | ( | const char * | new_traineddata_filename, |
char ** | component_filenames, | ||
int | num_new_components | ||
) |
Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.
Definition at line 160 of file tessdatamanager.cpp.
{ int i; inT64 offset_table[TESSDATA_NUM_ENTRIES]; TessdataType type = TESSDATA_NUM_ENTRIES; bool text_file = false; FILE *file_ptr[TESSDATA_NUM_ENTRIES]; for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { offset_table[i] = -1; file_ptr[i] = NULL; } FILE *output_file = fopen(new_traineddata_filename, "wb"); if (output_file == NULL) { tprintf("Error opening %s for writing\n", new_traineddata_filename); return false; } // Leave some space for recording the offset_table. fseek(output_file, sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); // Open the files with the new components. for (i = 0; i < num_new_components; ++i) { TessdataTypeFromFileName(component_filenames[i], &type, &text_file); file_ptr[type] = fopen(component_filenames[i], "rb"); } // Write updated data to the output traineddata file. for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (file_ptr[i] != NULL) { // Get the data from the opened component file. offset_table[i] = ftell(output_file); CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1); fclose(file_ptr[i]); } else { // Get this data component from the loaded data file. if (SeekToStart(static_cast<TessdataType>(i))) { offset_table[i] = ftell(output_file); CopyFile(data_file_, output_file, kTessdataFileIsText[i], GetEndOffset(static_cast<TessdataType>(i)) - ftell(data_file_) + 1); } } } WriteMetadata(offset_table, output_file); return true; }
bool tesseract::TessdataManager::SeekToStart | ( | TessdataType | tessdata_type | ) | [inline] |
Returns false if there is no data of the given type. Otherwise does a seek on the data_file_ to position the pointer at the start of the data of the given type.
Definition at line 157 of file tessdatamanager.h.
{ if (debug_level_) { tprintf("TessdataManager: seek to offset %lld - start of tessdata" "type %d (%s))\n", offset_table_[tessdata_type], tessdata_type, kTessdataFileSuffixes[tessdata_type]); } if (offset_table_[tessdata_type] < 0) { return false; } else { ASSERT_HOST(fseek(data_file_, static_cast<size_t>(offset_table_[tessdata_type]), SEEK_SET) == 0); return true; } }
bool tesseract::TessdataManager::swap | ( | ) | const [inline] |
Definition at line 193 of file tessdatamanager.h.
{
return swap_;
}
bool tesseract::TessdataManager::TessdataTypeFromFileName | ( | const char * | filename, |
TessdataType * | type, | ||
bool * | text_file | ||
) | [static] |
Tries to determine tessdata component file suffix from filename, returns true on success.
Definition at line 225 of file tessdatamanager.cpp.
{ // Get the file suffix (extension) const char *suffix = strrchr(filename, '.'); if (suffix == NULL || *(++suffix) == '\0') return false; return TessdataTypeFromFileSuffix(suffix, type, text_file); }
bool tesseract::TessdataManager::TessdataTypeFromFileSuffix | ( | const char * | suffix, |
TessdataType * | type, | ||
bool * | text_file | ||
) | [static] |
Fills type with TessdataType of the tessdata component represented by the given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. Sets *text_file to true if the component is in text format (e.g. unicharset, unichar ambigs, config, etc).
Definition at line 211 of file tessdatamanager.cpp.
{ for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { *type = static_cast<TessdataType>(i); *text_file = kTessdataFileIsText[i]; return true; } } printf("TessdataManager can't determine which tessdata" " component is represented by %s\n", suffix); return false; }
void tesseract::TessdataManager::WriteMetadata | ( | inT64 * | offset_table, |
FILE * | output_file | ||
) | [static] |
Writes the number of entries and the given offset table to output_file.
Definition at line 95 of file tessdatamanager.cpp.
{ fseek(output_file, 0, SEEK_SET); inT32 num_entries = TESSDATA_NUM_ENTRIES; fwrite(&num_entries, sizeof(inT32), 1, output_file); fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file); fclose(output_file); tprintf("TessdataManager combined tesseract data files.\n"); for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { tprintf("Offset for type %d is %lld\n", i, offset_table[i]); } }