Tesseract  3.02
tesseract-ocr/cube/char_set.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        char_samp_enum.cpp
00003  * Description: Implementation of a Character Set Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2007
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include <string>
00021 
00022 #include "char_set.h"
00023 #include "cube_utils.h"
00024 #include "tessdatamanager.h"
00025 
00026 namespace tesseract {
00027 
00028 CharSet::CharSet() {
00029   class_cnt_ = 0;
00030   class_strings_ = NULL;
00031   unicharset_map_ = NULL;
00032   init_ = false;
00033 
00034   // init hash table
00035   memset(hash_bin_size_, 0, sizeof(hash_bin_size_));
00036 }
00037 
00038 CharSet::~CharSet() {
00039   if (class_strings_ != NULL) {
00040     for (int cls = 0; cls < class_cnt_; cls++) {
00041       if (class_strings_[cls] != NULL) {
00042         delete class_strings_[cls];
00043       }
00044     }
00045     delete []class_strings_;
00046     class_strings_ = NULL;
00047   }
00048   delete []unicharset_map_;
00049 }
00050 
00051 // Creates CharSet object by reading the unicharset from the
00052 // TessDatamanager, and mapping Cube's unicharset to Tesseract's if
00053 // they differ.
00054 CharSet *CharSet::Create(TessdataManager *tessdata_manager,
00055                          UNICHARSET *tess_unicharset) {
00056   CharSet *char_set = new CharSet();
00057   if (char_set == NULL) {
00058     return NULL;
00059   }
00060 
00061   // First look for Cube's unicharset; if not there, use tesseract's
00062   bool cube_unicharset_exists;
00063   if (!(cube_unicharset_exists =
00064         tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) &&
00065       !tessdata_manager->SeekToStart(TESSDATA_UNICHARSET)) {
00066     fprintf(stderr, "Cube ERROR (CharSet::Create): could not find "
00067             "either cube or tesseract unicharset\n");
00068     return false;
00069   }
00070   FILE *charset_fp = tessdata_manager->GetDataFilePtr();
00071   if (!charset_fp) {
00072     fprintf(stderr, "Cube ERROR (CharSet::Create): could not load "
00073             "a unicharset\n");
00074     return false;
00075   }
00076 
00077   // If we found a cube unicharset separate from tesseract's, load it and
00078   // map its unichars to tesseract's; if only one unicharset exists,
00079   // just load it.
00080   bool loaded;
00081   if (cube_unicharset_exists) {
00082     char_set->cube_unicharset_.load_from_file(charset_fp);
00083     loaded = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET);
00084     loaded = loaded && char_set->LoadSupportedCharList(
00085         tessdata_manager->GetDataFilePtr(), tess_unicharset);
00086     char_set->unicharset_ = &char_set->cube_unicharset_;
00087   } else {
00088     loaded = char_set->LoadSupportedCharList(charset_fp, NULL);
00089     char_set->unicharset_ = tess_unicharset;
00090   }
00091   if (!loaded) {
00092     delete char_set;
00093     return false;
00094   }
00095 
00096   char_set->init_ = true;
00097   return char_set;
00098 }
00099 
00100 // Load the list of supported chars from the given data file pointer.
00101 bool CharSet::LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset) {
00102   if (init_)
00103     return true;
00104 
00105   char str_line[256];
00106   // init hash table
00107   memset(hash_bin_size_, 0, sizeof(hash_bin_size_));
00108   // read the char count
00109   if (fgets(str_line, sizeof(str_line), fp) == NULL) {
00110     fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not "
00111             "read char count.\n");
00112     return false;
00113   }
00114   class_cnt_ = atoi(str_line);
00115   if  (class_cnt_ < 2) {
00116     fprintf(stderr, "Cube ERROR (CharSet::InitMemory): invalid "
00117             "class count: %d\n", class_cnt_);
00118     return false;
00119   }
00120   // memory for class strings
00121   class_strings_ = new string_32*[class_cnt_];
00122   if (class_strings_ == NULL) {
00123     fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not "
00124             "allocate memory for class strings.\n");
00125     return false;
00126   }
00127   // memory for unicharset map
00128   if (tess_unicharset) {
00129     unicharset_map_ = new int[class_cnt_];
00130     if (unicharset_map_ == NULL) {
00131       fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not "
00132               "allocate memory for unicharset map.\n");
00133       return false;
00134     }
00135   }
00136 
00137   // Read in character strings and add to hash table
00138   for (int class_id = 0; class_id < class_cnt_; class_id++) {
00139     // Read the class string
00140     if (fgets(str_line, sizeof(str_line), fp) == NULL) {
00141       fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): "
00142               "could not read class string with class_id=%d.\n", class_id);
00143       return false;
00144     }
00145     // Terminate at space if any
00146     char *p = strchr(str_line, ' ');
00147     if (p != NULL)
00148       *p = '\0';
00149     // Convert to UTF32 and store
00150     string_32 str32;
00151     // Convert NULL to a space
00152     if (strcmp(str_line, "NULL") == 0) {
00153       strcpy(str_line, " ");
00154     }
00155     CubeUtils::UTF8ToUTF32(str_line, &str32);
00156     class_strings_[class_id] = new string_32(str32);
00157     if (class_strings_[class_id] == NULL) {
00158       fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): could not "
00159               "allocate memory for class string with class_id=%d.\n", class_id);
00160       return false;
00161     }
00162 
00163     // Add to hash-table
00164     int hash_val = Hash(reinterpret_cast<const char_32 *>(str32.c_str()));
00165     if (hash_bin_size_[hash_val] >= kMaxHashSize) {
00166       fprintf(stderr, "Cube ERROR (CharSet::LoadSupportedCharList): hash "
00167               "table is full.\n");
00168       return false;
00169     }
00170     hash_bins_[hash_val][hash_bin_size_[hash_val]++] = class_id;
00171 
00172     if (tess_unicharset != NULL) {
00173       // Add class id to unicharset map
00174       UNICHAR_ID tess_id = tess_unicharset->unichar_to_id(str_line);
00175       if (tess_id == INVALID_UNICHAR_ID) {
00176         tess_unicharset->unichar_insert(str_line);
00177         tess_id = tess_unicharset->unichar_to_id(str_line);
00178       }
00179       ASSERT_HOST(tess_id != INVALID_UNICHAR_ID);
00180       unicharset_map_[class_id] = tess_id;
00181     }
00182   }
00183   return true;
00184 }
00185 
00186 }  // tesseract