Tesseract  3.02
tesseract-ocr/cube/cube_utils.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        cube_utils.cpp
00003  * Description: Implementation of the Cube Utilities Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include <math.h>
00021 #include <string>
00022 #include <vector>
00023 #include "cube_utils.h"
00024 #include "char_set.h"
00025 #include "unichar.h"
00026 
00027 namespace tesseract {
00028 CubeUtils::CubeUtils() {
00029 }
00030 
00031 CubeUtils::~CubeUtils() {
00032 }
00033 
00034 // convert a prob to a cost (-ve log prob)
00035 int CubeUtils::Prob2Cost(double prob_val) {
00036   if (prob_val < MIN_PROB)   {
00037     return MIN_PROB_COST;
00038   }
00039   return static_cast<int>(-log(prob_val) * PROB2COST_SCALE);
00040 }
00041 
00042 // converts a cost to probability
00043 double CubeUtils::Cost2Prob(int cost) {
00044   return exp(-cost / PROB2COST_SCALE);
00045 }
00046 
00047 // computes the length of a NULL terminated char_32 string
00048 int CubeUtils::StrLen(const char_32 *char_32_ptr) {
00049   if (char_32_ptr == NULL) {
00050     return 0;
00051   }
00052   int len = -1;
00053   while (char_32_ptr[++len]);
00054   return len;
00055 }
00056 
00057 // compares two char_32 strings
00058 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) {
00059   const char_32 *pch1 = str1;
00060   const char_32 *pch2 = str2;
00061 
00062   for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) {
00063     if ((*pch1) != (*pch2)) {
00064       return (*pch1) - (*pch2);
00065     }
00066   }
00067 
00068   if ((*pch1) == 0) {
00069     if ((*pch2) == 0) {
00070       return 0;
00071     } else {
00072       return -1;
00073     }
00074   } else {
00075     return 1;
00076   }
00077 }
00078 
00079 // Duplicates a 32-bit char buffer
00080 char_32 *CubeUtils::StrDup(const char_32 *str32) {
00081   int len = StrLen(str32);
00082   char_32 *new_str = new char_32[len + 1];
00083   if (new_str == NULL) {
00084     return NULL;
00085   }
00086   memcpy(new_str, str32, len * sizeof(*str32));
00087   new_str[len] = 0;
00088   return new_str;
00089 }
00090 
00091 // creates a raw buffer from the specified location of the image
00092 unsigned char *CubeUtils::GetImageData(IMAGE *img, int left,
00093                                        int top, int wid, int hgt) {
00094   // skip invalid dimensions
00095   if (left < 0 || top < 0 || wid < 0 || hgt < 0 ||
00096       (left + wid) > img->get_xsize() ||
00097       (top + hgt) > img->get_ysize()) {
00098     return NULL;
00099   }
00100 
00101   // copy the char img to a temp buffer
00102   unsigned char *temp_buff = new unsigned char[wid * hgt];
00103   if (temp_buff == NULL) {
00104     return NULL;
00105   }
00106 
00107   IMAGELINE line;
00108   line.init(wid);
00109 
00110   for (int y = 0, off = 0; y < hgt ; y++) {
00111     img->get_line(left, img->get_ysize() - 1 - y - top, wid, &line, 0);
00112     for (int x = 0; x < wid; x++, off++) {
00113       temp_buff[off] = line.pixels[x] ? 255 : 0;
00114     }
00115   }
00116 
00117   return temp_buff;
00118 }
00119 
00120 // creates a char samp from a specified portion of the image
00121 CharSamp *CubeUtils::CharSampleFromImg(IMAGE *img,
00122                                        int left, int top,
00123                                        int wid, int hgt) {
00124   // get the raw img data from the image
00125   unsigned char *temp_buff = GetImageData(img, left, top, wid, hgt);
00126   if (temp_buff == NULL) {
00127     return NULL;
00128   }
00129 
00130   // create a char samp from temp buffer
00131   CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
00132   // clean up temp buffer
00133   delete []temp_buff;
00134   return char_samp;
00135 }
00136 
00137 // creates a char samp from a specified portion of the image
00138 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top,
00139                                        int wid, int hgt) {
00140   // get the raw img data from the image
00141   unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt);
00142   if (temp_buff == NULL) {
00143     return NULL;
00144   }
00145 
00146   // create a char samp from temp buffer
00147   CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
00148 
00149   // clean up temp buffer
00150   delete []temp_buff;
00151   return char_samp;
00152 }
00153 
00154 // create a B/W image from a char_sample
00155 IMAGE *CubeUtils::ImageFromCharSample(CharSamp *char_samp) {
00156   // parameter check
00157   if (char_samp == NULL) {
00158     return NULL;
00159   }
00160 
00161   // get the raw data
00162   int stride = char_samp->Stride(),
00163     wid = char_samp->Width(),
00164     hgt = char_samp->Height();
00165 
00166   unsigned char  *buff = char_samp->RawData();
00167   if (buff == NULL) {
00168     return NULL;
00169   }
00170 
00171   // create a new image object
00172   IMAGE *img = new IMAGE();
00173   if (img == NULL) {
00174     return NULL;
00175   }
00176 
00177   // create a blank B/W image
00178   if (img->create(wid, hgt, 1) == -1) {
00179     delete img;
00180     return NULL;
00181   }
00182 
00183   // copy the contents
00184   IMAGELINE line;
00185   line.init(wid);
00186 
00187   for (int y = 0, off = 0; y < hgt ; y++, off += stride) {
00188     for (int x = 0; x < wid; x++) {
00189       line.pixels[x] = (buff[off + x] == 0) ? 0 : 1;
00190     }
00191 
00192     img->fast_put_line(0, hgt - 1 - y, wid, &line);
00193   }
00194 
00195   return img;
00196 }
00197 
00198 // create a B/W image from a char_sample
00199 Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) {
00200   // parameter check
00201   if (char_samp == NULL) {
00202     return NULL;
00203   }
00204 
00205   // get the raw data
00206   int stride = char_samp->Stride();
00207   int wid = char_samp->Width();
00208   int hgt = char_samp->Height();
00209 
00210   Pix *pix = pixCreate(wid, hgt, 1);
00211   if (pix == NULL) {
00212     return NULL;
00213   }
00214 
00215   // copy the contents
00216   unsigned char *line = char_samp->RawData();
00217   for (int y = 0; y < hgt ; y++, line += stride) {
00218     for (int x = 0; x < wid; x++) {
00219       if (line[x] != 0) {
00220         pixSetPixel(pix, x, y, 0);
00221       } else {
00222         pixSetPixel(pix, x, y, 255);
00223       }
00224     }
00225   }
00226 
00227   return pix;
00228 }
00229 
00230 // creates a raw buffer from the specified location of the pix
00231 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top,
00232                                        int wid, int hgt) {
00233   // skip invalid dimensions
00234   if (left < 0 || top < 0 || wid < 0 || hgt < 0 ||
00235       (left + wid) > pix->w || (top + hgt) > pix->h ||
00236       pix->d != 1) {
00237     return NULL;
00238   }
00239 
00240   // copy the char img to a temp buffer
00241   unsigned char *temp_buff = new unsigned char[wid * hgt];
00242   if (temp_buff == NULL) {
00243     return NULL;
00244   }
00245 
00246   l_int32 w;
00247   l_int32 h;
00248   l_int32 d;
00249   l_int32 wpl;
00250   l_uint32 *line;
00251   l_uint32 *data;
00252 
00253   pixGetDimensions(pix, &w, &h, &d);
00254   wpl = pixGetWpl(pix);
00255   data = pixGetData(pix);
00256   line = data + (top * wpl);
00257 
00258   for (int y = 0, off = 0; y < hgt ; y++) {
00259     for (int x = 0; x < wid; x++, off++) {
00260       temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255;
00261     }
00262     line += wpl;
00263   }
00264   return temp_buff;
00265 }
00266 
00267 // read file contents to a string
00268 bool CubeUtils::ReadFileToString(const string &file_name, string *str) {
00269   str->clear();
00270   FILE *fp = fopen(file_name.c_str(), "rb");
00271   if (fp == NULL) {
00272     return false;
00273   }
00274 
00275   // get the size of the size
00276   fseek(fp, 0, SEEK_END);
00277   int file_size = ftell(fp);
00278   if (file_size < 1) {
00279     fclose(fp);
00280     return false;
00281   }
00282   // adjust string size
00283   str->reserve(file_size);
00284   // read the contents
00285   rewind(fp);
00286   char *buff = new char[file_size];
00287   if (buff == NULL) {
00288     fclose(fp);
00289     return false;
00290   }
00291   int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp);
00292   if (read_bytes == file_size) {
00293     str->append(buff, file_size);
00294   }
00295   delete []buff;
00296   fclose(fp);
00297   return (read_bytes == file_size);
00298 }
00299 
00300 // splits a string into vectors based on specified delimiters
00301 void CubeUtils::SplitStringUsing(const string &str,
00302                                  const string &delims,
00303                                  vector<string> *str_vec) {
00304   // Optimize the common case where delims is a single character.
00305   if (delims[0] != '\0' && delims[1] == '\0') {
00306     char c = delims[0];
00307     const char* p = str.data();
00308     const char* end = p + str.size();
00309     while (p != end) {
00310       if (*p == c) {
00311         ++p;
00312       } else {
00313         const char* start = p;
00314         while (++p != end && *p != c);
00315         str_vec->push_back(string(start, p - start));
00316       }
00317     }
00318     return;
00319   }
00320 
00321   string::size_type begin_index, end_index;
00322   begin_index = str.find_first_not_of(delims);
00323   while (begin_index != string::npos) {
00324     end_index = str.find_first_of(delims, begin_index);
00325     if (end_index == string::npos) {
00326       str_vec->push_back(str.substr(begin_index));
00327       return;
00328     }
00329     str_vec->push_back(str.substr(begin_index, (end_index - begin_index)));
00330     begin_index = str.find_first_not_of(delims, end_index);
00331   }
00332 }
00333 
00334 // UTF-8 to UTF-32 convesion functions
00335 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) {
00336   str32->clear();
00337   int len = strlen(utf8_str);
00338   int step = 0;
00339   for (int ch = 0; ch < len; ch += step) {
00340     step = UNICHAR::utf8_step(utf8_str + ch);
00341     if (step > 0) {
00342       UNICHAR uni_ch(utf8_str + ch, step);
00343       (*str32) += uni_ch.first_uni();
00344     }
00345   }
00346 }
00347 
00348 // UTF-8 to UTF-32 convesion functions
00349 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
00350   str->clear();
00351   for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++)  {
00352     UNICHAR uni_ch((*ch_32));
00353     char *utf8 = uni_ch.utf8_str();
00354     if (utf8 != NULL) {
00355       (*str) += utf8;
00356       delete []utf8;
00357     }
00358   }
00359 }
00360 
00361 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
00362   bool all_one_case = true;
00363   bool capitalized;
00364   bool prev_upper;
00365   bool prev_lower;
00366   bool first_upper;
00367   bool first_lower;
00368   bool cur_upper;
00369   bool cur_lower;
00370 
00371   string str8;
00372   if (!char_set) {
00373     // If cube char_set is missing, use C-locale-dependent functions
00374     // on UTF8 characters to determine case properties.
00375     first_upper = isupper(str32[0]);
00376     first_lower = islower(str32[0]);
00377     if (first_upper)
00378       capitalized = true;
00379     prev_upper = first_upper;
00380     prev_lower = islower(str32[0]);
00381     for (int c = 1; str32[c] != 0; ++c) {
00382       cur_upper = isupper(str32[c]);
00383       cur_lower = islower(str32[c]);
00384       if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
00385         all_one_case = false;
00386       if (cur_upper)
00387         capitalized = false;
00388       prev_upper = cur_upper;
00389       prev_lower = cur_lower;
00390     }
00391   } else {
00392     UNICHARSET *unicharset = char_set->InternalUnicharset();
00393     // Use UNICHARSET functions to determine case properties
00394     first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
00395     first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
00396     if (first_upper)
00397       capitalized = true;
00398     prev_upper = first_upper;
00399     prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
00400 
00401     for (int c = 1; c < StrLen(str32); ++c) {
00402       cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
00403       cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
00404       if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
00405         all_one_case = false;
00406       if (cur_upper)
00407         capitalized = false;
00408       prev_upper = cur_upper;
00409       prev_lower = cur_lower;
00410     }
00411   }
00412   return all_one_case || capitalized;
00413 }
00414 
00415 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
00416   if (!char_set) {
00417     return NULL;
00418   }
00419   UNICHARSET *unicharset = char_set->InternalUnicharset();
00420   int len = StrLen(str32);
00421   char_32 *lower = new char_32[len + 1];
00422   if (!lower)
00423     return NULL;
00424   for (int i = 0; i < len; ++i) {
00425     char_32 ch = str32[i];
00426     if (ch == INVALID_UNICHAR_ID) {
00427       delete [] lower;
00428       return NULL;
00429     }
00430     // convert upper-case characters to lower-case
00431     if (unicharset->get_isupper(char_set->ClassID(ch))) {
00432       UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
00433       const char_32 *str32_lower = char_set->ClassString(uid_lower);
00434       // expect lower-case version of character to be a single character
00435       if (!str32_lower || StrLen(str32_lower) != 1) {
00436         delete [] lower;
00437         return NULL;
00438       }
00439       lower[i] = str32_lower[0];
00440     } else {
00441       lower[i] = ch;
00442     }
00443   }
00444   lower[len] = 0;
00445   return lower;
00446 }
00447 
00448 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
00449   if (!char_set) {
00450     return NULL;
00451   }
00452   UNICHARSET *unicharset = char_set->InternalUnicharset();
00453   int len = StrLen(str32);
00454   char_32 *upper = new char_32[len + 1];
00455   if (!upper)
00456     return NULL;
00457   for (int i = 0; i < len; ++i) {
00458     char_32 ch = str32[i];
00459     if (ch == INVALID_UNICHAR_ID) {
00460       delete [] upper;
00461       return NULL;
00462     }
00463     // convert lower-case characters to upper-case
00464     if (unicharset->get_islower(char_set->ClassID(ch))) {
00465       UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
00466       const char_32 *str32_upper = char_set->ClassString(uid_upper);
00467       // expect upper-case version of character to be a single character
00468       if (!str32_upper || StrLen(str32_upper) != 1) {
00469         delete [] upper;
00470         return NULL;
00471       }
00472       upper[i] = str32_upper[0];
00473     } else {
00474       upper[i] = ch;
00475     }
00476   }
00477   upper[len] = 0;
00478   return upper;
00479 }
00480 }  // namespace tesseract