Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: cube_utils.cpp 00003 * Description: Implementation of the Cube Utilities Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <math.h> 00021 #include <string> 00022 #include <vector> 00023 #include "cube_utils.h" 00024 #include "char_set.h" 00025 #include "unichar.h" 00026 00027 namespace tesseract { 00028 CubeUtils::CubeUtils() { 00029 } 00030 00031 CubeUtils::~CubeUtils() { 00032 } 00033 00034 // convert a prob to a cost (-ve log prob) 00035 int CubeUtils::Prob2Cost(double prob_val) { 00036 if (prob_val < MIN_PROB) { 00037 return MIN_PROB_COST; 00038 } 00039 return static_cast<int>(-log(prob_val) * PROB2COST_SCALE); 00040 } 00041 00042 // converts a cost to probability 00043 double CubeUtils::Cost2Prob(int cost) { 00044 return exp(-cost / PROB2COST_SCALE); 00045 } 00046 00047 // computes the length of a NULL terminated char_32 string 00048 int CubeUtils::StrLen(const char_32 *char_32_ptr) { 00049 if (char_32_ptr == NULL) { 00050 return 0; 00051 } 00052 int len = -1; 00053 while (char_32_ptr[++len]); 00054 return len; 00055 } 00056 00057 // compares two char_32 strings 00058 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) { 00059 const char_32 *pch1 = str1; 00060 const char_32 *pch2 = str2; 00061 00062 for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) { 00063 if ((*pch1) != (*pch2)) { 00064 return (*pch1) - (*pch2); 00065 } 00066 } 00067 00068 if ((*pch1) == 0) { 00069 if ((*pch2) == 0) { 00070 return 0; 00071 } else { 00072 return -1; 00073 } 00074 } else { 00075 return 1; 00076 } 00077 } 00078 00079 // Duplicates a 32-bit char buffer 00080 char_32 *CubeUtils::StrDup(const char_32 *str32) { 00081 int len = StrLen(str32); 00082 char_32 *new_str = new char_32[len + 1]; 00083 if (new_str == NULL) { 00084 return NULL; 00085 } 00086 memcpy(new_str, str32, len * sizeof(*str32)); 00087 new_str[len] = 0; 00088 return new_str; 00089 } 00090 00091 // creates a raw buffer from the specified location of the image 00092 unsigned char *CubeUtils::GetImageData(IMAGE *img, int left, 00093 int top, int wid, int hgt) { 00094 // skip invalid dimensions 00095 if (left < 0 || top < 0 || wid < 0 || hgt < 0 || 00096 (left + wid) > img->get_xsize() || 00097 (top + hgt) > img->get_ysize()) { 00098 return NULL; 00099 } 00100 00101 // copy the char img to a temp buffer 00102 unsigned char *temp_buff = new unsigned char[wid * hgt]; 00103 if (temp_buff == NULL) { 00104 return NULL; 00105 } 00106 00107 IMAGELINE line; 00108 line.init(wid); 00109 00110 for (int y = 0, off = 0; y < hgt ; y++) { 00111 img->get_line(left, img->get_ysize() - 1 - y - top, wid, &line, 0); 00112 for (int x = 0; x < wid; x++, off++) { 00113 temp_buff[off] = line.pixels[x] ? 255 : 0; 00114 } 00115 } 00116 00117 return temp_buff; 00118 } 00119 00120 // creates a char samp from a specified portion of the image 00121 CharSamp *CubeUtils::CharSampleFromImg(IMAGE *img, 00122 int left, int top, 00123 int wid, int hgt) { 00124 // get the raw img data from the image 00125 unsigned char *temp_buff = GetImageData(img, left, top, wid, hgt); 00126 if (temp_buff == NULL) { 00127 return NULL; 00128 } 00129 00130 // create a char samp from temp buffer 00131 CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff); 00132 // clean up temp buffer 00133 delete []temp_buff; 00134 return char_samp; 00135 } 00136 00137 // creates a char samp from a specified portion of the image 00138 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top, 00139 int wid, int hgt) { 00140 // get the raw img data from the image 00141 unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt); 00142 if (temp_buff == NULL) { 00143 return NULL; 00144 } 00145 00146 // create a char samp from temp buffer 00147 CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff); 00148 00149 // clean up temp buffer 00150 delete []temp_buff; 00151 return char_samp; 00152 } 00153 00154 // create a B/W image from a char_sample 00155 IMAGE *CubeUtils::ImageFromCharSample(CharSamp *char_samp) { 00156 // parameter check 00157 if (char_samp == NULL) { 00158 return NULL; 00159 } 00160 00161 // get the raw data 00162 int stride = char_samp->Stride(), 00163 wid = char_samp->Width(), 00164 hgt = char_samp->Height(); 00165 00166 unsigned char *buff = char_samp->RawData(); 00167 if (buff == NULL) { 00168 return NULL; 00169 } 00170 00171 // create a new image object 00172 IMAGE *img = new IMAGE(); 00173 if (img == NULL) { 00174 return NULL; 00175 } 00176 00177 // create a blank B/W image 00178 if (img->create(wid, hgt, 1) == -1) { 00179 delete img; 00180 return NULL; 00181 } 00182 00183 // copy the contents 00184 IMAGELINE line; 00185 line.init(wid); 00186 00187 for (int y = 0, off = 0; y < hgt ; y++, off += stride) { 00188 for (int x = 0; x < wid; x++) { 00189 line.pixels[x] = (buff[off + x] == 0) ? 0 : 1; 00190 } 00191 00192 img->fast_put_line(0, hgt - 1 - y, wid, &line); 00193 } 00194 00195 return img; 00196 } 00197 00198 // create a B/W image from a char_sample 00199 Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) { 00200 // parameter check 00201 if (char_samp == NULL) { 00202 return NULL; 00203 } 00204 00205 // get the raw data 00206 int stride = char_samp->Stride(); 00207 int wid = char_samp->Width(); 00208 int hgt = char_samp->Height(); 00209 00210 Pix *pix = pixCreate(wid, hgt, 1); 00211 if (pix == NULL) { 00212 return NULL; 00213 } 00214 00215 // copy the contents 00216 unsigned char *line = char_samp->RawData(); 00217 for (int y = 0; y < hgt ; y++, line += stride) { 00218 for (int x = 0; x < wid; x++) { 00219 if (line[x] != 0) { 00220 pixSetPixel(pix, x, y, 0); 00221 } else { 00222 pixSetPixel(pix, x, y, 255); 00223 } 00224 } 00225 } 00226 00227 return pix; 00228 } 00229 00230 // creates a raw buffer from the specified location of the pix 00231 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top, 00232 int wid, int hgt) { 00233 // skip invalid dimensions 00234 if (left < 0 || top < 0 || wid < 0 || hgt < 0 || 00235 (left + wid) > pix->w || (top + hgt) > pix->h || 00236 pix->d != 1) { 00237 return NULL; 00238 } 00239 00240 // copy the char img to a temp buffer 00241 unsigned char *temp_buff = new unsigned char[wid * hgt]; 00242 if (temp_buff == NULL) { 00243 return NULL; 00244 } 00245 00246 l_int32 w; 00247 l_int32 h; 00248 l_int32 d; 00249 l_int32 wpl; 00250 l_uint32 *line; 00251 l_uint32 *data; 00252 00253 pixGetDimensions(pix, &w, &h, &d); 00254 wpl = pixGetWpl(pix); 00255 data = pixGetData(pix); 00256 line = data + (top * wpl); 00257 00258 for (int y = 0, off = 0; y < hgt ; y++) { 00259 for (int x = 0; x < wid; x++, off++) { 00260 temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255; 00261 } 00262 line += wpl; 00263 } 00264 return temp_buff; 00265 } 00266 00267 // read file contents to a string 00268 bool CubeUtils::ReadFileToString(const string &file_name, string *str) { 00269 str->clear(); 00270 FILE *fp = fopen(file_name.c_str(), "rb"); 00271 if (fp == NULL) { 00272 return false; 00273 } 00274 00275 // get the size of the size 00276 fseek(fp, 0, SEEK_END); 00277 int file_size = ftell(fp); 00278 if (file_size < 1) { 00279 fclose(fp); 00280 return false; 00281 } 00282 // adjust string size 00283 str->reserve(file_size); 00284 // read the contents 00285 rewind(fp); 00286 char *buff = new char[file_size]; 00287 if (buff == NULL) { 00288 fclose(fp); 00289 return false; 00290 } 00291 int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp); 00292 if (read_bytes == file_size) { 00293 str->append(buff, file_size); 00294 } 00295 delete []buff; 00296 fclose(fp); 00297 return (read_bytes == file_size); 00298 } 00299 00300 // splits a string into vectors based on specified delimiters 00301 void CubeUtils::SplitStringUsing(const string &str, 00302 const string &delims, 00303 vector<string> *str_vec) { 00304 // Optimize the common case where delims is a single character. 00305 if (delims[0] != '\0' && delims[1] == '\0') { 00306 char c = delims[0]; 00307 const char* p = str.data(); 00308 const char* end = p + str.size(); 00309 while (p != end) { 00310 if (*p == c) { 00311 ++p; 00312 } else { 00313 const char* start = p; 00314 while (++p != end && *p != c); 00315 str_vec->push_back(string(start, p - start)); 00316 } 00317 } 00318 return; 00319 } 00320 00321 string::size_type begin_index, end_index; 00322 begin_index = str.find_first_not_of(delims); 00323 while (begin_index != string::npos) { 00324 end_index = str.find_first_of(delims, begin_index); 00325 if (end_index == string::npos) { 00326 str_vec->push_back(str.substr(begin_index)); 00327 return; 00328 } 00329 str_vec->push_back(str.substr(begin_index, (end_index - begin_index))); 00330 begin_index = str.find_first_not_of(delims, end_index); 00331 } 00332 } 00333 00334 // UTF-8 to UTF-32 convesion functions 00335 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) { 00336 str32->clear(); 00337 int len = strlen(utf8_str); 00338 int step = 0; 00339 for (int ch = 0; ch < len; ch += step) { 00340 step = UNICHAR::utf8_step(utf8_str + ch); 00341 if (step > 0) { 00342 UNICHAR uni_ch(utf8_str + ch, step); 00343 (*str32) += uni_ch.first_uni(); 00344 } 00345 } 00346 } 00347 00348 // UTF-8 to UTF-32 convesion functions 00349 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) { 00350 str->clear(); 00351 for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) { 00352 UNICHAR uni_ch((*ch_32)); 00353 char *utf8 = uni_ch.utf8_str(); 00354 if (utf8 != NULL) { 00355 (*str) += utf8; 00356 delete []utf8; 00357 } 00358 } 00359 } 00360 00361 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) { 00362 bool all_one_case = true; 00363 bool capitalized; 00364 bool prev_upper; 00365 bool prev_lower; 00366 bool first_upper; 00367 bool first_lower; 00368 bool cur_upper; 00369 bool cur_lower; 00370 00371 string str8; 00372 if (!char_set) { 00373 // If cube char_set is missing, use C-locale-dependent functions 00374 // on UTF8 characters to determine case properties. 00375 first_upper = isupper(str32[0]); 00376 first_lower = islower(str32[0]); 00377 if (first_upper) 00378 capitalized = true; 00379 prev_upper = first_upper; 00380 prev_lower = islower(str32[0]); 00381 for (int c = 1; str32[c] != 0; ++c) { 00382 cur_upper = isupper(str32[c]); 00383 cur_lower = islower(str32[c]); 00384 if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) 00385 all_one_case = false; 00386 if (cur_upper) 00387 capitalized = false; 00388 prev_upper = cur_upper; 00389 prev_lower = cur_lower; 00390 } 00391 } else { 00392 UNICHARSET *unicharset = char_set->InternalUnicharset(); 00393 // Use UNICHARSET functions to determine case properties 00394 first_upper = unicharset->get_isupper(char_set->ClassID(str32[0])); 00395 first_lower = unicharset->get_islower(char_set->ClassID(str32[0])); 00396 if (first_upper) 00397 capitalized = true; 00398 prev_upper = first_upper; 00399 prev_lower = unicharset->get_islower(char_set->ClassID(str32[0])); 00400 00401 for (int c = 1; c < StrLen(str32); ++c) { 00402 cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c])); 00403 cur_lower = unicharset->get_islower(char_set->ClassID(str32[c])); 00404 if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) 00405 all_one_case = false; 00406 if (cur_upper) 00407 capitalized = false; 00408 prev_upper = cur_upper; 00409 prev_lower = cur_lower; 00410 } 00411 } 00412 return all_one_case || capitalized; 00413 } 00414 00415 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) { 00416 if (!char_set) { 00417 return NULL; 00418 } 00419 UNICHARSET *unicharset = char_set->InternalUnicharset(); 00420 int len = StrLen(str32); 00421 char_32 *lower = new char_32[len + 1]; 00422 if (!lower) 00423 return NULL; 00424 for (int i = 0; i < len; ++i) { 00425 char_32 ch = str32[i]; 00426 if (ch == INVALID_UNICHAR_ID) { 00427 delete [] lower; 00428 return NULL; 00429 } 00430 // convert upper-case characters to lower-case 00431 if (unicharset->get_isupper(char_set->ClassID(ch))) { 00432 UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch)); 00433 const char_32 *str32_lower = char_set->ClassString(uid_lower); 00434 // expect lower-case version of character to be a single character 00435 if (!str32_lower || StrLen(str32_lower) != 1) { 00436 delete [] lower; 00437 return NULL; 00438 } 00439 lower[i] = str32_lower[0]; 00440 } else { 00441 lower[i] = ch; 00442 } 00443 } 00444 lower[len] = 0; 00445 return lower; 00446 } 00447 00448 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) { 00449 if (!char_set) { 00450 return NULL; 00451 } 00452 UNICHARSET *unicharset = char_set->InternalUnicharset(); 00453 int len = StrLen(str32); 00454 char_32 *upper = new char_32[len + 1]; 00455 if (!upper) 00456 return NULL; 00457 for (int i = 0; i < len; ++i) { 00458 char_32 ch = str32[i]; 00459 if (ch == INVALID_UNICHAR_ID) { 00460 delete [] upper; 00461 return NULL; 00462 } 00463 // convert lower-case characters to upper-case 00464 if (unicharset->get_islower(char_set->ClassID(ch))) { 00465 UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch)); 00466 const char_32 *str32_upper = char_set->ClassString(uid_upper); 00467 // expect upper-case version of character to be a single character 00468 if (!str32_upper || StrLen(str32_upper) != 1) { 00469 delete [] upper; 00470 return NULL; 00471 } 00472 upper[i] = str32_upper[0]; 00473 } else { 00474 upper[i] = ch; 00475 } 00476 } 00477 upper[len] = 0; 00478 return upper; 00479 } 00480 } // namespace tesseract