Tesseract  3.02
tesseract-ocr/cube/cube_utils.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        cube_utils.h
00003  * Description: Declaration of the Cube Utilities Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  *(C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0(the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // The CubeUtils class provides miscellaneous utility and helper functions
00021 // to the rest of the Cube Engine
00022 
00023 #ifndef CUBE_UTILS_H
00024 #define CUBE_UTILS_H
00025 
00026 #include <vector>
00027 #include <string>
00028 
00029 #include "allheaders.h"
00030 #include "const.h"
00031 #include "char_set.h"
00032 #include "char_samp.h"
00033 #include "img.h"
00034 
00035 namespace tesseract {
00036 class CubeUtils {
00037  public:
00038   CubeUtils();
00039   ~CubeUtils();
00040 
00041   // Converts a probability value to a cost by getting the -log() of the
00042   // probability value to a known base
00043   static int Prob2Cost(double prob_val);
00044   // Converts a cost to probability by getting the exp(-normalized cost)
00045   static double Cost2Prob(int cost);
00046   // Computes the length of a 32-bit char buffer
00047   static int StrLen(const char_32 *str);
00048   // Compares two 32-bit char buffers
00049   static int StrCmp(const char_32 *str1, const char_32 *str2);
00050   // Duplicates a 32-bit char buffer
00051   static char_32 *StrDup(const char_32 *str);
00052   // Creates a CharSamp from an IMAGE and a bounding box
00053   static CharSamp *CharSampleFromImg(IMAGE *img,
00054                                      int left, int top, int wid, int hgt);
00055   // Creates a CharSamp from an Pix and a bounding box
00056   static CharSamp *CharSampleFromPix(Pix *pix,
00057                                      int left, int top, int wid, int hgt);
00058   // Creates an IMAGE from a CharSamp
00059   static IMAGE *ImageFromCharSample(CharSamp *char_samp);
00060   // Creates a Pix from a CharSamp
00061   static Pix *PixFromCharSample(CharSamp *char_samp);
00062   // read the contents of a file to a string
00063   static bool ReadFileToString(const string &file_name, string *str);
00064   // split a string into vectors using any of the specified delimiters
00065   static void SplitStringUsing(const string &str, const string &delims,
00066                                vector<string> *str_vec);
00067   // UTF-8 to UTF-32 convesion functions
00068   static void UTF8ToUTF32(const char *utf8_str, string_32 *str32);
00069   static void UTF32ToUTF8(const char_32 *utf32_str, string *str);
00070   // Returns true if input word has either 1) all-one-case, or 2)
00071   // first character upper-case, and remaining characters lower-case.
00072   // If char_set is not NULL, uses tesseract's unicharset functions
00073   // to determine case properties. Otherwise, uses C-locale-dependent
00074   // functions, which may be unreliable on non-ASCII characters.
00075   static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set);
00076   // Returns char_32 pointer to the lower-case-transformed version of
00077   // the input string or NULL on error. If char_set is NULL returns NULL.
00078   // Return array must be freed by caller.
00079   static char_32 *ToLower(const char_32 *str32, CharSet *char_set);
00080   // Returns char_32 pointer to the upper-case-transformed version of
00081   // the input string or NULL on error. If char_set is NULL returns NULL.
00082   // Return array must be freed by caller.
00083   static char_32 *ToUpper(const char_32 *str32, CharSet *char_set);
00084  private:
00085   static unsigned char *GetImageData(IMAGE *img,
00086                                      int left, int top, int wid, int hgt);
00087   static unsigned char *GetImageData(Pix *pix,
00088                                      int left, int top, int wid, int hgt);
00089 };
00090 }  // namespace tesseract
00091 #endif  // CUBE_UTILS_H