Tesseract  3.02
tesseract-ocr/ccutil/strngs.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        strngs.h  (Formerly strings.h)
00003  * Description: STRING class definition.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Fri Feb 15 09:15:01 GMT 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef           STRNGS_H
00021 #define           STRNGS_H
00022 
00023 #include          <string.h>
00024 #include          "platform.h"
00025 #include          "memry.h"
00026 #include          "serialis.h"
00027 
00028 // STRING_IS_PROTECTED means that  string[index] = X is invalid
00029 // because you have to go through strings interface to modify it.
00030 // This allows the string to ensure internal integrity and maintain
00031 // its own string length. Unfortunately this is not possible because
00032 // STRINGS are used as direct-manipulation data buffers for things
00033 // like length arrays and many places cast away the const on string()
00034 // to mutate the string. Turning this off means that internally we
00035 // cannot assume we know the strlen.
00036 #define STRING_IS_PROTECTED  0
00037 
00038 template <typename T> class GenericVector;
00039 
00040 class TESS_API STRING
00041 {
00042   public:
00043     STRING();
00044     STRING(const STRING &string);
00045     STRING(const char *string);
00046     ~STRING ();
00047 
00048     // Writes to the given file. Returns false in case of error.
00049     bool Serialize(FILE* fp) const;
00050     // Reads from the given file. Returns false in case of error.
00051     // If swap is true, assumes a big/little-endian swap is needed.
00052     bool DeSerialize(bool swap, FILE* fp);
00053 
00054     BOOL8 contains(const char c) const;
00055     inT32 length() const;
00056     inT32 size() const { return length(); }
00057     const char *string() const;
00058 
00059 #if STRING_IS_PROTECTED
00060     const char &operator[] (inT32 index) const;
00061     // len is number of chars in s to insert starting at index in this string
00062     void insert_range(inT32 index, const char*s, int len);
00063     void erase_range(inT32 index, int len);
00064 #else
00065     char &operator[] (inT32 index) const;
00066 #endif
00067     void split(const char c, GenericVector<STRING> *splited);
00068     void truncate_at(inT32 index);
00069 
00070     BOOL8 operator== (const STRING & string) const;
00071     BOOL8 operator!= (const STRING & string) const;
00072     BOOL8 operator!= (const char *string) const;
00073 
00074     STRING & operator= (const char *string);
00075     STRING & operator= (const STRING & string);
00076 
00077     STRING operator+ (const STRING & string) const;
00078     STRING operator+ (const char ch) const;
00079 
00080     STRING & operator+= (const char *string);
00081     STRING & operator+= (const STRING & string);
00082     STRING & operator+= (const char ch);
00083 
00084     // Appends the given string and int (as a %d) to this.
00085     // += cannot be used for ints as there as a char += operator that would
00086     // be ambiguous, and ints usually need a string before or between them
00087     // anyway.
00088     void add_str_int(const char* str, int number);
00089 
00090     // ensure capcaity but keep pointer encapsulated
00091     inline void ensure(inT32 min_capacity) { ensure_cstr(min_capacity); }
00092 
00093   private:
00094     typedef struct STRING_HEADER {
00095       // How much space was allocated in the string buffer for char data.
00096       int capacity_;
00097 
00098       // used_ is how much of the capacity is currently being used,
00099       // including a '\0' terminator.
00100       //
00101       // If used_ is 0 then string is NULL (not even the '\0')
00102       // else if used_ > 0 then it is strlen() + 1 (because it includes '\0')
00103       // else strlen is >= 0 (not NULL) but needs to be computed.
00104       //      this condition is set when encapsulation is violated because
00105       //      an API returned a mutable string.
00106       //
00107       // capacity_ - used_ = excess capacity that the string can grow
00108       //                     without reallocating
00109       mutable int used_;
00110     } STRING_HEADER;
00111 
00112     // To preserve the behavior of the old serialization, we only have space
00113     // for one pointer in this structure. So we are embedding a data structure
00114     // at the start of the storage that will hold additional state variables,
00115     // then storing the actual string contents immediately after.
00116     STRING_HEADER* data_;
00117 
00118     // returns the header part of the storage
00119     inline STRING_HEADER* GetHeader() {
00120       return data_;
00121     }
00122     inline const STRING_HEADER* GetHeader() const {
00123       return data_;
00124     }
00125 
00126     // returns the string data part of storage
00127     inline char* GetCStr() {
00128       return ((char *)data_) + sizeof(STRING_HEADER);
00129     };
00130 
00131     inline const char* GetCStr() const {
00132       return ((const char *)data_) + sizeof(STRING_HEADER);
00133     };
00134     inline bool InvariantOk() const {
00135 #if STRING_IS_PROTECTED
00136       return (GetHeader()->used_ == 0) ?
00137         (string() == NULL) : (GetHeader()->used_ == (strlen(string()) + 1));
00138 #else
00139       return true;
00140 #endif
00141     }
00142 
00143     // Ensure string has requested capacity as optimization
00144     // to avoid unnecessary reallocations.
00145     // The return value is a cstr buffer with at least requested capacity
00146     char* ensure_cstr(inT32 min_capacity);
00147 
00148     void FixHeader() const;  // make used_ non-negative, even if const
00149 
00150     char* AllocData(int used, int capacity);
00151     void DiscardData();
00152 };
00153 #endif