Tesseract  3.02
tesseract-ocr/ccutil/strngs.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        strngs.c  (Formerly strings.c)
00003  * Description: STRING class functions.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Fri Feb 15 09:13:30 GMT 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include          "mfcpch.h"     // Precompiled headers
00021 #include          "helpers.h"
00022 #include          "tprintf.h"
00023 #include          "strngs.h"
00024 #include          "genericvector.h"
00025 
00026 #include <assert.h>
00027 // Size of buffer needed to host the decimal representation of the maximum
00028 // possible length of an int (in 64 bits, being -<20 digits>.
00029 const int kMaxIntSize = 22;
00030 
00031 /**********************************************************************
00032  * STRING_HEADER provides metadata about the allocated buffer,
00033  * including total capacity and how much used (strlen with '\0').
00034  *
00035  * The implementation hides this header at the start of the data
00036  * buffer and appends the string on the end to keep sizeof(STRING)
00037  * unchanged from earlier versions so serialization is not affected.
00038  *
00039  * The collection of MACROS provide different implementations depending
00040  * on whether the string keeps track of its strlen or not so that this
00041  * feature can be added in later when consumers dont modifify the string
00042  **********************************************************************/
00043 
00044 // Smallest string to allocate by default
00045 const int kMinCapacity = 16;
00046 
00047 char* STRING::AllocData(int used, int capacity) {
00048   data_ = (STRING_HEADER *)alloc_string(capacity + sizeof(STRING_HEADER));
00049 
00050   // header is the metadata for this memory block
00051   STRING_HEADER* header = GetHeader();
00052   header->capacity_ = capacity;
00053   header->used_ = used;
00054   return GetCStr();
00055 }
00056 
00057 void STRING::DiscardData() {
00058   free_string((char *)data_);
00059 }
00060 
00061 // This is a private method; ensure FixHeader is called (or used_ is well defined)
00062 // beforehand
00063 char* STRING::ensure_cstr(inT32 min_capacity) {
00064   STRING_HEADER* orig_header = GetHeader();
00065   if (min_capacity <= orig_header->capacity_)
00066     return ((char *)this->data_) + sizeof(STRING_HEADER);
00067 
00068   // if we are going to grow bigger, than double our existing
00069   // size, but if that still is not big enough then keep the
00070   // requested capacity
00071   if (min_capacity < 2 * orig_header->capacity_)
00072     min_capacity = 2 * orig_header->capacity_;
00073 
00074   int alloc = sizeof(STRING_HEADER) + min_capacity;
00075   STRING_HEADER* new_header = (STRING_HEADER*)(alloc_string(alloc));
00076 
00077   memcpy(&new_header[1], GetCStr(), orig_header->used_);
00078   new_header->capacity_ = min_capacity;
00079   new_header->used_ = orig_header->used_;
00080 
00081   // free old memory, then rebind to new memory
00082   DiscardData();
00083   data_ = new_header;
00084 
00085   assert(InvariantOk());
00086   return ((char *)data_) + sizeof(STRING_HEADER);
00087 }
00088 
00089 // This is const, but is modifying a mutable field
00090 // this way it can be used on const or non-const instances.
00091 void STRING::FixHeader() const {
00092   const STRING_HEADER* header = GetHeader();
00093   if (header->used_ < 0)
00094     header->used_ = strlen(GetCStr()) + 1;
00095 }
00096 
00097 
00098 STRING::STRING() {
00099   // 0 indicates old NULL -- it doesnt even have '\0'
00100   AllocData(0, kMinCapacity);
00101 }
00102 
00103 STRING::STRING(const STRING& str) {
00104   str.FixHeader();
00105   const STRING_HEADER* str_header  = str.GetHeader();
00106   int   str_used  = str_header->used_;
00107   char *this_cstr = AllocData(str_used, str_used);
00108   memcpy(this_cstr, str.GetCStr(), str_used);
00109   assert(InvariantOk());
00110 }
00111 
00112 STRING::STRING(const char* cstr) {
00113   if (cstr == NULL) {
00114     AllocData(0, 0);
00115   } else {
00116     int len = strlen(cstr) + 1;
00117     char* this_cstr = AllocData(len, len);
00118     memcpy(this_cstr, cstr, len);
00119   }
00120   assert(InvariantOk());
00121 }
00122 
00123 STRING::~STRING() {
00124   DiscardData();
00125 }
00126 
00127 // Writes to the given file. Returns false in case of error.
00128 bool STRING::Serialize(FILE* fp) const {
00129   inT32 len = length();
00130   if (fwrite(&len, sizeof(len), 1, fp) != 1) return false;
00131   if (fwrite(GetCStr(), 1, len, fp) != len) return false;
00132   return true;
00133 }
00134 // Reads from the given file. Returns false in case of error.
00135 // If swap is true, assumes a big/little-endian swap is needed.
00136 bool STRING::DeSerialize(bool swap, FILE* fp) {
00137   inT32 len;
00138   if (fread(&len, sizeof(len), 1, fp) != 1) return false;
00139   if (swap)
00140     ReverseN(&len, sizeof(len));
00141   truncate_at(len);
00142   if (fread(GetCStr(), 1, len, fp) != len) return false;
00143   return true;
00144 }
00145 
00146 BOOL8 STRING::contains(const char c) const {
00147   return (c != '\0') && (strchr (GetCStr(), c) != NULL);
00148 }
00149 
00150 inT32 STRING::length() const {
00151   FixHeader();
00152   return GetHeader()->used_ - 1;
00153 }
00154 
00155 const char* STRING::string() const {
00156   const STRING_HEADER* header = GetHeader();
00157   if (header->used_ == 0)
00158     return NULL;
00159 
00160   // mark header length unreliable because tesseract might
00161   // cast away the const and mutate the string directly.
00162   header->used_ = -1;
00163   return GetCStr();
00164 }
00165 
00166 /******
00167  * The STRING_IS_PROTECTED interface adds additional support to migrate
00168  * code that needs to modify the STRING in ways not otherwise supported
00169  * without violating encapsulation.
00170  *
00171  * Also makes the [] operator return a const so it is immutable
00172  */
00173 #if STRING_IS_PROTECTED
00174 const char& STRING::operator[](inT32 index) const {
00175   return GetCStr()[index];
00176 }
00177 
00178 void STRING::insert_range(inT32 index, const char* str, int len) {
00179   // if index is outside current range, then also grow size of string
00180   // to accmodate the requested range.
00181   STRING_HEADER* this_header = GetHeader();
00182   int used = this_header->used_;
00183   if (index > used)
00184     used = index;
00185 
00186   char* this_cstr = ensure_cstr(used + len + 1);
00187   if (index < used) {
00188     // move existing string from index to '\0' inclusive.
00189     memmove(this_cstr + index + len,
00190            this_cstr + index,
00191            this_header->used_ - index);
00192   } else if (len > 0) {
00193     // We are going to overwrite previous null terminator, so write the new one.
00194     this_cstr[this_header->used_ + len - 1] = '\0';
00195 
00196     // If the old header did not have the terminator,
00197     // then we need to account for it now that we've added it.
00198     // Otherwise it was already accounted for; we just moved it.
00199     if (this_header->used_ == 0)
00200       ++this_header->used_;
00201   }
00202 
00203   // Write new string to index.
00204   // The string is already terminated from the conditions above.
00205   memcpy(this_cstr + index, str, len);
00206   this_header->used_ += len;
00207 
00208   assert(InvariantOk());
00209 }
00210 
00211 void STRING::erase_range(inT32 index, int len) {
00212   char* this_cstr = GetCStr();
00213   STRING_HEADER* this_header = GetHeader();
00214 
00215   memcpy(this_cstr+index, this_cstr+index+len,
00216          this_header->used_ - index - len);
00217   this_header->used_ -= len;
00218   assert(InvariantOk());
00219 }
00220 
00221 #else
00222 void STRING::truncate_at(inT32 index) {
00223   char* this_cstr = ensure_cstr(index + 1);
00224   this_cstr[index] = '\0';
00225   GetHeader()->used_ = index + 1;
00226   assert(InvariantOk());
00227 }
00228 
00229 char& STRING::operator[](inT32 index) const {
00230   // Code is casting away this const and mutating the string,
00231   // so mark used_ as -1 to flag it unreliable.
00232   GetHeader()->used_ = -1;
00233   return ((char *)GetCStr())[index];
00234 }
00235 #endif
00236 
00237 void STRING::split(const char c, GenericVector<STRING> *splited) {
00238   int start_index = 0;
00239   for (int i = 0; i < length(); i++) {
00240     if ((*this)[i] == c) {
00241       if (i != start_index) {
00242         (*this)[i] = '\0';
00243         STRING tmp = GetCStr() + start_index;
00244         splited->push_back(tmp);
00245         (*this)[i] = c;
00246       }
00247       start_index = i + 1;
00248     }
00249   }
00250 
00251   if (length() != start_index) {
00252     STRING tmp = GetCStr() + start_index;
00253     splited->push_back(tmp);
00254   }
00255 }
00256 
00257 BOOL8 STRING::operator==(const STRING& str) const {
00258   FixHeader();
00259   str.FixHeader();
00260   const STRING_HEADER* str_header = str.GetHeader();
00261   const STRING_HEADER* this_header = GetHeader();
00262   int this_used = this_header->used_;
00263   int str_used  = str_header->used_;
00264 
00265   return (this_used == str_used)
00266           && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
00267 }
00268 
00269 BOOL8 STRING::operator!=(const STRING& str) const {
00270   FixHeader();
00271   str.FixHeader();
00272   const STRING_HEADER* str_header = str.GetHeader();
00273   const STRING_HEADER* this_header = GetHeader();
00274   int this_used = this_header->used_;
00275   int str_used  = str_header->used_;
00276 
00277   return (this_used != str_used)
00278          || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
00279 }
00280 
00281 BOOL8 STRING::operator!=(const char* cstr) const {
00282   FixHeader();
00283   const STRING_HEADER* this_header = GetHeader();
00284 
00285   if (cstr == NULL)
00286     return this_header->used_ > 1;  // either '\0' or NULL
00287   else {
00288     inT32 length = strlen(cstr) + 1;
00289     return (this_header->used_ != length)
00290             || (memcmp(GetCStr(), cstr, length) != 0);
00291   }
00292 }
00293 
00294 STRING& STRING::operator=(const STRING& str) {
00295   str.FixHeader();
00296   const STRING_HEADER* str_header = str.GetHeader();
00297   int   str_used = str_header->used_;
00298 
00299   GetHeader()->used_ = 0;  // clear since ensure doesnt need to copy data
00300   char* this_cstr = ensure_cstr(str_used);
00301   STRING_HEADER* this_header = GetHeader();
00302 
00303   memcpy(this_cstr, str.GetCStr(), str_used);
00304   this_header->used_ = str_used;
00305 
00306   assert(InvariantOk());
00307   return *this;
00308 }
00309 
00310 STRING & STRING::operator+=(const STRING& str) {
00311   FixHeader();
00312   str.FixHeader();
00313   const STRING_HEADER* str_header = str.GetHeader();
00314   const char* str_cstr = str.GetCStr();
00315   int  str_used  = str_header->used_;
00316   int  this_used = GetHeader()->used_;
00317   char* this_cstr = ensure_cstr(this_used + str_used);
00318 
00319   STRING_HEADER* this_header = GetHeader();  // after ensure for realloc
00320 
00321   if (this_used > 1) {
00322     memcpy(this_cstr + this_used - 1, str_cstr, str_used);
00323     this_header->used_ += str_used - 1;  // overwrite '\0'
00324   } else {
00325     memcpy(this_cstr, str_cstr, str_used);
00326     this_header->used_ = str_used;
00327   }
00328 
00329   assert(InvariantOk());
00330   return *this;
00331 }
00332 
00333 void STRING::add_str_int(const char* str, int number) {
00334   if (str != NULL)
00335     *this += str;
00336   // Allow space for the maximum possible length of inT64.
00337   char num_buffer[kMaxIntSize];
00338   snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
00339   num_buffer[kMaxIntSize - 1] = '\0';
00340   *this += num_buffer;
00341 }
00342 
00343 STRING & STRING::operator=(const char* cstr) {
00344   STRING_HEADER* this_header = GetHeader();
00345   if (cstr) {
00346     int len = strlen(cstr) + 1;
00347 
00348     this_header->used_ = 0;  // dont bother copying data if need to realloc
00349     char* this_cstr = ensure_cstr(len);
00350     this_header = GetHeader();  // for realloc
00351     memcpy(this_cstr, cstr, len);
00352     this_header->used_ = len;
00353   }
00354   else {
00355     // Reallocate to zero capacity buffer, consistent with the corresponding
00356     // copy constructor.
00357     DiscardData();
00358     AllocData(0, 0);
00359   }
00360 
00361   assert(InvariantOk());
00362   return *this;
00363 }
00364 
00365 
00366 STRING STRING::operator+(const STRING& str) const {
00367   STRING result(*this);
00368   result += str;
00369 
00370   assert(InvariantOk());
00371   return result;
00372 }
00373 
00374 
00375 STRING STRING::operator+(const char ch) const {
00376   STRING result;
00377   FixHeader();
00378   const STRING_HEADER* this_header = GetHeader();
00379   int this_used = this_header->used_;
00380   char* result_cstr = result.ensure_cstr(this_used + 1);
00381   STRING_HEADER* result_header = result.GetHeader();
00382   int result_used = result_header->used_;
00383 
00384   // copies '\0' but we'll overwrite that
00385   memcpy(result_cstr, GetCStr(), this_used);
00386   result_cstr[result_used] = ch;      // overwrite old '\0'
00387   result_cstr[result_used + 1] = '\0';  // append on '\0'
00388   ++result_header->used_;
00389 
00390   assert(InvariantOk());
00391   return result;
00392 }
00393 
00394 
00395 STRING&  STRING::operator+=(const char *str) {
00396   if (!str || !*str)  // empty string has no effect
00397     return *this;
00398 
00399   FixHeader();
00400   int len = strlen(str) + 1;
00401   int this_used = GetHeader()->used_;
00402   char* this_cstr = ensure_cstr(this_used + len);
00403   STRING_HEADER* this_header = GetHeader();  // after ensure for realloc
00404 
00405   // if we had non-empty string then append overwriting old '\0'
00406   // otherwise replace
00407   if (this_used > 0) {
00408     memcpy(this_cstr + this_used - 1, str, len);
00409     this_header->used_ += len - 1;
00410   } else {
00411     memcpy(this_cstr, str, len);
00412     this_header->used_ = len;
00413   }
00414 
00415   assert(InvariantOk());
00416   return *this;
00417 }
00418 
00419 
00420 STRING& STRING::operator+=(const char ch) {
00421   if (ch == '\0')
00422     return *this;
00423 
00424   FixHeader();
00425   int   this_used = GetHeader()->used_;
00426   char* this_cstr = ensure_cstr(this_used + 1);
00427   STRING_HEADER* this_header = GetHeader();
00428 
00429   if (this_used > 0)
00430     --this_used; // undo old empty null if there was one
00431 
00432   this_cstr[this_used++] = ch;   // append ch to end
00433   this_cstr[this_used++] = '\0'; // append '\0' after ch
00434   this_header->used_ = this_used;
00435 
00436   assert(InvariantOk());
00437   return *this;
00438 }