Tesseract  3.02
tesseract-ocr/ccutil/scanutils.cpp
Go to the documentation of this file.
00001 // Copyright 2006 Google Inc.
00002 // All Rights Reserved.
00003 // Author: renn
00004 //
00005 // The fscanf, vfscanf and creat functions are implemented so that their
00006 // functionality is mostly like their stdio counterparts. However, currently
00007 // these functions do not use any buffering, making them rather slow.
00008 // File streams are thus processed one character at a time.
00009 // Although the implementations of the scanf functions do lack a few minor
00010 // features, they should be sufficient for their use in tesseract.
00011 //
00012 // Licensed under the Apache License, Version 2.0 (the "License");
00013 // you may not use this file except in compliance with the License.
00014 // You may obtain a copy of the License at
00015 // http://www.apache.org/licenses/LICENSE-2.0
00016 // Unless required by applicable law or agreed to in writing, software
00017 // distributed under the License is distributed on an "AS IS" BASIS,
00018 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00019 // See the License for the specific language governing permissions and
00020 // limitations under the License.
00021 
00022 #ifdef EMBEDDED
00023 
00024 #include <ctype.h>
00025 #include <stdarg.h>
00026 #include <stddef.h>
00027 #include <inttypes.h>
00028 #include <string.h>
00029 #include <limits.h>
00030 #include <stdio.h>
00031 #include <sys/types.h>
00032 #include <sys/stat.h>
00033 #include <fcntl.h>
00034 
00035 #include "scanutils.h"
00036 #include "tprintf.h"
00037 
00038 enum Flags {
00039   FL_SPLAT  = 0x01,   // Drop the value, do not assign
00040   FL_INV    = 0x02,   // Character-set with inverse
00041   FL_WIDTH  = 0x04,   // Field width specified
00042   FL_MINUS  = 0x08,   // Negative number
00043 };
00044 
00045 enum Ranks {
00046   RANK_CHAR = -2,
00047   RANK_SHORT  = -1,
00048   RANK_INT  = 0,
00049   RANK_LONG = 1,
00050   RANK_LONGLONG = 2,
00051   RANK_PTR      = INT_MAX // Special value used for pointers
00052 };
00053 
00054 const enum Ranks kMinRank = RANK_CHAR;
00055 const enum Ranks kMaxRank = RANK_LONGLONG;
00056 
00057 const enum Ranks kIntMaxRank = RANK_LONGLONG;
00058 const enum Ranks kSizeTRank = RANK_LONG;
00059 const enum Ranks kPtrDiffRank = RANK_LONG;
00060 
00061 enum Bail {
00062   BAIL_NONE = 0,    // No error condition
00063   BAIL_EOF,         // Hit EOF
00064   BAIL_ERR          // Conversion mismatch
00065 };
00066 
00067 // Helper functions ------------------------------------------------------------
00068 inline size_t LongBit() {
00069   return CHAR_BIT * sizeof(long);
00070 }
00071 
00072 static inline int
00073 SkipSpace(FILE *s)
00074 {
00075   int p;
00076   while (isspace(p = fgetc(s)));
00077   ungetc(p, s);  // Make sure next char is available for reading
00078   return p;
00079 }
00080 
00081 static inline void
00082 SetBit(unsigned long *bitmap, unsigned int bit)
00083 {
00084   bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
00085 }
00086 
00087 static inline int
00088 TestBit(unsigned long *bitmap, unsigned int bit)
00089 {
00090   return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
00091 }
00092 
00093 static inline int DigitValue(int ch)
00094 {
00095   if (ch >= '0' && ch <= '9') {
00096     return ch-'0';
00097   } else if (ch >= 'A' && ch <= 'Z') {
00098     return ch-'A'+10;
00099   } else if (ch >= 'a' && ch <= 'z') {
00100     return ch-'a'+10;
00101   } else {
00102     return -1;
00103   }
00104 }
00105 
00106 // IO (re-)implementations -----------------------------------------------------
00107 uintmax_t streamtoumax(FILE* s, int base)
00108 {
00109   int minus = 0;
00110   uintmax_t v = 0;
00111   int d, c = 0;
00112 
00113   for (c = fgetc(s);
00114     isspace(static_cast<unsigned char>(c)) && (c != EOF);
00115     c = fgetc(s))
00116 
00117   // Single optional + or -
00118   if (c == '-' || c == '+') {
00119     minus = (c == '-');
00120     c = fgetc(s);
00121   }
00122 
00123   // Assign correct base
00124   if (base == 0) {
00125     if (c == '0') {
00126       c = fgetc(s);
00127       if (c == 'x' || c == 'X') {
00128         base = 16;
00129         c = fgetc(s);
00130       } else {
00131         base = 8;
00132       }
00133     }
00134   } else if (base == 16) {
00135     if (c == '0') {
00136       c = fgetc(s);
00137       if (c == 'x' && c == 'X') c = fgetc(s);
00138     }
00139   }
00140 
00141   // Actual number parsing
00142   for (; (c != EOF) && (d = DigitValue(c)) >= 0 && d < base; c = fgetc(s))
00143     v = v*base + d;
00144 
00145   ungetc(c, s);
00146   return minus ? -v : v;
00147 }
00148 
00149 double streamtofloat(FILE* s)
00150 {
00151   int minus = 0;
00152   int v = 0;
00153   int d, c = 0;
00154   int k = 1;
00155   int w = 0;
00156 
00157   for (c = fgetc(s);
00158     isspace(static_cast<unsigned char>(c)) && (c != EOF);
00159     c = fgetc(s));
00160 
00161   // Single optional + or -
00162   if (c == '-' || c == '+') {
00163     minus = (c == '-');
00164     c = fgetc(s);
00165   }
00166 
00167   // Actual number parsing
00168   for (; (c != EOF) && (d = DigitValue(c)) >= 0; c = fgetc(s))
00169     v = v*10 + d;
00170   if (c == '.') {
00171     for (c = fgetc(s); (c != EOF) && (d = DigitValue(c)) >= 0; c = fgetc(s)) {
00172       w = w*10 + d;
00173       k *= 10;
00174     }
00175   } else if (c == 'e' || c == 'E')
00176     tprintf("WARNING: Scientific Notation not supported!");
00177 
00178   ungetc(c, s);
00179   double f  = static_cast<double>(v)
00180             + static_cast<double>(w) / static_cast<double>(k);
00181 
00182   return minus ? -f : f;
00183 }
00184 
00185 double strtofloat(const char* s)
00186 {
00187   int minus = 0;
00188   int v = 0;
00189   int d;
00190   int k = 1;
00191   int w = 0;
00192 
00193   while(*s && isspace(static_cast<unsigned char>(*s))) s++;
00194 
00195   // Single optional + or -
00196   if (*s == '-' || *s == '+') {
00197     minus = (*s == '-');
00198     s++;
00199   }
00200 
00201   // Actual number parsing
00202   for (; *s && (d = DigitValue(*s)) >= 0; s++)
00203     v = v*10 + d;
00204   if (*s == '.') {
00205     for (++s; *s && (d = DigitValue(*s)) >= 0; s++) {
00206       w = w*10 + d;
00207       k *= 10;
00208     }
00209   } else if (*s == 'e' || *s == 'E')
00210     tprintf("WARNING: Scientific Notation not supported!");
00211 
00212   double f  = static_cast<double>(v)
00213             + static_cast<double>(w) / static_cast<double>(k);
00214 
00215   return minus ? -f : f;
00216 }
00217 
00218 int fscanf(FILE* stream, const char *format, ...)
00219 {
00220   va_list ap;
00221   int rv;
00222 
00223   va_start(ap, format);
00224   rv = vfscanf(stream, format, ap);
00225   va_end(ap);
00226 
00227   return rv;
00228 }
00229 
00230 int vfscanf(FILE* stream, const char *format, va_list ap)
00231 {
00232   const char *p = format;
00233   char ch;
00234   int q = 0;
00235   uintmax_t val = 0;
00236   int rank = RANK_INT;    // Default rank
00237   unsigned int width = UINT_MAX;
00238   int base;
00239   int flags = 0;
00240   enum {
00241     ST_NORMAL,        // Ground state
00242     ST_FLAGS,         // Special flags
00243     ST_WIDTH,         // Field width
00244     ST_MODIFIERS,     // Length or conversion modifiers
00245     ST_MATCH_INIT,    // Initial state of %[ sequence
00246     ST_MATCH,         // Main state of %[ sequence
00247     ST_MATCH_RANGE,   // After - in a %[ sequence
00248   } state = ST_NORMAL;
00249   char *sarg = NULL;    // %s %c or %[ string argument
00250   enum Bail bail = BAIL_NONE;
00251   int sign;
00252   int converted = 0;    // Successful conversions
00253   unsigned long matchmap[((1 << CHAR_BIT)+(LongBit()-1))/LongBit()];
00254   int matchinv = 0;   // Is match map inverted?
00255   unsigned char range_start = 0;
00256   off_t start_off = ftell(stream);
00257 
00258   // Skip leading spaces
00259   SkipSpace(stream);
00260 
00261   while ((ch = *p++) && !bail) {
00262     switch (state) {
00263       case ST_NORMAL:
00264         if (ch == '%') {
00265           state = ST_FLAGS;
00266           flags = 0; rank = RANK_INT; width = UINT_MAX;
00267         } else if (isspace(static_cast<unsigned char>(ch))) {
00268           SkipSpace(stream);
00269         } else {
00270           if (fgetc(stream) != ch)
00271             bail = BAIL_ERR;  // Match failure
00272         }
00273         break;
00274 
00275       case ST_FLAGS:
00276         switch (ch) {
00277           case '*':
00278             flags |= FL_SPLAT;
00279           break;
00280 
00281           case '0' ... '9':
00282             width = (ch-'0');
00283             state = ST_WIDTH;
00284             flags |= FL_WIDTH;
00285           break;
00286 
00287           default:
00288             state = ST_MODIFIERS;
00289             p--;      // Process this character again
00290           break;
00291         }
00292       break;
00293 
00294       case ST_WIDTH:
00295         if (ch >= '0' && ch <= '9') {
00296           width = width*10+(ch-'0');
00297         } else {
00298           state = ST_MODIFIERS;
00299           p--;      // Process this character again
00300         }
00301       break;
00302 
00303       case ST_MODIFIERS:
00304         switch (ch) {
00305           // Length modifiers - nonterminal sequences
00306           case 'h':
00307             rank--;     // Shorter rank
00308           break;
00309           case 'l':
00310             rank++;     // Longer rank
00311           break;
00312           case 'j':
00313             rank = kIntMaxRank;
00314           break;
00315           case 'z':
00316             rank = kSizeTRank;
00317           break;
00318           case 't':
00319             rank = kPtrDiffRank;
00320           break;
00321           case 'L':
00322           case 'q':
00323             rank = RANK_LONGLONG; // long double/long long
00324           break;
00325 
00326           default:
00327             // Output modifiers - terminal sequences
00328             state = ST_NORMAL;  // Next state will be normal
00329             if (rank < kMinRank)  // Canonicalize rank
00330               rank = kMinRank;
00331             else if (rank > kMaxRank)
00332               rank = kMaxRank;
00333 
00334           switch (ch) {
00335             case 'P':   // Upper case pointer
00336             case 'p':   // Pointer
00337               rank = RANK_PTR;
00338               base = 0; sign = 0;
00339             goto scan_int;
00340 
00341             case 'i':   // Base-independent integer
00342               base = 0; sign = 1;
00343             goto scan_int;
00344 
00345             case 'd':   // Decimal integer
00346               base = 10; sign = 1;
00347             goto scan_int;
00348 
00349             case 'o':   // Octal integer
00350               base = 8; sign = 0;
00351             goto scan_int;
00352 
00353             case 'u':   // Unsigned decimal integer
00354               base = 10; sign = 0;
00355             goto scan_int;
00356 
00357             case 'x':   // Hexadecimal integer
00358             case 'X':
00359               base = 16; sign = 0;
00360             goto scan_int;
00361 
00362             case 'n':   // Number of characters consumed
00363               val = ftell(stream) - start_off;
00364             goto set_integer;
00365 
00366             scan_int:
00367               q = SkipSpace(stream);
00368               if ( q <= 0 ) {
00369                 bail = BAIL_EOF;
00370                 break;
00371               }
00372               val = streamtoumax(stream, base);
00373               converted++;
00374               // fall through
00375 
00376             set_integer:
00377               if (!(flags & FL_SPLAT)) {
00378                 switch(rank) {
00379                   case RANK_CHAR:
00380                     *va_arg(ap, unsigned char *)
00381                       = static_cast<unsigned char>(val);
00382                   break;
00383                   case RANK_SHORT:
00384                     *va_arg(ap, unsigned short *)
00385                       = static_cast<unsigned short>(val);
00386                   break;
00387                   case RANK_INT:
00388                     *va_arg(ap, unsigned int *)
00389                       = static_cast<unsigned int>(val);
00390                   break;
00391                   case RANK_LONG:
00392                     *va_arg(ap, unsigned long *)
00393                       = static_cast<unsigned long>(val);
00394                   break;
00395                   case RANK_LONGLONG:
00396                     *va_arg(ap, unsigned long long *)
00397                       = static_cast<unsigned long long>(val);
00398                   break;
00399                   case RANK_PTR:
00400                     *va_arg(ap, void **)
00401                       = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
00402                   break;
00403                 }
00404               }
00405             break;
00406 
00407             case 'f':   // Preliminary float value parsing
00408             case 'g':
00409             case 'G':
00410             case 'e':
00411             case 'E':
00412               q = SkipSpace(stream);
00413               if (q <= 0) {
00414                 bail = BAIL_EOF;
00415                 break;
00416               }
00417 
00418               {
00419               double fval = streamtofloat(stream);
00420               switch(rank) {
00421                 case RANK_INT:
00422                   *va_arg(ap, float *) = static_cast<float>(fval);
00423                 break;
00424                 case RANK_LONG:
00425                   *va_arg(ap, double *) = static_cast<double>(fval);
00426                 break;
00427               }
00428               converted++;
00429               }
00430             break;
00431 
00432             case 'c':               // Character
00433               width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
00434               sarg = va_arg(ap, char *);
00435               while (width--) {
00436                 if ((q = fgetc(stream)) <= 0) {
00437                   bail = BAIL_EOF;
00438                   break;
00439                 }
00440                 *sarg++ = q;
00441               }
00442               if (!bail)
00443                 converted++;
00444             break;
00445 
00446             case 's':               // String
00447             {
00448               char *sp;
00449               sp = sarg = va_arg(ap, char *);
00450               while (width--) {
00451                 q = fgetc(stream);
00452                 if (isspace(static_cast<unsigned char>(q)) || q <= 0) {
00453                   ungetc(q, stream);
00454                   break;
00455                 }
00456                 *sp++ = q;
00457               }
00458               if (sarg != sp) {
00459                 *sp = '\0'; // Terminate output
00460                 converted++;
00461               } else {
00462                 bail = BAIL_EOF;
00463               }
00464             }
00465             break;
00466 
00467             case '[':   // Character range
00468               sarg = va_arg(ap, char *);
00469               state = ST_MATCH_INIT;
00470               matchinv = 0;
00471               memset(matchmap, 0, sizeof matchmap);
00472             break;
00473 
00474             case '%':   // %% sequence
00475               if (fgetc(stream) != '%' )
00476                 bail = BAIL_ERR;
00477             break;
00478 
00479             default:    // Anything else
00480               bail = BAIL_ERR;  // Unknown sequence
00481             break;
00482           }
00483         }
00484       break;
00485 
00486       case ST_MATCH_INIT:   // Initial state for %[ match
00487         if (ch == '^' && !(flags & FL_INV)) {
00488           matchinv = 1;
00489         } else {
00490           SetBit(matchmap, static_cast<unsigned char>(ch));
00491           state = ST_MATCH;
00492         }
00493       break;
00494 
00495       case ST_MATCH:    // Main state for %[ match
00496         if (ch == ']') {
00497           goto match_run;
00498         } else if (ch == '-') {
00499           range_start = static_cast<unsigned char>(ch);
00500           state = ST_MATCH_RANGE;
00501         } else {
00502           SetBit(matchmap, static_cast<unsigned char>(ch));
00503         }
00504       break;
00505 
00506       case ST_MATCH_RANGE:    // %[ match after -
00507         if (ch == ']') {
00508           SetBit(matchmap, static_cast<unsigned char>('-'));
00509           goto match_run;
00510         } else {
00511           int i;
00512           for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
00513           SetBit(matchmap, i);
00514           state = ST_MATCH;
00515         }
00516       break;
00517 
00518       match_run:      // Match expression finished
00519         char* oarg = sarg;
00520         while (width) {
00521           q = fgetc(stream);
00522           unsigned char qc = static_cast<unsigned char>(q);
00523           if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
00524             ungetc(q, stream);
00525             break;
00526           }
00527           *sarg++ = q;
00528         }
00529         if (oarg != sarg) {
00530           *sarg = '\0';
00531           converted++;
00532         } else {
00533           bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
00534         }
00535       break;
00536     }
00537   }
00538 
00539   if (bail == BAIL_EOF && !converted)
00540     converted = -1;   // Return EOF (-1)
00541 
00542   return converted;
00543 }
00544 
00545 int creat(const char *pathname, mode_t mode)
00546 {
00547   return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode);
00548 }
00549 
00550 #endif  // EMBEDDED