Tesseract  3.02
tesseract-ocr/ccutil/unichar.cpp
Go to the documentation of this file.
00001 
00002 // File:        unichar.cpp
00003 // Description: Unicode character/ligature class.
00004 // Author:      Ray Smith
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "unichar.h"
00021 
00022 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF
00023 
00024 // Construct from a utf8 string. If len<0 then the string is null terminated.
00025 // If the string is too long to fit in the UNICHAR then it takes only what
00026 // will fit. Checks for illegal input and stops at an illegal sequence.
00027 // The resulting UNICHAR may be empty.
00028 UNICHAR::UNICHAR(const char* utf8_str, int len) {
00029   int total_len = 0;
00030   int step = 0;
00031   if (len < 0) {
00032     for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len);
00033   }
00034   for (total_len = 0; total_len < len; total_len += step) {
00035     step = utf8_step(utf8_str + total_len);
00036     if (total_len + step > UNICHAR_LEN)
00037       break;  // Too long.
00038     if (step == 0)
00039       break;  // Illegal first byte.
00040     int i;
00041     for (i = 1; i < step; ++i)
00042       if ((utf8_str[total_len + i] & 0xc0) != 0x80)
00043         break;
00044     if (i < step)
00045       break;  // Illegal surrogate
00046   }
00047   memcpy(chars, utf8_str, total_len);
00048   if (total_len < UNICHAR_LEN) {
00049     chars[UNICHAR_LEN - 1] = total_len;
00050     while (total_len < UNICHAR_LEN - 1)
00051       chars[total_len++] = 0;
00052   }
00053 }
00054 
00055 // Construct from a single UCS4 character. Illegal values are ignored,
00056 // resulting in an empty UNICHAR.
00057 UNICHAR::UNICHAR(int unicode) {
00058   const int bytemask = 0xBF;
00059   const int bytemark = 0x80;
00060 
00061   if (unicode < 0x80) {
00062     chars[UNICHAR_LEN - 1] = 1;
00063     chars[2] = 0;
00064     chars[1] = 0;
00065     chars[0] = static_cast<char>(unicode);
00066   } else if (unicode < 0x800) {
00067     chars[UNICHAR_LEN - 1] = 2;
00068     chars[2] = 0;
00069     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00070     unicode >>= 6;
00071     chars[0] = static_cast<char>(unicode | 0xc0);
00072   } else if (unicode < 0x10000) {
00073     chars[UNICHAR_LEN - 1] = 3;
00074     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
00075     unicode >>= 6;
00076     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00077     unicode >>= 6;
00078     chars[0] = static_cast<char>(unicode | 0xe0);
00079   } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
00080     chars[UNICHAR_LEN - 1] = 4;
00081     chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
00082     unicode >>= 6;
00083     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
00084     unicode >>= 6;
00085     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00086     unicode >>= 6;
00087     chars[0] = static_cast<char>(unicode | 0xf0);
00088   } else {
00089     memset(chars, 0, UNICHAR_LEN);
00090   }
00091 }
00092 
00093 // Get the first character as UCS-4.
00094 int UNICHAR::first_uni() const {
00095   static const int utf8_offsets[5] = {
00096     0, 0, 0x3080, 0xE2080, 0x3C82080
00097   };
00098   int uni = 0;
00099   int len = utf8_step(chars);
00100   const char* src = chars;
00101 
00102   switch (len) {
00103   default:
00104     break;
00105   case 4:
00106     uni += static_cast<unsigned char>(*src++);
00107     uni <<= 6;
00108   case 3:
00109     uni += static_cast<unsigned char>(*src++);
00110     uni <<= 6;
00111   case 2:
00112     uni += static_cast<unsigned char>(*src++);
00113     uni <<= 6;
00114   case 1:
00115     uni += static_cast<unsigned char>(*src++);
00116   }
00117   uni -= utf8_offsets[len];
00118   return uni;
00119 }
00120 
00121 // Get a terminated UTF8 string: Must delete[] it after use.
00122 char* UNICHAR::utf8_str() const {
00123   int len = utf8_len();
00124   char* str = new char[len + 1];
00125   memcpy(str, chars, len);
00126   str[len] = 0;
00127   return str;
00128 }
00129 
00130 // Get the number of bytes in the first character of the given utf8 string.
00131 int UNICHAR::utf8_step(const char* utf8_str) {
00132   static const char utf8_bytes[256] = {
00133     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00134     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00135     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00136     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00137     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00138     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00139     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
00140     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
00141   };
00142 
00143   return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
00144 }