Tesseract
3.02
|
#include <unichar.h>
Public Member Functions | |
UNICHAR () | |
UNICHAR (const char *utf8_str, int len) | |
UNICHAR (int unicode) | |
int | first_uni () const |
int | utf8_len () const |
const char * | utf8 () const |
char * | utf8_str () const |
Static Public Member Functions | |
static int | utf8_step (const char *utf8_str) |
UNICHAR::UNICHAR | ( | ) | [inline] |
Definition at line 44 of file unichar.h.
{ memset(chars, 0, UNICHAR_LEN); }
UNICHAR::UNICHAR | ( | const char * | utf8_str, |
int | len | ||
) |
Definition at line 28 of file unichar.cpp.
{ int total_len = 0; int step = 0; if (len < 0) { for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len); } for (total_len = 0; total_len < len; total_len += step) { step = utf8_step(utf8_str + total_len); if (total_len + step > UNICHAR_LEN) break; // Too long. if (step == 0) break; // Illegal first byte. int i; for (i = 1; i < step; ++i) if ((utf8_str[total_len + i] & 0xc0) != 0x80) break; if (i < step) break; // Illegal surrogate } memcpy(chars, utf8_str, total_len); if (total_len < UNICHAR_LEN) { chars[UNICHAR_LEN - 1] = total_len; while (total_len < UNICHAR_LEN - 1) chars[total_len++] = 0; } }
UNICHAR::UNICHAR | ( | int | unicode | ) | [explicit] |
Definition at line 57 of file unichar.cpp.
{ const int bytemask = 0xBF; const int bytemark = 0x80; if (unicode < 0x80) { chars[UNICHAR_LEN - 1] = 1; chars[2] = 0; chars[1] = 0; chars[0] = static_cast<char>(unicode); } else if (unicode < 0x800) { chars[UNICHAR_LEN - 1] = 2; chars[2] = 0; chars[1] = static_cast<char>((unicode | bytemark) & bytemask); unicode >>= 6; chars[0] = static_cast<char>(unicode | 0xc0); } else if (unicode < 0x10000) { chars[UNICHAR_LEN - 1] = 3; chars[2] = static_cast<char>((unicode | bytemark) & bytemask); unicode >>= 6; chars[1] = static_cast<char>((unicode | bytemark) & bytemask); unicode >>= 6; chars[0] = static_cast<char>(unicode | 0xe0); } else if (unicode <= UNI_MAX_LEGAL_UTF32) { chars[UNICHAR_LEN - 1] = 4; chars[3] = static_cast<char>((unicode | bytemark) & bytemask); unicode >>= 6; chars[2] = static_cast<char>((unicode | bytemark) & bytemask); unicode >>= 6; chars[1] = static_cast<char>((unicode | bytemark) & bytemask); unicode >>= 6; chars[0] = static_cast<char>(unicode | 0xf0); } else { memset(chars, 0, UNICHAR_LEN); } }
int UNICHAR::first_uni | ( | ) | const |
Definition at line 94 of file unichar.cpp.
{ static const int utf8_offsets[5] = { 0, 0, 0x3080, 0xE2080, 0x3C82080 }; int uni = 0; int len = utf8_step(chars); const char* src = chars; switch (len) { default: break; case 4: uni += static_cast<unsigned char>(*src++); uni <<= 6; case 3: uni += static_cast<unsigned char>(*src++); uni <<= 6; case 2: uni += static_cast<unsigned char>(*src++); uni <<= 6; case 1: uni += static_cast<unsigned char>(*src++); } uni -= utf8_offsets[len]; return uni; }
const char* UNICHAR::utf8 | ( | ) | const [inline] |
int UNICHAR::utf8_len | ( | ) | const [inline] |
Definition at line 62 of file unichar.h.
{ int len = chars[UNICHAR_LEN - 1]; return len >=0 && len < UNICHAR_LEN ? len : UNICHAR_LEN; }
int UNICHAR::utf8_step | ( | const char * | utf8_str | ) | [static] |
Definition at line 131 of file unichar.cpp.
{ static const char utf8_bytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0 }; return utf8_bytes[static_cast<unsigned char>(*utf8_str)]; }
char * UNICHAR::utf8_str | ( | ) | const |
Definition at line 122 of file unichar.cpp.
{ int len = utf8_len(); char* str = new char[len + 1]; memcpy(str, chars, len); str[len] = 0; return str; }