|
Tesseract
3.02
|
#include <unichar.h>
Public Member Functions | |
| UNICHAR () | |
| UNICHAR (const char *utf8_str, int len) | |
| UNICHAR (int unicode) | |
| int | first_uni () const |
| int | utf8_len () const |
| const char * | utf8 () const |
| char * | utf8_str () const |
Static Public Member Functions | |
| static int | utf8_step (const char *utf8_str) |
| UNICHAR::UNICHAR | ( | ) | [inline] |
Definition at line 44 of file unichar.h.
{
memset(chars, 0, UNICHAR_LEN);
}
| UNICHAR::UNICHAR | ( | const char * | utf8_str, |
| int | len | ||
| ) |
Definition at line 28 of file unichar.cpp.
{
int total_len = 0;
int step = 0;
if (len < 0) {
for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len);
}
for (total_len = 0; total_len < len; total_len += step) {
step = utf8_step(utf8_str + total_len);
if (total_len + step > UNICHAR_LEN)
break; // Too long.
if (step == 0)
break; // Illegal first byte.
int i;
for (i = 1; i < step; ++i)
if ((utf8_str[total_len + i] & 0xc0) != 0x80)
break;
if (i < step)
break; // Illegal surrogate
}
memcpy(chars, utf8_str, total_len);
if (total_len < UNICHAR_LEN) {
chars[UNICHAR_LEN - 1] = total_len;
while (total_len < UNICHAR_LEN - 1)
chars[total_len++] = 0;
}
}
| UNICHAR::UNICHAR | ( | int | unicode | ) | [explicit] |
Definition at line 57 of file unichar.cpp.
{
const int bytemask = 0xBF;
const int bytemark = 0x80;
if (unicode < 0x80) {
chars[UNICHAR_LEN - 1] = 1;
chars[2] = 0;
chars[1] = 0;
chars[0] = static_cast<char>(unicode);
} else if (unicode < 0x800) {
chars[UNICHAR_LEN - 1] = 2;
chars[2] = 0;
chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
unicode >>= 6;
chars[0] = static_cast<char>(unicode | 0xc0);
} else if (unicode < 0x10000) {
chars[UNICHAR_LEN - 1] = 3;
chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
unicode >>= 6;
chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
unicode >>= 6;
chars[0] = static_cast<char>(unicode | 0xe0);
} else if (unicode <= UNI_MAX_LEGAL_UTF32) {
chars[UNICHAR_LEN - 1] = 4;
chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
unicode >>= 6;
chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
unicode >>= 6;
chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
unicode >>= 6;
chars[0] = static_cast<char>(unicode | 0xf0);
} else {
memset(chars, 0, UNICHAR_LEN);
}
}
| int UNICHAR::first_uni | ( | ) | const |
Definition at line 94 of file unichar.cpp.
{
static const int utf8_offsets[5] = {
0, 0, 0x3080, 0xE2080, 0x3C82080
};
int uni = 0;
int len = utf8_step(chars);
const char* src = chars;
switch (len) {
default:
break;
case 4:
uni += static_cast<unsigned char>(*src++);
uni <<= 6;
case 3:
uni += static_cast<unsigned char>(*src++);
uni <<= 6;
case 2:
uni += static_cast<unsigned char>(*src++);
uni <<= 6;
case 1:
uni += static_cast<unsigned char>(*src++);
}
uni -= utf8_offsets[len];
return uni;
}
| const char* UNICHAR::utf8 | ( | ) | const [inline] |
| int UNICHAR::utf8_len | ( | ) | const [inline] |
Definition at line 62 of file unichar.h.
{
int len = chars[UNICHAR_LEN - 1];
return len >=0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
}
| int UNICHAR::utf8_step | ( | const char * | utf8_str | ) | [static] |
Definition at line 131 of file unichar.cpp.
{
static const char utf8_bytes[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
};
return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
}
| char * UNICHAR::utf8_str | ( | ) | const |
Definition at line 122 of file unichar.cpp.
{
int len = utf8_len();
char* str = new char[len + 1];
memcpy(str, chars, len);
str[len] = 0;
return str;
}