#include <unichar.h>

Public Member Functions
	UNICHAR ()
	UNICHAR (const char *utf8_str, int len)
	UNICHAR (int unicode)
int	first_uni () const
int	utf8_len () const
const char *	utf8 () const
char *	utf8_str () const
Static Public Member Functions
static int	utf8_step (const char *utf8_str)

Detailed Description

Definition at line 42 of file unichar.h.

Constructor & Destructor Documentation

UNICHAR::UNICHAR ( ) [inline]

Definition at line 44 of file unichar.h.

            {
    memset(chars, 0, UNICHAR_LEN);
  }

UNICHAR::UNICHAR	(	const char *	utf8_str,
		int	len
	)

Definition at line 28 of file unichar.cpp.

                                              {
  int total_len = 0;
  int step = 0;
  if (len < 0) {
    for (len = 0; utf8_str[len] != 0 && len < UNICHAR_LEN; ++len);
  }
  for (total_len = 0; total_len < len; total_len += step) {
    step = utf8_step(utf8_str + total_len);
    if (total_len + step > UNICHAR_LEN)
      break;  // Too long.
    if (step == 0)
      break;  // Illegal first byte.
    int i;
    for (i = 1; i < step; ++i)
      if ((utf8_str[total_len + i] & 0xc0) != 0x80)
        break;
    if (i < step)
      break;  // Illegal surrogate
  }
  memcpy(chars, utf8_str, total_len);
  if (total_len < UNICHAR_LEN) {
    chars[UNICHAR_LEN - 1] = total_len;
    while (total_len < UNICHAR_LEN - 1)
      chars[total_len++] = 0;
  }
}

UNICHAR::UNICHAR ( int unicode ) [explicit]

Definition at line 57 of file unichar.cpp.

                            {
  const int bytemask = 0xBF;
  const int bytemark = 0x80;

  if (unicode < 0x80) {
    chars[UNICHAR_LEN - 1] = 1;
    chars[2] = 0;
    chars[1] = 0;
    chars[0] = static_cast<char>(unicode);
  } else if (unicode < 0x800) {
    chars[UNICHAR_LEN - 1] = 2;
    chars[2] = 0;
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[0] = static_cast<char>(unicode | 0xc0);
  } else if (unicode < 0x10000) {
    chars[UNICHAR_LEN - 1] = 3;
    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[0] = static_cast<char>(unicode | 0xe0);
  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
    chars[UNICHAR_LEN - 1] = 4;
    chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[0] = static_cast<char>(unicode | 0xf0);
  } else {
    memset(chars, 0, UNICHAR_LEN);
  }
}

Member Function Documentation

int UNICHAR::first_uni ( ) const

Definition at line 94 of file unichar.cpp.

                             {
  static const int utf8_offsets[5] = {
    0, 0, 0x3080, 0xE2080, 0x3C82080
  };
  int uni = 0;
  int len = utf8_step(chars);
  const char* src = chars;

  switch (len) {
  default:
    break;
  case 4:
    uni += static_cast<unsigned char>(*src++);
    uni <<= 6;
  case 3:
    uni += static_cast<unsigned char>(*src++);
    uni <<= 6;
  case 2:
    uni += static_cast<unsigned char>(*src++);
    uni <<= 6;
  case 1:
    uni += static_cast<unsigned char>(*src++);
  }
  uni -= utf8_offsets[len];
  return uni;
}

const char* UNICHAR::utf8 ( ) const [inline]

Definition at line 68 of file unichar.h.

                           {
    return chars;
  }

int UNICHAR::utf8_len ( ) const [inline]

Definition at line 62 of file unichar.h.

                       {
    int len = chars[UNICHAR_LEN - 1];
    return len >=0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
  }

int UNICHAR::utf8_step ( const char * utf8_str ) [static]

Definition at line 131 of file unichar.cpp.

                                           {
  static const char utf8_bytes[256] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
  };

  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
}

char * UNICHAR::utf8_str ( ) const

Definition at line 122 of file unichar.cpp.

                              {
  int len = utf8_len();
  char* str = new char[len + 1];
  memcpy(str, chars, len);
  str[len] = 0;
  return str;
}

The documentation for this class was generated from the following files:

tesseract-ocr/ccutil/unichar.h
tesseract-ocr/ccutil/unichar.cpp

Public Member Functions

Static Public Member Functions

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation