Tesseract  3.02
tesseract-ocr/ccutil/unicodes.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        unicodes.h
00003  * Description: Unicode related machinery
00004  * Author:      David Eger
00005  * Created:     Wed Jun 15 16:37:50 PST 2011
00006  *
00007  * (C) Copyright 2011, Google, Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "unicodes.h"
00021 #include "host.h"  // for NULL
00022 
00023 namespace tesseract {
00024 
00025 const char *kUTF8LineSeparator = "\u2028";  // "\xe2\x80\xa8";
00026 const char *kUTF8ParagraphSeparator = "\u2029";  // "\xe2\x80\xa9";
00027 const char *kLRM = "\u200E";  // Left-to-Right Mark
00028 const char *kRLM = "\u200F";  // Right-to-Left Mark
00029 const char *kRLE = "\u202A";  // Right-to-Left Embedding
00030 const char *kPDF = "\u202C";  // Pop Directional Formatting
00031 
00032 const char *kHyphenLikeUTF8[] = {
00033   "-",       // ASCII hyphen-minus
00034   "\u05BE",  // word hyphen in hybrew
00035   "\u2010",  // hyphen
00036   "\u2011",  // non-breaking hyphen
00037   "\u2012",  // a hyphen the same width as digits
00038   "\u2013",  // en dash
00039   "\u2014",  // em dash
00040   "\u2015",  // horizontal bar
00041   "\u2212",  // arithmetic minus sign
00042   "\uFE58",  // small em dash
00043   "\uFE63",  // small hyphen-minus
00044   "\uFF0D",  // fullwidth hyphen-minus
00045   NULL,      // end of our list
00046 };
00047 
00048 const char *kApostropheLikeUTF8[] = {
00049   "'",       // ASCII apostrophe
00050   "`",       // ASCII backtick
00051   "\u2018",  // opening single quote
00052   "\u2019",  // closing single quote
00053   "\u2032",  // mathematical prime mark
00054   NULL,      // end of our list.
00055 };
00056 
00057 }  // namespace