Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: unicodes.h 00003 * Description: Unicode related machinery 00004 * Author: David Eger 00005 * Created: Wed Jun 15 16:37:50 PST 2011 00006 * 00007 * (C) Copyright 2011, Google, Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "unicodes.h" 00021 #include "host.h" // for NULL 00022 00023 namespace tesseract { 00024 00025 const char *kUTF8LineSeparator = "\u2028"; // "\xe2\x80\xa8"; 00026 const char *kUTF8ParagraphSeparator = "\u2029"; // "\xe2\x80\xa9"; 00027 const char *kLRM = "\u200E"; // Left-to-Right Mark 00028 const char *kRLM = "\u200F"; // Right-to-Left Mark 00029 const char *kRLE = "\u202A"; // Right-to-Left Embedding 00030 const char *kPDF = "\u202C"; // Pop Directional Formatting 00031 00032 const char *kHyphenLikeUTF8[] = { 00033 "-", // ASCII hyphen-minus 00034 "\u05BE", // word hyphen in hybrew 00035 "\u2010", // hyphen 00036 "\u2011", // non-breaking hyphen 00037 "\u2012", // a hyphen the same width as digits 00038 "\u2013", // en dash 00039 "\u2014", // em dash 00040 "\u2015", // horizontal bar 00041 "\u2212", // arithmetic minus sign 00042 "\uFE58", // small em dash 00043 "\uFE63", // small hyphen-minus 00044 "\uFF0D", // fullwidth hyphen-minus 00045 NULL, // end of our list 00046 }; 00047 00048 const char *kApostropheLikeUTF8[] = { 00049 "'", // ASCII apostrophe 00050 "`", // ASCII backtick 00051 "\u2018", // opening single quote 00052 "\u2019", // closing single quote 00053 "\u2032", // mathematical prime mark 00054 NULL, // end of our list. 00055 }; 00056 00057 } // namespace