Tesseract
3.02
|
00001 00002 // File: unicharmap.cpp 00003 // Description: Unicode character/ligature to integer id class. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include <assert.h> 00021 #include "unichar.h" 00022 #include "host.h" 00023 #include "unicharmap.h" 00024 00025 UNICHARMAP::UNICHARMAP() : 00026 nodes(0) { 00027 } 00028 00029 UNICHARMAP::~UNICHARMAP() { 00030 if (nodes != 0) 00031 delete[] nodes; 00032 } 00033 00034 // Search the given unichar representation in the tree. Each character in the 00035 // string is interpreted as an index in an array of nodes. 00036 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const { 00037 const char* current_char = unichar_repr; 00038 UNICHARMAP_NODE* current_nodes = nodes; 00039 00040 assert(*unichar_repr != '\0'); 00041 00042 do { 00043 if (*(current_char + 1) == '\0') 00044 return current_nodes[static_cast<unsigned char>(*current_char)].id; 00045 current_nodes = 00046 current_nodes[static_cast<unsigned char>(*current_char)].children; 00047 ++current_char; 00048 } while (true); 00049 } 00050 00051 // Search the given unichar representation in the tree, using length characters 00052 // from it maximum. Each character in the string is interpreted as an index in 00053 // an array of nodes. 00054 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr, 00055 int length) const { 00056 const char* current_char = unichar_repr; 00057 UNICHARMAP_NODE* current_nodes = nodes; 00058 00059 assert(*unichar_repr != '\0'); 00060 assert(length > 0 && length <= UNICHAR_LEN); 00061 00062 do { 00063 if (length == 1 || *(current_char + 1) == '\0') 00064 return current_nodes[static_cast<unsigned char>(*current_char)].id; 00065 current_nodes = 00066 current_nodes[static_cast<unsigned char>(*current_char)].children; 00067 ++current_char; 00068 --length; 00069 } while (true); 00070 } 00071 00072 // Search the given unichar representation in the tree, creating the possibly 00073 // missing nodes. Once the right place has been found, insert the given id and 00074 // update the inserted flag to keep track of the insert. Each character in the 00075 // string is interpreted as an index in an array of nodes. 00076 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) { 00077 const char* current_char = unichar_repr; 00078 UNICHARMAP_NODE** current_nodes_pointer = &nodes; 00079 00080 assert(*unichar_repr != '\0'); 00081 assert(id >= 0); 00082 00083 do { 00084 if (*current_nodes_pointer == 0) 00085 *current_nodes_pointer = new UNICHARMAP_NODE[256]; 00086 if (*(current_char + 1) == '\0') { 00087 (*current_nodes_pointer) 00088 [static_cast<unsigned char>(*current_char)].id = id; 00089 return; 00090 } 00091 current_nodes_pointer = 00092 &((*current_nodes_pointer) 00093 [static_cast<unsigned char>(*current_char)].children); 00094 ++current_char; 00095 } while (true); 00096 } 00097 00098 // Search the given unichar representation in the tree. Each character in the 00099 // string is interpreted as an index in an array of nodes. Stop once the tree 00100 // does not have anymore nodes or once we found the right unichar_repr. 00101 bool UNICHARMAP::contains(const char* const unichar_repr) const { 00102 const char* current_char = unichar_repr; 00103 UNICHARMAP_NODE* current_nodes = nodes; 00104 00105 assert(*unichar_repr != '\0'); 00106 00107 while (current_nodes != 0 && *(current_char + 1) != '\0') { 00108 current_nodes = 00109 current_nodes[static_cast<unsigned char>(*current_char)].children; 00110 ++current_char; 00111 } 00112 return current_nodes != 0 && *(current_char + 1) == '\0' && 00113 current_nodes[static_cast<unsigned char>(*current_char)].id >= 0; 00114 } 00115 00116 // Search the given unichar representation in the tree, using length characters 00117 // from it maximum. Each character in the string is interpreted as an index in 00118 // an array of nodes. Stop once the tree does not have anymore nodes or once we 00119 // found the right unichar_repr. 00120 bool UNICHARMAP::contains(const char* const unichar_repr, 00121 int length) const { 00122 const char* current_char = unichar_repr; 00123 UNICHARMAP_NODE* current_nodes = nodes; 00124 00125 assert(*unichar_repr != '\0'); 00126 assert(length > 0 && length <= UNICHAR_LEN); 00127 00128 while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) { 00129 current_nodes = 00130 current_nodes[static_cast<unsigned char>(*current_char)].children; 00131 --length; 00132 ++current_char; 00133 } 00134 return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') && 00135 current_nodes[static_cast<unsigned char>(*current_char)].id >= 0; 00136 } 00137 00138 // Return the minimum number of characters that must be used from this string 00139 // to obtain a match in the UNICHARMAP. 00140 int UNICHARMAP::minmatch(const char* const unichar_repr) const { 00141 const char* current_char = unichar_repr; 00142 UNICHARMAP_NODE* current_nodes = nodes; 00143 00144 while (current_nodes != NULL && *current_char != '\0') { 00145 if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0) 00146 return current_char + 1 - unichar_repr; 00147 current_nodes = 00148 current_nodes[static_cast<unsigned char>(*current_char)].children; 00149 ++current_char; 00150 } 00151 return 0; 00152 } 00153 00154 void UNICHARMAP::clear() { 00155 if (nodes != 0) 00156 { 00157 delete[] nodes; 00158 nodes = 0; 00159 } 00160 } 00161 00162 UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : 00163 children(0), 00164 id(-1) { 00165 } 00166 00167 // Recursively delete the children 00168 UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() { 00169 if (children != 0) { 00170 delete[] children; 00171 } 00172 }