Tesseract  3.02
tesseract-ocr/ccutil/unicharmap.cpp
Go to the documentation of this file.
00001 
00002 // File:        unicharmap.cpp
00003 // Description: Unicode character/ligature to integer id class.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include <assert.h>
00021 #include "unichar.h"
00022 #include "host.h"
00023 #include "unicharmap.h"
00024 
00025 UNICHARMAP::UNICHARMAP() :
00026 nodes(0) {
00027 }
00028 
00029 UNICHARMAP::~UNICHARMAP() {
00030   if (nodes != 0)
00031     delete[] nodes;
00032 }
00033 
00034 // Search the given unichar representation in the tree. Each character in the
00035 // string is interpreted as an index in an array of nodes.
00036 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
00037   const char* current_char = unichar_repr;
00038   UNICHARMAP_NODE* current_nodes = nodes;
00039 
00040   assert(*unichar_repr != '\0');
00041 
00042   do {
00043     if (*(current_char + 1) == '\0')
00044       return current_nodes[static_cast<unsigned char>(*current_char)].id;
00045     current_nodes =
00046         current_nodes[static_cast<unsigned char>(*current_char)].children;
00047     ++current_char;
00048   } while (true);
00049 }
00050 
00051 // Search the given unichar representation in the tree, using length characters
00052 // from it maximum. Each character in the string is interpreted as an index in
00053 // an array of nodes.
00054 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
00055                                      int length) const {
00056   const char* current_char = unichar_repr;
00057   UNICHARMAP_NODE* current_nodes = nodes;
00058 
00059   assert(*unichar_repr != '\0');
00060   assert(length > 0 && length <= UNICHAR_LEN);
00061 
00062   do {
00063     if (length == 1 || *(current_char + 1) == '\0')
00064       return current_nodes[static_cast<unsigned char>(*current_char)].id;
00065     current_nodes =
00066         current_nodes[static_cast<unsigned char>(*current_char)].children;
00067     ++current_char;
00068     --length;
00069   } while (true);
00070 }
00071 
00072 // Search the given unichar representation in the tree, creating the possibly
00073 // missing nodes. Once the right place has been found, insert the given id and
00074 // update the inserted flag to keep track of the insert. Each character in the
00075 // string is interpreted as an index in an array of nodes.
00076 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
00077   const char* current_char = unichar_repr;
00078   UNICHARMAP_NODE** current_nodes_pointer = &nodes;
00079 
00080   assert(*unichar_repr != '\0');
00081   assert(id >= 0);
00082 
00083   do {
00084     if (*current_nodes_pointer == 0)
00085       *current_nodes_pointer = new UNICHARMAP_NODE[256];
00086     if (*(current_char + 1) == '\0') {
00087       (*current_nodes_pointer)
00088           [static_cast<unsigned char>(*current_char)].id = id;
00089       return;
00090     }
00091     current_nodes_pointer =
00092         &((*current_nodes_pointer)
00093           [static_cast<unsigned char>(*current_char)].children);
00094     ++current_char;
00095   } while (true);
00096 }
00097 
00098 // Search the given unichar representation in the tree. Each character in the
00099 // string is interpreted as an index in an array of nodes. Stop once the tree
00100 // does not have anymore nodes or once we found the right unichar_repr.
00101 bool UNICHARMAP::contains(const char* const unichar_repr) const {
00102   const char* current_char = unichar_repr;
00103   UNICHARMAP_NODE* current_nodes = nodes;
00104 
00105   assert(*unichar_repr != '\0');
00106 
00107   while (current_nodes != 0 && *(current_char + 1) != '\0') {
00108     current_nodes =
00109         current_nodes[static_cast<unsigned char>(*current_char)].children;
00110     ++current_char;
00111   }
00112   return current_nodes != 0 && *(current_char + 1) == '\0' &&
00113       current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
00114 }
00115 
00116 // Search the given unichar representation in the tree, using length characters
00117 // from it maximum. Each character in the string is interpreted as an index in
00118 // an array of nodes. Stop once the tree does not have anymore nodes or once we
00119 // found the right unichar_repr.
00120 bool UNICHARMAP::contains(const char* const unichar_repr,
00121                           int length) const {
00122   const char* current_char = unichar_repr;
00123   UNICHARMAP_NODE* current_nodes = nodes;
00124 
00125   assert(*unichar_repr != '\0');
00126   assert(length > 0 && length <= UNICHAR_LEN);
00127 
00128   while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
00129     current_nodes =
00130         current_nodes[static_cast<unsigned char>(*current_char)].children;
00131     --length;
00132     ++current_char;
00133   }
00134   return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
00135       current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
00136 }
00137 
00138 // Return the minimum number of characters that must be used from this string
00139 // to obtain a match in the UNICHARMAP.
00140 int UNICHARMAP::minmatch(const char* const unichar_repr) const {
00141   const char* current_char = unichar_repr;
00142   UNICHARMAP_NODE* current_nodes = nodes;
00143 
00144   while (current_nodes != NULL && *current_char != '\0') {
00145     if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
00146       return current_char + 1 - unichar_repr;
00147     current_nodes =
00148         current_nodes[static_cast<unsigned char>(*current_char)].children;
00149     ++current_char;
00150   }
00151   return 0;
00152 }
00153 
00154 void UNICHARMAP::clear() {
00155   if (nodes != 0)
00156   {
00157     delete[] nodes;
00158     nodes = 0;
00159   }
00160 }
00161 
00162 UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
00163 children(0),
00164 id(-1) {
00165 }
00166 
00167 // Recursively delete the children
00168 UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
00169   if (children != 0) {
00170     delete[] children;
00171   }
00172 }