Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: trie.c (Formerly trie.c) 00005 * Description: Functions to build a trie data structure. 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Fri Jul 26 12:18:10 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 #ifdef _MSC_VER 00029 #pragma warning(disable:4244) // Conversion warnings 00030 #pragma warning(disable:4800) // int/bool warnings 00031 #endif 00032 #include "trie.h" 00033 00034 #include "callcpp.h" 00035 #include "dawg.h" 00036 #include "dict.h" 00037 #include "freelist.h" 00038 #include "genericvector.h" 00039 #include "helpers.h" 00040 00041 namespace tesseract { 00042 00043 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE"; 00044 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"; 00045 const char kForceReverse[] = "RRP_FORCE_REVERSE"; 00046 00047 const char * const RTLReversePolicyNames[] = { 00048 kDoNotReverse, 00049 kReverseIfHasRTL, 00050 kForceReverse 00051 }; 00052 00053 const char Trie::kAlphaPatternUnicode[] = "\u2000"; 00054 const char Trie::kDigitPatternUnicode[] = "\u2001"; 00055 const char Trie::kAlphanumPatternUnicode[] = "\u2002"; 00056 const char Trie::kPuncPatternUnicode[] = "\u2003"; 00057 const char Trie::kLowerPatternUnicode[] = "\u2004"; 00058 const char Trie::kUpperPatternUnicode[] = "\u2005"; 00059 00060 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) { 00061 return RTLReversePolicyNames[reverse_policy]; 00062 } 00063 00064 // Reset the Trie to empty. 00065 void Trie::clear() { 00066 nodes_.delete_data_pointers(); 00067 nodes_.clear(); 00068 num_edges_ = 0; 00069 new_dawg_node(); // Need to allocate node 0. 00070 } 00071 00072 bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, 00073 int direction, bool word_end, UNICHAR_ID unichar_id, 00074 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const { 00075 if (debug_level_ == 3) { 00076 tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT 00077 " direction %d word_end %d unichar_id %d, exploring node:\n", 00078 node_ref, next_node, direction, word_end, unichar_id); 00079 if (node_ref != NO_EDGE) { 00080 print_node(node_ref, nodes_[node_ref]->forward_edges.size()); 00081 } 00082 } 00083 if (node_ref == NO_EDGE) return false; 00084 assert(node_ref < nodes_.size()); 00085 EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? 00086 nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges; 00087 int vec_size = vec.size(); 00088 if (node_ref == 0) { // binary search 00089 EDGE_INDEX start = 0; 00090 EDGE_INDEX end = vec_size - 1; 00091 EDGE_INDEX k; 00092 int compare; 00093 while (start <= end) { 00094 k = (start + end) >> 1; // (start + end) / 2 00095 compare = given_greater_than_edge_rec(next_node, word_end, 00096 unichar_id, vec[k]); 00097 if (compare == 0) { // given == vec[k] 00098 *edge_ptr = &(vec[k]); 00099 *edge_index = k; 00100 return true; 00101 } else if (compare == 1) { // given > vec[k] 00102 start = k + 1; 00103 } else { // given < vec[k] 00104 end = k - 1; 00105 } 00106 } 00107 } else { // linear search 00108 for (int i = 0; i < vec_size; ++i) { 00109 EDGE_RECORD &edge_rec = vec[i]; 00110 if (edge_rec_match(next_node, word_end, unichar_id, 00111 next_node_from_edge_rec(edge_rec), 00112 end_of_word_from_edge_rec(edge_rec), 00113 unichar_id_from_edge_rec(edge_rec))) { 00114 *edge_ptr = &(edge_rec); 00115 *edge_index = i; 00116 return true; 00117 } 00118 } 00119 } 00120 return false; // not found 00121 } 00122 00123 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, 00124 int direction, bool word_end, 00125 UNICHAR_ID unichar_id) { 00126 if (num_edges_ == max_num_edges_) return false; 00127 EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? 00128 &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges); 00129 int search_index; 00130 if (node1 == 0) { 00131 search_index = 0; // find the index to make the add sorted 00132 while (search_index < vec->size() && 00133 given_greater_than_edge_rec(node2, word_end, unichar_id, 00134 (*vec)[search_index]) == 1) { 00135 search_index++; 00136 } 00137 } else { 00138 search_index = vec->size(); // add is unsorted, so index does not matter 00139 } 00140 EDGE_RECORD edge_rec; 00141 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id); 00142 if (search_index < vec->size()) { 00143 vec->insert(edge_rec, search_index); 00144 } else { 00145 vec->push_back(edge_rec); 00146 } 00147 if (debug_level_ > 1) { 00148 tprintf("new edge in nodes_[" REFFORMAT "]: ", node1); 00149 print_edge_rec(edge_rec); 00150 tprintf("\n"); 00151 } 00152 num_edges_++; 00153 return true; 00154 } 00155 00156 void Trie::add_word_ending(EDGE_RECORD *edge_ptr, 00157 NODE_REF the_next_node, 00158 bool marker_flag, 00159 UNICHAR_ID unichar_id) { 00160 EDGE_RECORD *back_edge_ptr; 00161 EDGE_INDEX back_edge_index; 00162 ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, 00163 unichar_id, &back_edge_ptr, &back_edge_index)); 00164 if (marker_flag) { 00165 *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_); 00166 *edge_ptr |= (MARKER_FLAG << flag_start_bit_); 00167 } 00168 // Mark both directions as end of word. 00169 *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_); 00170 *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); 00171 } 00172 00173 bool Trie::add_word_to_dawg(const WERD_CHOICE &word, 00174 const GenericVector<bool> *repetitions) { 00175 if (word.length() <= 0) return false; // can't add empty words 00176 if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length()); 00177 // Make sure the word does not contain invalid unchar ids. 00178 for (int i = 0; i < word.length(); ++i) { 00179 if (word.unichar_id(i) < 0 || 00180 word.unichar_id(i) >= unicharset_size_) return false; 00181 } 00182 00183 EDGE_RECORD *edge_ptr; 00184 NODE_REF last_node = 0; 00185 NODE_REF the_next_node; 00186 bool marker_flag = false; 00187 EDGE_INDEX edge_index; 00188 int i; 00189 inT32 still_finding_chars = true; 00190 inT32 word_end = false; 00191 bool add_failed = false; 00192 bool found; 00193 00194 if (debug_level_ > 1) word.print("\nAdding word: "); 00195 00196 UNICHAR_ID unichar_id; 00197 for (i = 0; i < word.length() - 1; ++i) { 00198 unichar_id = word.unichar_id(i); 00199 marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false; 00200 if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); 00201 if (still_finding_chars) { 00202 found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, 00203 unichar_id, &edge_ptr, &edge_index); 00204 if (found && debug_level_ > 1) { 00205 tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", 00206 edge_index, last_node); 00207 } 00208 if (!found) { 00209 still_finding_chars = false; 00210 } else if (next_node_from_edge_rec(*edge_ptr) == 0) { 00211 word_end = true; 00212 still_finding_chars = false; 00213 remove_edge(last_node, 0, word_end, unichar_id); 00214 } else { 00215 if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr); 00216 last_node = next_node_from_edge_rec(*edge_ptr); 00217 } 00218 } 00219 if (!still_finding_chars) { 00220 the_next_node = new_dawg_node(); 00221 if (debug_level_ > 1) 00222 tprintf("adding node " REFFORMAT "\n", the_next_node); 00223 if (the_next_node == 0) { 00224 add_failed = true; 00225 break; 00226 } 00227 if (!add_new_edge(last_node, the_next_node, 00228 marker_flag, word_end, unichar_id)) { 00229 add_failed = true; 00230 break; 00231 } 00232 word_end = false; 00233 last_node = the_next_node; 00234 } 00235 } 00236 the_next_node = 0; 00237 unichar_id = word.unichar_id(i); 00238 marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false; 00239 if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); 00240 if (still_finding_chars && 00241 edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, 00242 unichar_id, &edge_ptr, &edge_index)) { 00243 // An extension of this word already exists in the trie, so we 00244 // only have to add the ending flags in both directions. 00245 add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), 00246 marker_flag, unichar_id); 00247 } else { 00248 if (!add_failed && 00249 !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) 00250 add_failed = true; 00251 } 00252 if (add_failed) { 00253 tprintf("Re-initializing document dictionary...\n"); 00254 clear(); 00255 return false; 00256 } else { 00257 return true; 00258 } 00259 } 00260 00261 NODE_REF Trie::new_dawg_node() { 00262 TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD(); 00263 if (node == NULL) return 0; // failed to create new node 00264 nodes_.push_back(node); 00265 return nodes_.length() - 1; 00266 } 00267 00268 bool Trie::read_word_list(const char *filename, 00269 const UNICHARSET &unicharset, 00270 Trie::RTLReversePolicy reverse_policy) { 00271 FILE *word_file; 00272 char string[CHARS_PER_LINE]; 00273 int word_count = 0; 00274 00275 word_file = open_file (filename, "r"); 00276 00277 while (fgets(string, CHARS_PER_LINE, word_file) != NULL) { 00278 chomp_string(string); // remove newline 00279 WERD_CHOICE word(string, unicharset); 00280 if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && 00281 word.has_rtl_unichar_id()) || 00282 reverse_policy == RRP_FORCE_REVERSE) { 00283 word.reverse_and_mirror_unichar_ids(); 00284 } 00285 ++word_count; 00286 if (debug_level_ && word_count % 10000 == 0) 00287 tprintf("Read %d words so far\n", word_count); 00288 if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { 00289 if (!this->word_in_dawg(word)) { 00290 this->add_word_to_dawg(word); 00291 if (!this->word_in_dawg(word)) { 00292 tprintf("Error: word '%s' not in DAWG after adding it\n", string); 00293 return false; 00294 } 00295 } 00296 } else if (debug_level_) { 00297 tprintf("Skipping invalid word %s\n", string); 00298 if (debug_level_ >= 3) word.print(); 00299 } 00300 } 00301 if (debug_level_) 00302 tprintf("Read %d words total.\n", word_count); 00303 fclose(word_file); 00304 return true; 00305 } 00306 00307 void Trie::initialize_patterns(UNICHARSET *unicharset) { 00308 unicharset->unichar_insert(kAlphaPatternUnicode); 00309 alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode); 00310 unicharset->unichar_insert(kDigitPatternUnicode); 00311 digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode); 00312 unicharset->unichar_insert(kAlphanumPatternUnicode); 00313 alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode); 00314 unicharset->unichar_insert(kPuncPatternUnicode); 00315 punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode); 00316 unicharset->unichar_insert(kLowerPatternUnicode); 00317 lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode); 00318 unicharset->unichar_insert(kUpperPatternUnicode); 00319 upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode); 00320 initialized_patterns_ = true; 00321 unicharset_size_ = unicharset->size(); 00322 } 00323 00324 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, 00325 const UNICHARSET &unicharset, 00326 GenericVector<UNICHAR_ID> *vec) const { 00327 bool is_alpha = unicharset.get_isalpha(unichar_id); 00328 if (is_alpha) { 00329 vec->push_back(alpha_pattern_); 00330 vec->push_back(alphanum_pattern_); 00331 if (unicharset.get_islower(unichar_id)) { 00332 vec->push_back(lower_pattern_); 00333 } else if (unicharset.get_isupper(unichar_id)) { 00334 vec->push_back(upper_pattern_); 00335 } 00336 } 00337 if (unicharset.get_isdigit(unichar_id)) { 00338 vec->push_back(digit_pattern_); 00339 if (!is_alpha) vec->push_back(alphanum_pattern_); 00340 } 00341 if (unicharset.get_ispunctuation(unichar_id)) { 00342 vec->push_back(punc_pattern_); 00343 } 00344 } 00345 00346 UNICHAR_ID Trie::character_class_to_pattern(char ch) { 00347 if (ch == 'c') { 00348 return alpha_pattern_; 00349 } else if (ch == 'd') { 00350 return digit_pattern_; 00351 } else if (ch == 'n') { 00352 return alphanum_pattern_; 00353 } else if (ch == 'p') { 00354 return punc_pattern_; 00355 } else if (ch == 'a') { 00356 return lower_pattern_; 00357 } else if (ch == 'A') { 00358 return upper_pattern_; 00359 } else { 00360 return INVALID_UNICHAR_ID; 00361 } 00362 } 00363 00364 bool Trie::read_pattern_list(const char *filename, 00365 const UNICHARSET &unicharset) { 00366 if (!initialized_patterns_) { 00367 tprintf("please call initialize_patterns() before read_pattern_list()\n"); 00368 return false; 00369 } 00370 00371 FILE *pattern_file = open_file (filename, "r"); 00372 if (pattern_file == NULL) { 00373 tprintf("Error opening pattern file %s\n", filename); 00374 return false; 00375 } 00376 00377 int pattern_count = 0; 00378 char string[CHARS_PER_LINE]; 00379 while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) { 00380 chomp_string(string); // remove newline 00381 // Parse the pattern and construct a unichar id vector. 00382 // Record the number of repetitions of each unichar in the parallel vector. 00383 WERD_CHOICE word(&unicharset); 00384 GenericVector<bool> repetitions_vec; 00385 const char *str_ptr = string; 00386 int step = unicharset.step(str_ptr); 00387 bool failed = false; 00388 while (step > 0) { 00389 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; 00390 if (step == 1 && *str_ptr == '\\') { 00391 ++str_ptr; 00392 if (*str_ptr == '\\') { // regular '\' unichar that was escaped 00393 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); 00394 } else { 00395 if (word.length() < kSaneNumConcreteChars) { 00396 tprintf("Please provide at least %d concrete characters at the" 00397 " beginning of the pattern\n", kSaneNumConcreteChars); 00398 failed = true; 00399 break; 00400 } 00401 // Parse character class from expression. 00402 curr_unichar_id = character_class_to_pattern(*str_ptr); 00403 } 00404 } else { 00405 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); 00406 } 00407 if (curr_unichar_id == INVALID_UNICHAR_ID) { 00408 failed = true; 00409 break; // failed to parse this pattern 00410 } 00411 word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0); 00412 repetitions_vec.push_back(false); 00413 str_ptr += step; 00414 step = unicharset.step(str_ptr); 00415 // Check if there is a repetition pattern specified after this unichar. 00416 if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') { 00417 repetitions_vec[repetitions_vec.size()-1] = true; 00418 str_ptr += 2; 00419 step = unicharset.step(str_ptr); 00420 } 00421 } 00422 if (failed) { 00423 tprintf("Invalid user pattern %s\n", string); 00424 continue; 00425 } 00426 // Insert the pattern into the trie. 00427 if (debug_level_ > 2) { 00428 tprintf("Inserting expanded user pattern %s\n", 00429 word.debug_string().string()); 00430 } 00431 if (!this->word_in_dawg(word)) { 00432 this->add_word_to_dawg(word, &repetitions_vec); 00433 if (!this->word_in_dawg(word)) { 00434 tprintf("Error: failed to insert pattern '%s'\n", string); 00435 } 00436 } 00437 ++pattern_count; 00438 } 00439 if (debug_level_) { 00440 tprintf("Read %d valid patterns from %s\n", pattern_count, filename); 00441 } 00442 fclose(pattern_file); 00443 return true; 00444 } 00445 00446 void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, 00447 bool word_end, UNICHAR_ID unichar_id) { 00448 EDGE_RECORD *edge_ptr = NULL; 00449 EDGE_INDEX edge_index = 0; 00450 ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, 00451 unichar_id, &edge_ptr, &edge_index)); 00452 if (debug_level_ > 1) { 00453 tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1); 00454 print_edge_rec(*edge_ptr); 00455 tprintf("\n"); 00456 } 00457 if (direction == FORWARD_EDGE) { 00458 nodes_[node1]->forward_edges.remove(edge_index); 00459 } else { 00460 nodes_[node1]->backward_edges.remove(edge_index); 00461 } 00462 --num_edges_; 00463 } 00464 00465 SquishedDawg *Trie::trie_to_dawg() { 00466 if (debug_level_ > 2) { 00467 print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY); 00468 } 00469 NODE_MARKER reduced_nodes = new bool[nodes_.size()]; 00470 for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0; 00471 this->reduce_node_input(0, reduced_nodes); 00472 delete[] reduced_nodes; 00473 00474 if (debug_level_ > 2) { 00475 print_all("After reduction:", MAX_NODE_EDGES_DISPLAY); 00476 } 00477 // Build a translation map from node indices in nodes_ vector to 00478 // their target indices in EDGE_ARRAY. 00479 NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1]; 00480 int i, j; 00481 node_ref_map[0] = 0; 00482 for (i = 0; i < nodes_.size(); ++i) { 00483 node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size(); 00484 } 00485 int num_forward_edges = node_ref_map[i]; 00486 00487 // Convert nodes_ vector into EDGE_ARRAY translating the next node references 00488 // in edges using node_ref_map. Empty nodes and backward edges are dropped. 00489 EDGE_ARRAY edge_array = 00490 (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD)); 00491 EDGE_ARRAY edge_array_ptr = edge_array; 00492 for (i = 0; i < nodes_.size(); ++i) { 00493 TRIE_NODE_RECORD *node_ptr = nodes_[i]; 00494 int end = node_ptr->forward_edges.size(); 00495 for (j = 0; j < end; ++j) { 00496 EDGE_RECORD &edge_rec = node_ptr->forward_edges[j]; 00497 NODE_REF node_ref = next_node_from_edge_rec(edge_rec); 00498 ASSERT_HOST(node_ref < nodes_.size()); 00499 UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec); 00500 link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE, 00501 end_of_word_from_edge_rec(edge_rec), unichar_id); 00502 if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr); 00503 ++edge_array_ptr; 00504 } 00505 } 00506 delete[] node_ref_map; 00507 00508 return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, 00509 perm_, unicharset_size_, debug_level_); 00510 } 00511 00512 bool Trie::eliminate_redundant_edges(NODE_REF node, 00513 const EDGE_RECORD &edge1, 00514 const EDGE_RECORD &edge2) { 00515 if (debug_level_ > 1) { 00516 tprintf("\nCollapsing node %d:\n", node); 00517 print_node(node, MAX_NODE_EDGES_DISPLAY); 00518 tprintf("Candidate edges: "); 00519 print_edge_rec(edge1); 00520 tprintf(", "); 00521 print_edge_rec(edge2); 00522 tprintf("\n\n"); 00523 } 00524 NODE_REF next_node1 = next_node_from_edge_rec(edge1); 00525 NODE_REF next_node2 = next_node_from_edge_rec(edge2); 00526 TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2]; 00527 // Translate all edges going to/from next_node2 to go to/from next_node1. 00528 EDGE_RECORD *edge_ptr = NULL; 00529 EDGE_INDEX edge_index; 00530 int i; 00531 // Remove the backward link in node to next_node2. 00532 const EDGE_RECORD &fwd_edge = next_node2_ptr->forward_edges[0]; 00533 remove_edge_linkage(node, next_node2, BACKWARD_EDGE, 00534 end_of_word_from_edge_rec(fwd_edge), 00535 unichar_id_from_edge_rec(fwd_edge)); 00536 // Copy all the backward links in next_node2 to node next_node1 00537 for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) { 00538 const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i]; 00539 NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge); 00540 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge); 00541 int curr_word_end = end_of_word_from_edge_rec(bkw_edge); 00542 bool marker_flag = marker_flag_from_edge_rec(bkw_edge); 00543 add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, 00544 curr_word_end, curr_unichar_id); 00545 // Relocate the corresponding forward edge in curr_next_node 00546 ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, 00547 curr_word_end, curr_unichar_id, 00548 &edge_ptr, &edge_index)); 00549 set_next_node_in_edge_rec(edge_ptr, next_node1); 00550 } 00551 int next_node2_num_edges = (next_node2_ptr->forward_edges.size() + 00552 next_node2_ptr->backward_edges.size()); 00553 if (debug_level_ > 1) { 00554 tprintf("removed %d edges from node " REFFORMAT "\n", 00555 next_node2_num_edges, next_node2); 00556 } 00557 next_node2_ptr->forward_edges.clear(); 00558 next_node2_ptr->backward_edges.clear(); 00559 num_edges_ -= next_node2_num_edges; 00560 return true; 00561 } 00562 00563 bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, 00564 UNICHAR_ID unichar_id, 00565 NODE_REF node, 00566 const EDGE_VECTOR &backward_edges, 00567 NODE_MARKER reduced_nodes) { 00568 if (debug_level_ > 1) 00569 tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index); 00570 // Compare each of the edge pairs with the given unichar_id. 00571 bool did_something = false; 00572 for (int i = edge_index; i < backward_edges.size() - 1; ++i) { 00573 // Find the first edge that can be eliminated. 00574 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; 00575 while (i < backward_edges.size() && 00576 ((curr_unichar_id = unichar_id_from_edge_rec(backward_edges[i])) == 00577 unichar_id) && 00578 !can_be_eliminated(backward_edges[i])) ++i; 00579 if (i == backward_edges.size() || curr_unichar_id != unichar_id) break; 00580 const EDGE_RECORD &edge_rec = backward_edges[i]; 00581 // Compare it to the rest of the edges with the given unichar_id. 00582 for (int j = i + 1; j < backward_edges.size(); ++j) { 00583 const EDGE_RECORD &next_edge_rec = backward_edges[j]; 00584 if (unichar_id_from_edge_rec(next_edge_rec) != unichar_id) break; 00585 if (end_of_word_from_edge_rec(next_edge_rec) == 00586 end_of_word_from_edge_rec(edge_rec) && 00587 can_be_eliminated(next_edge_rec) && 00588 eliminate_redundant_edges(node, edge_rec, next_edge_rec)) { 00589 reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0; 00590 did_something = true; 00591 --j; // do not increment j if next_edge_rec was removed 00592 } 00593 } 00594 } 00595 return did_something; 00596 } 00597 00598 void Trie::sort_edges(EDGE_VECTOR *edges) { 00599 int num_edges = edges->size(); 00600 if (num_edges <= 1) return; 00601 for (int i = 0; i < num_edges - 1; ++i) { 00602 int min = i; 00603 for (int j = (i + 1); j < num_edges; ++j) { 00604 if (unichar_id_from_edge_rec((*edges)[j]) < 00605 unichar_id_from_edge_rec((*edges)[min])) min = j; 00606 } 00607 if (i != min) { 00608 EDGE_RECORD temp = (*edges)[i]; 00609 (*edges)[i] = (*edges)[min]; 00610 (*edges)[min] = temp; 00611 } 00612 } 00613 } 00614 00615 void Trie::reduce_node_input(NODE_REF node, 00616 NODE_MARKER reduced_nodes) { 00617 if (debug_level_ > 1) { 00618 tprintf("reduce_node_input(node=" REFFORMAT ")\n", node); 00619 print_node(node, MAX_NODE_EDGES_DISPLAY); 00620 } 00621 00622 EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges; 00623 if (node != 0) sort_edges(&backward_edges); 00624 EDGE_INDEX edge_index = 0; 00625 while (edge_index < backward_edges.size()) { 00626 UNICHAR_ID unichar_id = 00627 unichar_id_from_edge_rec(backward_edges[edge_index]); 00628 while (reduce_lettered_edges(edge_index, unichar_id, node, 00629 backward_edges, reduced_nodes)); 00630 while (++edge_index < backward_edges.size() && 00631 unichar_id_from_edge_rec(backward_edges[edge_index]) == unichar_id); 00632 } 00633 reduced_nodes[node] = true; // mark as reduced 00634 00635 if (debug_level_ > 1) { 00636 tprintf("Node " REFFORMAT " after reduction:\n", node); 00637 print_node(node, MAX_NODE_EDGES_DISPLAY); 00638 } 00639 00640 for (int i = 0; i < backward_edges.size(); ++i) { 00641 NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]); 00642 if (next_node != 0 && !reduced_nodes[next_node]) { 00643 reduce_node_input(next_node, reduced_nodes); 00644 } 00645 } 00646 } 00647 00648 void Trie::print_node(NODE_REF node, int max_num_edges) const { 00649 if (node == NO_EDGE) return; // nothing to print 00650 TRIE_NODE_RECORD *node_ptr = nodes_[node]; 00651 int num_fwd = node_ptr->forward_edges.size(); 00652 int num_bkw = node_ptr->backward_edges.size(); 00653 EDGE_VECTOR *vec; 00654 for (int dir = 0; dir < 2; ++dir) { 00655 if (dir == 0) { 00656 vec = &(node_ptr->forward_edges); 00657 tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw); 00658 } else { 00659 vec = &(node_ptr->backward_edges); 00660 tprintf("\t"); 00661 } 00662 int i; 00663 for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && 00664 i < max_num_edges; ++i) { 00665 print_edge_rec((*vec)[i]); 00666 tprintf(" "); 00667 } 00668 if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("..."); 00669 tprintf("\n"); 00670 } 00671 } 00672 } // namespace tesseract