Tesseract  3.02
tesseract-ocr/dict/trie.cpp
Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        trie.c  (Formerly trie.c)
00005  * Description:  Functions to build a trie data structure.
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Fri Jul 26 12:18:10 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 /*----------------------------------------------------------------------
00026               I n c l u d e s
00027 ----------------------------------------------------------------------*/
00028 #ifdef _MSC_VER
00029 #pragma warning(disable:4244)  // Conversion warnings
00030 #pragma warning(disable:4800)  // int/bool warnings
00031 #endif
00032 #include "trie.h"
00033 
00034 #include "callcpp.h"
00035 #include "dawg.h"
00036 #include "dict.h"
00037 #include "freelist.h"
00038 #include "genericvector.h"
00039 #include "helpers.h"
00040 
00041 namespace tesseract {
00042 
00043 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
00044 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
00045 const char kForceReverse[] = "RRP_FORCE_REVERSE";
00046 
00047 const char * const RTLReversePolicyNames[] = {
00048   kDoNotReverse,
00049   kReverseIfHasRTL,
00050   kForceReverse
00051 };
00052 
00053 const char Trie::kAlphaPatternUnicode[] = "\u2000";
00054 const char Trie::kDigitPatternUnicode[] = "\u2001";
00055 const char Trie::kAlphanumPatternUnicode[] = "\u2002";
00056 const char Trie::kPuncPatternUnicode[] = "\u2003";
00057 const char Trie::kLowerPatternUnicode[] = "\u2004";
00058 const char Trie::kUpperPatternUnicode[] = "\u2005";
00059 
00060 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
00061   return RTLReversePolicyNames[reverse_policy];
00062 }
00063 
00064 // Reset the Trie to empty.
00065 void Trie::clear() {
00066   nodes_.delete_data_pointers();
00067   nodes_.clear();
00068   num_edges_ = 0;
00069   new_dawg_node();  // Need to allocate node 0.
00070 }
00071 
00072 bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node,
00073                         int direction, bool word_end, UNICHAR_ID unichar_id,
00074                         EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const {
00075   if (debug_level_ == 3) {
00076     tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
00077             " direction %d word_end %d unichar_id %d, exploring node:\n",
00078             node_ref, next_node, direction, word_end, unichar_id);
00079     if (node_ref != NO_EDGE) {
00080       print_node(node_ref, nodes_[node_ref]->forward_edges.size());
00081     }
00082   }
00083   if (node_ref == NO_EDGE) return false;
00084   assert(node_ref < nodes_.size());
00085   EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
00086     nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
00087   int vec_size = vec.size();
00088   if (node_ref == 0) { // binary search
00089     EDGE_INDEX start = 0;
00090     EDGE_INDEX end = vec_size - 1;
00091     EDGE_INDEX k;
00092     int compare;
00093     while (start <= end) {
00094       k = (start + end) >> 1;  // (start + end) / 2
00095       compare = given_greater_than_edge_rec(next_node, word_end,
00096                                             unichar_id, vec[k]);
00097       if (compare == 0) {  // given == vec[k]
00098         *edge_ptr = &(vec[k]);
00099         *edge_index = k;
00100         return true;
00101       } else if (compare == 1) {  // given > vec[k]
00102         start = k + 1;
00103       } else {  // given < vec[k]
00104         end = k - 1;
00105       }
00106     }
00107   } else {  // linear search
00108     for (int i = 0; i < vec_size; ++i) {
00109       EDGE_RECORD &edge_rec = vec[i];
00110       if (edge_rec_match(next_node, word_end, unichar_id,
00111                          next_node_from_edge_rec(edge_rec),
00112                          end_of_word_from_edge_rec(edge_rec),
00113                          unichar_id_from_edge_rec(edge_rec))) {
00114         *edge_ptr = &(edge_rec);
00115         *edge_index = i;
00116         return true;
00117       }
00118     }
00119   }
00120   return false;  // not found
00121 }
00122 
00123 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
00124                             int direction, bool word_end,
00125                             UNICHAR_ID unichar_id) {
00126   if (num_edges_ == max_num_edges_) return false;
00127   EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
00128     &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
00129   int search_index;
00130   if (node1 == 0) {
00131     search_index = 0;  // find the index to make the add sorted
00132     while (search_index < vec->size() &&
00133            given_greater_than_edge_rec(node2, word_end, unichar_id,
00134                                        (*vec)[search_index]) == 1) {
00135       search_index++;
00136     }
00137   } else {
00138     search_index = vec->size();  // add is unsorted, so index does not matter
00139   }
00140   EDGE_RECORD edge_rec;
00141   link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
00142   if (search_index < vec->size()) {
00143     vec->insert(edge_rec, search_index);
00144   } else {
00145     vec->push_back(edge_rec);
00146   }
00147   if (debug_level_ > 1) {
00148     tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
00149     print_edge_rec(edge_rec);
00150     tprintf("\n");
00151   }
00152   num_edges_++;
00153   return true;
00154 }
00155 
00156 void Trie::add_word_ending(EDGE_RECORD *edge_ptr,
00157                            NODE_REF the_next_node,
00158                            bool marker_flag,
00159                            UNICHAR_ID unichar_id) {
00160   EDGE_RECORD *back_edge_ptr;
00161   EDGE_INDEX back_edge_index;
00162   ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
00163                            unichar_id, &back_edge_ptr, &back_edge_index));
00164   if (marker_flag) {
00165     *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
00166     *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
00167   }
00168   // Mark both directions as end of word.
00169   *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
00170   *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
00171 }
00172 
00173 bool Trie::add_word_to_dawg(const WERD_CHOICE &word,
00174                             const GenericVector<bool> *repetitions) {
00175   if (word.length() <= 0) return false;  // can't add empty words
00176   if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
00177   // Make sure the word does not contain invalid unchar ids.
00178   for (int i = 0; i < word.length(); ++i) {
00179     if (word.unichar_id(i) < 0 ||
00180         word.unichar_id(i) >= unicharset_size_) return false;
00181   }
00182 
00183   EDGE_RECORD *edge_ptr;
00184   NODE_REF last_node = 0;
00185   NODE_REF the_next_node;
00186   bool marker_flag = false;
00187   EDGE_INDEX edge_index;
00188   int i;
00189   inT32 still_finding_chars = true;
00190   inT32 word_end = false;
00191   bool  add_failed = false;
00192   bool found;
00193 
00194   if (debug_level_ > 1) word.print("\nAdding word: ");
00195 
00196   UNICHAR_ID unichar_id;
00197   for (i = 0; i < word.length() - 1; ++i) {
00198     unichar_id = word.unichar_id(i);
00199     marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
00200     if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
00201     if (still_finding_chars) {
00202       found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
00203                            unichar_id, &edge_ptr, &edge_index);
00204       if (found && debug_level_ > 1) {
00205         tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
00206                 edge_index, last_node);
00207       }
00208       if (!found) {
00209         still_finding_chars = false;
00210       } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
00211         word_end = true;
00212         still_finding_chars = false;
00213         remove_edge(last_node, 0, word_end, unichar_id);
00214       } else {
00215         if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
00216         last_node = next_node_from_edge_rec(*edge_ptr);
00217       }
00218     }
00219     if (!still_finding_chars) {
00220       the_next_node = new_dawg_node();
00221       if (debug_level_ > 1)
00222         tprintf("adding node " REFFORMAT "\n", the_next_node);
00223       if (the_next_node == 0) {
00224         add_failed = true;
00225         break;
00226       }
00227       if (!add_new_edge(last_node, the_next_node,
00228                         marker_flag, word_end, unichar_id)) {
00229         add_failed = true;
00230         break;
00231       }
00232       word_end = false;
00233       last_node = the_next_node;
00234     }
00235   }
00236   the_next_node = 0;
00237   unichar_id = word.unichar_id(i);
00238   marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
00239   if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
00240   if (still_finding_chars &&
00241       edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
00242                    unichar_id, &edge_ptr, &edge_index)) {
00243     // An extension of this word already exists in the trie, so we
00244     // only have to add the ending flags in both directions.
00245     add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
00246                     marker_flag, unichar_id);
00247   } else {
00248     if (!add_failed &&
00249         !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
00250       add_failed = true;
00251   }
00252   if (add_failed) {
00253     tprintf("Re-initializing document dictionary...\n");
00254     clear();
00255     return false;
00256   } else {
00257     return true;
00258   }
00259 }
00260 
00261 NODE_REF Trie::new_dawg_node() {
00262   TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD();
00263   if (node == NULL) return 0;  // failed to create new node
00264   nodes_.push_back(node);
00265   return nodes_.length() - 1;
00266 }
00267 
00268 bool Trie::read_word_list(const char *filename,
00269                           const UNICHARSET &unicharset,
00270                           Trie::RTLReversePolicy reverse_policy) {
00271   FILE *word_file;
00272   char string[CHARS_PER_LINE];
00273   int  word_count = 0;
00274 
00275   word_file = open_file (filename, "r");
00276 
00277   while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
00278     chomp_string(string);  // remove newline
00279     WERD_CHOICE word(string, unicharset);
00280     if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
00281         word.has_rtl_unichar_id()) ||
00282         reverse_policy == RRP_FORCE_REVERSE) {
00283       word.reverse_and_mirror_unichar_ids();
00284     }
00285     ++word_count;
00286     if (debug_level_ && word_count % 10000 == 0)
00287       tprintf("Read %d words so far\n", word_count);
00288     if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
00289       if (!this->word_in_dawg(word)) {
00290         this->add_word_to_dawg(word);
00291         if (!this->word_in_dawg(word)) {
00292           tprintf("Error: word '%s' not in DAWG after adding it\n", string);
00293           return false;
00294         }
00295       }
00296     } else if (debug_level_) {
00297       tprintf("Skipping invalid word %s\n", string);
00298       if (debug_level_ >= 3) word.print();
00299     }
00300   }
00301   if (debug_level_)
00302     tprintf("Read %d words total.\n", word_count);
00303   fclose(word_file);
00304   return true;
00305 }
00306 
00307 void Trie::initialize_patterns(UNICHARSET *unicharset) {
00308   unicharset->unichar_insert(kAlphaPatternUnicode);
00309   alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);
00310   unicharset->unichar_insert(kDigitPatternUnicode);
00311   digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);
00312   unicharset->unichar_insert(kAlphanumPatternUnicode);
00313   alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);
00314   unicharset->unichar_insert(kPuncPatternUnicode);
00315   punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);
00316   unicharset->unichar_insert(kLowerPatternUnicode);
00317   lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);
00318   unicharset->unichar_insert(kUpperPatternUnicode);
00319   upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
00320   initialized_patterns_ = true;
00321   unicharset_size_ = unicharset->size();
00322 }
00323 
00324 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id,
00325                                   const UNICHARSET &unicharset,
00326                                   GenericVector<UNICHAR_ID> *vec) const {
00327   bool is_alpha = unicharset.get_isalpha(unichar_id);
00328   if (is_alpha) {
00329     vec->push_back(alpha_pattern_);
00330     vec->push_back(alphanum_pattern_);
00331     if (unicharset.get_islower(unichar_id)) {
00332       vec->push_back(lower_pattern_);
00333     } else if (unicharset.get_isupper(unichar_id)) {
00334       vec->push_back(upper_pattern_);
00335     }
00336   }
00337   if (unicharset.get_isdigit(unichar_id)) {
00338     vec->push_back(digit_pattern_);
00339     if (!is_alpha) vec->push_back(alphanum_pattern_);
00340   }
00341   if (unicharset.get_ispunctuation(unichar_id)) {
00342     vec->push_back(punc_pattern_);
00343   }
00344 }
00345 
00346 UNICHAR_ID Trie::character_class_to_pattern(char ch) {
00347   if (ch == 'c') {
00348     return alpha_pattern_;
00349   } else if (ch == 'd') {
00350     return digit_pattern_;
00351   } else if (ch == 'n') {
00352     return alphanum_pattern_;
00353   } else if (ch == 'p') {
00354     return punc_pattern_;
00355   } else if (ch == 'a') {
00356     return lower_pattern_;
00357   } else if (ch == 'A') {
00358     return upper_pattern_;
00359   } else {
00360     return INVALID_UNICHAR_ID;
00361   }
00362 }
00363 
00364 bool Trie::read_pattern_list(const char *filename,
00365                              const UNICHARSET &unicharset) {
00366   if (!initialized_patterns_) {
00367     tprintf("please call initialize_patterns() before read_pattern_list()\n");
00368     return false;
00369   }
00370 
00371   FILE *pattern_file = open_file (filename, "r");
00372   if (pattern_file == NULL) {
00373     tprintf("Error opening pattern file %s\n", filename);
00374     return false;
00375   }
00376 
00377   int pattern_count = 0;
00378   char string[CHARS_PER_LINE];
00379   while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) {
00380     chomp_string(string);  // remove newline
00381     // Parse the pattern and construct a unichar id vector.
00382     // Record the number of repetitions of each unichar in the parallel vector.
00383     WERD_CHOICE word(&unicharset);
00384     GenericVector<bool> repetitions_vec;
00385     const char *str_ptr = string;
00386     int step = unicharset.step(str_ptr);
00387     bool failed = false;
00388     while (step > 0) {
00389       UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
00390       if (step == 1 && *str_ptr == '\\') {
00391         ++str_ptr;
00392         if (*str_ptr == '\\') {  // regular '\' unichar that was escaped
00393           curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
00394         } else {
00395           if (word.length() < kSaneNumConcreteChars) {
00396             tprintf("Please provide at least %d concrete characters at the"
00397                     " beginning of the pattern\n", kSaneNumConcreteChars);
00398             failed = true;
00399             break;
00400           }
00401           // Parse character class from expression.
00402           curr_unichar_id = character_class_to_pattern(*str_ptr);
00403         }
00404       } else {
00405         curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
00406       }
00407       if (curr_unichar_id ==  INVALID_UNICHAR_ID) {
00408         failed = true;
00409         break;  // failed to parse this pattern
00410       }
00411       word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
00412       repetitions_vec.push_back(false);
00413       str_ptr += step;
00414       step = unicharset.step(str_ptr);
00415       // Check if there is a repetition pattern specified after this unichar.
00416       if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
00417         repetitions_vec[repetitions_vec.size()-1] = true;
00418         str_ptr += 2;
00419         step = unicharset.step(str_ptr);
00420       }
00421     }
00422     if (failed) {
00423       tprintf("Invalid user pattern %s\n", string);
00424       continue;
00425     }
00426     // Insert the pattern into the trie.
00427     if (debug_level_ > 2) {
00428       tprintf("Inserting expanded user pattern %s\n",
00429               word.debug_string().string());
00430     }
00431     if (!this->word_in_dawg(word)) {
00432       this->add_word_to_dawg(word, &repetitions_vec);
00433       if (!this->word_in_dawg(word)) {
00434         tprintf("Error: failed to insert pattern '%s'\n", string);
00435       }
00436     }
00437     ++pattern_count;
00438   }
00439   if (debug_level_) {
00440     tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
00441   }
00442   fclose(pattern_file);
00443   return true;
00444 }
00445 
00446 void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
00447                                bool word_end, UNICHAR_ID unichar_id) {
00448   EDGE_RECORD *edge_ptr = NULL;
00449   EDGE_INDEX edge_index = 0;
00450   ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
00451                            unichar_id, &edge_ptr, &edge_index));
00452   if (debug_level_ > 1) {
00453     tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
00454     print_edge_rec(*edge_ptr);
00455     tprintf("\n");
00456   }
00457   if (direction == FORWARD_EDGE) {
00458     nodes_[node1]->forward_edges.remove(edge_index);
00459   } else {
00460     nodes_[node1]->backward_edges.remove(edge_index);
00461   }
00462   --num_edges_;
00463 }
00464 
00465 SquishedDawg *Trie::trie_to_dawg() {
00466   if (debug_level_ > 2) {
00467     print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
00468   }
00469   NODE_MARKER reduced_nodes = new bool[nodes_.size()];
00470   for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
00471   this->reduce_node_input(0, reduced_nodes);
00472   delete[] reduced_nodes;
00473 
00474   if (debug_level_ > 2) {
00475     print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
00476   }
00477   // Build a translation map from node indices in nodes_ vector to
00478   // their target indices in EDGE_ARRAY.
00479   NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
00480   int i, j;
00481   node_ref_map[0] = 0;
00482   for (i = 0; i < nodes_.size(); ++i) {
00483     node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
00484   }
00485   int num_forward_edges = node_ref_map[i];
00486 
00487   // Convert nodes_ vector into EDGE_ARRAY translating the next node references
00488   // in edges using node_ref_map. Empty nodes and backward edges are dropped.
00489   EDGE_ARRAY edge_array =
00490     (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD));
00491   EDGE_ARRAY edge_array_ptr = edge_array;
00492   for (i = 0; i < nodes_.size(); ++i) {
00493     TRIE_NODE_RECORD *node_ptr = nodes_[i];
00494     int end = node_ptr->forward_edges.size();
00495     for (j = 0; j < end; ++j) {
00496       EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
00497       NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
00498       ASSERT_HOST(node_ref < nodes_.size());
00499       UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
00500       link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
00501                 end_of_word_from_edge_rec(edge_rec), unichar_id);
00502       if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
00503       ++edge_array_ptr;
00504     }
00505   }
00506   delete[] node_ref_map;
00507 
00508   return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
00509                           perm_, unicharset_size_, debug_level_);
00510 }
00511 
00512 bool Trie::eliminate_redundant_edges(NODE_REF node,
00513                                      const EDGE_RECORD &edge1,
00514                                      const EDGE_RECORD &edge2) {
00515   if (debug_level_ > 1) {
00516     tprintf("\nCollapsing node %d:\n", node);
00517     print_node(node, MAX_NODE_EDGES_DISPLAY);
00518     tprintf("Candidate edges: ");
00519     print_edge_rec(edge1);
00520     tprintf(", ");
00521     print_edge_rec(edge2);
00522     tprintf("\n\n");
00523   }
00524   NODE_REF next_node1 = next_node_from_edge_rec(edge1);
00525   NODE_REF next_node2 = next_node_from_edge_rec(edge2);
00526   TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
00527   // Translate all edges going to/from next_node2 to go to/from next_node1.
00528   EDGE_RECORD *edge_ptr = NULL;
00529   EDGE_INDEX edge_index;
00530   int i;
00531   // Remove the backward link in node to next_node2.
00532   const EDGE_RECORD &fwd_edge = next_node2_ptr->forward_edges[0];
00533   remove_edge_linkage(node, next_node2, BACKWARD_EDGE,
00534                       end_of_word_from_edge_rec(fwd_edge),
00535                       unichar_id_from_edge_rec(fwd_edge));
00536   // Copy all the backward links in next_node2 to node next_node1
00537   for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
00538     const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
00539     NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
00540     UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
00541     int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
00542     bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
00543     add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
00544                      curr_word_end, curr_unichar_id);
00545     // Relocate the corresponding forward edge in curr_next_node
00546     ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
00547                              curr_word_end, curr_unichar_id,
00548                              &edge_ptr, &edge_index));
00549     set_next_node_in_edge_rec(edge_ptr, next_node1);
00550   }
00551   int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
00552                               next_node2_ptr->backward_edges.size());
00553   if (debug_level_ > 1) {
00554     tprintf("removed %d edges from node " REFFORMAT "\n",
00555             next_node2_num_edges, next_node2);
00556   }
00557   next_node2_ptr->forward_edges.clear();
00558   next_node2_ptr->backward_edges.clear();
00559   num_edges_ -= next_node2_num_edges;
00560   return true;
00561 }
00562 
00563 bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index,
00564                                  UNICHAR_ID unichar_id,
00565                                  NODE_REF node,
00566                                  const EDGE_VECTOR &backward_edges,
00567                                  NODE_MARKER reduced_nodes) {
00568   if (debug_level_ > 1)
00569     tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
00570   // Compare each of the edge pairs with the given unichar_id.
00571   bool did_something = false;
00572   for (int i = edge_index; i < backward_edges.size() - 1; ++i) {
00573     // Find the first edge that can be eliminated.
00574     UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
00575     while (i < backward_edges.size() &&
00576            ((curr_unichar_id = unichar_id_from_edge_rec(backward_edges[i])) ==
00577             unichar_id) &&
00578            !can_be_eliminated(backward_edges[i])) ++i;
00579     if (i == backward_edges.size() || curr_unichar_id != unichar_id) break;
00580     const EDGE_RECORD &edge_rec = backward_edges[i];
00581     // Compare it to the rest of the edges with the given unichar_id.
00582     for (int j = i + 1; j < backward_edges.size(); ++j) {
00583       const EDGE_RECORD &next_edge_rec = backward_edges[j];
00584       if (unichar_id_from_edge_rec(next_edge_rec) != unichar_id) break;
00585       if (end_of_word_from_edge_rec(next_edge_rec) ==
00586           end_of_word_from_edge_rec(edge_rec) &&
00587           can_be_eliminated(next_edge_rec) &&
00588           eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
00589         reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
00590         did_something = true;
00591         --j;  // do not increment j if next_edge_rec was removed
00592       }
00593     }
00594   }
00595   return did_something;
00596 }
00597 
00598 void Trie::sort_edges(EDGE_VECTOR *edges) {
00599   int num_edges = edges->size();
00600   if (num_edges <= 1) return;
00601   for (int i = 0; i < num_edges - 1; ++i) {
00602     int min = i;
00603     for (int j = (i + 1); j < num_edges; ++j) {
00604       if (unichar_id_from_edge_rec((*edges)[j]) <
00605           unichar_id_from_edge_rec((*edges)[min])) min = j;
00606     }
00607     if (i != min) {
00608       EDGE_RECORD temp = (*edges)[i];
00609       (*edges)[i] = (*edges)[min];
00610       (*edges)[min] = temp;
00611     }
00612   }
00613 }
00614 
00615 void Trie::reduce_node_input(NODE_REF node,
00616                              NODE_MARKER reduced_nodes) {
00617   if (debug_level_ > 1) {
00618     tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
00619     print_node(node, MAX_NODE_EDGES_DISPLAY);
00620   }
00621 
00622   EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
00623   if (node != 0) sort_edges(&backward_edges);
00624   EDGE_INDEX edge_index = 0;
00625   while (edge_index < backward_edges.size()) {
00626     UNICHAR_ID unichar_id =
00627       unichar_id_from_edge_rec(backward_edges[edge_index]);
00628     while (reduce_lettered_edges(edge_index, unichar_id, node,
00629                                  backward_edges, reduced_nodes));
00630     while (++edge_index < backward_edges.size() &&
00631            unichar_id_from_edge_rec(backward_edges[edge_index]) == unichar_id);
00632   }
00633   reduced_nodes[node] = true;  // mark as reduced
00634 
00635   if (debug_level_ > 1) {
00636     tprintf("Node " REFFORMAT " after reduction:\n", node);
00637     print_node(node, MAX_NODE_EDGES_DISPLAY);
00638   }
00639 
00640   for (int i = 0; i < backward_edges.size(); ++i) {
00641     NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
00642     if (next_node != 0 && !reduced_nodes[next_node]) {
00643       reduce_node_input(next_node, reduced_nodes);
00644     }
00645   }
00646 }
00647 
00648 void Trie::print_node(NODE_REF node, int max_num_edges) const {
00649   if (node == NO_EDGE) return;  // nothing to print
00650   TRIE_NODE_RECORD *node_ptr = nodes_[node];
00651   int num_fwd = node_ptr->forward_edges.size();
00652   int num_bkw = node_ptr->backward_edges.size();
00653   EDGE_VECTOR *vec;
00654   for (int dir = 0; dir < 2; ++dir) {
00655     if (dir == 0) {
00656       vec = &(node_ptr->forward_edges);
00657       tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
00658     } else {
00659       vec = &(node_ptr->backward_edges);
00660       tprintf("\t");
00661     }
00662     int i;
00663     for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
00664          i < max_num_edges; ++i) {
00665       print_edge_rec((*vec)[i]);
00666       tprintf(" ");
00667     }
00668     if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
00669     tprintf("\n");
00670   }
00671 }
00672 }  // namespace tesseract