Tesseract  3.02
tesseract-ocr/cube/search_column.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        search_column.cpp
00003  * Description: Implementation of the Beam Search Column Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "search_column.h"
00021 #include <stdlib.h>
00022 
00023 namespace tesseract {
00024 
00025 SearchColumn::SearchColumn(int col_idx, int max_node) {
00026   col_idx_ = col_idx;
00027   node_cnt_ = 0;
00028   node_array_ = NULL;
00029   max_node_cnt_ = max_node;
00030   node_hash_table_ = NULL;
00031   init_ = false;
00032   min_cost_ = INT_MAX;
00033   max_cost_ = 0;
00034 }
00035 
00036 // Cleanup data
00037 void SearchColumn::Cleanup() {
00038   if (node_array_ != NULL) {
00039     for (int node_idx = 0; node_idx < node_cnt_; node_idx++) {
00040       if (node_array_[node_idx] != NULL) {
00041         delete node_array_[node_idx];
00042       }
00043     }
00044 
00045     delete []node_array_;
00046     node_array_ = NULL;
00047   }
00048   FreeHashTable();
00049   init_ = false;
00050 }
00051 
00052 SearchColumn::~SearchColumn() {
00053   Cleanup();
00054 }
00055 
00056 // Initializations
00057 bool SearchColumn::Init() {
00058   if (init_ == true) {
00059     return true;
00060   }
00061 
00062   // create hash table
00063   if (node_hash_table_ == NULL) {
00064     node_hash_table_ = new SearchNodeHashTable();
00065     if (node_hash_table_ == NULL) {
00066       return false;
00067     }
00068   }
00069 
00070   init_ = true;
00071 
00072   return true;
00073 }
00074 
00075 // Prune the nodes if necessary. Pruning is done such that a max
00076 // number of nodes is kept, i.e., the beam width
00077 void SearchColumn::Prune() {
00078   // no need to prune
00079   if (node_cnt_ <= max_node_cnt_) {
00080     return;
00081   }
00082 
00083   // compute the cost histogram
00084   memset(score_bins_, 0, sizeof(score_bins_));
00085   int cost_range = max_cost_ - min_cost_ + 1;
00086   for (int node_idx = 0; node_idx < node_cnt_; node_idx++) {
00087     int cost_bin = static_cast<int>(
00088         ((node_array_[node_idx]->BestCost() - min_cost_) *
00089          kScoreBins) / static_cast<double>(cost_range));
00090     if (cost_bin >= kScoreBins) {
00091       cost_bin = kScoreBins - 1;
00092     }
00093     score_bins_[cost_bin]++;
00094   }
00095 
00096   // determine the pruning cost by scanning the cost histogram from
00097   // least to greatest cost bins and finding the cost at which the
00098   // max number of nodes is exceeded
00099   int pruning_cost = 0;
00100   int new_node_cnt = 0;
00101   for (int cost_bin = 0; cost_bin < kScoreBins; cost_bin++) {
00102     if (new_node_cnt > 0 &&
00103         (new_node_cnt + score_bins_[cost_bin]) > max_node_cnt_) {
00104       pruning_cost = min_cost_ + ((cost_bin * cost_range) / kScoreBins);
00105       break;
00106     }
00107     new_node_cnt += score_bins_[cost_bin];
00108   }
00109 
00110   // prune out all the nodes above this cost
00111   for (int node_idx = new_node_cnt = 0; node_idx < node_cnt_; node_idx++) {
00112     // prune this node out
00113     if (node_array_[node_idx]->BestCost() > pruning_cost ||
00114         new_node_cnt > max_node_cnt_) {
00115       delete node_array_[node_idx];
00116     } else {
00117       // keep it
00118       node_array_[new_node_cnt++] = node_array_[node_idx];
00119     }
00120   }
00121   node_cnt_ = new_node_cnt;
00122 }
00123 
00124 // sort all nodes
00125 void SearchColumn::Sort() {
00126   if (node_cnt_ > 0 && node_array_ != NULL) {
00127     qsort(node_array_, node_cnt_, sizeof(*node_array_),
00128           SearchNode::SearchNodeComparer);
00129   }
00130 }
00131 
00132 // add a new node
00133 SearchNode *SearchColumn::AddNode(LangModEdge *edge, int reco_cost,
00134                                   SearchNode *parent_node,
00135                                   CubeRecoContext *cntxt) {
00136   // init if necessary
00137   if (init_ == false && Init() == false) {
00138     return NULL;
00139   }
00140 
00141   // find out if we have an node with the same edge
00142   // look in the hash table
00143   SearchNode *new_node = node_hash_table_->Lookup(edge, parent_node);
00144   // node does not exist
00145   if (new_node == NULL) {
00146     new_node = new SearchNode(cntxt, parent_node, reco_cost, edge, col_idx_);
00147     if (new_node == NULL) {
00148       return NULL;
00149     }
00150 
00151     // if the max node count has already been reached, check if the cost of
00152     // the new node exceeds the max cost. This indicates that it will be pruned
00153     // and so there is no point adding it
00154     if (node_cnt_ >= max_node_cnt_ && new_node->BestCost() > max_cost_) {
00155       delete new_node;
00156       return NULL;
00157     }
00158 
00159     // expand the node buffer if necc
00160     if ((node_cnt_ % kNodeAllocChunk) == 0) {
00161       // alloc a new buff
00162       SearchNode **new_node_buff =
00163           new SearchNode *[node_cnt_ + kNodeAllocChunk];
00164       if (new_node_buff == NULL) {
00165         delete new_node;
00166         return NULL;
00167       }
00168 
00169       // free existing after copying contents
00170       if (node_array_ != NULL) {
00171         memcpy(new_node_buff, node_array_, node_cnt_ * sizeof(*new_node_buff));
00172         delete []node_array_;
00173       }
00174 
00175       node_array_ = new_node_buff;
00176     }
00177 
00178     // add the node to the hash table only if it is non-OOD edge
00179     // because the langmod state is not unique
00180     if (edge->IsOOD() == false) {
00181       if (!node_hash_table_->Insert(edge, new_node)) {
00182         printf("Hash table full!!!");
00183         delete new_node;
00184         return NULL;
00185       }
00186     }
00187 
00188     node_array_[node_cnt_++] = new_node;
00189 
00190   } else {
00191     // node exists before
00192     // if no update occurred, return NULL
00193     if (new_node->UpdateParent(parent_node, reco_cost, edge) == false) {
00194       new_node = NULL;
00195     }
00196 
00197     // free the edge
00198     if (edge != NULL) {
00199       delete edge;
00200     }
00201   }
00202 
00203   // update Min and Max Costs
00204   if (new_node != NULL) {
00205     if (min_cost_ > new_node->BestCost()) {
00206       min_cost_ = new_node->BestCost();
00207     }
00208 
00209     if (max_cost_ < new_node->BestCost()) {
00210       max_cost_ = new_node->BestCost();
00211     }
00212   }
00213 
00214   return new_node;
00215 }
00216 
00217 SearchNode *SearchColumn::BestNode() {
00218   SearchNode *best_node = NULL;
00219 
00220   for (int node_idx = 0; node_idx < node_cnt_; node_idx++) {
00221     if (best_node == NULL ||
00222         best_node->BestCost() > node_array_[node_idx]->BestCost()) {
00223       best_node = node_array_[node_idx];
00224     }
00225   }
00226 
00227   return best_node;
00228 }
00229 }  // namespace tesseract