Tesseract  3.02
tesseract-ocr/wordrec/wordrec.cpp
Go to the documentation of this file.
00001 
00002 // File:        wordrec.cpp
00003 // Description: wordrec class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #include "wordrec.h"
00020 
00021 #include "language_model.h"
00022 #include "params.h"
00023 
00024 
00025 namespace tesseract {
00026 Wordrec::Wordrec() :
00027   // control parameters
00028   BOOL_MEMBER(merge_fragments_in_matrix, TRUE,
00029               "Merge the fragments in the ratings matrix and delete them"
00030               " after merging", params()),
00031   BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information",
00032               params()),
00033   BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable",
00034               params()),
00035   BOOL_MEMBER(force_word_assoc, FALSE,
00036               "force associator to run regardless of what enable_assoc is."
00037               "This is used for CJK where component grouping is necessary.",
00038               CCUtil::params()),
00039   INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states",
00040              CCUtil::params()),
00041   double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
00042                 params()),
00043   BOOL_MEMBER(fragments_guide_chopper, FALSE,
00044               "Use information from fragments to guide chopping process",
00045               params()),
00046   INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
00047              params()),
00048   double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
00049                 params()),
00050   INT_MEMBER(chop_debug, 0, "Chop debug",
00051              params()),
00052   BOOL_MEMBER(chop_enable, 1, "Chop enable",
00053               params()),
00054   BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
00055             params()),
00056   INT_MEMBER(chop_split_length, 10000, "Split Length",
00057              params()),
00058   INT_MEMBER(chop_same_distance, 2, "Same distance",
00059              params()),
00060   INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
00061              params()),
00062   INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
00063              params()),
00064   INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
00065              params()),
00066   double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
00067                 params()),
00068   double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
00069                 params()),
00070   double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
00071                 params()),
00072   double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
00073                 params()),
00074   double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
00075                 params()),
00076   double_MEMBER(chop_ok_split, 100.0, "OK split limit",
00077                 params()),
00078   double_MEMBER(chop_good_split, 50.0, "Good split limit",
00079                 params()),
00080   INT_MEMBER(chop_x_y_weight, 3, "X / Y  length weight",
00081              params()),
00082   INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug",
00083              params()),
00084   BOOL_MEMBER(assume_fixed_pitch_char_segment, FALSE,
00085               "include fixed-pitch heuristics in char segmentation",
00086               params()),
00087   BOOL_MEMBER(use_new_state_cost, FALSE,
00088               "use new state cost heuristics for segmentation state evaluation",
00089               params()),
00090   double_MEMBER(heuristic_segcost_rating_base, 1.25,
00091                 "base factor for adding segmentation cost into word rating."
00092                 "It's a multiplying factor, the larger the value above 1, "
00093                 "the bigger the effect of segmentation cost.",
00094                 params()),
00095   double_MEMBER(heuristic_weight_rating, 1.0,
00096                 "weight associated with char rating in combined cost of state",
00097                 params()),
00098   double_MEMBER(heuristic_weight_width, 1000.0,
00099                 "weight associated with width evidence in combined cost of"
00100                 " state", params()),
00101   double_MEMBER(heuristic_weight_seamcut, 0.0,
00102                 "weight associated with seam cut in combined cost of state",
00103                 params()),
00104   double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
00105                 "max char width-to-height ratio allowed in segmentation",
00106                 params()),
00107   INT_MEMBER(wordrec_debug_level, 0,
00108              "Debug level for wordrec", params()),
00109   BOOL_MEMBER(wordrec_debug_blamer, false,
00110               "Print blamer debug messages", params()),
00111   BOOL_MEMBER(wordrec_run_blamer, false,
00112               "Try to set the blame for errors", params()),
00113   BOOL_MEMBER(enable_new_segsearch, true,
00114                    "Enable new segmentation search path.", params()),
00115   INT_MEMBER(segsearch_debug_level, 0,
00116              "SegSearch debug level", params()),
00117   INT_MEMBER(segsearch_max_pain_points, 2000,
00118              "Maximum number of pain points stored in the queue",
00119              params()),
00120   INT_MEMBER(segsearch_max_futile_classifications, 10,
00121              "Maximum number of pain point classifications per word that"
00122              "did not result in finding a better word choice.",
00123              params()),
00124   double_MEMBER(segsearch_max_char_wh_ratio, 2.0,
00125                 "Maximum character width-to-height ratio", params()),
00126   double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
00127                 "Maximum character width-to-height ratio for"
00128                 " fixed-pitch fonts",
00129                 params()),
00130   BOOL_MEMBER(save_alt_choices, false,
00131               "Save alternative paths found during chopping"
00132               " and segmentation search",
00133               params()) {
00134   prev_word_best_choice_ = NULL;
00135   language_model_ = new LanguageModel(&get_fontinfo_table(),
00136                                       &(getDict()));
00137   pass2_seg_states = 0;
00138   num_joints = 0;
00139   num_pushed = 0;
00140   num_popped = 0;
00141   fill_lattice_ = NULL;
00142 }
00143 
00144 Wordrec::~Wordrec() {
00145   delete language_model_;
00146 }
00147 
00148 void Wordrec::CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from,
00149                               BLOB_CHOICE_LIST_VECTOR *to) {
00150   to->delete_data_pointers();
00151   to->clear();
00152   for (int i = 0; i < from.size(); ++i) {
00153     BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
00154     cc_list->deep_copy(from[i], &BLOB_CHOICE::deep_copy);
00155     to->push_back(cc_list);
00156   }
00157 }
00158 
00159 bool Wordrec::ChoiceIsCorrect(const UNICHARSET &uni_set,
00160                               const WERD_CHOICE *choice,
00161                               const GenericVector<STRING> &truth_text) {
00162   if (choice == NULL) return false;
00163   int i;
00164   STRING truth_str;
00165   for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i];
00166   STRING normed_choice_str;
00167   for (i = 0; i < choice->length(); ++i) {
00168     normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i));
00169   }
00170   return (truth_str == normed_choice_str);
00171 }
00172 
00173 void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) {
00174   ASSERT_HOST(word->alt_choices.empty());
00175   ASSERT_HOST(word->alt_states.empty());
00176   LIST list_it;
00177   iterate_list(list_it, best_choices) {
00178     VIABLE_CHOICE choice =
00179         reinterpret_cast<VIABLE_CHOICE>(first_node(list_it));
00180     CHAR_CHOICE *char_choice = &(choice->Blob[0]);
00181     WERD_CHOICE *alt_choice = new WERD_CHOICE(word->uch_set, choice->Length);
00182     word->alt_states.push_back(GenericVector<int>(choice->Length));
00183     GenericVector<int> &alt_state = word->alt_states.back();
00184     for (int i = 0; i < choice->Length; char_choice++, i++) {
00185       alt_choice->append_unichar_id_space_allocated(
00186           char_choice->Class, 1, 0, 0);
00187       alt_state.push_back(char_choice->NumChunks);
00188     }
00189     alt_choice->set_rating(choice->Rating);
00190     alt_choice->set_certainty(choice->Certainty);
00191     word->alt_choices.push_back(alt_choice);
00192     if (wordrec_debug_level > 0) {
00193       tprintf("SaveAltChoices: %s %g\n",
00194               alt_choice->unichar_string().string(), alt_choice->rating());
00195     }
00196   }
00197 }
00198 
00199 }  // namespace tesseract