Tesseract
3.02
|
00001 00002 // File: wordrec.cpp 00003 // Description: wordrec class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #include "wordrec.h" 00020 00021 #include "language_model.h" 00022 #include "params.h" 00023 00024 00025 namespace tesseract { 00026 Wordrec::Wordrec() : 00027 // control parameters 00028 BOOL_MEMBER(merge_fragments_in_matrix, TRUE, 00029 "Merge the fragments in the ratings matrix and delete them" 00030 " after merging", params()), 00031 BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information", 00032 params()), 00033 BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable", 00034 params()), 00035 BOOL_MEMBER(force_word_assoc, FALSE, 00036 "force associator to run regardless of what enable_assoc is." 00037 "This is used for CJK where component grouping is necessary.", 00038 CCUtil::params()), 00039 INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states", 00040 CCUtil::params()), 00041 double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state", 00042 params()), 00043 BOOL_MEMBER(fragments_guide_chopper, FALSE, 00044 "Use information from fragments to guide chopping process", 00045 params()), 00046 INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped", 00047 params()), 00048 double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit", 00049 params()), 00050 INT_MEMBER(chop_debug, 0, "Chop debug", 00051 params()), 00052 BOOL_MEMBER(chop_enable, 1, "Chop enable", 00053 params()), 00054 BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep", 00055 params()), 00056 INT_MEMBER(chop_split_length, 10000, "Split Length", 00057 params()), 00058 INT_MEMBER(chop_same_distance, 2, "Same distance", 00059 params()), 00060 INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline", 00061 params()), 00062 INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend", 00063 params()), 00064 INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area", 00065 params()), 00066 double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment", 00067 params()), 00068 double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment", 00069 params()), 00070 double_MEMBER(chop_center_knob, 0.15, "Split center adjustment", 00071 params()), 00072 double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment", 00073 params()), 00074 double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment", 00075 params()), 00076 double_MEMBER(chop_ok_split, 100.0, "OK split limit", 00077 params()), 00078 double_MEMBER(chop_good_split, 50.0, "Good split limit", 00079 params()), 00080 INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight", 00081 params()), 00082 INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug", 00083 params()), 00084 BOOL_MEMBER(assume_fixed_pitch_char_segment, FALSE, 00085 "include fixed-pitch heuristics in char segmentation", 00086 params()), 00087 BOOL_MEMBER(use_new_state_cost, FALSE, 00088 "use new state cost heuristics for segmentation state evaluation", 00089 params()), 00090 double_MEMBER(heuristic_segcost_rating_base, 1.25, 00091 "base factor for adding segmentation cost into word rating." 00092 "It's a multiplying factor, the larger the value above 1, " 00093 "the bigger the effect of segmentation cost.", 00094 params()), 00095 double_MEMBER(heuristic_weight_rating, 1.0, 00096 "weight associated with char rating in combined cost of state", 00097 params()), 00098 double_MEMBER(heuristic_weight_width, 1000.0, 00099 "weight associated with width evidence in combined cost of" 00100 " state", params()), 00101 double_MEMBER(heuristic_weight_seamcut, 0.0, 00102 "weight associated with seam cut in combined cost of state", 00103 params()), 00104 double_MEMBER(heuristic_max_char_wh_ratio, 2.0, 00105 "max char width-to-height ratio allowed in segmentation", 00106 params()), 00107 INT_MEMBER(wordrec_debug_level, 0, 00108 "Debug level for wordrec", params()), 00109 BOOL_MEMBER(wordrec_debug_blamer, false, 00110 "Print blamer debug messages", params()), 00111 BOOL_MEMBER(wordrec_run_blamer, false, 00112 "Try to set the blame for errors", params()), 00113 BOOL_MEMBER(enable_new_segsearch, true, 00114 "Enable new segmentation search path.", params()), 00115 INT_MEMBER(segsearch_debug_level, 0, 00116 "SegSearch debug level", params()), 00117 INT_MEMBER(segsearch_max_pain_points, 2000, 00118 "Maximum number of pain points stored in the queue", 00119 params()), 00120 INT_MEMBER(segsearch_max_futile_classifications, 10, 00121 "Maximum number of pain point classifications per word that" 00122 "did not result in finding a better word choice.", 00123 params()), 00124 double_MEMBER(segsearch_max_char_wh_ratio, 2.0, 00125 "Maximum character width-to-height ratio", params()), 00126 double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, 00127 "Maximum character width-to-height ratio for" 00128 " fixed-pitch fonts", 00129 params()), 00130 BOOL_MEMBER(save_alt_choices, false, 00131 "Save alternative paths found during chopping" 00132 " and segmentation search", 00133 params()) { 00134 prev_word_best_choice_ = NULL; 00135 language_model_ = new LanguageModel(&get_fontinfo_table(), 00136 &(getDict())); 00137 pass2_seg_states = 0; 00138 num_joints = 0; 00139 num_pushed = 0; 00140 num_popped = 0; 00141 fill_lattice_ = NULL; 00142 } 00143 00144 Wordrec::~Wordrec() { 00145 delete language_model_; 00146 } 00147 00148 void Wordrec::CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from, 00149 BLOB_CHOICE_LIST_VECTOR *to) { 00150 to->delete_data_pointers(); 00151 to->clear(); 00152 for (int i = 0; i < from.size(); ++i) { 00153 BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST(); 00154 cc_list->deep_copy(from[i], &BLOB_CHOICE::deep_copy); 00155 to->push_back(cc_list); 00156 } 00157 } 00158 00159 bool Wordrec::ChoiceIsCorrect(const UNICHARSET &uni_set, 00160 const WERD_CHOICE *choice, 00161 const GenericVector<STRING> &truth_text) { 00162 if (choice == NULL) return false; 00163 int i; 00164 STRING truth_str; 00165 for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i]; 00166 STRING normed_choice_str; 00167 for (i = 0; i < choice->length(); ++i) { 00168 normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i)); 00169 } 00170 return (truth_str == normed_choice_str); 00171 } 00172 00173 void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) { 00174 ASSERT_HOST(word->alt_choices.empty()); 00175 ASSERT_HOST(word->alt_states.empty()); 00176 LIST list_it; 00177 iterate_list(list_it, best_choices) { 00178 VIABLE_CHOICE choice = 00179 reinterpret_cast<VIABLE_CHOICE>(first_node(list_it)); 00180 CHAR_CHOICE *char_choice = &(choice->Blob[0]); 00181 WERD_CHOICE *alt_choice = new WERD_CHOICE(word->uch_set, choice->Length); 00182 word->alt_states.push_back(GenericVector<int>(choice->Length)); 00183 GenericVector<int> &alt_state = word->alt_states.back(); 00184 for (int i = 0; i < choice->Length; char_choice++, i++) { 00185 alt_choice->append_unichar_id_space_allocated( 00186 char_choice->Class, 1, 0, 0); 00187 alt_state.push_back(char_choice->NumChunks); 00188 } 00189 alt_choice->set_rating(choice->Rating); 00190 alt_choice->set_certainty(choice->Certainty); 00191 word->alt_choices.push_back(alt_choice); 00192 if (wordrec_debug_level > 0) { 00193 tprintf("SaveAltChoices: %s %g\n", 00194 alt_choice->unichar_string().string(), alt_choice->rating()); 00195 } 00196 } 00197 } 00198 00199 } // namespace tesseract