Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: heuristic.c (Formerly heuristic.c) 00005 * Description: 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Wed Jul 10 14:15:08 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 #include <math.h> 00029 00030 // Note: "heuristic.h" is an empty file and deleted 00031 #include "associate.h" 00032 #include "bestfirst.h" 00033 #include "seam.h" 00034 #include "baseline.h" 00035 #include "freelist.h" 00036 #include "measure.h" 00037 #include "ratngs.h" 00038 #include "wordrec.h" 00039 00040 /*---------------------------------------------------------------------- 00041 M a c r o s 00042 ----------------------------------------------------------------------*/ 00043 #define BAD_RATING 1000.0 /* No valid blob */ 00044 00045 00046 namespace tesseract { 00047 00048 /*---------------------------------------------------------------------- 00049 // Some static helpers used only in this file 00050 ----------------------------------------------------------------------*/ 00051 00052 // Return a character width record corresponding to the character 00053 // width that will be generated in this segmentation state. 00054 // The calling function need to memfree WIDTH_RECORD when finished. 00055 // This is the same as the original function, only cosmetic changes, 00056 // except instead of passing chunks back to be freed, it deallocates 00057 // internally. 00058 WIDTH_RECORD *Wordrec::state_char_widths(WIDTH_RECORD *chunk_widths, 00059 STATE *state, 00060 int num_joints) { 00061 SEARCH_STATE chunks = bin_to_chunks(state, num_joints); 00062 int num_chars = chunks[0] + 1; 00063 00064 // allocate and store (n+1,w0,g0,w1,g1...,wn) in int[2*(n+1)] as a 00065 // struct { num_chars, widths[2*n+1]; } 00066 WIDTH_RECORD *char_widths = (WIDTH_RECORD*) memalloc(sizeof(int)*num_chars*2); 00067 char_widths->num_chars = num_chars; 00068 00069 int first_blob = 0; 00070 int last_blob; 00071 for (int i = 1; i <= num_chars; i++) { 00072 last_blob = (i > chunks[0]) ? num_joints : first_blob + chunks[i]; 00073 00074 char_widths->widths[2*i-2] = 00075 AssociateUtils::GetChunksWidth(chunk_widths, first_blob, last_blob); 00076 if (i <= chunks[0]) { 00077 char_widths->widths[2*i-1] = 00078 AssociateUtils::GetChunksGap(chunk_widths, last_blob); 00079 } 00080 00081 if (segment_adjust_debug > 3) 00082 tprintf("width_record[%d]s%d--s%d(%d) %d %d:%d\n", 00083 i-1, first_blob, last_blob, chunks[i], 00084 char_widths->widths[2*i-2], char_widths->widths[2*i-1], 00085 chunk_widths->widths[2*last_blob+1]); 00086 first_blob = last_blob + 1; 00087 } 00088 00089 memfree(chunks); 00090 return char_widths; 00091 } 00092 00093 // Computes the variance of the char widths normalized to the given height 00094 // TODO(dsl): Do this in a later stage and use char choice info to skip 00095 // punctuations. 00096 FLOAT32 Wordrec::get_width_variance(WIDTH_RECORD *wrec, float norm_height) { 00097 MEASUREMENT ws; 00098 new_measurement(ws); 00099 for (int x = 0; x < wrec->num_chars; x++) { 00100 FLOAT32 wh_ratio = wrec->widths[2 * x] * 1.0f / norm_height; 00101 if (x == wrec->num_chars - 1 && wh_ratio > 0.3) 00102 continue; // exclude trailing punctuation from stats 00103 ADD_SAMPLE(ws, wh_ratio); 00104 } 00105 if (segment_adjust_debug > 2) 00106 tprintf("Width Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws)); 00107 return VARIANCE(ws); 00108 } 00109 00110 // Computes the variance of char positioning (width + spacing) wrt norm_height 00111 FLOAT32 Wordrec::get_gap_variance(WIDTH_RECORD *wrec, float norm_height) { 00112 MEASUREMENT ws; 00113 new_measurement(ws); 00114 for (int x = 0; x < wrec->num_chars - 1; x++) { 00115 FLOAT32 gap_ratio = (wrec->widths[2 * x] + wrec->widths[ 2*x + 1]) 00116 * 1.0 / norm_height; 00117 ADD_SAMPLE(ws, gap_ratio); 00118 } 00119 if (segment_adjust_debug > 2) 00120 tprintf("Gap Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws)); 00121 return VARIANCE(ws); 00122 } 00123 00124 00125 /*---------------------------------------------------------------------- 00126 Below are the new state prioritization functions, which incorporates 00127 segmentation cost and width distribution for fixed pitch languages. 00128 They are included as methods in class Wordrec to obtain more context. 00129 ----------------------------------------------------------------------*/ 00130 00131 /********************************************************************** 00132 * Returns the cost associated with the segmentation state. 00133 * 00134 * The number of states should be the same as the number of seams. 00135 * If state[i] is 1, then seams[i] is present; otherwise, it is hidden. 00136 * This function returns the sum of priority for active seams. 00137 * TODO(dsl): To keep this clean, this function will just return the 00138 * sum of raw "priority" as seam cost. The normalization of this score 00139 * relative to other costs will be done in bestfirst.cpp evaluate_seg(). 00140 **********************************************************************/ 00141 00142 FLOAT32 Wordrec::seamcut_priority(SEAMS seams, 00143 STATE *state, 00144 int num_joints) { 00145 int x; 00146 unsigned int mask = (num_joints > 32) ? (1 << (num_joints - 1 - 32)) 00147 : (1 << (num_joints - 1)); 00148 float seam_cost = 0.0f; 00149 for (x = num_joints - 1; x >= 0; x--) { 00150 int i = num_joints - 1 - x; 00151 uinT32 value = (x < 32) ? state->part2 : state->part1; 00152 bool state_on = value & mask; 00153 if (state_on) { 00154 SEAM* seam = (SEAM *) array_value(seams, i); 00155 seam_cost += seam->priority; 00156 } 00157 if (mask == 1) 00158 mask = 1 << 31; 00159 else 00160 mask >>= 1; 00161 } 00162 if (segment_adjust_debug > 2) 00163 tprintf("seam_cost: %f\n", seam_cost); 00164 return seam_cost; 00165 } 00166 00167 /********************************************************************** 00168 * rating_priority 00169 * 00170 * Assign a segmentation priority based on the ratings of the blobs 00171 * (in that segmentation) that have been classified. The average 00172 * "goodness" (i.e. rating / weight) for each blob is used to indicate 00173 * the segmentation priority. This is the original. 00174 **********************************************************************/ 00175 FLOAT32 Wordrec::rating_priority(CHUNKS_RECORD *chunks_record, 00176 STATE *state, 00177 int num_joints) { 00178 BLOB_CHOICE_LIST *blob_choices; 00179 BLOB_CHOICE_IT blob_choice_it; 00180 inT16 first_chunk = 0; 00181 inT16 last_chunk; 00182 inT16 ratings = 0; 00183 inT16 weights = 0; 00184 00185 PIECES_STATE blob_chunks; 00186 bin_to_pieces(state, num_joints, blob_chunks); 00187 00188 for (int x = 0; blob_chunks[x]; x++) { 00189 last_chunk = first_chunk + blob_chunks[x]; 00190 00191 blob_choices = chunks_record->ratings->get(first_chunk, last_chunk - 1); 00192 if (blob_choices != NOT_CLASSIFIED && blob_choices->length() > 0) { 00193 blob_choice_it.set_to_list(blob_choices); 00194 ratings += (inT16) blob_choice_it.data()->rating(); 00195 for (int y = first_chunk; y < last_chunk; y++) { 00196 weights += (inT16) (chunks_record->weights[y]); 00197 } 00198 } 00199 first_chunk = last_chunk; 00200 } 00201 if (weights <= 0) 00202 weights = 1; 00203 FLOAT32 rating_cost = static_cast<FLOAT32>(ratings) / 00204 static_cast<FLOAT32>(weights); 00205 if (segment_adjust_debug > 2) 00206 tprintf("rating_cost: r%f / w%f = %f\n", ratings, weights, rating_cost); 00207 return rating_cost; 00208 } 00209 00210 /********************************************************************** 00211 * width_priority 00212 * 00213 * Return a priority value for this word segmentation based on the 00214 * character widths present in the new segmentation. 00215 * For variable-pitch fonts, this should do the same thing as before: 00216 * ie. penalize only on really wide squatting blobs. 00217 * For fixed-pitch fonts, this will include a measure of char & gap 00218 * width consistency. 00219 * TODO(dsl): generalize this to use a PDF estimate for proportional and 00220 * fixed pitch mode. 00221 **********************************************************************/ 00222 FLOAT32 Wordrec::width_priority(CHUNKS_RECORD *chunks_record, 00223 STATE *state, 00224 int num_joints) { 00225 FLOAT32 penalty = 0.0; 00226 WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths, 00227 state, num_joints); 00228 // When baseline_enable==True, which is the current default for Tesseract, 00229 // a fixed value of 128 (BASELINE_SCALE) is always used. 00230 FLOAT32 normalizing_height = BASELINE_SCALE; 00231 if (assume_fixed_pitch_char_segment) { 00232 // For fixed pitch language like CJK, we use the full text height as the 00233 // normalizing factor so we are not dependent on xheight calculation. 00234 // In the normalized coord. xheight * scale == BASELINE_SCALE(128), 00235 // so add proportionally scaled ascender zone to get full text height. 00236 const DENORM& denorm = chunks_record->word_res->denorm; 00237 normalizing_height = denorm.y_scale() * 00238 (denorm.row()->x_height() + denorm.row()->ascenders()); 00239 if (segment_adjust_debug > 1) 00240 tprintf("WidthPriority: %f %f normalizing height = %f\n", 00241 denorm.row()->x_height(), denorm.row()->ascenders(), 00242 normalizing_height); 00243 // Impose additional segmentation penalties if blob widths or gaps 00244 // distribution don't fit a fixed-pitch model. 00245 FLOAT32 width_var = get_width_variance(width_rec, normalizing_height); 00246 FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height); 00247 penalty += width_var; 00248 penalty += gap_var; 00249 } 00250 00251 for (int x = 0; x < width_rec->num_chars; x++) { 00252 FLOAT32 squat = width_rec->widths[2*x]; 00253 FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0; 00254 squat /= normalizing_height; 00255 gap /= normalizing_height; 00256 if (assume_fixed_pitch_char_segment) { 00257 penalty += AssociateUtils::FixedPitchWidthCost( 00258 squat, 0.0f, x == 0 || x == width_rec->num_chars -1, 00259 heuristic_max_char_wh_ratio); 00260 penalty += AssociateUtils::FixedPitchGapCost( 00261 gap, x == width_rec->num_chars - 1); 00262 if (width_rec->num_chars == 1 && 00263 squat > AssociateUtils::kMaxFixedPitchCharAspectRatio) { 00264 penalty += 10; 00265 } 00266 } else { 00267 // Original equation when 00268 // heuristic_max_char_ratio == AssociateUtils::kMaxSquat 00269 if (squat > heuristic_max_char_wh_ratio) 00270 penalty += squat - heuristic_max_char_wh_ratio; 00271 } 00272 } 00273 00274 free_widths(width_rec); 00275 return (penalty); 00276 } 00277 00278 00279 /********************************************************************** 00280 * prioritize_state 00281 * 00282 * Create a priority for this state. It represents the urgency of 00283 * checking this state. The larger the priority value, the worse the 00284 * state is (lower priority). The "value" of this priority should be 00285 * somewhat consistent with the final word rating. 00286 * The question is how to normalize the different scores, and adjust 00287 * the relative importance among them. 00288 **********************************************************************/ 00289 FLOAT32 Wordrec::prioritize_state(CHUNKS_RECORD *chunks_record, 00290 SEARCH_RECORD *the_search) { 00291 FLOAT32 shape_cost; 00292 FLOAT32 width_cost; 00293 FLOAT32 seam_cost; 00294 00295 shape_cost = rating_priority(chunks_record, 00296 the_search->this_state, 00297 the_search->num_joints); 00298 00299 width_cost = width_priority(chunks_record, 00300 the_search->this_state, 00301 the_search->num_joints); 00302 00303 // The rating_priority is the same as the original, and the width_priority 00304 // is the same as before if assume_fixed_pitch_char_segment == FALSE. 00305 // So this would return the original state priority. 00306 if (!use_new_state_cost) 00307 return width_cost * 1000 + shape_cost; 00308 00309 seam_cost = seamcut_priority(chunks_record->splits, 00310 the_search->this_state, 00311 the_search->num_joints); 00312 00313 // TODO(dsl): how do we normalize the scores for these separate evidence? 00314 // FLOAT32 total_cost = shape_cost + width_cost * 0.01 + seam_cost * 0.001; 00315 FLOAT32 total_cost = shape_cost * heuristic_weight_rating + 00316 width_cost * heuristic_weight_width + 00317 seam_cost * heuristic_weight_seamcut; 00318 00319 // We don't have an adjustment model for variable pitch segmentation cost 00320 // into word rating 00321 if (assume_fixed_pitch_char_segment) { 00322 float seg_bias = 1.0; 00323 if (width_cost < 1) seg_bias *= 0.85; 00324 if (width_cost > 3) 00325 seg_bias *= pow(heuristic_segcost_rating_base, width_cost/3.0); 00326 if (seam_cost > 10) 00327 seg_bias *= pow(heuristic_segcost_rating_base, log(seam_cost)/log(10.0)); 00328 if (shape_cost > 5) 00329 seg_bias *= pow(heuristic_segcost_rating_base, shape_cost/5.0); 00330 if (segment_adjust_debug) { 00331 tprintf("SegCost: %g Weight: %g rating: %g width: %g seam: %g\n", 00332 total_cost, seg_bias, shape_cost, width_cost, seam_cost); 00333 } 00334 the_search->segcost_bias = seg_bias; 00335 } else { 00336 the_search->segcost_bias = 0; 00337 } 00338 00339 return total_cost; 00340 } 00341 00342 } // namespace tesseract