tesseract-doc/heuristic_8cpp_source.html

00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        heuristic.c  (Formerly heuristic.c)
00005  * Description:
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Wed Jul 10 14:15:08 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 /*----------------------------------------------------------------------
00026               I n c l u d e s
00027 ----------------------------------------------------------------------*/
00028 #include <math.h>
00029
00030 // Note: "heuristic.h" is an empty file and deleted
00031 #include "associate.h"
00032 #include "bestfirst.h"
00033 #include "seam.h"
00034 #include "baseline.h"
00035 #include "freelist.h"
00036 #include "measure.h"
00037 #include "ratngs.h"
00038 #include "wordrec.h"
00039
00040 /*----------------------------------------------------------------------
00041               M a c r o s
00042 ----------------------------------------------------------------------*/
00043 #define BAD_RATING   1000.0      /* No valid blob */
00044
00045
00046 namespace tesseract {
00047
00048 /*----------------------------------------------------------------------
00049 // Some static helpers used only in this file
00050 ----------------------------------------------------------------------*/
00051
00052 // Return a character width record corresponding to the character
00053 // width that will be generated in this segmentation state.
00054 // The calling function need to memfree WIDTH_RECORD when finished.
00055 // This is the same as the original function, only cosmetic changes,
00056 // except instead of passing chunks back to be freed, it deallocates
00057 // internally.
00058 WIDTH_RECORD *Wordrec::state_char_widths(WIDTH_RECORD *chunk_widths,
00059                                          STATE *state,
00060                                          int num_joints) {
00061   SEARCH_STATE chunks = bin_to_chunks(state, num_joints);
00062   int num_chars = chunks[0] + 1;
00063
00064   // allocate and store (n+1,w0,g0,w1,g1...,wn) in int[2*(n+1)] as a
00065   // struct { num_chars, widths[2*n+1]; }
00066   WIDTH_RECORD *char_widths = (WIDTH_RECORD*) memalloc(sizeof(int)*num_chars*2);
00067   char_widths->num_chars = num_chars;
00068
00069   int first_blob = 0;
00070   int last_blob;
00071   for (int i = 1; i <= num_chars; i++) {
00072     last_blob = (i > chunks[0]) ? num_joints : first_blob + chunks[i];
00073
00074     char_widths->widths[2*i-2] =
00075       AssociateUtils::GetChunksWidth(chunk_widths, first_blob, last_blob);
00076     if (i <= chunks[0]) {
00077       char_widths->widths[2*i-1] =
00078         AssociateUtils::GetChunksGap(chunk_widths, last_blob);
00079     }
00080
00081     if (segment_adjust_debug > 3)
00082       tprintf("width_record[%d]s%d--s%d(%d) %d %d:%d\n",
00083               i-1, first_blob, last_blob, chunks[i],
00084               char_widths->widths[2*i-2], char_widths->widths[2*i-1],
00085               chunk_widths->widths[2*last_blob+1]);
00086     first_blob = last_blob + 1;
00087   }
00088
00089   memfree(chunks);
00090   return char_widths;
00091 }
00092
00093 // Computes the variance of the char widths normalized to the given height
00094 // TODO(dsl): Do this in a later stage and use char choice info to skip
00095 // punctuations.
00096 FLOAT32 Wordrec::get_width_variance(WIDTH_RECORD *wrec, float norm_height) {
00097   MEASUREMENT ws;
00098   new_measurement(ws);
00099   for (int x = 0; x < wrec->num_chars; x++) {
00100     FLOAT32 wh_ratio = wrec->widths[2 * x] * 1.0f / norm_height;
00101     if (x == wrec->num_chars - 1 && wh_ratio > 0.3)
00102       continue;   // exclude trailing punctuation from stats
00103     ADD_SAMPLE(ws, wh_ratio);
00104   }
00105   if (segment_adjust_debug > 2)
00106     tprintf("Width Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
00107   return VARIANCE(ws);
00108 }
00109
00110 // Computes the variance of char positioning (width + spacing) wrt norm_height
00111 FLOAT32 Wordrec::get_gap_variance(WIDTH_RECORD *wrec, float norm_height) {
00112   MEASUREMENT ws;
00113   new_measurement(ws);
00114   for (int x = 0; x < wrec->num_chars - 1; x++) {
00115     FLOAT32 gap_ratio = (wrec->widths[2 * x] + wrec->widths[ 2*x + 1])
00116                         * 1.0 / norm_height;
00117     ADD_SAMPLE(ws, gap_ratio);
00118   }
00119   if (segment_adjust_debug > 2)
00120     tprintf("Gap Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
00121   return VARIANCE(ws);
00122 }
00123
00124
00125 /*----------------------------------------------------------------------
00126  Below are the new state prioritization functions, which incorporates
00127  segmentation cost and width distribution for fixed pitch languages.
00128  They are included as methods in class Wordrec to obtain more context.
00129  ----------------------------------------------------------------------*/
00130
00131 /**********************************************************************
00132  * Returns the cost associated with the segmentation state.
00133  *
00134  * The number of states should be the same as the number of seams.
00135  * If state[i] is 1, then seams[i] is present; otherwise, it is hidden.
00136  * This function returns the sum of priority for active seams.
00137  * TODO(dsl): To keep this clean, this function will just return the
00138  * sum of raw "priority" as seam cost.  The normalization of this score
00139  * relative to other costs will be done in bestfirst.cpp evaluate_seg().
00140  **********************************************************************/
00141
00142 FLOAT32 Wordrec::seamcut_priority(SEAMS seams,
00143                                   STATE *state,
00144                                   int num_joints) {
00145   int x;
00146   unsigned int mask = (num_joints > 32) ? (1 << (num_joints - 1 - 32))
00147                                         : (1 << (num_joints - 1));
00148   float seam_cost = 0.0f;
00149   for (x = num_joints - 1; x >= 0; x--) {
00150     int i = num_joints - 1 - x;
00151     uinT32 value = (x < 32) ? state->part2 : state->part1;
00152     bool state_on = value & mask;
00153     if (state_on) {
00154       SEAM* seam = (SEAM *) array_value(seams, i);
00155       seam_cost += seam->priority;
00156     }
00157     if (mask == 1)
00158       mask = 1 << 31;
00159     else
00160       mask >>= 1;
00161   }
00162   if (segment_adjust_debug > 2)
00163     tprintf("seam_cost: %f\n", seam_cost);
00164   return seam_cost;
00165 }
00166
00167 /**********************************************************************
00168  * rating_priority
00169  *
00170  * Assign a segmentation priority based on the ratings of the blobs
00171  * (in that segmentation) that have been classified.  The average
00172  * "goodness" (i.e. rating / weight) for each blob is used to indicate
00173  * the segmentation priority.  This is the original.
00174  **********************************************************************/
00175 FLOAT32 Wordrec::rating_priority(CHUNKS_RECORD *chunks_record,
00176                                  STATE *state,
00177                                  int num_joints) {
00178   BLOB_CHOICE_LIST *blob_choices;
00179   BLOB_CHOICE_IT blob_choice_it;
00180   inT16 first_chunk = 0;
00181   inT16 last_chunk;
00182   inT16 ratings = 0;
00183   inT16 weights = 0;
00184
00185   PIECES_STATE blob_chunks;
00186   bin_to_pieces(state, num_joints, blob_chunks);
00187
00188   for (int x = 0; blob_chunks[x]; x++) {
00189     last_chunk = first_chunk + blob_chunks[x];
00190
00191     blob_choices = chunks_record->ratings->get(first_chunk, last_chunk - 1);
00192     if (blob_choices != NOT_CLASSIFIED && blob_choices->length() > 0) {
00193       blob_choice_it.set_to_list(blob_choices);
00194       ratings += (inT16) blob_choice_it.data()->rating();
00195       for (int y = first_chunk; y < last_chunk; y++) {
00196         weights += (inT16) (chunks_record->weights[y]);
00197       }
00198     }
00199     first_chunk = last_chunk;
00200   }
00201   if (weights <= 0)
00202     weights = 1;
00203   FLOAT32 rating_cost = static_cast<FLOAT32>(ratings) /
00204                         static_cast<FLOAT32>(weights);
00205   if (segment_adjust_debug > 2)
00206     tprintf("rating_cost: r%f / w%f = %f\n", ratings, weights, rating_cost);
00207   return rating_cost;
00208 }
00209
00210 /**********************************************************************
00211  * width_priority
00212  *
00213  * Return a priority value for this word segmentation based on the
00214  * character widths present in the new segmentation.
00215  * For variable-pitch fonts, this should do the same thing as before:
00216  * ie. penalize only on really wide squatting blobs.
00217  * For fixed-pitch fonts, this will include a measure of char & gap
00218  * width consistency.
00219  * TODO(dsl): generalize this to use a PDF estimate for proportional and
00220  * fixed pitch mode.
00221  **********************************************************************/
00222 FLOAT32 Wordrec::width_priority(CHUNKS_RECORD *chunks_record,
00223                                 STATE *state,
00224                                 int num_joints) {
00225   FLOAT32 penalty = 0.0;
00226   WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths,
00227                                               state, num_joints);
00228   // When baseline_enable==True, which is the current default for Tesseract,
00229   // a fixed value of 128 (BASELINE_SCALE) is always used.
00230   FLOAT32 normalizing_height = BASELINE_SCALE;
00231   if (assume_fixed_pitch_char_segment) {
00232     // For fixed pitch language like CJK, we use the full text height as the
00233     // normalizing factor so we are not dependent on xheight calculation.
00234     // In the normalized coord. xheight * scale == BASELINE_SCALE(128),
00235     // so add proportionally scaled ascender zone to get full text height.
00236     const DENORM& denorm = chunks_record->word_res->denorm;
00237     normalizing_height = denorm.y_scale() *
00238         (denorm.row()->x_height() + denorm.row()->ascenders());
00239     if (segment_adjust_debug > 1)
00240       tprintf("WidthPriority: %f %f normalizing height = %f\n",
00241               denorm.row()->x_height(), denorm.row()->ascenders(),
00242               normalizing_height);
00243     // Impose additional segmentation penalties if blob widths or gaps
00244     // distribution don't fit a fixed-pitch model.
00245     FLOAT32 width_var = get_width_variance(width_rec, normalizing_height);
00246     FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height);
00247     penalty += width_var;
00248     penalty += gap_var;
00249   }
00250
00251   for (int x = 0; x < width_rec->num_chars; x++) {
00252     FLOAT32 squat = width_rec->widths[2*x];
00253     FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0;
00254     squat /= normalizing_height;
00255     gap /= normalizing_height;
00256     if (assume_fixed_pitch_char_segment) {
00257       penalty += AssociateUtils::FixedPitchWidthCost(
00258           squat, 0.0f, x == 0 || x == width_rec->num_chars -1,
00259           heuristic_max_char_wh_ratio);
00260       penalty += AssociateUtils::FixedPitchGapCost(
00261           gap, x == width_rec->num_chars - 1);
00262       if (width_rec->num_chars == 1 &&
00263           squat > AssociateUtils::kMaxFixedPitchCharAspectRatio) {
00264         penalty += 10;
00265       }
00266     } else {
00267       // Original equation when
00268       // heuristic_max_char_ratio == AssociateUtils::kMaxSquat
00269       if (squat > heuristic_max_char_wh_ratio)
00270         penalty += squat - heuristic_max_char_wh_ratio;
00271     }
00272   }
00273
00274   free_widths(width_rec);
00275   return (penalty);
00276 }
00277
00278
00279 /**********************************************************************
00280  * prioritize_state
00281  *
00282  * Create a priority for this state.  It represents the urgency of
00283  * checking this state.  The larger the priority value, the worse the
00284  * state is (lower priority).  The "value" of this priority should be
00285  * somewhat consistent with the final word rating.
00286  * The question is how to normalize the different scores, and adjust
00287  * the relative importance among them.
00288  **********************************************************************/
00289 FLOAT32 Wordrec::prioritize_state(CHUNKS_RECORD *chunks_record,
00290                                   SEARCH_RECORD *the_search) {
00291   FLOAT32 shape_cost;
00292   FLOAT32 width_cost;
00293   FLOAT32 seam_cost;
00294
00295   shape_cost = rating_priority(chunks_record,
00296                                the_search->this_state,
00297                                the_search->num_joints);
00298
00299   width_cost = width_priority(chunks_record,
00300                               the_search->this_state,
00301                               the_search->num_joints);
00302
00303   // The rating_priority is the same as the original, and the width_priority
00304   // is the same as before if assume_fixed_pitch_char_segment == FALSE.
00305   // So this would return the original state priority.
00306   if (!use_new_state_cost)
00307     return width_cost * 1000 + shape_cost;
00308
00309   seam_cost = seamcut_priority(chunks_record->splits,
00310                                the_search->this_state,
00311                                the_search->num_joints);
00312
00313   // TODO(dsl): how do we normalize the scores for these separate evidence?
00314   // FLOAT32 total_cost = shape_cost + width_cost * 0.01 + seam_cost * 0.001;
00315   FLOAT32 total_cost = shape_cost * heuristic_weight_rating +
00316                        width_cost * heuristic_weight_width +
00317                        seam_cost * heuristic_weight_seamcut;
00318
00319   // We don't have an adjustment model for variable pitch segmentation cost
00320   // into word rating
00321   if (assume_fixed_pitch_char_segment) {
00322     float seg_bias = 1.0;
00323     if (width_cost < 1) seg_bias *= 0.85;
00324     if (width_cost > 3)
00325       seg_bias *= pow(heuristic_segcost_rating_base, width_cost/3.0);
00326     if (seam_cost > 10)
00327       seg_bias *= pow(heuristic_segcost_rating_base, log(seam_cost)/log(10.0));
00328     if (shape_cost > 5)
00329       seg_bias *= pow(heuristic_segcost_rating_base, shape_cost/5.0);
00330     if (segment_adjust_debug) {
00331       tprintf("SegCost: %g Weight: %g rating: %g  width: %g  seam: %g\n",
00332                total_cost, seg_bias, shape_cost, width_cost, seam_cost);
00333     }
00334     the_search->segcost_bias = seg_bias;
00335   } else {
00336     the_search->segcost_bias = 0;
00337   }
00338
00339   return total_cost;
00340 }
00341
00342 }  // namespace tesseract