Tesseract  3.02
tesseract-ocr/wordrec/associate.cpp
Go to the documentation of this file.
00001 
00002 // File:        associate.cpp
00003 // Description: Functions for scoring segmentation paths according to
00004 //              their character widths, gap widths and seam cuts.
00005 // Author:      Daria Antonova
00006 // Created:     Mon Mar 8 11:26:43 PDT 2010
00007 //
00008 // (C) Copyright 2010, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 
00022 #include <stdio.h>
00023 #ifdef __UNIX__
00024 #include <assert.h>
00025 #endif
00026 #include <math.h>
00027 
00028 #include "associate.h"
00029 #include "baseline.h"
00030 #include "normalis.h"
00031 
00032 namespace tesseract {
00033 
00034 const float AssociateUtils::kMaxFixedPitchCharAspectRatio = 2.0f;
00035 const float AssociateUtils::kMinGap = 0.03f;
00036 
00037 void AssociateUtils::ComputeStats(int col, int row,
00038                                   const AssociateStats *parent_stats,
00039                                   int parent_path_length,
00040                                   bool fixed_pitch,
00041                                   float max_char_wh_ratio,
00042                                   const DENORM *denorm,
00043                                   CHUNKS_RECORD *chunks_record,
00044                                   int debug_level,
00045                                   AssociateStats *stats) {
00046   stats->Clear();
00047 
00048   if (debug_level > 0) {
00049     tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n",
00050             col, row, fixed_pitch ? " (fixed pitch)" : "");
00051   }
00052   float normalizing_height = BASELINE_SCALE;
00053   // TODO(rays/daria) Can unicharset.script_has_xheight be useful here?
00054   if (fixed_pitch && denorm != NULL && denorm->row() != NULL) {
00055     // For fixed pitch language like CJK, we use the full text height
00056     // as the normalizing factor so we are not dependent on xheight
00057     // calculation.
00058     if (denorm->row()->body_size() > 0.0f) {
00059       normalizing_height = denorm->y_scale() * denorm->row()->body_size();
00060     } else {
00061       normalizing_height = denorm->y_scale() *
00062           (denorm->row()->x_height() + denorm->row()->ascenders());
00063     }
00064     if (debug_level > 0) {
00065       tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n",
00066               normalizing_height, denorm->y_scale(), denorm->row()->x_height(),
00067               denorm->row()->ascenders());
00068     }
00069   }
00070   float wh_ratio =
00071     GetChunksWidth(chunks_record->chunk_widths, col, row) / normalizing_height;
00072   if (debug_level) tprintf("wh_ratio %g\n", wh_ratio);
00073   if (wh_ratio > max_char_wh_ratio) stats->bad_shape = true;
00074   if (fixed_pitch) {
00075     bool end_row = (row == (chunks_record->ratings->dimension() - 1));
00076 
00077     // Ensure that the blob has gaps on the left and the right sides
00078     // (except for beginning and ending punctuation) and that there is
00079     // no cutting through ink at the blob boundaries.
00080     if (col > 0) {
00081       float left_gap =
00082         GetChunksGap(chunks_record->chunk_widths, col-1) / normalizing_height;
00083       SEAM *left_seam =
00084         static_cast<SEAM *>(array_value(chunks_record->splits, col-1));
00085       if (debug_level) {
00086         tprintf("left_gap %g, left_seam %g\n", left_gap, left_seam->priority);
00087       }
00088       if ((!end_row && left_gap < kMinGap) || left_seam->priority > 0.0f) {
00089         stats->bad_shape = true;
00090       }
00091     }
00092     float right_gap = 0.0f;
00093     if (!end_row) {
00094       right_gap =
00095         GetChunksGap(chunks_record->chunk_widths, row) / normalizing_height;
00096       SEAM *right_seam =
00097         static_cast<SEAM *>(array_value(chunks_record->splits, row));
00098       if (debug_level) {
00099         tprintf("right_gap %g right_seam %g\n",
00100                 right_gap, right_seam->priority);
00101       }
00102       if (right_gap < kMinGap || right_seam->priority > 0.0f) {
00103         stats->bad_shape = true;
00104         if (right_gap < kMinGap) stats->bad_fixed_pitch_right_gap = true;
00105       }
00106     }
00107 
00108     // Impose additional segmentation penalties if blob widths or gaps
00109     // distribution don't fit a fixed-pitch model.
00110     // Since we only know the widths and gaps of the path explored so far,
00111     // the means and variances are computed for the path so far (not
00112     // considering characters to the right of the last character on the path).
00113     stats->full_wh_ratio = wh_ratio + right_gap;
00114     if (parent_stats != NULL) {
00115       stats->full_wh_ratio_total =
00116         (parent_stats->full_wh_ratio_total + stats->full_wh_ratio);
00117       float mean =
00118         stats->full_wh_ratio_total / static_cast<float>(parent_path_length+1);
00119       stats->full_wh_ratio_var =
00120         parent_stats->full_wh_ratio_var + pow(mean-stats->full_wh_ratio, 2);
00121     } else {
00122       stats->full_wh_ratio_total = stats->full_wh_ratio;
00123     }
00124     if (debug_level) {
00125       tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n",
00126               stats->full_wh_ratio, stats->full_wh_ratio_total,
00127               stats->full_wh_ratio_var);
00128     }
00129 
00130     stats->shape_cost =
00131       FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio);
00132 
00133     // For some reason Tesseract prefers to treat the whole CJ words
00134     // as one blob when the initial segmentation is particularly bad.
00135     // This hack is to avoid favoring such states.
00136     if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) {
00137       stats->shape_cost += 10;
00138     }
00139     stats->shape_cost += stats->full_wh_ratio_var;
00140     if (debug_level) tprintf("shape_cost %g\n", stats->shape_cost);
00141   }
00142 }
00143 
00144 int AssociateUtils::GetChunksWidth(WIDTH_RECORD *width_record,
00145                                    int start_blob, int last_blob) {
00146   int result = 0;
00147   for (int x = start_blob * 2; x <= last_blob * 2; x++)
00148     result += width_record->widths[x];
00149   return result;
00150 }
00151 
00152 float AssociateUtils::FixedPitchWidthCost(float norm_width,
00153                                           float right_gap,
00154                                           bool end_pos,
00155                                           float max_char_wh_ratio) {
00156   float cost = 0.0f;
00157   if (norm_width > max_char_wh_ratio) cost += norm_width;
00158   if (norm_width > kMaxFixedPitchCharAspectRatio)
00159     cost += norm_width * norm_width;  // extra penalty for merging CJK chars
00160   // Penalize skinny blobs, except for punctuation in the last position.
00161   if (norm_width+right_gap < 0.5f && !end_pos) {
00162     cost += 1.0f - (norm_width + right_gap);
00163   }
00164   return cost;
00165 }
00166 
00167 }  // namespace tesseract