Tesseract
3.02
|
00001 00002 // File: associate.cpp 00003 // Description: Functions for scoring segmentation paths according to 00004 // their character widths, gap widths and seam cuts. 00005 // Author: Daria Antonova 00006 // Created: Mon Mar 8 11:26:43 PDT 2010 00007 // 00008 // (C) Copyright 2010, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 00022 #include <stdio.h> 00023 #ifdef __UNIX__ 00024 #include <assert.h> 00025 #endif 00026 #include <math.h> 00027 00028 #include "associate.h" 00029 #include "baseline.h" 00030 #include "normalis.h" 00031 00032 namespace tesseract { 00033 00034 const float AssociateUtils::kMaxFixedPitchCharAspectRatio = 2.0f; 00035 const float AssociateUtils::kMinGap = 0.03f; 00036 00037 void AssociateUtils::ComputeStats(int col, int row, 00038 const AssociateStats *parent_stats, 00039 int parent_path_length, 00040 bool fixed_pitch, 00041 float max_char_wh_ratio, 00042 const DENORM *denorm, 00043 CHUNKS_RECORD *chunks_record, 00044 int debug_level, 00045 AssociateStats *stats) { 00046 stats->Clear(); 00047 00048 if (debug_level > 0) { 00049 tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n", 00050 col, row, fixed_pitch ? " (fixed pitch)" : ""); 00051 } 00052 float normalizing_height = BASELINE_SCALE; 00053 // TODO(rays/daria) Can unicharset.script_has_xheight be useful here? 00054 if (fixed_pitch && denorm != NULL && denorm->row() != NULL) { 00055 // For fixed pitch language like CJK, we use the full text height 00056 // as the normalizing factor so we are not dependent on xheight 00057 // calculation. 00058 if (denorm->row()->body_size() > 0.0f) { 00059 normalizing_height = denorm->y_scale() * denorm->row()->body_size(); 00060 } else { 00061 normalizing_height = denorm->y_scale() * 00062 (denorm->row()->x_height() + denorm->row()->ascenders()); 00063 } 00064 if (debug_level > 0) { 00065 tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n", 00066 normalizing_height, denorm->y_scale(), denorm->row()->x_height(), 00067 denorm->row()->ascenders()); 00068 } 00069 } 00070 float wh_ratio = 00071 GetChunksWidth(chunks_record->chunk_widths, col, row) / normalizing_height; 00072 if (debug_level) tprintf("wh_ratio %g\n", wh_ratio); 00073 if (wh_ratio > max_char_wh_ratio) stats->bad_shape = true; 00074 if (fixed_pitch) { 00075 bool end_row = (row == (chunks_record->ratings->dimension() - 1)); 00076 00077 // Ensure that the blob has gaps on the left and the right sides 00078 // (except for beginning and ending punctuation) and that there is 00079 // no cutting through ink at the blob boundaries. 00080 if (col > 0) { 00081 float left_gap = 00082 GetChunksGap(chunks_record->chunk_widths, col-1) / normalizing_height; 00083 SEAM *left_seam = 00084 static_cast<SEAM *>(array_value(chunks_record->splits, col-1)); 00085 if (debug_level) { 00086 tprintf("left_gap %g, left_seam %g\n", left_gap, left_seam->priority); 00087 } 00088 if ((!end_row && left_gap < kMinGap) || left_seam->priority > 0.0f) { 00089 stats->bad_shape = true; 00090 } 00091 } 00092 float right_gap = 0.0f; 00093 if (!end_row) { 00094 right_gap = 00095 GetChunksGap(chunks_record->chunk_widths, row) / normalizing_height; 00096 SEAM *right_seam = 00097 static_cast<SEAM *>(array_value(chunks_record->splits, row)); 00098 if (debug_level) { 00099 tprintf("right_gap %g right_seam %g\n", 00100 right_gap, right_seam->priority); 00101 } 00102 if (right_gap < kMinGap || right_seam->priority > 0.0f) { 00103 stats->bad_shape = true; 00104 if (right_gap < kMinGap) stats->bad_fixed_pitch_right_gap = true; 00105 } 00106 } 00107 00108 // Impose additional segmentation penalties if blob widths or gaps 00109 // distribution don't fit a fixed-pitch model. 00110 // Since we only know the widths and gaps of the path explored so far, 00111 // the means and variances are computed for the path so far (not 00112 // considering characters to the right of the last character on the path). 00113 stats->full_wh_ratio = wh_ratio + right_gap; 00114 if (parent_stats != NULL) { 00115 stats->full_wh_ratio_total = 00116 (parent_stats->full_wh_ratio_total + stats->full_wh_ratio); 00117 float mean = 00118 stats->full_wh_ratio_total / static_cast<float>(parent_path_length+1); 00119 stats->full_wh_ratio_var = 00120 parent_stats->full_wh_ratio_var + pow(mean-stats->full_wh_ratio, 2); 00121 } else { 00122 stats->full_wh_ratio_total = stats->full_wh_ratio; 00123 } 00124 if (debug_level) { 00125 tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n", 00126 stats->full_wh_ratio, stats->full_wh_ratio_total, 00127 stats->full_wh_ratio_var); 00128 } 00129 00130 stats->shape_cost = 00131 FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio); 00132 00133 // For some reason Tesseract prefers to treat the whole CJ words 00134 // as one blob when the initial segmentation is particularly bad. 00135 // This hack is to avoid favoring such states. 00136 if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) { 00137 stats->shape_cost += 10; 00138 } 00139 stats->shape_cost += stats->full_wh_ratio_var; 00140 if (debug_level) tprintf("shape_cost %g\n", stats->shape_cost); 00141 } 00142 } 00143 00144 int AssociateUtils::GetChunksWidth(WIDTH_RECORD *width_record, 00145 int start_blob, int last_blob) { 00146 int result = 0; 00147 for (int x = start_blob * 2; x <= last_blob * 2; x++) 00148 result += width_record->widths[x]; 00149 return result; 00150 } 00151 00152 float AssociateUtils::FixedPitchWidthCost(float norm_width, 00153 float right_gap, 00154 bool end_pos, 00155 float max_char_wh_ratio) { 00156 float cost = 0.0f; 00157 if (norm_width > max_char_wh_ratio) cost += norm_width; 00158 if (norm_width > kMaxFixedPitchCharAspectRatio) 00159 cost += norm_width * norm_width; // extra penalty for merging CJK chars 00160 // Penalize skinny blobs, except for punctuation in the last position. 00161 if (norm_width+right_gap < 0.5f && !end_pos) { 00162 cost += 1.0f - (norm_width + right_gap); 00163 } 00164 return cost; 00165 } 00166 00167 } // namespace tesseract