Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: fixxht.cpp (Formerly fixxht.c) 00003 * Description: Improve x_ht and look out for case inconsistencies 00004 * Author: Phil Cheatle 00005 * Created: Thu Aug 5 14:11:08 BST 1993 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "mfcpch.h" 00021 #include <string.h> 00022 #include <ctype.h> 00023 #include "params.h" 00024 #include "float2int.h" 00025 #include "tesseractclass.h" 00026 00027 namespace tesseract { 00028 00029 // Fixxht overview. 00030 // Premise: Initial estimate of x-height is adequate most of the time, but 00031 // occasionally it is incorrect. Most notable causes of failure are: 00032 // 1. Small caps, where the top of the caps is the same as the body text 00033 // xheight. For small caps words the xheight needs to be reduced to correctly 00034 // recognize the caps in the small caps word. 00035 // 2. All xheight lines, such as summer. Here the initial estimate will have 00036 // guessed that the blob tops are caps and will have placed the xheight too low. 00037 // 3. Noise/logos beside words, or changes in font size on a line. Such 00038 // things can blow the statistics and cause an incorrect estimate. 00039 // 00040 // Algorithm. 00041 // Compare the vertical position (top only) of alphnumerics in a word with 00042 // the range of positions in training data (in the unicharset). 00043 // See CountMisfitTops. If any characters disagree sufficiently with the 00044 // initial xheight estimate, then recalculate the xheight, re-run OCR on 00045 // the word, and if the number of vertical misfits goes down, along with 00046 // either the word rating or certainty, then keep the new xheight. 00047 // The new xheight is calculated as follows:ComputeCompatibleXHeight 00048 // For each alphanumeric character that has a vertically misplaced top 00049 // (a misfit), yet its bottom is within the acceptable range (ie it is not 00050 // likely a sub-or super-script) calculate the range of acceptable xheight 00051 // positions from its range of tops, and give each value in the range a 00052 // number of votes equal to the distance of its top from its acceptance range. 00053 // The x-height position with the median of the votes becomes the new 00054 // x-height. This assumes that most characters will be correctly recognized 00055 // even if the x-height is incorrect. This is not a terrible assumption, but 00056 // it is not great. An improvement would be to use a classifier that does 00057 // not care about vertical position or scaling at all. 00058 00059 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange 00060 // then the char top cannot be used to judge misfits or suggest a new top. 00061 const int kMaxCharTopRange = 48; 00062 00063 // Returns the number of misfit blob tops in this word. 00064 int Tesseract::CountMisfitTops(WERD_RES *word_res) { 00065 int bad_blobs = 0; 00066 TBLOB* blob = word_res->rebuild_word->blobs; 00067 int blob_id = 0; 00068 for (; blob != NULL; blob = blob->next, ++blob_id) { 00069 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); 00070 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { 00071 int top = blob->bounding_box().top(); 00072 if (top >= INT_FEAT_RANGE) 00073 top = INT_FEAT_RANGE - 1; 00074 int min_bottom, max_bottom, min_top, max_top; 00075 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, 00076 &min_top, &max_top); 00077 if (max_top - min_top > kMaxCharTopRange) 00078 continue; 00079 bool bad = top < min_top - x_ht_acceptance_tolerance || 00080 top > max_top + x_ht_acceptance_tolerance; 00081 if (bad) 00082 ++bad_blobs; 00083 if (debug_x_ht_level >= 1) { 00084 tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n", 00085 unicharset.id_to_unichar(class_id), 00086 bad ? "Misfit" : "OK", top, min_top, max_top, 00087 static_cast<int>(x_ht_acceptance_tolerance)); 00088 } 00089 } 00090 } 00091 return bad_blobs; 00092 } 00093 00094 // Returns a new x-height maximally compatible with the result in word_res. 00095 // See comment above for overall algorithm. 00096 float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) { 00097 STATS top_stats(0, MAX_UINT8); 00098 TBLOB* blob = word_res->rebuild_word->blobs; 00099 int blob_id = 0; 00100 for (; blob != NULL; blob = blob->next, ++blob_id) { 00101 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); 00102 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { 00103 int top = blob->bounding_box().top(); 00104 // Clip the top to the limit of normalized feature space. 00105 if (top >= INT_FEAT_RANGE) 00106 top = INT_FEAT_RANGE - 1; 00107 int bottom = blob->bounding_box().bottom(); 00108 int min_bottom, max_bottom, min_top, max_top; 00109 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, 00110 &min_top, &max_top); 00111 // Chars with a wild top range would mess up the result so ignore them. 00112 if (max_top - min_top > kMaxCharTopRange) 00113 continue; 00114 int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top, 00115 top - (max_top + x_ht_acceptance_tolerance)); 00116 int height = top - kBlnBaselineOffset; 00117 if (debug_x_ht_level >= 20) { 00118 tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ", 00119 unicharset.id_to_unichar(class_id), 00120 height, min_bottom, max_bottom, min_top, max_top, 00121 bottom, top); 00122 } 00123 // Use only chars that fit in the expected bottom range, and where 00124 // the range of tops is sensibly near the xheight. 00125 if (min_bottom <= bottom + x_ht_acceptance_tolerance && 00126 bottom - x_ht_acceptance_tolerance <= max_bottom && 00127 min_top > kBlnBaselineOffset && 00128 max_top - kBlnBaselineOffset >= kBlnXHeight && 00129 misfit_dist > 0) { 00130 // Compute the x-height position using proportionality between the 00131 // actual height and expected height. 00132 int min_xht = DivRounded(height * kBlnXHeight, 00133 max_top - kBlnBaselineOffset); 00134 int max_xht = DivRounded(height * kBlnXHeight, 00135 min_top - kBlnBaselineOffset); 00136 if (debug_x_ht_level >= 20) { 00137 tprintf(" xht range min=%d, max=%d\n", 00138 min_xht, max_xht); 00139 } 00140 // The range of expected heights gets a vote equal to the distance 00141 // of the actual top from the expected top. 00142 for (int y = min_xht; y <= max_xht; ++y) 00143 top_stats.add(y, misfit_dist); 00144 } else if (debug_x_ht_level >= 20) { 00145 tprintf(" already OK\n"); 00146 } 00147 } 00148 } 00149 if (top_stats.get_total() == 0) 00150 return 0.0f; 00151 // The new xheight is just the median vote, which is then scaled out 00152 // of BLN space back to pixel space to get the x-height in pixel space. 00153 float new_xht = top_stats.median(); 00154 if (debug_x_ht_level >= 20) { 00155 tprintf("Median xht=%f\n", new_xht); 00156 tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", 00157 new_xht, new_xht / word_res->denorm.y_scale()); 00158 } 00159 // The xheight must change by at least x_ht_min_change to be used. 00160 if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) 00161 return new_xht / word_res->denorm.y_scale(); 00162 else 00163 return 0.0f; 00164 } 00165 00166 } // namespace tesseract