tesseract-doc/fixxht_8cpp_source.html

00001 /**********************************************************************
00002  * File:        fixxht.cpp  (Formerly fixxht.c)
00003  * Description: Improve x_ht and look out for case inconsistencies
00004  * Author:              Phil Cheatle
00005  * Created:             Thu Aug  5 14:11:08 BST 1993
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019
00020 #include "mfcpch.h"
00021 #include          <string.h>
00022 #include          <ctype.h>
00023 #include          "params.h"
00024 #include          "float2int.h"
00025 #include          "tesseractclass.h"
00026
00027 namespace tesseract {
00028
00029 // Fixxht overview.
00030 // Premise: Initial estimate of x-height is adequate most of the time, but
00031 // occasionally it is incorrect. Most notable causes of failure are:
00032 // 1. Small caps, where the top of the caps is the same as the body text
00033 // xheight. For small caps words the xheight needs to be reduced to correctly
00034 // recognize the caps in the small caps word.
00035 // 2. All xheight lines, such as summer. Here the initial estimate will have
00036 // guessed that the blob tops are caps and will have placed the xheight too low.
00037 // 3. Noise/logos beside words, or changes in font size on a line. Such
00038 // things can blow the statistics and cause an incorrect estimate.
00039 //
00040 // Algorithm.
00041 // Compare the vertical position (top only) of alphnumerics in a word with
00042 // the range of positions in training data (in the unicharset).
00043 // See CountMisfitTops. If any characters disagree sufficiently with the
00044 // initial xheight estimate, then recalculate the xheight, re-run OCR on
00045 // the word, and if the number of vertical misfits goes down, along with
00046 // either the word rating or certainty, then keep the new xheight.
00047 // The new xheight is calculated as follows:ComputeCompatibleXHeight
00048 // For each alphanumeric character that has a vertically misplaced top
00049 // (a misfit), yet its bottom is within the acceptable range (ie it is not
00050 // likely a sub-or super-script) calculate the range of acceptable xheight
00051 // positions from its range of tops, and give each value in the range a
00052 // number of votes equal to the distance of its top from its acceptance range.
00053 // The x-height position with the median of the votes becomes the new
00054 // x-height. This assumes that most characters will be correctly recognized
00055 // even if the x-height is incorrect. This is not a terrible assumption, but
00056 // it is not great. An improvement would be to use a classifier that does
00057 // not care about vertical position or scaling at all.
00058
00059 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange
00060 // then the char top cannot be used to judge misfits or suggest a new top.
00061 const int kMaxCharTopRange = 48;
00062
00063 // Returns the number of misfit blob tops in this word.
00064 int Tesseract::CountMisfitTops(WERD_RES *word_res) {
00065   int bad_blobs = 0;
00066   TBLOB* blob = word_res->rebuild_word->blobs;
00067   int blob_id = 0;
00068   for (; blob != NULL; blob = blob->next, ++blob_id) {
00069     UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
00070     if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
00071       int top = blob->bounding_box().top();
00072       if (top >= INT_FEAT_RANGE)
00073         top = INT_FEAT_RANGE - 1;
00074       int min_bottom, max_bottom, min_top, max_top;
00075       unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
00076                                 &min_top, &max_top);
00077       if (max_top - min_top > kMaxCharTopRange)
00078         continue;
00079       bool bad =  top < min_top - x_ht_acceptance_tolerance ||
00080                   top > max_top + x_ht_acceptance_tolerance;
00081       if (bad)
00082         ++bad_blobs;
00083       if (debug_x_ht_level >= 1) {
00084         tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
00085                 unicharset.id_to_unichar(class_id),
00086                 bad ? "Misfit" : "OK", top, min_top, max_top,
00087                 static_cast<int>(x_ht_acceptance_tolerance));
00088       }
00089     }
00090   }
00091   return bad_blobs;
00092 }
00093
00094 // Returns a new x-height maximally compatible with the result in word_res.
00095 // See comment above for overall algorithm.
00096 float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
00097   STATS top_stats(0, MAX_UINT8);
00098   TBLOB* blob = word_res->rebuild_word->blobs;
00099   int blob_id = 0;
00100   for (; blob != NULL; blob = blob->next, ++blob_id) {
00101     UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
00102     if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
00103       int top = blob->bounding_box().top();
00104       // Clip the top to the limit of normalized feature space.
00105       if (top >= INT_FEAT_RANGE)
00106         top = INT_FEAT_RANGE - 1;
00107       int bottom = blob->bounding_box().bottom();
00108       int min_bottom, max_bottom, min_top, max_top;
00109       unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
00110                                 &min_top, &max_top);
00111       // Chars with a wild top range would mess up the result so ignore them.
00112       if (max_top - min_top > kMaxCharTopRange)
00113         continue;
00114       int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
00115                           top - (max_top + x_ht_acceptance_tolerance));
00116       int height = top - kBlnBaselineOffset;
00117       if (debug_x_ht_level >= 20) {
00118         tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ",
00119                 unicharset.id_to_unichar(class_id),
00120                 height, min_bottom, max_bottom, min_top, max_top,
00121                 bottom, top);
00122       }
00123       // Use only chars that fit in the expected bottom range, and where
00124       // the range of tops is sensibly near the xheight.
00125       if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
00126           bottom - x_ht_acceptance_tolerance <= max_bottom &&
00127           min_top > kBlnBaselineOffset &&
00128           max_top - kBlnBaselineOffset >= kBlnXHeight &&
00129           misfit_dist > 0) {
00130         // Compute the x-height position using proportionality between the
00131         // actual height and expected height.
00132         int min_xht = DivRounded(height * kBlnXHeight,
00133                                  max_top - kBlnBaselineOffset);
00134         int max_xht = DivRounded(height * kBlnXHeight,
00135                                  min_top - kBlnBaselineOffset);
00136         if (debug_x_ht_level >= 20) {
00137           tprintf(" xht range min=%d, max=%d\n",
00138                   min_xht, max_xht);
00139         }
00140         // The range of expected heights gets a vote equal to the distance
00141         // of the actual top from the expected top.
00142         for (int y = min_xht; y <= max_xht; ++y)
00143           top_stats.add(y, misfit_dist);
00144       } else if (debug_x_ht_level >= 20) {
00145         tprintf(" already OK\n");
00146       }
00147     }
00148   }
00149   if (top_stats.get_total() == 0)
00150     return 0.0f;
00151   // The new xheight is just the median vote, which is then scaled out
00152   // of BLN space back to pixel space to get the x-height in pixel space.
00153   float new_xht = top_stats.median();
00154   if (debug_x_ht_level >= 20) {
00155     tprintf("Median xht=%f\n", new_xht);
00156     tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
00157             new_xht, new_xht / word_res->denorm.y_scale());
00158   }
00159   // The xheight must change by at least x_ht_min_change to be used.
00160   if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
00161     return new_xht / word_res->denorm.y_scale();
00162   else
00163     return 0.0f;
00164 }
00165
00166 }  // namespace tesseract