Tesseract  3.02
tesseract-ocr/classify/normmatch.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:    normmatch.c
00003  **     Purpose:     Simple matcher based on character normalization features.
00004  **     Author:      Dan Johnson
00005  **     History:     Wed Dec 19 16:18:06 1990, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00021 #include "normmatch.h"
00022 
00023 #include <stdio.h>
00024 #include <math.h>
00025 
00026 #include "classify.h"
00027 #include "clusttool.h"
00028 #include "const.h"
00029 #include "efio.h"
00030 #include "emalloc.h"
00031 #include "globals.h"
00032 #include "helpers.h"
00033 #include "normfeat.h"
00034 #include "scanutils.h"
00035 #include "unicharset.h"
00036 #include "params.h"
00037 
00038 struct NORM_PROTOS
00039 {
00040   int NumParams;
00041   PARAM_DESC *ParamDesc;
00042   LIST* Protos;
00043   int NumProtos;
00044 };
00045 
00049 double NormEvidenceOf(register double NormAdj);
00050 
00051 void PrintNormMatch(FILE *File,
00052                     int NumParams,
00053                     PROTOTYPE *Proto,
00054                     FEATURE Feature);
00055 
00056 NORM_PROTOS *ReadNormProtos(FILE *File);
00057 
00062 /* control knobs used to control the normalization adjustment process */
00063 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
00064 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
00065 // Weight of width variance against height and vertical position.
00066 const double kWidthErrorWeighting = 0.125;
00067 
00071 /*---------------------------------------------------------------------------*/
00072 namespace tesseract {
00073 FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId,
00074                                    const FEATURE_STRUCT& feature,
00075                                    BOOL8 DebugMatch) {
00076 /*
00077  **     Parameters:
00078  **             ClassId         id of class to match against
00079  **             Feature         character normalization feature
00080  **             DebugMatch      controls dump of debug info
00081  **     Globals:
00082  **             NormProtos      character normalization prototypes
00083  **     Operation: This routine compares Features against each character
00084  **             normalization proto for ClassId and returns the match
00085  **             rating of the best match.
00086  **     Return: Best match rating for Feature against protos of ClassId.
00087  **     Exceptions: none
00088  **     History: Wed Dec 19 16:56:12 1990, DSJ, Created.
00089  */
00090   LIST Protos;
00091   FLOAT32 BestMatch;
00092   FLOAT32 Match;
00093   FLOAT32 Delta;
00094   PROTOTYPE *Proto;
00095   int ProtoId;
00096 
00097   /* handle requests for classification as noise */
00098   if (ClassId == NO_CLASS) {
00099     /* kludge - clean up constants and make into control knobs later */
00100     Match = (feature.Params[CharNormLength] *
00101       feature.Params[CharNormLength] * 500.0 +
00102       feature.Params[CharNormRx] *
00103       feature.Params[CharNormRx] * 8000.0 +
00104       feature.Params[CharNormRy] *
00105       feature.Params[CharNormRy] * 8000.0);
00106     return (1.0 - NormEvidenceOf (Match));
00107   }
00108 
00109   BestMatch = MAX_FLOAT32;
00110   Protos = NormProtos->Protos[ClassId];
00111 
00112   if (DebugMatch) {
00113     tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
00114   }
00115 
00116   ProtoId = 0;
00117   iterate(Protos) {
00118     Proto = (PROTOTYPE *) first_node (Protos);
00119     Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
00120     Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
00121     if (DebugMatch) {
00122       tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
00123               Proto->Mean[CharNormY], Delta,
00124               Proto->Weight.Elliptical[CharNormY], Match);
00125     }
00126     Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
00127     Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
00128     if (DebugMatch) {
00129       tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
00130               Proto->Mean[CharNormRx], Delta,
00131               Proto->Weight.Elliptical[CharNormRx], Match);
00132     }
00133     // Ry is width! See intfx.cpp.
00134     Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
00135     if (DebugMatch) {
00136       tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
00137               Proto->Mean[CharNormRy], Delta,
00138               Proto->Weight.Elliptical[CharNormRy]);
00139     }
00140     Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
00141     Delta *= kWidthErrorWeighting;
00142     Match += Delta;
00143     if (DebugMatch) {
00144       tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
00145               Match, Match / classify_norm_adj_midpoint,
00146               NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
00147     }
00148 
00149     if (Match < BestMatch)
00150       BestMatch = Match;
00151 
00152     ProtoId++;
00153   }
00154   return 1.0 - NormEvidenceOf(BestMatch);
00155 }                                /* ComputeNormMatch */
00156 
00157 void Classify::FreeNormProtos() {
00158   if (NormProtos != NULL) {
00159     for (int i = 0; i < NormProtos->NumProtos; i++)
00160       FreeProtoList(&NormProtos->Protos[i]);
00161     Efree(NormProtos->Protos);
00162     Efree(NormProtos->ParamDesc);
00163     Efree(NormProtos);
00164     NormProtos = NULL;
00165   }
00166 }
00167 }  // namespace tesseract
00168 
00172 /**********************************************************************
00173  * NormEvidenceOf
00174  *
00175  * Return the new type of evidence number corresponding to this
00176  * normalization adjustment.  The equation that represents the transform is:
00177  *       1 / (1 + (NormAdj / midpoint) ^ curl)
00178  **********************************************************************/
00179 double NormEvidenceOf(register double NormAdj) {
00180   NormAdj /= classify_norm_adj_midpoint;
00181 
00182   if (classify_norm_adj_curl == 3)
00183     NormAdj = NormAdj * NormAdj * NormAdj;
00184   else if (classify_norm_adj_curl == 2)
00185     NormAdj = NormAdj * NormAdj;
00186   else
00187     NormAdj = pow (NormAdj, classify_norm_adj_curl);
00188   return (1.0 / (1.0 + NormAdj));
00189 }
00190 
00191 
00192 /*---------------------------------------------------------------------------*/
00193 void PrintNormMatch(FILE *File,
00194                     int NumParams,
00195                     PROTOTYPE *Proto,
00196                     FEATURE Feature) {
00197 /*
00198  **     Parameters:
00199  **             File            open text file to dump match debug info to
00200  **             NumParams       # of parameters in proto and feature
00201  **             Proto[]         array of prototype parameters
00202  **             Feature[]       array of feature parameters
00203  **     Globals: none
00204  **     Operation: This routine dumps out detailed normalization match info.
00205  **     Return: none
00206  **     Exceptions: none
00207  **     History: Wed Jan  2 09:49:35 1991, DSJ, Created.
00208  */
00209   int i;
00210   FLOAT32 ParamMatch;
00211   FLOAT32 TotalMatch;
00212 
00213   for (i = 0, TotalMatch = 0.0; i < NumParams; i++) {
00214     ParamMatch = (Feature->Params[i] - Mean(Proto, i)) /
00215       StandardDeviation(Proto, i);
00216 
00217     fprintf (File, " %6.1f", ParamMatch);
00218 
00219     if (i == CharNormY || i == CharNormRx)
00220       TotalMatch += ParamMatch * ParamMatch;
00221   }
00222   fprintf (File, " --> %6.1f (%4.2f)\n",
00223     TotalMatch, NormEvidenceOf (TotalMatch));
00224 
00225 }                                /* PrintNormMatch */
00226 
00227 
00228 /*---------------------------------------------------------------------------*/
00229 namespace tesseract {
00230 NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
00231 /*
00232  **     Parameters:
00233  **             File    open text file to read normalization protos from
00234  **     Globals: none
00235  **     Operation: This routine allocates a new data structure to hold
00236  **             a set of character normalization protos.  It then fills in
00237  **             the data structure by reading from the specified File.
00238  **     Return: Character normalization protos.
00239  **     Exceptions: none
00240  **     History: Wed Dec 19 16:38:49 1990, DSJ, Created.
00241  */
00242   NORM_PROTOS *NormProtos;
00243   int i;
00244   char unichar[2 * UNICHAR_LEN + 1];
00245   UNICHAR_ID unichar_id;
00246   LIST Protos;
00247   int NumProtos;
00248 
00249   /* allocate and initialization data structure */
00250   NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
00251   NormProtos->NumProtos = unicharset.size();
00252   NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
00253   for (i = 0; i < NormProtos->NumProtos; i++)
00254     NormProtos->Protos[i] = NIL_LIST;
00255 
00256   /* read file header and save in data structure */
00257   NormProtos->NumParams = ReadSampleSize (File);
00258   NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams);
00259 
00260   /* read protos for each class into a separate list */
00261   while ((end_offset < 0 || ftell(File) < end_offset) &&
00262          fscanf(File, "%s %d", unichar, &NumProtos) == 2) {
00263     if (unicharset.contains_unichar(unichar)) {
00264       unichar_id = unicharset.unichar_to_id(unichar);
00265       Protos = NormProtos->Protos[unichar_id];
00266       for (i = 0; i < NumProtos; i++)
00267         Protos =
00268             push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
00269       NormProtos->Protos[unichar_id] = Protos;
00270     } else {
00271       cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
00272               unichar);
00273       for (i = 0; i < NumProtos; i++)
00274         FreePrototype(ReadPrototype (File, NormProtos->NumParams));
00275     }
00276     SkipNewline(File);
00277   }
00278   return (NormProtos);
00279 }                                /* ReadNormProtos */
00280 }  // namespace tesseract