Tesseract
3.02
|
00001 /****************************************************************************** 00002 ** Filename: normmatch.c 00003 ** Purpose: Simple matcher based on character normalization features. 00004 ** Author: Dan Johnson 00005 ** History: Wed Dec 19 16:18:06 1990, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00021 #include "normmatch.h" 00022 00023 #include <stdio.h> 00024 #include <math.h> 00025 00026 #include "classify.h" 00027 #include "clusttool.h" 00028 #include "const.h" 00029 #include "efio.h" 00030 #include "emalloc.h" 00031 #include "globals.h" 00032 #include "helpers.h" 00033 #include "normfeat.h" 00034 #include "scanutils.h" 00035 #include "unicharset.h" 00036 #include "params.h" 00037 00038 struct NORM_PROTOS 00039 { 00040 int NumParams; 00041 PARAM_DESC *ParamDesc; 00042 LIST* Protos; 00043 int NumProtos; 00044 }; 00045 00049 double NormEvidenceOf(register double NormAdj); 00050 00051 void PrintNormMatch(FILE *File, 00052 int NumParams, 00053 PROTOTYPE *Proto, 00054 FEATURE Feature); 00055 00056 NORM_PROTOS *ReadNormProtos(FILE *File); 00057 00062 /* control knobs used to control the normalization adjustment process */ 00063 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ..."); 00064 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ..."); 00065 // Weight of width variance against height and vertical position. 00066 const double kWidthErrorWeighting = 0.125; 00067 00071 /*---------------------------------------------------------------------------*/ 00072 namespace tesseract { 00073 FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId, 00074 const FEATURE_STRUCT& feature, 00075 BOOL8 DebugMatch) { 00076 /* 00077 ** Parameters: 00078 ** ClassId id of class to match against 00079 ** Feature character normalization feature 00080 ** DebugMatch controls dump of debug info 00081 ** Globals: 00082 ** NormProtos character normalization prototypes 00083 ** Operation: This routine compares Features against each character 00084 ** normalization proto for ClassId and returns the match 00085 ** rating of the best match. 00086 ** Return: Best match rating for Feature against protos of ClassId. 00087 ** Exceptions: none 00088 ** History: Wed Dec 19 16:56:12 1990, DSJ, Created. 00089 */ 00090 LIST Protos; 00091 FLOAT32 BestMatch; 00092 FLOAT32 Match; 00093 FLOAT32 Delta; 00094 PROTOTYPE *Proto; 00095 int ProtoId; 00096 00097 /* handle requests for classification as noise */ 00098 if (ClassId == NO_CLASS) { 00099 /* kludge - clean up constants and make into control knobs later */ 00100 Match = (feature.Params[CharNormLength] * 00101 feature.Params[CharNormLength] * 500.0 + 00102 feature.Params[CharNormRx] * 00103 feature.Params[CharNormRx] * 8000.0 + 00104 feature.Params[CharNormRy] * 00105 feature.Params[CharNormRy] * 8000.0); 00106 return (1.0 - NormEvidenceOf (Match)); 00107 } 00108 00109 BestMatch = MAX_FLOAT32; 00110 Protos = NormProtos->Protos[ClassId]; 00111 00112 if (DebugMatch) { 00113 tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId)); 00114 } 00115 00116 ProtoId = 0; 00117 iterate(Protos) { 00118 Proto = (PROTOTYPE *) first_node (Protos); 00119 Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY]; 00120 Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY]; 00121 if (DebugMatch) { 00122 tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", 00123 Proto->Mean[CharNormY], Delta, 00124 Proto->Weight.Elliptical[CharNormY], Match); 00125 } 00126 Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx]; 00127 Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx]; 00128 if (DebugMatch) { 00129 tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", 00130 Proto->Mean[CharNormRx], Delta, 00131 Proto->Weight.Elliptical[CharNormRx], Match); 00132 } 00133 // Ry is width! See intfx.cpp. 00134 Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy]; 00135 if (DebugMatch) { 00136 tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", 00137 Proto->Mean[CharNormRy], Delta, 00138 Proto->Weight.Elliptical[CharNormRy]); 00139 } 00140 Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy]; 00141 Delta *= kWidthErrorWeighting; 00142 Match += Delta; 00143 if (DebugMatch) { 00144 tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", 00145 Match, Match / classify_norm_adj_midpoint, 00146 NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match))); 00147 } 00148 00149 if (Match < BestMatch) 00150 BestMatch = Match; 00151 00152 ProtoId++; 00153 } 00154 return 1.0 - NormEvidenceOf(BestMatch); 00155 } /* ComputeNormMatch */ 00156 00157 void Classify::FreeNormProtos() { 00158 if (NormProtos != NULL) { 00159 for (int i = 0; i < NormProtos->NumProtos; i++) 00160 FreeProtoList(&NormProtos->Protos[i]); 00161 Efree(NormProtos->Protos); 00162 Efree(NormProtos->ParamDesc); 00163 Efree(NormProtos); 00164 NormProtos = NULL; 00165 } 00166 } 00167 } // namespace tesseract 00168 00172 /********************************************************************** 00173 * NormEvidenceOf 00174 * 00175 * Return the new type of evidence number corresponding to this 00176 * normalization adjustment. The equation that represents the transform is: 00177 * 1 / (1 + (NormAdj / midpoint) ^ curl) 00178 **********************************************************************/ 00179 double NormEvidenceOf(register double NormAdj) { 00180 NormAdj /= classify_norm_adj_midpoint; 00181 00182 if (classify_norm_adj_curl == 3) 00183 NormAdj = NormAdj * NormAdj * NormAdj; 00184 else if (classify_norm_adj_curl == 2) 00185 NormAdj = NormAdj * NormAdj; 00186 else 00187 NormAdj = pow (NormAdj, classify_norm_adj_curl); 00188 return (1.0 / (1.0 + NormAdj)); 00189 } 00190 00191 00192 /*---------------------------------------------------------------------------*/ 00193 void PrintNormMatch(FILE *File, 00194 int NumParams, 00195 PROTOTYPE *Proto, 00196 FEATURE Feature) { 00197 /* 00198 ** Parameters: 00199 ** File open text file to dump match debug info to 00200 ** NumParams # of parameters in proto and feature 00201 ** Proto[] array of prototype parameters 00202 ** Feature[] array of feature parameters 00203 ** Globals: none 00204 ** Operation: This routine dumps out detailed normalization match info. 00205 ** Return: none 00206 ** Exceptions: none 00207 ** History: Wed Jan 2 09:49:35 1991, DSJ, Created. 00208 */ 00209 int i; 00210 FLOAT32 ParamMatch; 00211 FLOAT32 TotalMatch; 00212 00213 for (i = 0, TotalMatch = 0.0; i < NumParams; i++) { 00214 ParamMatch = (Feature->Params[i] - Mean(Proto, i)) / 00215 StandardDeviation(Proto, i); 00216 00217 fprintf (File, " %6.1f", ParamMatch); 00218 00219 if (i == CharNormY || i == CharNormRx) 00220 TotalMatch += ParamMatch * ParamMatch; 00221 } 00222 fprintf (File, " --> %6.1f (%4.2f)\n", 00223 TotalMatch, NormEvidenceOf (TotalMatch)); 00224 00225 } /* PrintNormMatch */ 00226 00227 00228 /*---------------------------------------------------------------------------*/ 00229 namespace tesseract { 00230 NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) { 00231 /* 00232 ** Parameters: 00233 ** File open text file to read normalization protos from 00234 ** Globals: none 00235 ** Operation: This routine allocates a new data structure to hold 00236 ** a set of character normalization protos. It then fills in 00237 ** the data structure by reading from the specified File. 00238 ** Return: Character normalization protos. 00239 ** Exceptions: none 00240 ** History: Wed Dec 19 16:38:49 1990, DSJ, Created. 00241 */ 00242 NORM_PROTOS *NormProtos; 00243 int i; 00244 char unichar[2 * UNICHAR_LEN + 1]; 00245 UNICHAR_ID unichar_id; 00246 LIST Protos; 00247 int NumProtos; 00248 00249 /* allocate and initialization data structure */ 00250 NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS)); 00251 NormProtos->NumProtos = unicharset.size(); 00252 NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST)); 00253 for (i = 0; i < NormProtos->NumProtos; i++) 00254 NormProtos->Protos[i] = NIL_LIST; 00255 00256 /* read file header and save in data structure */ 00257 NormProtos->NumParams = ReadSampleSize (File); 00258 NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams); 00259 00260 /* read protos for each class into a separate list */ 00261 while ((end_offset < 0 || ftell(File) < end_offset) && 00262 fscanf(File, "%s %d", unichar, &NumProtos) == 2) { 00263 if (unicharset.contains_unichar(unichar)) { 00264 unichar_id = unicharset.unichar_to_id(unichar); 00265 Protos = NormProtos->Protos[unichar_id]; 00266 for (i = 0; i < NumProtos; i++) 00267 Protos = 00268 push_last (Protos, ReadPrototype (File, NormProtos->NumParams)); 00269 NormProtos->Protos[unichar_id] = Protos; 00270 } else { 00271 cprintf("Error: unichar %s in normproto file is not in unichar set.\n", 00272 unichar); 00273 for (i = 0; i < NumProtos; i++) 00274 FreePrototype(ReadPrototype (File, NormProtos->NumParams)); 00275 } 00276 SkipNewline(File); 00277 } 00278 return (NormProtos); 00279 } /* ReadNormProtos */ 00280 } // namespace tesseract