Tesseract
3.02
|
00001 /****************************************************************************** 00002 ** Filename: cntraining.cpp 00003 ** Purpose: Generates a normproto and pffmtable. 00004 ** Author: Dan Johnson 00005 ** Revisment: Christy Russon 00006 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created. 00007 ** 5/25/90, DSJ, Adapted to multiple feature types. 00008 ** Tuesday, May 17, 1998 Changes made to make feature specific and 00009 ** simplify structures. First step in simplifying training process. 00010 ** 00011 ** (c) Copyright Hewlett-Packard Company, 1988. 00012 ** Licensed under the Apache License, Version 2.0 (the "License"); 00013 ** you may not use this file except in compliance with the License. 00014 ** You may obtain a copy of the License at 00015 ** http://www.apache.org/licenses/LICENSE-2.0 00016 ** Unless required by applicable law or agreed to in writing, software 00017 ** distributed under the License is distributed on an "AS IS" BASIS, 00018 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00019 ** See the License for the specific language governing permissions and 00020 ** limitations under the License. 00021 ******************************************************************************/ 00022 00023 00027 #include "oldlist.h" 00028 #include "efio.h" 00029 #include "emalloc.h" 00030 #include "featdefs.h" 00031 #include "tessopt.h" 00032 #include "ocrfeatures.h" 00033 #include "clusttool.h" 00034 #include "cluster.h" 00035 #include <string.h> 00036 #include <stdio.h> 00037 #include <math.h> 00038 #include "unichar.h" 00039 #include "commontraining.h" 00040 00041 #define PROGRAM_FEATURE_TYPE "cn" 00042 00043 DECLARE_STRING_PARAM_FLAG(D); 00044 00048 int main ( 00049 int argc, 00050 char **argv); 00051 00056 void WriteNormProtos ( 00057 const char *Directory, 00058 LIST LabeledProtoList, 00059 CLUSTERER *Clusterer); 00060 00061 /* 00062 PARAMDESC *ConvertToPARAMDESC( 00063 PARAM_DESC* Param_Desc, 00064 int N); 00065 */ 00066 00067 void WriteProtos( 00068 FILE *File, 00069 uinT16 N, 00070 LIST ProtoList, 00071 BOOL8 WriteSigProtos, 00072 BOOL8 WriteInsigProtos); 00073 00077 /* global variable to hold configuration parameters to control clustering */ 00078 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3 00079 CLUSTERCONFIG CNConfig = 00080 { 00081 elliptical, 0.025, 0.05, 0.8, 1e-3, 0 00082 }; 00083 00084 00088 /*---------------------------------------------------------------------------*/ 00089 int main(int argc, char* argv[]) 00090 00091 /* 00092 ** Parameters: 00093 ** argc number of command line arguments 00094 ** argv array of command line arguments 00095 ** Globals: none 00096 ** Operation: 00097 ** This program reads in a text file consisting of feature 00098 ** samples from a training page in the following format: 00099 ** 00100 ** FontName CharName NumberOfFeatureTypes(N) 00101 ** FeatureTypeName1 NumberOfFeatures(M) 00102 ** Feature1 00103 ** ... 00104 ** FeatureM 00105 ** FeatureTypeName2 NumberOfFeatures(M) 00106 ** Feature1 00107 ** ... 00108 ** FeatureM 00109 ** ... 00110 ** FeatureTypeNameN NumberOfFeatures(M) 00111 ** Feature1 00112 ** ... 00113 ** FeatureM 00114 ** FontName CharName ... 00115 ** 00116 ** It then appends these samples into a separate file for each 00117 ** character. The name of the file is 00118 ** 00119 ** DirectoryName/FontName/CharName.FeatureTypeName 00120 ** 00121 ** The DirectoryName can be specified via a command 00122 ** line argument. If not specified, it defaults to the 00123 ** current directory. The format of the resulting files is: 00124 ** 00125 ** NumberOfFeatures(M) 00126 ** Feature1 00127 ** ... 00128 ** FeatureM 00129 ** NumberOfFeatures(M) 00130 ** ... 00131 ** 00132 ** The output files each have a header which describes the 00133 ** type of feature which the file contains. This header is 00134 ** in the format required by the clusterer. A command line 00135 ** argument can also be used to specify that only the first 00136 ** N samples of each class should be used. 00137 ** Return: none 00138 ** Exceptions: none 00139 ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. 00140 */ 00141 00142 { 00143 // Set the global Config parameters before parsing the command line. 00144 Config = CNConfig; 00145 00146 const char *PageName; 00147 FILE *TrainingPage; 00148 LIST CharList = NIL_LIST; 00149 CLUSTERER *Clusterer = NULL; 00150 LIST ProtoList = NIL_LIST; 00151 LIST NormProtoList = NIL_LIST; 00152 LIST pCharList; 00153 LABELEDLIST CharSample; 00154 FEATURE_DEFS_STRUCT FeatureDefs; 00155 InitFeatureDefs(&FeatureDefs); 00156 00157 ParseArguments(&argc, &argv); 00158 int num_fonts = 0; 00159 while ((PageName = GetNextFilename(argc, argv)) != NULL) { 00160 printf("Reading %s ...\n", PageName); 00161 TrainingPage = Efopen(PageName, "rb"); 00162 ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 00163 100, NULL, TrainingPage, &CharList); 00164 fclose(TrainingPage); 00165 ++num_fonts; 00166 } 00167 printf("Clustering ...\n"); 00168 // To allow an individual font to form a separate cluster, 00169 // reduce the min samples: 00170 // Config.MinSamples = 0.5 / num_fonts; 00171 pCharList = CharList; 00172 iterate(pCharList) { 00173 //Cluster 00174 CharSample = (LABELEDLIST)first_node(pCharList); 00175 Clusterer = 00176 SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); 00177 float SavedMinSamples = Config.MinSamples; 00178 // To disable the tendency to produce a single cluster for all fonts, 00179 // make MagicSamples an impossible to achieve number: 00180 // Config.MagicSamples = CharSample->SampleCount * 10; 00181 Config.MagicSamples = CharSample->SampleCount; 00182 while (Config.MinSamples > 0.001) { 00183 ProtoList = ClusterSamples(Clusterer, &Config); 00184 if (NumberOfProtos(ProtoList, 1, 0) > 0) { 00185 break; 00186 } else { 00187 Config.MinSamples *= 0.95; 00188 printf("0 significant protos for %s." 00189 " Retrying clustering with MinSamples = %f%%\n", 00190 CharSample->Label, Config.MinSamples); 00191 } 00192 } 00193 Config.MinSamples = SavedMinSamples; 00194 AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); 00195 } 00196 FreeTrainingSamples(CharList); 00197 if (Clusterer == NULL) { // To avoid a SIGSEGV 00198 fprintf(stderr, "Error: NULL clusterer!\n"); 00199 return 1; 00200 } 00201 WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer); 00202 FreeNormProtoList(NormProtoList); 00203 FreeProtoList(&ProtoList); 00204 FreeClusterer(Clusterer); 00205 printf ("\n"); 00206 return 0; 00207 } // main 00208 00209 00214 /*----------------------------------------------------------------------------*/ 00215 void WriteNormProtos ( 00216 const char *Directory, 00217 LIST LabeledProtoList, 00218 CLUSTERER *Clusterer) 00219 00220 /* 00221 ** Parameters: 00222 ** Directory directory to place sample files into 00223 ** Operation: 00224 ** This routine writes the specified samples into files which 00225 ** are organized according to the font name and character name 00226 ** of the samples. 00227 ** Return: none 00228 ** Exceptions: none 00229 ** History: Fri Aug 18 16:17:06 1989, DSJ, Created. 00230 */ 00231 00232 { 00233 FILE *File; 00234 STRING Filename; 00235 LABELEDLIST LabeledProto; 00236 int N; 00237 00238 Filename = ""; 00239 if (Directory != NULL && Directory[0] != '\0') 00240 { 00241 Filename += Directory; 00242 Filename += "/"; 00243 } 00244 Filename += "normproto"; 00245 printf ("\nWriting %s ...", Filename.string()); 00246 File = Efopen (Filename.string(), "wb"); 00247 fprintf(File,"%0d\n",Clusterer->SampleSize); 00248 WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc); 00249 iterate(LabeledProtoList) 00250 { 00251 LabeledProto = (LABELEDLIST) first_node (LabeledProtoList); 00252 N = NumberOfProtos(LabeledProto->List, true, false); 00253 if (N < 1) { 00254 printf ("\nError! Not enough protos for %s: %d protos" 00255 " (%d significant protos" 00256 ", %d insignificant protos)\n", 00257 LabeledProto->Label, N, 00258 NumberOfProtos(LabeledProto->List, 1, 0), 00259 NumberOfProtos(LabeledProto->List, 0, 1)); 00260 exit(1); 00261 } 00262 fprintf(File, "\n%s %d\n", LabeledProto->Label, N); 00263 WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false); 00264 } 00265 fclose (File); 00266 00267 } // WriteNormProtos 00268 00269 /*-------------------------------------------------------------------------*/ 00270 void WriteProtos( 00271 FILE *File, 00272 uinT16 N, 00273 LIST ProtoList, 00274 BOOL8 WriteSigProtos, 00275 BOOL8 WriteInsigProtos) 00276 { 00277 PROTOTYPE *Proto; 00278 00279 // write prototypes 00280 iterate(ProtoList) 00281 { 00282 Proto = (PROTOTYPE *) first_node ( ProtoList ); 00283 if (( Proto->Significant && WriteSigProtos ) || 00284 ( ! Proto->Significant && WriteInsigProtos ) ) 00285 WritePrototype( File, N, Proto ); 00286 } 00287 } // WriteProtos