Tesseract  3.02
tesseract-ocr/training/cntraining.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002 **  Filename:  cntraining.cpp
00003 **  Purpose:  Generates a normproto and pffmtable.
00004 **  Author:    Dan Johnson
00005 **  Revisment:  Christy Russon
00006 **  History:     Fri Aug 18 08:53:50 1989, DSJ, Created.
00007 **         5/25/90, DSJ, Adapted to multiple feature types.
00008 **        Tuesday, May 17, 1998 Changes made to make feature specific and
00009 **        simplify structures. First step in simplifying training process.
00010 **
00011  **  (c) Copyright Hewlett-Packard Company, 1988.
00012  ** Licensed under the Apache License, Version 2.0 (the "License");
00013  ** you may not use this file except in compliance with the License.
00014  ** You may obtain a copy of the License at
00015  ** http://www.apache.org/licenses/LICENSE-2.0
00016  ** Unless required by applicable law or agreed to in writing, software
00017  ** distributed under the License is distributed on an "AS IS" BASIS,
00018  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00019  ** See the License for the specific language governing permissions and
00020  ** limitations under the License.
00021 ******************************************************************************/
00022 
00023 
00027 #include "oldlist.h"
00028 #include "efio.h"
00029 #include "emalloc.h"
00030 #include "featdefs.h"
00031 #include "tessopt.h"
00032 #include "ocrfeatures.h"
00033 #include "clusttool.h"
00034 #include "cluster.h"
00035 #include <string.h>
00036 #include <stdio.h>
00037 #include <math.h>
00038 #include "unichar.h"
00039 #include "commontraining.h"
00040 
00041 #define PROGRAM_FEATURE_TYPE "cn"
00042 
00043 DECLARE_STRING_PARAM_FLAG(D);
00044 
00048 int main (
00049      int  argc,
00050      char  **argv);
00051 
00056 void WriteNormProtos (
00057      const char  *Directory,
00058      LIST  LabeledProtoList,
00059    CLUSTERER *Clusterer);
00060 
00061 /*
00062 PARAMDESC *ConvertToPARAMDESC(
00063   PARAM_DESC* Param_Desc,
00064   int N);
00065 */
00066 
00067 void WriteProtos(
00068      FILE  *File,
00069      uinT16  N,
00070      LIST  ProtoList,
00071      BOOL8  WriteSigProtos,
00072      BOOL8  WriteInsigProtos);
00073 
00077 /* global variable to hold configuration parameters to control clustering */
00078 //-M 0.025   -B 0.05   -I 0.8   -C 1e-3
00079 CLUSTERCONFIG  CNConfig =
00080 {
00081   elliptical, 0.025, 0.05, 0.8, 1e-3, 0
00082 };
00083 
00084 
00088 /*---------------------------------------------------------------------------*/
00089 int main(int  argc, char* argv[])
00090 
00091 /*
00092 **  Parameters:
00093 **    argc  number of command line arguments
00094 **    argv  array of command line arguments
00095 **  Globals: none
00096 **  Operation:
00097 **    This program reads in a text file consisting of feature
00098 **    samples from a training page in the following format:
00099 **
00100 **      FontName CharName NumberOfFeatureTypes(N)
00101 **         FeatureTypeName1 NumberOfFeatures(M)
00102 **            Feature1
00103 **            ...
00104 **            FeatureM
00105 **         FeatureTypeName2 NumberOfFeatures(M)
00106 **            Feature1
00107 **            ...
00108 **            FeatureM
00109 **         ...
00110 **         FeatureTypeNameN NumberOfFeatures(M)
00111 **            Feature1
00112 **            ...
00113 **            FeatureM
00114 **      FontName CharName ...
00115 **
00116 **    It then appends these samples into a separate file for each
00117 **    character.  The name of the file is
00118 **
00119 **      DirectoryName/FontName/CharName.FeatureTypeName
00120 **
00121 **    The DirectoryName can be specified via a command
00122 **    line argument.  If not specified, it defaults to the
00123 **    current directory.  The format of the resulting files is:
00124 **
00125 **      NumberOfFeatures(M)
00126 **         Feature1
00127 **         ...
00128 **         FeatureM
00129 **      NumberOfFeatures(M)
00130 **      ...
00131 **
00132 **    The output files each have a header which describes the
00133 **    type of feature which the file contains.  This header is
00134 **    in the format required by the clusterer.  A command line
00135 **    argument can also be used to specify that only the first
00136 **    N samples of each class should be used.
00137 **  Return: none
00138 **  Exceptions: none
00139 **  History: Fri Aug 18 08:56:17 1989, DSJ, Created.
00140 */
00141 
00142 {
00143   // Set the global Config parameters before parsing the command line.
00144   Config = CNConfig;
00145 
00146   const char  *PageName;
00147   FILE  *TrainingPage;
00148   LIST  CharList = NIL_LIST;
00149   CLUSTERER  *Clusterer = NULL;
00150   LIST    ProtoList = NIL_LIST;
00151   LIST    NormProtoList = NIL_LIST;
00152   LIST pCharList;
00153   LABELEDLIST CharSample;
00154   FEATURE_DEFS_STRUCT FeatureDefs;
00155   InitFeatureDefs(&FeatureDefs);
00156 
00157   ParseArguments(&argc, &argv);
00158   int num_fonts = 0;
00159   while ((PageName = GetNextFilename(argc, argv)) != NULL) {
00160     printf("Reading %s ...\n", PageName);
00161     TrainingPage = Efopen(PageName, "rb");
00162     ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE,
00163                         100, NULL, TrainingPage, &CharList);
00164     fclose(TrainingPage);
00165     ++num_fonts;
00166   }
00167   printf("Clustering ...\n");
00168   // To allow an individual font to form a separate cluster,
00169   // reduce the min samples:
00170   // Config.MinSamples = 0.5 / num_fonts;
00171   pCharList = CharList;
00172   iterate(pCharList) {
00173     //Cluster
00174     CharSample = (LABELEDLIST)first_node(pCharList);
00175     Clusterer =
00176       SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
00177     float SavedMinSamples = Config.MinSamples;
00178     // To disable the tendency to produce a single cluster for all fonts,
00179     // make MagicSamples an impossible to achieve number:
00180     // Config.MagicSamples = CharSample->SampleCount * 10;
00181     Config.MagicSamples = CharSample->SampleCount;
00182     while (Config.MinSamples > 0.001) {
00183       ProtoList = ClusterSamples(Clusterer, &Config);
00184       if (NumberOfProtos(ProtoList, 1, 0) > 0) {
00185         break;
00186       } else {
00187         Config.MinSamples *= 0.95;
00188         printf("0 significant protos for %s."
00189                " Retrying clustering with MinSamples = %f%%\n",
00190                CharSample->Label, Config.MinSamples);
00191       }
00192     }
00193     Config.MinSamples = SavedMinSamples;
00194     AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
00195   }
00196   FreeTrainingSamples(CharList);
00197   if (Clusterer == NULL) { // To avoid a SIGSEGV
00198     fprintf(stderr, "Error: NULL clusterer!\n");
00199     return 1;
00200   }
00201   WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
00202   FreeNormProtoList(NormProtoList);
00203   FreeProtoList(&ProtoList);
00204   FreeClusterer(Clusterer);
00205   printf ("\n");
00206   return 0;
00207 }  // main
00208 
00209 
00214 /*----------------------------------------------------------------------------*/
00215 void WriteNormProtos (
00216      const char  *Directory,
00217      LIST  LabeledProtoList,
00218    CLUSTERER *Clusterer)
00219 
00220 /*
00221 **  Parameters:
00222 **    Directory  directory to place sample files into
00223 **  Operation:
00224 **    This routine writes the specified samples into files which
00225 **    are organized according to the font name and character name
00226 **    of the samples.
00227 **  Return: none
00228 **  Exceptions: none
00229 **  History: Fri Aug 18 16:17:06 1989, DSJ, Created.
00230 */
00231 
00232 {
00233   FILE    *File;
00234   STRING Filename;
00235   LABELEDLIST LabeledProto;
00236   int N;
00237 
00238   Filename = "";
00239   if (Directory != NULL && Directory[0] != '\0')
00240   {
00241     Filename += Directory;
00242     Filename += "/";
00243   }
00244   Filename += "normproto";
00245   printf ("\nWriting %s ...", Filename.string());
00246   File = Efopen (Filename.string(), "wb");
00247   fprintf(File,"%0d\n",Clusterer->SampleSize);
00248   WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
00249   iterate(LabeledProtoList)
00250   {
00251     LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
00252     N = NumberOfProtos(LabeledProto->List, true, false);
00253     if (N < 1) {
00254       printf ("\nError! Not enough protos for %s: %d protos"
00255               " (%d significant protos"
00256               ", %d insignificant protos)\n",
00257               LabeledProto->Label, N,
00258               NumberOfProtos(LabeledProto->List, 1, 0),
00259               NumberOfProtos(LabeledProto->List, 0, 1));
00260       exit(1);
00261     }
00262     fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
00263     WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
00264   }
00265   fclose (File);
00266 
00267 }  // WriteNormProtos
00268 
00269 /*-------------------------------------------------------------------------*/
00270 void WriteProtos(
00271      FILE  *File,
00272      uinT16  N,
00273      LIST  ProtoList,
00274      BOOL8  WriteSigProtos,
00275      BOOL8  WriteInsigProtos)
00276 {
00277   PROTOTYPE  *Proto;
00278 
00279   // write prototypes
00280   iterate(ProtoList)
00281   {
00282     Proto = (PROTOTYPE *) first_node ( ProtoList );
00283     if (( Proto->Significant && WriteSigProtos )  ||
00284       ( ! Proto->Significant && WriteInsigProtos ) )
00285       WritePrototype( File, N, Proto );
00286   }
00287 }  // WriteProtos