Tesseract  3.02
tesseract-ocr/classify/clusttool.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:       clustertool.c
00003  **     Purpose:        Misc. tools for use with the clustering routines
00004  **     Author:         Dan Johnson
00005  **     History:        6/6/89, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 
00019 //--------------------------Include Files----------------------------------
00020 #include "clusttool.h"
00021 #include "const.h"
00022 #include "danerror.h"
00023 #include "emalloc.h"
00024 #include "scanutils.h"
00025 #include <stdio.h>
00026 #include <math.h>
00027 
00028 //---------------Global Data Definitions and Declarations--------------------
00029 #define TOKENSIZE 80             //max size of tokens read from an input file
00030 #define MAXSAMPLESIZE 65535      //max num of dimensions in feature space
00031 //#define MAXBLOCKSIZE  65535   //max num of samples in a character (block size)
00032 
00033 /*---------------------------------------------------------------------------
00034           Public Code
00035 -----------------------------------------------------------------------------*/
00046 uinT16 ReadSampleSize(FILE *File) {
00047   int SampleSize;
00048 
00049   if ((fscanf (File, "%d", &SampleSize) != 1) ||
00050     (SampleSize < 0) || (SampleSize > MAXSAMPLESIZE))
00051     DoError (ILLEGALSAMPLESIZE, "Illegal sample size");
00052   return (SampleSize);
00053 }                                // ReadSampleSize
00054 
00055 
00068 PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N) {
00069   int i;
00070   PARAM_DESC *ParamDesc;
00071   char Token[TOKENSIZE];
00072 
00073   ParamDesc = (PARAM_DESC *) Emalloc (N * sizeof (PARAM_DESC));
00074   for (i = 0; i < N; i++) {
00075     if (fscanf (File, "%s", Token) != 1)
00076       DoError (ILLEGALCIRCULARSPEC,
00077         "Illegal circular/linear specification");
00078     if (Token[0] == 'c')
00079       ParamDesc[i].Circular = TRUE;
00080     else
00081       ParamDesc[i].Circular = FALSE;
00082 
00083     if (fscanf (File, "%s", Token) != 1)
00084       DoError (ILLEGALESSENTIALSPEC,
00085         "Illegal essential/non-essential spec");
00086     if (Token[0] == 'e')
00087       ParamDesc[i].NonEssential = FALSE;
00088     else
00089       ParamDesc[i].NonEssential = TRUE;
00090     if (fscanf (File, "%f%f", &(ParamDesc[i].Min), &(ParamDesc[i].Max)) !=
00091       2)
00092       DoError (ILLEGALMINMAXSPEC, "Illegal min or max specification");
00093     ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
00094     ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
00095     ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
00096   }
00097   return (ParamDesc);
00098 }                                // ReadParamDesc
00099 
00100 
00115 PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) {
00116   char Token[TOKENSIZE];
00117   int Status;
00118   PROTOTYPE *Proto;
00119   int SampleCount;
00120   int i;
00121 
00122   if ((Status = fscanf (File, "%s", Token)) == 1) {
00123     Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
00124     Proto->Cluster = NULL;
00125     if (Token[0] == 's')
00126       Proto->Significant = TRUE;
00127     else
00128       Proto->Significant = FALSE;
00129 
00130     Proto->Style = ReadProtoStyle (File);
00131 
00132     if ((fscanf (File, "%d", &SampleCount) != 1) || (SampleCount < 0))
00133       DoError (ILLEGALSAMPLECOUNT, "Illegal sample count");
00134     Proto->NumSamples = SampleCount;
00135 
00136     Proto->Mean = ReadNFloats (File, N, NULL);
00137     if (Proto->Mean == NULL)
00138       DoError (ILLEGALMEANSPEC, "Illegal prototype mean");
00139 
00140     switch (Proto->Style) {
00141       case spherical:
00142         if (ReadNFloats (File, 1, &(Proto->Variance.Spherical)) == NULL)
00143           DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
00144         Proto->Magnitude.Spherical =
00145           1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
00146         Proto->TotalMagnitude =
00147           pow (Proto->Magnitude.Spherical, (float) N);
00148         Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
00149         Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
00150         Proto->Distrib = NULL;
00151         break;
00152       case elliptical:
00153         Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
00154         if (Proto->Variance.Elliptical == NULL)
00155           DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
00156         Proto->Magnitude.Elliptical =
00157           (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
00158         Proto->Weight.Elliptical =
00159           (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
00160         Proto->TotalMagnitude = 1.0;
00161         for (i = 0; i < N; i++) {
00162           Proto->Magnitude.Elliptical[i] =
00163             1.0 /
00164             sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
00165           Proto->Weight.Elliptical[i] =
00166             1.0 / Proto->Variance.Elliptical[i];
00167           Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
00168         }
00169         Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
00170         Proto->Distrib = NULL;
00171         break;
00172       case mixed:
00173         Proto->Distrib =
00174           (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
00175         for (i = 0; i < N; i++) {
00176           if (fscanf (File, "%s", Token) != 1)
00177             DoError (ILLEGALDISTRIBUTION,
00178               "Illegal prototype distribution");
00179           switch (Token[0]) {
00180             case 'n':
00181               Proto->Distrib[i] = normal;
00182               break;
00183             case 'u':
00184               Proto->Distrib[i] = uniform;
00185               break;
00186             case 'r':
00187               Proto->Distrib[i] = D_random;
00188               break;
00189             default:
00190               DoError (ILLEGALDISTRIBUTION,
00191                 "Illegal prototype distribution");
00192           }
00193         }
00194         Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
00195         if (Proto->Variance.Elliptical == NULL)
00196           DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
00197         Proto->Magnitude.Elliptical =
00198           (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
00199         Proto->Weight.Elliptical =
00200           (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
00201         Proto->TotalMagnitude = 1.0;
00202         for (i = 0; i < N; i++) {
00203           switch (Proto->Distrib[i]) {
00204             case normal:
00205               Proto->Magnitude.Elliptical[i] = 1.0 /
00206                 sqrt ((double)
00207                 (2.0 * PI * Proto->Variance.Elliptical[i]));
00208               Proto->Weight.Elliptical[i] =
00209                 1.0 / Proto->Variance.Elliptical[i];
00210               break;
00211             case uniform:
00212             case D_random:
00213               Proto->Magnitude.Elliptical[i] = 1.0 /
00214                 (2.0 * Proto->Variance.Elliptical[i]);
00215               break;
00216             case DISTRIBUTION_COUNT:
00217               ASSERT_HOST(!"Distribution count not allowed!");
00218           }
00219           Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
00220         }
00221         Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
00222         break;
00223     }
00224     return (Proto);
00225   }
00226   else if (Status == EOF)
00227     return (NULL);
00228   else {
00229     DoError (ILLEGALSIGNIFICANCESPEC, "Illegal significance specification");
00230     return (NULL);
00231   }
00232 }                                // ReadPrototype
00233 
00234 
00235 /* ReadProtoStyle *************************************************************
00236 Parameters:     File    open text file to read prototype style from
00237 Globals:        None
00238 Operation:      This routine reads an single token from the specified
00239       text file and interprets it as a prototype specification.
00240 Return:         Prototype style read from text file
00241 Exceptions:     ILLEGALSTYLESPEC        illegal prototype style specification
00242 History:        6/8/89, DSJ, Created.
00243 *******************************************************************************/
00244 PROTOSTYLE ReadProtoStyle(FILE *File) {
00245   char Token[TOKENSIZE];
00246   PROTOSTYLE Style;
00247 
00248   if (fscanf (File, "%s", Token) != 1)
00249     DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification");
00250   switch (Token[0]) {
00251     case 's':
00252       Style = spherical;
00253       break;
00254     case 'e':
00255       Style = elliptical;
00256       break;
00257     case 'm':
00258       Style = mixed;
00259       break;
00260     case 'a':
00261       Style = automatic;
00262       break;
00263     default:
00264       Style = elliptical;
00265       DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification");
00266   }
00267   return (Style);
00268 }                                // ReadProtoStyle
00269 
00270 
00285 FLOAT32 *
00286 ReadNFloats (FILE * File, uinT16 N, FLOAT32 Buffer[]) {
00287   int i;
00288   int NumFloatsRead;
00289 
00290   if (Buffer == NULL)
00291     Buffer = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
00292 
00293   for (i = 0; i < N; i++) {
00294     NumFloatsRead = fscanf (File, "%f", &(Buffer[i]));
00295     if (NumFloatsRead != 1) {
00296       if ((NumFloatsRead == EOF) && (i == 0))
00297         return (NULL);
00298       else
00299         DoError (ILLEGALFLOAT, "Illegal float specification");
00300     }
00301   }
00302   return (Buffer);
00303 }                                // ReadNFloats
00304 
00305 
00317 void
00318 WriteParamDesc (FILE * File, uinT16 N, PARAM_DESC ParamDesc[]) {
00319   int i;
00320 
00321   for (i = 0; i < N; i++) {
00322     if (ParamDesc[i].Circular)
00323       fprintf (File, "circular ");
00324     else
00325       fprintf (File, "linear   ");
00326 
00327     if (ParamDesc[i].NonEssential)
00328       fprintf (File, "non-essential ");
00329     else
00330       fprintf (File, "essential     ");
00331 
00332     fprintf (File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max);
00333   }
00334 }                                // WriteParamDesc
00335 
00336 
00348 void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto) {
00349   int i;
00350 
00351   if (Proto->Significant)
00352     fprintf (File, "significant   ");
00353   else
00354     fprintf (File, "insignificant ");
00355   WriteProtoStyle (File, (PROTOSTYLE) Proto->Style);
00356   fprintf (File, "%6d\n\t", Proto->NumSamples);
00357   WriteNFloats (File, N, Proto->Mean);
00358   fprintf (File, "\t");
00359 
00360   switch (Proto->Style) {
00361     case spherical:
00362       WriteNFloats (File, 1, &(Proto->Variance.Spherical));
00363       break;
00364     case elliptical:
00365       WriteNFloats (File, N, Proto->Variance.Elliptical);
00366       break;
00367     case mixed:
00368       for (i = 0; i < N; i++)
00369       switch (Proto->Distrib[i]) {
00370         case normal:
00371           fprintf (File, " %9s", "normal");
00372           break;
00373         case uniform:
00374           fprintf (File, " %9s", "uniform");
00375           break;
00376         case D_random:
00377           fprintf (File, " %9s", "random");
00378           break;
00379         case DISTRIBUTION_COUNT:
00380           ASSERT_HOST(!"Distribution count not allowed!");
00381       }
00382       fprintf (File, "\n\t");
00383       WriteNFloats (File, N, Proto->Variance.Elliptical);
00384   }
00385 }                                // WritePrototype
00386 
00387 
00399 void WriteNFloats(FILE * File, uinT16 N, FLOAT32 Array[]) {
00400   for (int i = 0; i < N; i++)
00401     fprintf(File, " %9.6f", Array[i]);
00402   fprintf(File, "\n");
00403 }                                // WriteNFloats
00404 
00405 
00417 void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {
00418   switch (ProtoStyle) {
00419     case spherical:
00420       fprintf (File, "spherical");
00421       break;
00422     case elliptical:
00423       fprintf (File, "elliptical");
00424       break;
00425     case mixed:
00426       fprintf (File, "mixed");
00427       break;
00428     case automatic:
00429       fprintf (File, "automatic");
00430       break;
00431   }
00432 }                                // WriteProtoStyle
00433 
00434 /*---------------------------------------------------------------------------*/
00435 void WriteProtoList(
00436      FILE       *File,
00437      uinT16     N,
00438      PARAM_DESC ParamDesc[],
00439      LIST       ProtoList,
00440      BOOL8      WriteSigProtos,
00441      BOOL8      WriteInsigProtos)
00442 
00443 /*
00444 **      Parameters:
00445 **              File            open text file to write prototypes to
00446 **              N               number of dimensions in feature space
00447 **              ParamDesc       descriptions for each dimension
00448 **              ProtoList       list of prototypes to be written
00449 **              WriteSigProtos  TRUE to write out significant prototypes
00450 **              WriteInsigProtos        TRUE to write out insignificants
00451 **      Globals:
00452 **              None
00453 **      Operation:
00454 **              This routine writes a textual description of each prototype
00455 **              in the prototype list to the specified file.  It also
00456 **              writes a file header which includes the number of dimensions
00457 **              in feature space and the descriptions for each dimension.
00458 **      Return:
00459 **              None
00460 **      Exceptions:
00461 **              None
00462 **      History:
00463 **              6/12/89, DSJ, Created.
00464 */
00465 
00466 {
00467   PROTOTYPE     *Proto;
00468 
00469   /* write file header */
00470   fprintf(File,"%0d\n",N);
00471   WriteParamDesc(File,N,ParamDesc);
00472 
00473   /* write prototypes */
00474   iterate(ProtoList)
00475     {
00476       Proto = (PROTOTYPE *) first_node ( ProtoList );
00477       if (( Proto->Significant && WriteSigProtos )      ||
00478           ( ! Proto->Significant && WriteInsigProtos ) )
00479         WritePrototype( File, N, Proto );
00480     }
00481 }       /* WriteProtoList */
00482