Tesseract  3.02
tesseract-ocr/classify/cluster.h
Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:       cluster.h
00003  **     Purpose:        Definition of feature space clustering routines
00004  **     Author:         Dan Johnson
00005  **     History:        5/29/89, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 #ifndef   CLUSTER_H
00019 #define   CLUSTER_H
00020 
00021 #include "kdtree.h"
00022 #include "oldlist.h"
00023 
00024 struct BUCKETS;
00025 
00026 #define MINBUCKETS      5
00027 #define MAXBUCKETS      39
00028 
00029 /*----------------------------------------------------------------------
00030           Types
00031 ----------------------------------------------------------------------*/
00032 typedef struct sample {
00033   unsigned Clustered:1;          // TRUE if included in a higher cluster
00034   unsigned Prototype:1;          // TRUE if cluster represented by a proto
00035   unsigned SampleCount:30;       // number of samples in this cluster
00036   struct sample *Left;           // ptr to left sub-cluster
00037   struct sample *Right;          // ptr to right sub-cluster
00038   inT32 CharID;                  // identifier of char sample came from
00039   FLOAT32 Mean[1];               // mean of cluster - SampleSize floats
00040 } CLUSTER;
00041 
00042 typedef CLUSTER SAMPLE;          // can refer to as either sample or cluster
00043 
00044 typedef enum {
00045   spherical, elliptical, mixed, automatic
00046 } PROTOSTYLE;
00047 
00048 typedef struct {                 // parameters to control clustering
00049   PROTOSTYLE ProtoStyle;         // specifies types of protos to be made
00050   FLOAT32 MinSamples;            // min # of samples per proto - % of total
00051   FLOAT32 MaxIllegal;            // max percentage of samples in a cluster which have
00052   // more than 1 feature in that cluster
00053   FLOAT32 Independence;          // desired independence between dimensions
00054   FLOAT64 Confidence;            // desired confidence in prototypes created
00055   int MagicSamples;              // Ideal number of samples in a cluster.
00056 } CLUSTERCONFIG;
00057 
00058 typedef enum {
00059   normal, uniform, D_random, DISTRIBUTION_COUNT
00060 } DISTRIBUTION;
00061 
00062 typedef union {
00063   FLOAT32 Spherical;
00064   FLOAT32 *Elliptical;
00065 } FLOATUNION;
00066 
00067 typedef struct {
00068   unsigned Significant:1;        // TRUE if prototype is significant
00069   unsigned Merged:1;             // Merged after clustering so do not output
00070                                  // but kept for display purposes. If it has no
00071                                  // samples then it was actually merged.
00072                                  // Otherwise it matched an already significant
00073                                  // cluster.
00074   unsigned Style:2;              // spherical, elliptical, or mixed
00075   unsigned NumSamples:28;        // number of samples in the cluster
00076   CLUSTER *Cluster;              // ptr to cluster which made prototype
00077   DISTRIBUTION *Distrib;         // different distribution for each dimension
00078   FLOAT32 *Mean;                 // prototype mean
00079   FLOAT32 TotalMagnitude;        // total magnitude over all dimensions
00080   FLOAT32 LogMagnitude;          // log base e of TotalMagnitude
00081   FLOATUNION Variance;           // prototype variance
00082   FLOATUNION Magnitude;          // magnitude of density function
00083   FLOATUNION Weight;             // weight of density function
00084 } PROTOTYPE;
00085 
00086 typedef struct {
00087   inT16 SampleSize;              // number of parameters per sample
00088   PARAM_DESC *ParamDesc;         // description of each parameter
00089   inT32 NumberOfSamples;         // total number of samples being clustered
00090   KDTREE *KDTree;                // for optimal nearest neighbor searching
00091   CLUSTER *Root;                 // ptr to root cluster of cluster tree
00092   LIST ProtoList;                // list of prototypes
00093   inT32 NumChar;                 // # of characters represented by samples
00094   // cache of reusable histograms by distribution type and number of buckets.
00095   BUCKETS* bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS];
00096 } CLUSTERER;
00097 
00098 typedef struct {
00099   inT32 NumSamples;              // number of samples in list
00100   inT32 MaxNumSamples;           // maximum size of list
00101   SAMPLE *Sample[1];             // array of ptrs to sample data structures
00102 } SAMPLELIST;
00103 
00104 // low level cluster tree analysis routines.
00105 #define InitSampleSearch(S,C) (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))
00106 
00107 /*--------------------------------------------------------------------------
00108         Public Function Prototypes
00109 --------------------------------------------------------------------------*/
00110 CLUSTERER *MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]);
00111 
00112 SAMPLE *MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, inT32 CharID);
00113 
00114 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
00115 
00116 void FreeClusterer(CLUSTERER *Clusterer);
00117 
00118 void FreeProtoList(LIST *ProtoList);
00119 
00120 void FreePrototype(void *arg);  // PROTOTYPE *Prototype);
00121 
00122 CLUSTER *NextSample(LIST *SearchState);
00123 
00124 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension);
00125 
00126 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension);
00127 
00128 inT32 MergeClusters(inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2,
00129                     FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]);
00130 
00131 //--------------Global Data Definitions and Declarations---------------------------
00132 // define errors that can be trapped
00133 #define ALREADYCLUSTERED  4000
00134 #endif