Tesseract
3.02
|
00001 /****************************************************************************** 00002 ** Filename: cluster.h 00003 ** Purpose: Definition of feature space clustering routines 00004 ** Author: Dan Johnson 00005 ** History: 5/29/89, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 #ifndef CLUSTER_H 00019 #define CLUSTER_H 00020 00021 #include "kdtree.h" 00022 #include "oldlist.h" 00023 00024 struct BUCKETS; 00025 00026 #define MINBUCKETS 5 00027 #define MAXBUCKETS 39 00028 00029 /*---------------------------------------------------------------------- 00030 Types 00031 ----------------------------------------------------------------------*/ 00032 typedef struct sample { 00033 unsigned Clustered:1; // TRUE if included in a higher cluster 00034 unsigned Prototype:1; // TRUE if cluster represented by a proto 00035 unsigned SampleCount:30; // number of samples in this cluster 00036 struct sample *Left; // ptr to left sub-cluster 00037 struct sample *Right; // ptr to right sub-cluster 00038 inT32 CharID; // identifier of char sample came from 00039 FLOAT32 Mean[1]; // mean of cluster - SampleSize floats 00040 } CLUSTER; 00041 00042 typedef CLUSTER SAMPLE; // can refer to as either sample or cluster 00043 00044 typedef enum { 00045 spherical, elliptical, mixed, automatic 00046 } PROTOSTYLE; 00047 00048 typedef struct { // parameters to control clustering 00049 PROTOSTYLE ProtoStyle; // specifies types of protos to be made 00050 FLOAT32 MinSamples; // min # of samples per proto - % of total 00051 FLOAT32 MaxIllegal; // max percentage of samples in a cluster which have 00052 // more than 1 feature in that cluster 00053 FLOAT32 Independence; // desired independence between dimensions 00054 FLOAT64 Confidence; // desired confidence in prototypes created 00055 int MagicSamples; // Ideal number of samples in a cluster. 00056 } CLUSTERCONFIG; 00057 00058 typedef enum { 00059 normal, uniform, D_random, DISTRIBUTION_COUNT 00060 } DISTRIBUTION; 00061 00062 typedef union { 00063 FLOAT32 Spherical; 00064 FLOAT32 *Elliptical; 00065 } FLOATUNION; 00066 00067 typedef struct { 00068 unsigned Significant:1; // TRUE if prototype is significant 00069 unsigned Merged:1; // Merged after clustering so do not output 00070 // but kept for display purposes. If it has no 00071 // samples then it was actually merged. 00072 // Otherwise it matched an already significant 00073 // cluster. 00074 unsigned Style:2; // spherical, elliptical, or mixed 00075 unsigned NumSamples:28; // number of samples in the cluster 00076 CLUSTER *Cluster; // ptr to cluster which made prototype 00077 DISTRIBUTION *Distrib; // different distribution for each dimension 00078 FLOAT32 *Mean; // prototype mean 00079 FLOAT32 TotalMagnitude; // total magnitude over all dimensions 00080 FLOAT32 LogMagnitude; // log base e of TotalMagnitude 00081 FLOATUNION Variance; // prototype variance 00082 FLOATUNION Magnitude; // magnitude of density function 00083 FLOATUNION Weight; // weight of density function 00084 } PROTOTYPE; 00085 00086 typedef struct { 00087 inT16 SampleSize; // number of parameters per sample 00088 PARAM_DESC *ParamDesc; // description of each parameter 00089 inT32 NumberOfSamples; // total number of samples being clustered 00090 KDTREE *KDTree; // for optimal nearest neighbor searching 00091 CLUSTER *Root; // ptr to root cluster of cluster tree 00092 LIST ProtoList; // list of prototypes 00093 inT32 NumChar; // # of characters represented by samples 00094 // cache of reusable histograms by distribution type and number of buckets. 00095 BUCKETS* bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS + 1 - MINBUCKETS]; 00096 } CLUSTERER; 00097 00098 typedef struct { 00099 inT32 NumSamples; // number of samples in list 00100 inT32 MaxNumSamples; // maximum size of list 00101 SAMPLE *Sample[1]; // array of ptrs to sample data structures 00102 } SAMPLELIST; 00103 00104 // low level cluster tree analysis routines. 00105 #define InitSampleSearch(S,C) (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C)))) 00106 00107 /*-------------------------------------------------------------------------- 00108 Public Function Prototypes 00109 --------------------------------------------------------------------------*/ 00110 CLUSTERER *MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]); 00111 00112 SAMPLE *MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, inT32 CharID); 00113 00114 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); 00115 00116 void FreeClusterer(CLUSTERER *Clusterer); 00117 00118 void FreeProtoList(LIST *ProtoList); 00119 00120 void FreePrototype(void *arg); // PROTOTYPE *Prototype); 00121 00122 CLUSTER *NextSample(LIST *SearchState); 00123 00124 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension); 00125 00126 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension); 00127 00128 inT32 MergeClusters(inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, 00129 FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]); 00130 00131 //--------------Global Data Definitions and Declarations--------------------------- 00132 // define errors that can be trapped 00133 #define ALREADYCLUSTERED 4000 00134 #endif