Tesseract
3.02
|
00001 /****************************************************************************** 00002 ** Filename: cluster.c 00003 ** Purpose: Routines for clustering points in N-D space 00004 ** Author: Dan Johnson 00005 ** History: 5/29/89, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 #include "oldheap.h" 00019 #include "const.h" 00020 #include "cluster.h" 00021 #include "emalloc.h" 00022 #include "helpers.h" 00023 #include "matrix.h" 00024 #include "tprintf.h" 00025 #include "danerror.h" 00026 #include "freelist.h" 00027 #include <math.h> 00028 00029 #define HOTELLING 1 // If true use Hotelling's test to decide where to split. 00030 #define FTABLE_X 10 // Size of FTable. 00031 #define FTABLE_Y 100 // Size of FTable. 00032 00033 // Table of values approximating the cumulative F-distribution for a confidence of 1%. 00034 const double FTable[FTABLE_Y][FTABLE_X] = { 00035 {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,}, 00036 {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,}, 00037 {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,}, 00038 {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,}, 00039 {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,}, 00040 {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,}, 00041 {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,}, 00042 {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,}, 00043 {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,}, 00044 {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,}, 00045 { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,}, 00046 { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,}, 00047 { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,}, 00048 { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,}, 00049 { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,}, 00050 { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,}, 00051 { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,}, 00052 { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,}, 00053 { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,}, 00054 { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,}, 00055 { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,}, 00056 { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,}, 00057 { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,}, 00058 { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,}, 00059 { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,}, 00060 { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,}, 00061 { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,}, 00062 { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,}, 00063 { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,}, 00064 { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,}, 00065 { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,}, 00066 { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,}, 00067 { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,}, 00068 { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,}, 00069 { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,}, 00070 { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,}, 00071 { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,}, 00072 { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,}, 00073 { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,}, 00074 { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,}, 00075 { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,}, 00076 { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,}, 00077 { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,}, 00078 { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,}, 00079 { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,}, 00080 { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,}, 00081 { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,}, 00082 { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,}, 00083 { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,}, 00084 { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,}, 00085 { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,}, 00086 { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,}, 00087 { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,}, 00088 { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,}, 00089 { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,}, 00090 { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,}, 00091 { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,}, 00092 { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,}, 00093 { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,}, 00094 { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,}, 00095 { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,}, 00096 { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,}, 00097 { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,}, 00098 { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,}, 00099 { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,}, 00100 { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,}, 00101 { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,}, 00102 { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,}, 00103 { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,}, 00104 { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,}, 00105 { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,}, 00106 { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,}, 00107 { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,}, 00108 { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,}, 00109 { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,}, 00110 { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,}, 00111 { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,}, 00112 { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,}, 00113 { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,}, 00114 { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,}, 00115 { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,}, 00116 { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,}, 00117 { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,}, 00118 { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,}, 00119 { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,}, 00120 { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,}, 00121 { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,}, 00122 { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,}, 00123 { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,}, 00124 { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,}, 00125 { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,}, 00126 { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,}, 00127 { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,}, 00128 { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,}, 00129 { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,}, 00130 { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,}, 00131 { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,}, 00132 { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,}, 00133 { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,}, 00134 { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503} 00135 }; 00136 00137 /* define the variance which will be used as a minimum variance for any 00138 dimension of any feature. Since most features are calculated from numbers 00139 with a precision no better than 1 in 128, the variance should never be 00140 less than the square of this number for parameters whose range is 1. */ 00141 #define MINVARIANCE 0.0004 00142 00143 /* define the absolute minimum number of samples which must be present in 00144 order to accurately test hypotheses about underlying probability 00145 distributions. Define separately the minimum samples that are needed 00146 before a statistical analysis is attempted; this number should be 00147 equal to MINSAMPLES but can be set to a lower number for early testing 00148 when very few samples are available. */ 00149 #define MINSAMPLESPERBUCKET 5 00150 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET) 00151 #define MINSAMPLESNEEDED 1 00152 00153 /* define the size of the table which maps normalized samples to 00154 histogram buckets. Also define the number of standard deviations 00155 in a normal distribution which are considered to be significant. 00156 The mapping table will be defined in such a way that it covers 00157 the specified number of standard deviations on either side of 00158 the mean. BUCKETTABLESIZE should always be even. */ 00159 #define BUCKETTABLESIZE 1024 00160 #define NORMALEXTENT 3.0 00161 00162 struct TEMPCLUSTER { 00163 CLUSTER *Cluster; 00164 CLUSTER *Neighbor; 00165 }; 00166 00167 struct STATISTICS { 00168 FLOAT32 AvgVariance; 00169 FLOAT32 *CoVariance; 00170 FLOAT32 *Min; // largest negative distance from the mean 00171 FLOAT32 *Max; // largest positive distance from the mean 00172 }; 00173 00174 struct BUCKETS { 00175 DISTRIBUTION Distribution; // distribution being tested for 00176 uinT32 SampleCount; // # of samples in histogram 00177 FLOAT64 Confidence; // confidence level of test 00178 FLOAT64 ChiSquared; // test threshold 00179 uinT16 NumberOfBuckets; // number of cells in histogram 00180 uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets 00181 uinT32 *Count; // frequency of occurence histogram 00182 FLOAT32 *ExpectedCount; // expected histogram 00183 }; 00184 00185 struct CHISTRUCT{ 00186 uinT16 DegreesOfFreedom; 00187 FLOAT64 Alpha; 00188 FLOAT64 ChiSquared; 00189 }; 00190 00191 // For use with KDWalk / MakePotentialClusters 00192 struct ClusteringContext { 00193 HEAP *heap; // heap used to hold temp clusters, "best" on top 00194 TEMPCLUSTER *candidates; // array of potential clusters 00195 KDTREE *tree; // kd-tree to be searched for neighbors 00196 inT32 next; // next candidate to be used 00197 }; 00198 00199 typedef FLOAT64 (*DENSITYFUNC) (inT32); 00200 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double); 00201 00202 #define Odd(N) ((N)%2) 00203 #define Mirror(N,R) ((R) - (N) - 1) 00204 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) ) 00205 00206 //--------------Global Data Definitions and Declarations---------------------- 00207 /* the following variables describe a discrete normal distribution 00208 which is used by NormalDensity() and NormalBucket(). The 00209 constant NORMALEXTENT determines how many standard 00210 deviations of the distribution are mapped onto the fixed 00211 discrete range of x. x=0 is mapped to -NORMALEXTENT standard 00212 deviations and x=BUCKETTABLESIZE is mapped to 00213 +NORMALEXTENT standard deviations. */ 00214 #define SqrtOf2Pi 2.506628275 00215 static const FLOAT64 kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT); 00216 static const FLOAT64 kNormalVariance = 00217 (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT); 00218 static const FLOAT64 kNormalMagnitude = 00219 (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE); 00220 static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2; 00221 00222 /* define lookup tables used to compute the number of histogram buckets 00223 that should be used for a given number of samples. */ 00224 #define LOOKUPTABLESIZE 8 00225 #define MAXDEGREESOFFREEDOM MAXBUCKETS 00226 00227 static const uinT32 kCountTable[LOOKUPTABLESIZE] = { 00228 MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000 00229 }; // number of samples 00230 00231 static const uinT16 kBucketsTable[LOOKUPTABLESIZE] = { 00232 MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS 00233 }; // number of buckets 00234 00235 /*------------------------------------------------------------------------- 00236 Private Function Prototypes 00237 --------------------------------------------------------------------------*/ 00238 void CreateClusterTree(CLUSTERER *Clusterer); 00239 00240 void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, 00241 inT32 Level); 00242 00243 CLUSTER *FindNearestNeighbor(KDTREE *Tree, 00244 CLUSTER *Cluster, 00245 FLOAT32 *Distance); 00246 00247 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster); 00248 00249 inT32 MergeClusters (inT16 N, 00250 register PARAM_DESC ParamDesc[], 00251 register inT32 n1, 00252 register inT32 n2, 00253 register FLOAT32 m[], 00254 register FLOAT32 m1[], register FLOAT32 m2[]); 00255 00256 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); 00257 00258 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, 00259 CLUSTERCONFIG *Config, 00260 CLUSTER *Cluster); 00261 00262 PROTOTYPE *MakeDegenerateProto(uinT16 N, 00263 CLUSTER *Cluster, 00264 STATISTICS *Statistics, 00265 PROTOSTYLE Style, 00266 inT32 MinSamples); 00267 00268 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, 00269 CLUSTERCONFIG *Config, 00270 CLUSTER *Cluster, 00271 STATISTICS *Statistics); 00272 00273 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, 00274 CLUSTER *Cluster, 00275 STATISTICS *Statistics, 00276 BUCKETS *Buckets); 00277 00278 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, 00279 CLUSTER *Cluster, 00280 STATISTICS *Statistics, 00281 BUCKETS *Buckets); 00282 00283 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, 00284 CLUSTER *Cluster, 00285 STATISTICS *Statistics, 00286 BUCKETS *NormalBuckets, 00287 FLOAT64 Confidence); 00288 00289 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc); 00290 00291 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics); 00292 00293 STATISTICS *ComputeStatistics (inT16 N, 00294 PARAM_DESC ParamDesc[], CLUSTER * Cluster); 00295 00296 PROTOTYPE *NewSphericalProto(uinT16 N, 00297 CLUSTER *Cluster, 00298 STATISTICS *Statistics); 00299 00300 PROTOTYPE *NewEllipticalProto(inT16 N, 00301 CLUSTER *Cluster, 00302 STATISTICS *Statistics); 00303 00304 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics); 00305 00306 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster); 00307 00308 BOOL8 Independent (PARAM_DESC ParamDesc[], 00309 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence); 00310 00311 BUCKETS *GetBuckets(CLUSTERER* clusterer, 00312 DISTRIBUTION Distribution, 00313 uinT32 SampleCount, 00314 FLOAT64 Confidence); 00315 00316 BUCKETS *MakeBuckets(DISTRIBUTION Distribution, 00317 uinT32 SampleCount, 00318 FLOAT64 Confidence); 00319 00320 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount); 00321 00322 FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha); 00323 00324 FLOAT64 NormalDensity(inT32 x); 00325 00326 FLOAT64 UniformDensity(inT32 x); 00327 00328 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx); 00329 00330 void FillBuckets(BUCKETS *Buckets, 00331 CLUSTER *Cluster, 00332 uinT16 Dim, 00333 PARAM_DESC *ParamDesc, 00334 FLOAT32 Mean, 00335 FLOAT32 StdDev); 00336 00337 uinT16 NormalBucket(PARAM_DESC *ParamDesc, 00338 FLOAT32 x, 00339 FLOAT32 Mean, 00340 FLOAT32 StdDev); 00341 00342 uinT16 UniformBucket(PARAM_DESC *ParamDesc, 00343 FLOAT32 x, 00344 FLOAT32 Mean, 00345 FLOAT32 StdDev); 00346 00347 BOOL8 DistributionOK(BUCKETS *Buckets); 00348 00349 void FreeStatistics(STATISTICS *Statistics); 00350 00351 void FreeBuckets(BUCKETS *Buckets); 00352 00353 void FreeCluster(CLUSTER *Cluster); 00354 00355 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets); 00356 00357 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram, 00358 void *arg2); // uinT16 *DesiredNumberOfBuckets); 00359 00360 int ListEntryMatch(void *arg1, void *arg2); 00361 00362 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount); 00363 00364 void InitBuckets(BUCKETS *Buckets); 00365 00366 int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct, 00367 void *arg2); // CHISTRUCT *SearchKey); 00368 00369 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha); 00370 00371 FLOAT64 Solve(SOLVEFUNC Function, 00372 void *FunctionParams, 00373 FLOAT64 InitialGuess, 00374 FLOAT64 Accuracy); 00375 00376 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x); 00377 00378 BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, 00379 CLUSTER *Cluster, 00380 FLOAT32 MaxIllegal); 00381 00382 double InvertMatrix(const float* input, int size, float* inv); 00383 00384 //--------------------------Public Code-------------------------------------- 00394 CLUSTERER * 00395 MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) { 00396 CLUSTERER *Clusterer; 00397 int i; 00398 00399 // allocate main clusterer data structure and init simple fields 00400 Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER)); 00401 Clusterer->SampleSize = SampleSize; 00402 Clusterer->NumberOfSamples = 0; 00403 Clusterer->NumChar = 0; 00404 00405 // init fields which will not be used initially 00406 Clusterer->Root = NULL; 00407 Clusterer->ProtoList = NIL_LIST; 00408 00409 // maintain a copy of param descriptors in the clusterer data structure 00410 Clusterer->ParamDesc = 00411 (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC)); 00412 for (i = 0; i < SampleSize; i++) { 00413 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular; 00414 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential; 00415 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min; 00416 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max; 00417 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min; 00418 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2; 00419 Clusterer->ParamDesc[i].MidRange = 00420 (ParamDesc[i].Max + ParamDesc[i].Min) / 2; 00421 } 00422 00423 // allocate a kd tree to hold the samples 00424 Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc); 00425 00426 // Initialize cache of histogram buckets to minimize recomputing them. 00427 for (int d = 0; d < DISTRIBUTION_COUNT; ++d) { 00428 for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c) 00429 Clusterer->bucket_cache[d][c] = NULL; 00430 } 00431 00432 return Clusterer; 00433 } // MakeClusterer 00434 00435 00450 SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, 00451 inT32 CharID) { 00452 SAMPLE *Sample; 00453 int i; 00454 00455 // see if the samples have already been clustered - if so trap an error 00456 if (Clusterer->Root != NULL) 00457 DoError (ALREADYCLUSTERED, 00458 "Can't add samples after they have been clustered"); 00459 00460 // allocate the new sample and initialize it 00461 Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) + 00462 (Clusterer->SampleSize - 00463 1) * sizeof (FLOAT32)); 00464 Sample->Clustered = FALSE; 00465 Sample->Prototype = FALSE; 00466 Sample->SampleCount = 1; 00467 Sample->Left = NULL; 00468 Sample->Right = NULL; 00469 Sample->CharID = CharID; 00470 00471 for (i = 0; i < Clusterer->SampleSize; i++) 00472 Sample->Mean[i] = Feature[i]; 00473 00474 // add the sample to the KD tree - keep track of the total # of samples 00475 Clusterer->NumberOfSamples++; 00476 KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample); 00477 if (CharID >= Clusterer->NumChar) 00478 Clusterer->NumChar = CharID + 1; 00479 00480 // execute hook for monitoring clustering operation 00481 // (*SampleCreationHook)( Sample ); 00482 00483 return (Sample); 00484 } // MakeSample 00485 00486 00504 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { 00505 //only create cluster tree if samples have never been clustered before 00506 if (Clusterer->Root == NULL) 00507 CreateClusterTree(Clusterer); 00508 00509 //deallocate the old prototype list if one exists 00510 FreeProtoList (&Clusterer->ProtoList); 00511 Clusterer->ProtoList = NIL_LIST; 00512 00513 //compute prototypes starting at the root node in the tree 00514 ComputePrototypes(Clusterer, Config); 00515 return (Clusterer->ProtoList); 00516 } // ClusterSamples 00517 00518 00532 void FreeClusterer(CLUSTERER *Clusterer) { 00533 if (Clusterer != NULL) { 00534 memfree (Clusterer->ParamDesc); 00535 if (Clusterer->KDTree != NULL) 00536 FreeKDTree (Clusterer->KDTree); 00537 if (Clusterer->Root != NULL) 00538 FreeCluster (Clusterer->Root); 00539 // Free up all used buckets structures. 00540 for (int d = 0; d < DISTRIBUTION_COUNT; ++d) { 00541 for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c) 00542 if (Clusterer->bucket_cache[d][c] != NULL) 00543 FreeBuckets(Clusterer->bucket_cache[d][c]); 00544 } 00545 00546 memfree(Clusterer); 00547 } 00548 } // FreeClusterer 00549 00550 00560 void FreeProtoList(LIST *ProtoList) { 00561 destroy_nodes(*ProtoList, FreePrototype); 00562 } // FreeProtoList 00563 00564 00575 void FreePrototype(void *arg) { //PROTOTYPE *Prototype) 00576 PROTOTYPE *Prototype = (PROTOTYPE *) arg; 00577 00578 // unmark the corresponding cluster (if there is one 00579 if (Prototype->Cluster != NULL) 00580 Prototype->Cluster->Prototype = FALSE; 00581 00582 // deallocate the prototype statistics and then the prototype itself 00583 if (Prototype->Distrib != NULL) 00584 memfree (Prototype->Distrib); 00585 if (Prototype->Mean != NULL) 00586 memfree (Prototype->Mean); 00587 if (Prototype->Style != spherical) { 00588 if (Prototype->Variance.Elliptical != NULL) 00589 memfree (Prototype->Variance.Elliptical); 00590 if (Prototype->Magnitude.Elliptical != NULL) 00591 memfree (Prototype->Magnitude.Elliptical); 00592 if (Prototype->Weight.Elliptical != NULL) 00593 memfree (Prototype->Weight.Elliptical); 00594 } 00595 memfree(Prototype); 00596 } // FreePrototype 00597 00598 00614 CLUSTER *NextSample(LIST *SearchState) { 00615 CLUSTER *Cluster; 00616 00617 if (*SearchState == NIL_LIST) 00618 return (NULL); 00619 Cluster = (CLUSTER *) first_node (*SearchState); 00620 *SearchState = pop (*SearchState); 00621 while (TRUE) { 00622 if (Cluster->Left == NULL) 00623 return (Cluster); 00624 *SearchState = push (*SearchState, Cluster->Right); 00625 Cluster = Cluster->Left; 00626 } 00627 } // NextSample 00628 00629 00639 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) { 00640 return (Proto->Mean[Dimension]); 00641 } // Mean 00642 00643 00653 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) { 00654 switch (Proto->Style) { 00655 case spherical: 00656 return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical)); 00657 case elliptical: 00658 return ((FLOAT32) 00659 sqrt ((double) Proto->Variance.Elliptical[Dimension])); 00660 case mixed: 00661 switch (Proto->Distrib[Dimension]) { 00662 case normal: 00663 return ((FLOAT32) 00664 sqrt ((double) Proto->Variance.Elliptical[Dimension])); 00665 case uniform: 00666 case D_random: 00667 return (Proto->Variance.Elliptical[Dimension]); 00668 case DISTRIBUTION_COUNT: 00669 ASSERT_HOST(!"Distribution count not allowed!"); 00670 } 00671 } 00672 return 0.0f; 00673 } // StandardDeviation 00674 00675 00676 /*--------------------------------------------------------------------------- 00677 Private Code 00678 ----------------------------------------------------------------------------*/ 00694 void CreateClusterTree(CLUSTERER *Clusterer) { 00695 ClusteringContext context; 00696 HEAPENTRY HeapEntry; 00697 TEMPCLUSTER *PotentialCluster; 00698 00699 // each sample and its nearest neighbor form a "potential" cluster 00700 // save these in a heap with the "best" potential clusters on top 00701 context.tree = Clusterer->KDTree; 00702 context.candidates = (TEMPCLUSTER *) 00703 Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER)); 00704 context.next = 0; 00705 context.heap = MakeHeap(Clusterer->NumberOfSamples); 00706 KDWalk(context.tree, (void_proc)MakePotentialClusters, &context); 00707 00708 // form potential clusters into actual clusters - always do "best" first 00709 while (GetTopOfHeap(context.heap, &HeapEntry) != EMPTY) { 00710 PotentialCluster = (TEMPCLUSTER *)HeapEntry.Data; 00711 00712 // if main cluster of potential cluster is already in another cluster 00713 // then we don't need to worry about it 00714 if (PotentialCluster->Cluster->Clustered) { 00715 continue; 00716 } 00717 00718 // if main cluster is not yet clustered, but its nearest neighbor is 00719 // then we must find a new nearest neighbor 00720 else if (PotentialCluster->Neighbor->Clustered) { 00721 PotentialCluster->Neighbor = 00722 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, 00723 &HeapEntry.Key); 00724 if (PotentialCluster->Neighbor != NULL) { 00725 HeapStore(context.heap, &HeapEntry); 00726 } 00727 } 00728 00729 // if neither cluster is already clustered, form permanent cluster 00730 else { 00731 PotentialCluster->Cluster = 00732 MakeNewCluster(Clusterer, PotentialCluster); 00733 PotentialCluster->Neighbor = 00734 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, 00735 &HeapEntry.Key); 00736 if (PotentialCluster->Neighbor != NULL) { 00737 HeapStore(context.heap, &HeapEntry); 00738 } 00739 } 00740 } 00741 00742 // the root node in the cluster tree is now the only node in the kd-tree 00743 Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree); 00744 00745 // free up the memory used by the K-D tree, heap, and temp clusters 00746 FreeKDTree(context.tree); 00747 Clusterer->KDTree = NULL; 00748 FreeHeap(context.heap); 00749 memfree(context.candidates); 00750 } // CreateClusterTree 00751 00752 00764 void MakePotentialClusters(ClusteringContext *context, 00765 CLUSTER *Cluster, inT32 Level) { 00766 HEAPENTRY HeapEntry; 00767 int next = context->next; 00768 context->candidates[next].Cluster = Cluster; 00769 HeapEntry.Data = (char *) &(context->candidates[next]); 00770 context->candidates[next].Neighbor = 00771 FindNearestNeighbor(context->tree, 00772 context->candidates[next].Cluster, 00773 &HeapEntry.Key); 00774 if (context->candidates[next].Neighbor != NULL) { 00775 HeapStore(context->heap, &HeapEntry); 00776 context->next++; 00777 } 00778 } // MakePotentialClusters 00779 00780 00797 CLUSTER * 00798 FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance) 00799 #define MAXNEIGHBORS 2 00800 #define MAXDISTANCE MAX_FLOAT32 00801 { 00802 CLUSTER *Neighbor[MAXNEIGHBORS]; 00803 FLOAT32 Dist[MAXNEIGHBORS]; 00804 int NumberOfNeighbors; 00805 inT32 i; 00806 CLUSTER *BestNeighbor; 00807 00808 // find the 2 nearest neighbors of the cluster 00809 KDNearestNeighborSearch(Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE, 00810 &NumberOfNeighbors, (void **)Neighbor, Dist); 00811 00812 // search for the nearest neighbor that is not the cluster itself 00813 *Distance = MAXDISTANCE; 00814 BestNeighbor = NULL; 00815 for (i = 0; i < NumberOfNeighbors; i++) { 00816 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) { 00817 *Distance = Dist[i]; 00818 BestNeighbor = Neighbor[i]; 00819 } 00820 } 00821 return BestNeighbor; 00822 } // FindNearestNeighbor 00823 00824 00837 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) { 00838 CLUSTER *Cluster; 00839 00840 // allocate the new cluster and initialize it 00841 Cluster = (CLUSTER *) Emalloc( 00842 sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32)); 00843 Cluster->Clustered = FALSE; 00844 Cluster->Prototype = FALSE; 00845 Cluster->Left = TempCluster->Cluster; 00846 Cluster->Right = TempCluster->Neighbor; 00847 Cluster->CharID = -1; 00848 00849 // mark the old clusters as "clustered" and delete them from the kd-tree 00850 Cluster->Left->Clustered = TRUE; 00851 Cluster->Right->Clustered = TRUE; 00852 KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left); 00853 KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right); 00854 00855 // compute the mean and sample count for the new cluster 00856 Cluster->SampleCount = 00857 MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, 00858 Cluster->Left->SampleCount, Cluster->Right->SampleCount, 00859 Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean); 00860 00861 // add the new cluster to the KD tree 00862 KDStore(Clusterer->KDTree, Cluster->Mean, Cluster); 00863 return Cluster; 00864 } // MakeNewCluster 00865 00866 00882 inT32 MergeClusters(inT16 N, 00883 PARAM_DESC ParamDesc[], 00884 inT32 n1, 00885 inT32 n2, 00886 FLOAT32 m[], 00887 FLOAT32 m1[], FLOAT32 m2[]) { 00888 inT32 i, n; 00889 00890 n = n1 + n2; 00891 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) { 00892 if (ParamDesc->Circular) { 00893 // if distance between means is greater than allowed 00894 // reduce upper point by one "rotation" to compute mean 00895 // then normalize the mean back into the accepted range 00896 if ((*m2 - *m1) > ParamDesc->HalfRange) { 00897 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n; 00898 if (*m < ParamDesc->Min) 00899 *m += ParamDesc->Range; 00900 } 00901 else if ((*m1 - *m2) > ParamDesc->HalfRange) { 00902 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n; 00903 if (*m < ParamDesc->Min) 00904 *m += ParamDesc->Range; 00905 } 00906 else 00907 *m = (n1 * *m1 + n2 * *m2) / n; 00908 } 00909 else 00910 *m = (n1 * *m1 + n2 * *m2) / n; 00911 } 00912 return n; 00913 } // MergeClusters 00914 00915 00927 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { 00928 LIST ClusterStack = NIL_LIST; 00929 CLUSTER *Cluster; 00930 PROTOTYPE *Prototype; 00931 00932 // use a stack to keep track of clusters waiting to be processed 00933 // initially the only cluster on the stack is the root cluster 00934 if (Clusterer->Root != NULL) 00935 ClusterStack = push (NIL_LIST, Clusterer->Root); 00936 00937 // loop until we have analyzed all clusters which are potential prototypes 00938 while (ClusterStack != NIL_LIST) { 00939 // remove the next cluster to be analyzed from the stack 00940 // try to make a prototype from the cluster 00941 // if successful, put it on the proto list, else split the cluster 00942 Cluster = (CLUSTER *) first_node (ClusterStack); 00943 ClusterStack = pop (ClusterStack); 00944 Prototype = MakePrototype(Clusterer, Config, Cluster); 00945 if (Prototype != NULL) { 00946 Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype); 00947 } 00948 else { 00949 ClusterStack = push (ClusterStack, Cluster->Right); 00950 ClusterStack = push (ClusterStack, Cluster->Left); 00951 } 00952 } 00953 } // ComputePrototypes 00954 00955 00974 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, 00975 CLUSTERCONFIG *Config, 00976 CLUSTER *Cluster) { 00977 STATISTICS *Statistics; 00978 PROTOTYPE *Proto; 00979 BUCKETS *Buckets; 00980 00981 // filter out clusters which contain samples from the same character 00982 if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal)) 00983 return NULL; 00984 00985 // compute the covariance matrix and ranges for the cluster 00986 Statistics = 00987 ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster); 00988 00989 // check for degenerate clusters which need not be analyzed further 00990 // note that the MinSamples test assumes that all clusters with multiple 00991 // character samples have been removed (as above) 00992 Proto = MakeDegenerateProto( 00993 Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle, 00994 (inT32) (Config->MinSamples * Clusterer->NumChar)); 00995 if (Proto != NULL) { 00996 FreeStatistics(Statistics); 00997 return Proto; 00998 } 00999 // check to ensure that all dimensions are independent 01000 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, 01001 Statistics->CoVariance, Config->Independence)) { 01002 FreeStatistics(Statistics); 01003 return NULL; 01004 } 01005 01006 if (HOTELLING && Config->ProtoStyle == elliptical) { 01007 Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics); 01008 if (Proto != NULL) { 01009 FreeStatistics(Statistics); 01010 return Proto; 01011 } 01012 } 01013 01014 // create a histogram data structure used to evaluate distributions 01015 Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, 01016 Config->Confidence); 01017 01018 // create a prototype based on the statistics and test it 01019 switch (Config->ProtoStyle) { 01020 case spherical: 01021 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); 01022 break; 01023 case elliptical: 01024 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); 01025 break; 01026 case mixed: 01027 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, 01028 Config->Confidence); 01029 break; 01030 case automatic: 01031 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); 01032 if (Proto != NULL) 01033 break; 01034 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); 01035 if (Proto != NULL) 01036 break; 01037 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, 01038 Config->Confidence); 01039 break; 01040 } 01041 FreeStatistics(Statistics); 01042 return Proto; 01043 } // MakePrototype 01044 01045 01067 PROTOTYPE *MakeDegenerateProto( //this was MinSample 01068 uinT16 N, 01069 CLUSTER *Cluster, 01070 STATISTICS *Statistics, 01071 PROTOSTYLE Style, 01072 inT32 MinSamples) { 01073 PROTOTYPE *Proto = NULL; 01074 01075 if (MinSamples < MINSAMPLESNEEDED) 01076 MinSamples = MINSAMPLESNEEDED; 01077 01078 if (Cluster->SampleCount < MinSamples) { 01079 switch (Style) { 01080 case spherical: 01081 Proto = NewSphericalProto (N, Cluster, Statistics); 01082 break; 01083 case elliptical: 01084 case automatic: 01085 Proto = NewEllipticalProto (N, Cluster, Statistics); 01086 break; 01087 case mixed: 01088 Proto = NewMixedProto (N, Cluster, Statistics); 01089 break; 01090 } 01091 Proto->Significant = FALSE; 01092 } 01093 return (Proto); 01094 } // MakeDegenerateProto 01095 01109 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, 01110 CLUSTERCONFIG *Config, 01111 CLUSTER *Cluster, 01112 STATISTICS *Statistics) { 01113 // Fraction of the number of samples used as a range around 1 within 01114 // which a cluster has the magic size that allows a boost to the 01115 // FTable by kFTableBoostMargin, thus allowing clusters near the 01116 // magic size (equal to the number of sample characters) to be more 01117 // likely to stay together. 01118 const double kMagicSampleMargin = 0.0625; 01119 const double kFTableBoostMargin = 2.0; 01120 01121 int N = Clusterer->SampleSize; 01122 CLUSTER* Left = Cluster->Left; 01123 CLUSTER* Right = Cluster->Right; 01124 if (Left == NULL || Right == NULL) 01125 return NULL; 01126 int TotalDims = Left->SampleCount + Right->SampleCount; 01127 if (TotalDims < N + 1 || TotalDims < 2) 01128 return NULL; 01129 const int kMatrixSize = N * N * sizeof(FLOAT32); 01130 FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize)); 01131 FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize)); 01132 FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32))); 01133 // Compute a new covariance matrix that only uses essential features. 01134 for (int i = 0; i < N; ++i) { 01135 int row_offset = i * N; 01136 if (!Clusterer->ParamDesc[i].NonEssential) { 01137 for (int j = 0; j < N; ++j) { 01138 if (!Clusterer->ParamDesc[j].NonEssential) 01139 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset]; 01140 else 01141 Covariance[j + row_offset] = 0.0f; 01142 } 01143 } else { 01144 for (int j = 0; j < N; ++j) { 01145 if (i == j) 01146 Covariance[j + row_offset] = 1.0f; 01147 else 01148 Covariance[j + row_offset] = 0.0f; 01149 } 01150 } 01151 } 01152 double err = InvertMatrix(Covariance, N, Inverse); 01153 if (err > 1) { 01154 tprintf("Clustering error: Matrix inverse failed with error %g\n", err); 01155 } 01156 int EssentialN = 0; 01157 for (int dim = 0; dim < N; ++dim) { 01158 if (!Clusterer->ParamDesc[dim].NonEssential) { 01159 Delta[dim] = Left->Mean[dim] - Right->Mean[dim]; 01160 ++EssentialN; 01161 } else { 01162 Delta[dim] = 0.0f; 01163 } 01164 } 01165 // Compute Hotelling's T-squared. 01166 double Tsq = 0.0; 01167 for (int x = 0; x < N; ++x) { 01168 double temp = 0.0; 01169 for (int y = 0; y < N; ++y) { 01170 temp += Inverse[y + N*x] * Delta[y]; 01171 } 01172 Tsq += Delta[x] * temp; 01173 } 01174 memfree(Covariance); 01175 memfree(Inverse); 01176 memfree(Delta); 01177 // Changed this function to match the formula in 01178 // Statistical Methods in Medical Research p 473 01179 // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews. 01180 // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims; 01181 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN); 01182 int Fx = EssentialN; 01183 if (Fx > FTABLE_X) 01184 Fx = FTABLE_X; 01185 --Fx; 01186 int Fy = TotalDims - EssentialN - 1; 01187 if (Fy > FTABLE_Y) 01188 Fy = FTABLE_Y; 01189 --Fy; 01190 double FTarget = FTable[Fy][Fx]; 01191 if (Config->MagicSamples > 0 && 01192 TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) && 01193 TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) { 01194 // Give magic-sized clusters a magic FTable boost. 01195 FTarget += kFTableBoostMargin; 01196 } 01197 if (F < FTarget) { 01198 return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics); 01199 } 01200 return NULL; 01201 } 01202 01203 /* MakeSphericalProto ******************************************************* 01204 Parameters: Clusterer data struct containing samples being clustered 01205 Cluster cluster to be made into a spherical prototype 01206 Statistics statistical info about cluster 01207 Buckets histogram struct used to analyze distribution 01208 Operation: This routine tests the specified cluster to see if it can 01209 be approximated by a spherical normal distribution. If it 01210 can be, then a new prototype is formed and returned to the 01211 caller. If it can't be, then NULL is returned to the caller. 01212 Return: Pointer to new spherical prototype or NULL. 01213 Exceptions: None 01214 History: 6/1/89, DSJ, Created. 01215 ******************************************************************************/ 01216 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, 01217 CLUSTER *Cluster, 01218 STATISTICS *Statistics, 01219 BUCKETS *Buckets) { 01220 PROTOTYPE *Proto = NULL; 01221 int i; 01222 01223 // check that each dimension is a normal distribution 01224 for (i = 0; i < Clusterer->SampleSize; i++) { 01225 if (Clusterer->ParamDesc[i].NonEssential) 01226 continue; 01227 01228 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01229 Cluster->Mean[i], 01230 sqrt ((FLOAT64) (Statistics->AvgVariance))); 01231 if (!DistributionOK (Buckets)) 01232 break; 01233 } 01234 // if all dimensions matched a normal distribution, make a proto 01235 if (i >= Clusterer->SampleSize) 01236 Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics); 01237 return (Proto); 01238 } // MakeSphericalProto 01239 01240 01254 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, 01255 CLUSTER *Cluster, 01256 STATISTICS *Statistics, 01257 BUCKETS *Buckets) { 01258 PROTOTYPE *Proto = NULL; 01259 int i; 01260 01261 // check that each dimension is a normal distribution 01262 for (i = 0; i < Clusterer->SampleSize; i++) { 01263 if (Clusterer->ParamDesc[i].NonEssential) 01264 continue; 01265 01266 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01267 Cluster->Mean[i], 01268 sqrt ((FLOAT64) Statistics-> 01269 CoVariance[i * (Clusterer->SampleSize + 1)])); 01270 if (!DistributionOK (Buckets)) 01271 break; 01272 } 01273 // if all dimensions matched a normal distribution, make a proto 01274 if (i >= Clusterer->SampleSize) 01275 Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics); 01276 return (Proto); 01277 } // MakeEllipticalProto 01278 01279 01298 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, 01299 CLUSTER *Cluster, 01300 STATISTICS *Statistics, 01301 BUCKETS *NormalBuckets, 01302 FLOAT64 Confidence) { 01303 PROTOTYPE *Proto; 01304 int i; 01305 BUCKETS *UniformBuckets = NULL; 01306 BUCKETS *RandomBuckets = NULL; 01307 01308 // create a mixed proto to work on - initially assume all dimensions normal*/ 01309 Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics); 01310 01311 // find the proper distribution for each dimension 01312 for (i = 0; i < Clusterer->SampleSize; i++) { 01313 if (Clusterer->ParamDesc[i].NonEssential) 01314 continue; 01315 01316 FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01317 Proto->Mean[i], 01318 sqrt ((FLOAT64) Proto->Variance.Elliptical[i])); 01319 if (DistributionOK (NormalBuckets)) 01320 continue; 01321 01322 if (RandomBuckets == NULL) 01323 RandomBuckets = 01324 GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence); 01325 MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i])); 01326 FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01327 Proto->Mean[i], Proto->Variance.Elliptical[i]); 01328 if (DistributionOK (RandomBuckets)) 01329 continue; 01330 01331 if (UniformBuckets == NULL) 01332 UniformBuckets = 01333 GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence); 01334 MakeDimUniform(i, Proto, Statistics); 01335 FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01336 Proto->Mean[i], Proto->Variance.Elliptical[i]); 01337 if (DistributionOK (UniformBuckets)) 01338 continue; 01339 break; 01340 } 01341 // if any dimension failed to match a distribution, discard the proto 01342 if (i < Clusterer->SampleSize) { 01343 FreePrototype(Proto); 01344 Proto = NULL; 01345 } 01346 return (Proto); 01347 } // MakeMixedProto 01348 01349 01350 /* MakeDimRandom ************************************************************* 01351 Parameters: i index of dimension to be changed 01352 Proto prototype whose dimension is to be altered 01353 ParamDesc description of specified dimension 01354 Operation: This routine alters the ith dimension of the specified 01355 mixed prototype to be D_random. 01356 Return: None 01357 Exceptions: None 01358 History: 6/20/89, DSJ, Created. 01359 ******************************************************************************/ 01360 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) { 01361 Proto->Distrib[i] = D_random; 01362 Proto->Mean[i] = ParamDesc->MidRange; 01363 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange; 01364 01365 // subtract out the previous magnitude of this dimension from the total 01366 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; 01367 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range; 01368 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01369 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01370 01371 // note that the proto Weight is irrelevant for D_random protos 01372 } // MakeDimRandom 01373 01374 01385 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) { 01386 Proto->Distrib[i] = uniform; 01387 Proto->Mean[i] = Proto->Cluster->Mean[i] + 01388 (Statistics->Min[i] + Statistics->Max[i]) / 2; 01389 Proto->Variance.Elliptical[i] = 01390 (Statistics->Max[i] - Statistics->Min[i]) / 2; 01391 if (Proto->Variance.Elliptical[i] < MINVARIANCE) 01392 Proto->Variance.Elliptical[i] = MINVARIANCE; 01393 01394 // subtract out the previous magnitude of this dimension from the total 01395 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; 01396 Proto->Magnitude.Elliptical[i] = 01397 1.0 / (2.0 * Proto->Variance.Elliptical[i]); 01398 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01399 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01400 01401 // note that the proto Weight is irrelevant for uniform protos 01402 } // MakeDimUniform 01403 01404 01421 STATISTICS * 01422 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) { 01423 STATISTICS *Statistics; 01424 int i, j; 01425 FLOAT32 *CoVariance; 01426 FLOAT32 *Distance; 01427 LIST SearchState; 01428 SAMPLE *Sample; 01429 uinT32 SampleCountAdjustedForBias; 01430 01431 // allocate memory to hold the statistics results 01432 Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS)); 01433 Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32)); 01434 Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01435 Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01436 01437 // allocate temporary memory to hold the sample to mean distances 01438 Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01439 01440 // initialize the statistics 01441 Statistics->AvgVariance = 1.0; 01442 CoVariance = Statistics->CoVariance; 01443 for (i = 0; i < N; i++) { 01444 Statistics->Min[i] = 0.0; 01445 Statistics->Max[i] = 0.0; 01446 for (j = 0; j < N; j++, CoVariance++) 01447 *CoVariance = 0; 01448 } 01449 // find each sample in the cluster and merge it into the statistics 01450 InitSampleSearch(SearchState, Cluster); 01451 while ((Sample = NextSample (&SearchState)) != NULL) { 01452 for (i = 0; i < N; i++) { 01453 Distance[i] = Sample->Mean[i] - Cluster->Mean[i]; 01454 if (ParamDesc[i].Circular) { 01455 if (Distance[i] > ParamDesc[i].HalfRange) 01456 Distance[i] -= ParamDesc[i].Range; 01457 if (Distance[i] < -ParamDesc[i].HalfRange) 01458 Distance[i] += ParamDesc[i].Range; 01459 } 01460 if (Distance[i] < Statistics->Min[i]) 01461 Statistics->Min[i] = Distance[i]; 01462 if (Distance[i] > Statistics->Max[i]) 01463 Statistics->Max[i] = Distance[i]; 01464 } 01465 CoVariance = Statistics->CoVariance; 01466 for (i = 0; i < N; i++) 01467 for (j = 0; j < N; j++, CoVariance++) 01468 *CoVariance += Distance[i] * Distance[j]; 01469 } 01470 // normalize the variances by the total number of samples 01471 // use SampleCount-1 instead of SampleCount to get an unbiased estimate 01472 // also compute the geometic mean of the diagonal variances 01473 // ensure that clusters with only 1 sample are handled correctly 01474 if (Cluster->SampleCount > 1) 01475 SampleCountAdjustedForBias = Cluster->SampleCount - 1; 01476 else 01477 SampleCountAdjustedForBias = 1; 01478 CoVariance = Statistics->CoVariance; 01479 for (i = 0; i < N; i++) 01480 for (j = 0; j < N; j++, CoVariance++) { 01481 *CoVariance /= SampleCountAdjustedForBias; 01482 if (j == i) { 01483 if (*CoVariance < MINVARIANCE) 01484 *CoVariance = MINVARIANCE; 01485 Statistics->AvgVariance *= *CoVariance; 01486 } 01487 } 01488 Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance, 01489 1.0 / N); 01490 01491 // release temporary memory and return 01492 memfree(Distance); 01493 return (Statistics); 01494 } // ComputeStatistics 01495 01496 01510 PROTOTYPE *NewSphericalProto(uinT16 N, 01511 CLUSTER *Cluster, 01512 STATISTICS *Statistics) { 01513 PROTOTYPE *Proto; 01514 01515 Proto = NewSimpleProto (N, Cluster); 01516 01517 Proto->Variance.Spherical = Statistics->AvgVariance; 01518 if (Proto->Variance.Spherical < MINVARIANCE) 01519 Proto->Variance.Spherical = MINVARIANCE; 01520 01521 Proto->Magnitude.Spherical = 01522 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical)); 01523 Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical, 01524 (double) N); 01525 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; 01526 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01527 01528 return (Proto); 01529 } // NewSphericalProto 01530 01531 01544 PROTOTYPE *NewEllipticalProto(inT16 N, 01545 CLUSTER *Cluster, 01546 STATISTICS *Statistics) { 01547 PROTOTYPE *Proto; 01548 FLOAT32 *CoVariance; 01549 int i; 01550 01551 Proto = NewSimpleProto (N, Cluster); 01552 Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01553 Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01554 Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01555 01556 CoVariance = Statistics->CoVariance; 01557 Proto->TotalMagnitude = 1.0; 01558 for (i = 0; i < N; i++, CoVariance += N + 1) { 01559 Proto->Variance.Elliptical[i] = *CoVariance; 01560 if (Proto->Variance.Elliptical[i] < MINVARIANCE) 01561 Proto->Variance.Elliptical[i] = MINVARIANCE; 01562 01563 Proto->Magnitude.Elliptical[i] = 01564 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i])); 01565 Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i]; 01566 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01567 } 01568 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01569 Proto->Style = elliptical; 01570 return (Proto); 01571 } // NewEllipticalProto 01572 01573 01589 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) { 01590 PROTOTYPE *Proto; 01591 int i; 01592 01593 Proto = NewEllipticalProto (N, Cluster, Statistics); 01594 Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION)); 01595 01596 for (i = 0; i < N; i++) { 01597 Proto->Distrib[i] = normal; 01598 } 01599 Proto->Style = mixed; 01600 return (Proto); 01601 } // NewMixedProto 01602 01603 01614 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) { 01615 PROTOTYPE *Proto; 01616 int i; 01617 01618 Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE)); 01619 Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01620 01621 for (i = 0; i < N; i++) 01622 Proto->Mean[i] = Cluster->Mean[i]; 01623 Proto->Distrib = NULL; 01624 01625 Proto->Significant = TRUE; 01626 Proto->Merged = FALSE; 01627 Proto->Style = spherical; 01628 Proto->NumSamples = Cluster->SampleCount; 01629 Proto->Cluster = Cluster; 01630 Proto->Cluster->Prototype = TRUE; 01631 return (Proto); 01632 } // NewSimpleProto 01633 01634 01655 BOOL8 01656 Independent (PARAM_DESC ParamDesc[], 01657 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) { 01658 int i, j; 01659 FLOAT32 *VARii; // points to ith on-diagonal element 01660 FLOAT32 *VARjj; // points to jth on-diagonal element 01661 FLOAT32 CorrelationCoeff; 01662 01663 VARii = CoVariance; 01664 for (i = 0; i < N; i++, VARii += N + 1) { 01665 if (ParamDesc[i].NonEssential) 01666 continue; 01667 01668 VARjj = VARii + N + 1; 01669 CoVariance = VARii + 1; 01670 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) { 01671 if (ParamDesc[j].NonEssential) 01672 continue; 01673 01674 if ((*VARii == 0.0) || (*VARjj == 0.0)) 01675 CorrelationCoeff = 0.0; 01676 else 01677 CorrelationCoeff = 01678 sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj))); 01679 if (CorrelationCoeff > Independence) 01680 return (FALSE); 01681 } 01682 } 01683 return (TRUE); 01684 } // Independent 01685 01686 01705 BUCKETS *GetBuckets(CLUSTERER* clusterer, 01706 DISTRIBUTION Distribution, 01707 uinT32 SampleCount, 01708 FLOAT64 Confidence) { 01709 // Get an old bucket structure with the same number of buckets. 01710 uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); 01711 BUCKETS *Buckets = 01712 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS]; 01713 01714 // If a matching bucket structure is not found, make one and save it. 01715 if (Buckets == NULL) { 01716 Buckets = MakeBuckets(Distribution, SampleCount, Confidence); 01717 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = 01718 Buckets; 01719 } else { 01720 // Just adjust the existing buckets. 01721 if (SampleCount != Buckets->SampleCount) 01722 AdjustBuckets(Buckets, SampleCount); 01723 if (Confidence != Buckets->Confidence) { 01724 Buckets->Confidence = Confidence; 01725 Buckets->ChiSquared = ComputeChiSquared( 01726 DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), 01727 Confidence); 01728 } 01729 InitBuckets(Buckets); 01730 } 01731 return Buckets; 01732 } // GetBuckets 01733 01734 01755 BUCKETS *MakeBuckets(DISTRIBUTION Distribution, 01756 uinT32 SampleCount, 01757 FLOAT64 Confidence) { 01758 const DENSITYFUNC DensityFunction[] = 01759 { NormalDensity, UniformDensity, UniformDensity }; 01760 int i, j; 01761 BUCKETS *Buckets; 01762 FLOAT64 BucketProbability; 01763 FLOAT64 NextBucketBoundary; 01764 FLOAT64 Probability; 01765 FLOAT64 ProbabilityDelta; 01766 FLOAT64 LastProbDensity; 01767 FLOAT64 ProbDensity; 01768 uinT16 CurrentBucket; 01769 BOOL8 Symmetrical; 01770 01771 // allocate memory needed for data structure 01772 Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS))); 01773 Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); 01774 Buckets->SampleCount = SampleCount; 01775 Buckets->Confidence = Confidence; 01776 Buckets->Count = reinterpret_cast<uinT32*>( 01777 Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32))); 01778 Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>( 01779 Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32))); 01780 01781 // initialize simple fields 01782 Buckets->Distribution = Distribution; 01783 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 01784 Buckets->Count[i] = 0; 01785 Buckets->ExpectedCount[i] = 0.0; 01786 } 01787 01788 // all currently defined distributions are symmetrical 01789 Symmetrical = TRUE; 01790 Buckets->ChiSquared = ComputeChiSquared( 01791 DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence); 01792 01793 if (Symmetrical) { 01794 // allocate buckets so that all have approx. equal probability 01795 BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets); 01796 01797 // distribution is symmetric so fill in upper half then copy 01798 CurrentBucket = Buckets->NumberOfBuckets / 2; 01799 if (Odd (Buckets->NumberOfBuckets)) 01800 NextBucketBoundary = BucketProbability / 2; 01801 else 01802 NextBucketBoundary = BucketProbability; 01803 01804 Probability = 0.0; 01805 LastProbDensity = 01806 (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2); 01807 for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) { 01808 ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1); 01809 ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0); 01810 Probability += ProbabilityDelta; 01811 if (Probability > NextBucketBoundary) { 01812 if (CurrentBucket < Buckets->NumberOfBuckets - 1) 01813 CurrentBucket++; 01814 NextBucketBoundary += BucketProbability; 01815 } 01816 Buckets->Bucket[i] = CurrentBucket; 01817 Buckets->ExpectedCount[CurrentBucket] += 01818 (FLOAT32) (ProbabilityDelta * SampleCount); 01819 LastProbDensity = ProbDensity; 01820 } 01821 // place any leftover probability into the last bucket 01822 Buckets->ExpectedCount[CurrentBucket] += 01823 (FLOAT32) ((0.5 - Probability) * SampleCount); 01824 01825 // copy upper half of distribution to lower half 01826 for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) 01827 Buckets->Bucket[i] = 01828 Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets); 01829 01830 // copy upper half of expected counts to lower half 01831 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) 01832 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j]; 01833 } 01834 return Buckets; 01835 } // MakeBuckets 01836 01837 01838 //--------------------------------------------------------------------------- 01839 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) { 01840 /* 01841 ** Parameters: 01842 ** SampleCount number of samples to be tested 01843 ** Operation: 01844 ** This routine computes the optimum number of histogram 01845 ** buckets that should be used in a chi-squared goodness of 01846 ** fit test for the specified number of samples. The optimum 01847 ** number is computed based on Table 4.1 on pg. 147 of 01848 ** "Measurement and Analysis of Random Data" by Bendat & Piersol. 01849 ** Linear interpolation is used to interpolate between table 01850 ** values. The table is intended for a 0.05 level of 01851 ** significance (alpha). This routine assumes that it is 01852 ** equally valid for other alpha's, which may not be true. 01853 ** Return: 01854 ** Optimum number of histogram buckets 01855 ** Exceptions: 01856 ** None 01857 ** History: 01858 ** 6/5/89, DSJ, Created. 01859 */ 01860 uinT8 Last, Next; 01861 FLOAT32 Slope; 01862 01863 if (SampleCount < kCountTable[0]) 01864 return kBucketsTable[0]; 01865 01866 for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) { 01867 if (SampleCount <= kCountTable[Next]) { 01868 Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) / 01869 (FLOAT32) (kCountTable[Next] - kCountTable[Last]); 01870 return ((uinT16) (kBucketsTable[Last] + 01871 Slope * (SampleCount - kCountTable[Last]))); 01872 } 01873 } 01874 return kBucketsTable[Last]; 01875 } // OptimumNumberOfBuckets 01876 01877 01878 //--------------------------------------------------------------------------- 01879 FLOAT64 01880 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha) 01881 /* 01882 ** Parameters: 01883 ** DegreesOfFreedom determines shape of distribution 01884 ** Alpha probability of right tail 01885 ** Operation: 01886 ** This routine computes the chi-squared value which will 01887 ** leave a cumulative probability of Alpha in the right tail 01888 ** of a chi-squared distribution with the specified number of 01889 ** degrees of freedom. Alpha must be between 0 and 1. 01890 ** DegreesOfFreedom must be even. The routine maintains an 01891 ** array of lists. Each list corresponds to a different 01892 ** number of degrees of freedom. Each entry in the list 01893 ** corresponds to a different alpha value and its corresponding 01894 ** chi-squared value. Therefore, once a particular chi-squared 01895 ** value is computed, it is stored in the list and never 01896 ** needs to be computed again. 01897 ** Return: Desired chi-squared value 01898 ** Exceptions: none 01899 ** History: 6/5/89, DSJ, Created. 01900 */ 01901 #define CHIACCURACY 0.01 01902 #define MINALPHA (1e-200) 01903 { 01904 static LIST ChiWith[MAXDEGREESOFFREEDOM + 1]; 01905 01906 CHISTRUCT *OldChiSquared; 01907 CHISTRUCT SearchKey; 01908 01909 // limit the minimum alpha that can be used - if alpha is too small 01910 // it may not be possible to compute chi-squared. 01911 Alpha = ClipToRange(Alpha, MINALPHA, 1.0); 01912 if (Odd (DegreesOfFreedom)) 01913 DegreesOfFreedom++; 01914 01915 /* find the list of chi-squared values which have already been computed 01916 for the specified number of degrees of freedom. Search the list for 01917 the desired chi-squared. */ 01918 SearchKey.Alpha = Alpha; 01919 OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom], 01920 &SearchKey, AlphaMatch)); 01921 01922 if (OldChiSquared == NULL) { 01923 OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha); 01924 OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared, 01925 (FLOAT64) DegreesOfFreedom, 01926 (FLOAT64) CHIACCURACY); 01927 ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom], 01928 OldChiSquared); 01929 } 01930 else { 01931 // further optimization might move OldChiSquared to front of list 01932 } 01933 01934 return (OldChiSquared->ChiSquared); 01935 01936 } // ComputeChiSquared 01937 01938 01939 //--------------------------------------------------------------------------- 01940 FLOAT64 NormalDensity(inT32 x) { 01941 /* 01942 ** Parameters: 01943 ** x number to compute the normal probability density for 01944 ** Globals: 01945 ** kNormalMean mean of a discrete normal distribution 01946 ** kNormalVariance variance of a discrete normal distribution 01947 ** kNormalMagnitude magnitude of a discrete normal distribution 01948 ** Operation: 01949 ** This routine computes the probability density function 01950 ** of a discrete normal distribution defined by the global 01951 ** variables kNormalMean, kNormalVariance, and kNormalMagnitude. 01952 ** Normal magnitude could, of course, be computed in terms of 01953 ** the normal variance but it is precomputed for efficiency. 01954 ** Return: 01955 ** The value of the normal distribution at x. 01956 ** Exceptions: 01957 ** None 01958 ** History: 01959 ** 6/4/89, DSJ, Created. 01960 */ 01961 FLOAT64 Distance; 01962 01963 Distance = x - kNormalMean; 01964 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance); 01965 } // NormalDensity 01966 01967 01968 //--------------------------------------------------------------------------- 01969 FLOAT64 UniformDensity(inT32 x) { 01970 /* 01971 ** Parameters: 01972 ** x number to compute the uniform probability density for 01973 ** Operation: 01974 ** This routine computes the probability density function 01975 ** of a uniform distribution at the specified point. The 01976 ** range of the distribution is from 0 to BUCKETTABLESIZE. 01977 ** Return: 01978 ** The value of the uniform distribution at x. 01979 ** Exceptions: 01980 ** None 01981 ** History: 01982 ** 6/5/89, DSJ, Created. 01983 */ 01984 static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE; 01985 01986 if ((x >= 0.0) && (x <= BUCKETTABLESIZE)) 01987 return UniformDistributionDensity; 01988 else 01989 return (FLOAT64) 0.0; 01990 } // UniformDensity 01991 01992 01993 //--------------------------------------------------------------------------- 01994 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) { 01995 /* 01996 ** Parameters: 01997 ** f1 value of function at x1 01998 ** f2 value of function at x2 01999 ** Dx x2 - x1 (should always be positive) 02000 ** Operation: 02001 ** This routine computes a trapezoidal approximation to the 02002 ** integral of a function over a small delta in x. 02003 ** Return: 02004 ** Approximation of the integral of the function from x1 to x2. 02005 ** Exceptions: 02006 ** None 02007 ** History: 02008 ** 6/5/89, DSJ, Created. 02009 */ 02010 return (f1 + f2) * Dx / 2.0; 02011 } // Integral 02012 02013 02014 //--------------------------------------------------------------------------- 02015 void FillBuckets(BUCKETS *Buckets, 02016 CLUSTER *Cluster, 02017 uinT16 Dim, 02018 PARAM_DESC *ParamDesc, 02019 FLOAT32 Mean, 02020 FLOAT32 StdDev) { 02021 /* 02022 ** Parameters: 02023 ** Buckets histogram buckets to count samples 02024 ** Cluster cluster whose samples are being analyzed 02025 ** Dim dimension of samples which is being analyzed 02026 ** ParamDesc description of the dimension 02027 ** Mean "mean" of the distribution 02028 ** StdDev "standard deviation" of the distribution 02029 ** Operation: 02030 ** This routine counts the number of cluster samples which 02031 ** fall within the various histogram buckets in Buckets. Only 02032 ** one dimension of each sample is examined. The exact meaning 02033 ** of the Mean and StdDev parameters depends on the 02034 ** distribution which is being analyzed (this info is in the 02035 ** Buckets data structure). For normal distributions, Mean 02036 ** and StdDev have the expected meanings. For uniform and 02037 ** random distributions the Mean is the center point of the 02038 ** range and the StdDev is 1/2 the range. A dimension with 02039 ** zero standard deviation cannot be statistically analyzed. 02040 ** In this case, a pseudo-analysis is used. 02041 ** Return: 02042 ** None (the Buckets data structure is filled in) 02043 ** Exceptions: 02044 ** None 02045 ** History: 02046 ** 6/5/89, DSJ, Created. 02047 */ 02048 uinT16 BucketID; 02049 int i; 02050 LIST SearchState; 02051 SAMPLE *Sample; 02052 02053 // initialize the histogram bucket counts to 0 02054 for (i = 0; i < Buckets->NumberOfBuckets; i++) 02055 Buckets->Count[i] = 0; 02056 02057 if (StdDev == 0.0) { 02058 /* if the standard deviation is zero, then we can't statistically 02059 analyze the cluster. Use a pseudo-analysis: samples exactly on 02060 the mean are distributed evenly across all buckets. Samples greater 02061 than the mean are placed in the last bucket; samples less than the 02062 mean are placed in the first bucket. */ 02063 02064 InitSampleSearch(SearchState, Cluster); 02065 i = 0; 02066 while ((Sample = NextSample (&SearchState)) != NULL) { 02067 if (Sample->Mean[Dim] > Mean) 02068 BucketID = Buckets->NumberOfBuckets - 1; 02069 else if (Sample->Mean[Dim] < Mean) 02070 BucketID = 0; 02071 else 02072 BucketID = i; 02073 Buckets->Count[BucketID] += 1; 02074 i++; 02075 if (i >= Buckets->NumberOfBuckets) 02076 i = 0; 02077 } 02078 } 02079 else { 02080 // search for all samples in the cluster and add to histogram buckets 02081 InitSampleSearch(SearchState, Cluster); 02082 while ((Sample = NextSample (&SearchState)) != NULL) { 02083 switch (Buckets->Distribution) { 02084 case normal: 02085 BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim], 02086 Mean, StdDev); 02087 break; 02088 case D_random: 02089 case uniform: 02090 BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim], 02091 Mean, StdDev); 02092 break; 02093 default: 02094 BucketID = 0; 02095 } 02096 Buckets->Count[Buckets->Bucket[BucketID]] += 1; 02097 } 02098 } 02099 } // FillBuckets 02100 02101 02102 //---------------------------------------------------------------------------*/ 02103 uinT16 NormalBucket(PARAM_DESC *ParamDesc, 02104 FLOAT32 x, 02105 FLOAT32 Mean, 02106 FLOAT32 StdDev) { 02107 /* 02108 ** Parameters: 02109 ** ParamDesc used to identify circular dimensions 02110 ** x value to be normalized 02111 ** Mean mean of normal distribution 02112 ** StdDev standard deviation of normal distribution 02113 ** Operation: 02114 ** This routine determines which bucket x falls into in the 02115 ** discrete normal distribution defined by kNormalMean 02116 ** and kNormalStdDev. x values which exceed the range of 02117 ** the discrete distribution are clipped. 02118 ** Return: 02119 ** Bucket number into which x falls 02120 ** Exceptions: 02121 ** None 02122 ** History: 02123 ** 6/5/89, DSJ, Created. 02124 */ 02125 FLOAT32 X; 02126 02127 // wraparound circular parameters if necessary 02128 if (ParamDesc->Circular) { 02129 if (x - Mean > ParamDesc->HalfRange) 02130 x -= ParamDesc->Range; 02131 else if (x - Mean < -ParamDesc->HalfRange) 02132 x += ParamDesc->Range; 02133 } 02134 02135 X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean; 02136 if (X < 0) 02137 return 0; 02138 if (X > BUCKETTABLESIZE - 1) 02139 return ((uinT16) (BUCKETTABLESIZE - 1)); 02140 return (uinT16) floor((FLOAT64) X); 02141 } // NormalBucket 02142 02143 02144 //--------------------------------------------------------------------------- 02145 uinT16 UniformBucket(PARAM_DESC *ParamDesc, 02146 FLOAT32 x, 02147 FLOAT32 Mean, 02148 FLOAT32 StdDev) { 02149 /* 02150 ** Parameters: 02151 ** ParamDesc used to identify circular dimensions 02152 ** x value to be normalized 02153 ** Mean center of range of uniform distribution 02154 ** StdDev 1/2 the range of the uniform distribution 02155 ** Operation: 02156 ** This routine determines which bucket x falls into in the 02157 ** discrete uniform distribution defined by 02158 ** BUCKETTABLESIZE. x values which exceed the range of 02159 ** the discrete distribution are clipped. 02160 ** Return: 02161 ** Bucket number into which x falls 02162 ** Exceptions: 02163 ** None 02164 ** History: 02165 ** 6/5/89, DSJ, Created. 02166 */ 02167 FLOAT32 X; 02168 02169 // wraparound circular parameters if necessary 02170 if (ParamDesc->Circular) { 02171 if (x - Mean > ParamDesc->HalfRange) 02172 x -= ParamDesc->Range; 02173 else if (x - Mean < -ParamDesc->HalfRange) 02174 x += ParamDesc->Range; 02175 } 02176 02177 X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0); 02178 if (X < 0) 02179 return 0; 02180 if (X > BUCKETTABLESIZE - 1) 02181 return (uinT16) (BUCKETTABLESIZE - 1); 02182 return (uinT16) floor((FLOAT64) X); 02183 } // UniformBucket 02184 02185 02186 //--------------------------------------------------------------------------- 02187 BOOL8 DistributionOK(BUCKETS *Buckets) { 02188 /* 02189 ** Parameters: 02190 ** Buckets histogram data to perform chi-square test on 02191 ** Operation: 02192 ** This routine performs a chi-square goodness of fit test 02193 ** on the histogram data in the Buckets data structure. TRUE 02194 ** is returned if the histogram matches the probability 02195 ** distribution which was specified when the Buckets 02196 ** structure was originally created. Otherwise FALSE is 02197 ** returned. 02198 ** Return: 02199 ** TRUE if samples match distribution, FALSE otherwise 02200 ** Exceptions: 02201 ** None 02202 ** History: 02203 ** 6/5/89, DSJ, Created. 02204 */ 02205 FLOAT32 FrequencyDifference; 02206 FLOAT32 TotalDifference; 02207 int i; 02208 02209 // compute how well the histogram matches the expected histogram 02210 TotalDifference = 0.0; 02211 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02212 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i]; 02213 TotalDifference += (FrequencyDifference * FrequencyDifference) / 02214 Buckets->ExpectedCount[i]; 02215 } 02216 02217 // test to see if the difference is more than expected 02218 if (TotalDifference > Buckets->ChiSquared) 02219 return FALSE; 02220 else 02221 return TRUE; 02222 } // DistributionOK 02223 02224 02225 //--------------------------------------------------------------------------- 02226 void FreeStatistics(STATISTICS *Statistics) { 02227 /* 02228 ** Parameters: 02229 ** Statistics pointer to data structure to be freed 02230 ** Operation: 02231 ** This routine frees the memory used by the statistics 02232 ** data structure. 02233 ** Return: 02234 ** None 02235 ** Exceptions: 02236 ** None 02237 ** History: 02238 ** 6/5/89, DSJ, Created. 02239 */ 02240 memfree (Statistics->CoVariance); 02241 memfree (Statistics->Min); 02242 memfree (Statistics->Max); 02243 memfree(Statistics); 02244 } // FreeStatistics 02245 02246 02247 //--------------------------------------------------------------------------- 02248 void FreeBuckets(BUCKETS *buckets) { 02249 /* 02250 ** Parameters: 02251 ** buckets pointer to data structure to be freed 02252 ** Operation: 02253 ** This routine properly frees the memory used by a BUCKETS. 02254 */ 02255 Efree(buckets->Count); 02256 Efree(buckets->ExpectedCount); 02257 Efree(buckets); 02258 } // FreeBuckets 02259 02260 02261 //--------------------------------------------------------------------------- 02262 void FreeCluster(CLUSTER *Cluster) { 02263 /* 02264 ** Parameters: 02265 ** Cluster pointer to cluster to be freed 02266 ** Operation: 02267 ** This routine frees the memory consumed by the specified 02268 ** cluster and all of its subclusters. This is done by 02269 ** recursive calls to FreeCluster(). 02270 ** Return: 02271 ** None 02272 ** Exceptions: 02273 ** None 02274 ** History: 02275 ** 6/6/89, DSJ, Created. 02276 */ 02277 if (Cluster != NULL) { 02278 FreeCluster (Cluster->Left); 02279 FreeCluster (Cluster->Right); 02280 memfree(Cluster); 02281 } 02282 } // FreeCluster 02283 02284 02285 //--------------------------------------------------------------------------- 02286 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) { 02287 /* 02288 ** Parameters: 02289 ** Distribution distribution being tested for 02290 ** HistogramBuckets number of buckets in chi-square test 02291 ** Operation: 02292 ** This routine computes the degrees of freedom that should 02293 ** be used in a chi-squared test with the specified number of 02294 ** histogram buckets. The result is always rounded up to 02295 ** the next even number so that the value of chi-squared can be 02296 ** computed more easily. This will cause the value of 02297 ** chi-squared to be higher than the optimum value, resulting 02298 ** in the chi-square test being more lenient than optimum. 02299 ** Return: The number of degrees of freedom for a chi-square test 02300 ** Exceptions: none 02301 ** History: Thu Aug 3 14:04:18 1989, DSJ, Created. 02302 */ 02303 static uinT8 DegreeOffsets[] = { 3, 3, 1 }; 02304 02305 uinT16 AdjustedNumBuckets; 02306 02307 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution]; 02308 if (Odd (AdjustedNumBuckets)) 02309 AdjustedNumBuckets++; 02310 return (AdjustedNumBuckets); 02311 02312 } // DegreesOfFreedom 02313 02314 02315 //--------------------------------------------------------------------------- 02316 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram, 02317 void *arg2) { // uinT16 *DesiredNumberOfBuckets) 02318 /* 02319 ** Parameters: 02320 ** Histogram current histogram being tested for a match 02321 ** DesiredNumberOfBuckets match key 02322 ** Operation: 02323 ** This routine is used to search a list of histogram data 02324 ** structures to find one with the specified number of 02325 ** buckets. It is called by the list search routines. 02326 ** Return: TRUE if Histogram matches DesiredNumberOfBuckets 02327 ** Exceptions: none 02328 ** History: Thu Aug 3 14:17:33 1989, DSJ, Created. 02329 */ 02330 BUCKETS *Histogram = (BUCKETS *) arg1; 02331 uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2; 02332 02333 return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets); 02334 02335 } // NumBucketsMatch 02336 02337 02338 //--------------------------------------------------------------------------- 02339 int ListEntryMatch(void *arg1, //ListNode 02340 void *arg2) { //Key 02341 /* 02342 ** Parameters: none 02343 ** Operation: 02344 ** This routine is used to search a list for a list node 02345 ** whose contents match Key. It is called by the list 02346 ** delete_d routine. 02347 ** Return: TRUE if ListNode matches Key 02348 ** Exceptions: none 02349 ** History: Thu Aug 3 14:23:58 1989, DSJ, Created. 02350 */ 02351 return (arg1 == arg2); 02352 02353 } // ListEntryMatch 02354 02355 02356 //--------------------------------------------------------------------------- 02357 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) { 02358 /* 02359 ** Parameters: 02360 ** Buckets histogram data structure to adjust 02361 ** NewSampleCount new sample count to adjust to 02362 ** Operation: 02363 ** This routine multiplies each ExpectedCount histogram entry 02364 ** by NewSampleCount/OldSampleCount so that the histogram 02365 ** is now adjusted to the new sample count. 02366 ** Return: none 02367 ** Exceptions: none 02368 ** History: Thu Aug 3 14:31:14 1989, DSJ, Created. 02369 */ 02370 int i; 02371 FLOAT64 AdjustFactor; 02372 02373 AdjustFactor = (((FLOAT64) NewSampleCount) / 02374 ((FLOAT64) Buckets->SampleCount)); 02375 02376 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02377 Buckets->ExpectedCount[i] *= AdjustFactor; 02378 } 02379 02380 Buckets->SampleCount = NewSampleCount; 02381 02382 } // AdjustBuckets 02383 02384 02385 //--------------------------------------------------------------------------- 02386 void InitBuckets(BUCKETS *Buckets) { 02387 /* 02388 ** Parameters: 02389 ** Buckets histogram data structure to init 02390 ** Operation: 02391 ** This routine sets the bucket counts in the specified histogram 02392 ** to zero. 02393 ** Return: none 02394 ** Exceptions: none 02395 ** History: Thu Aug 3 14:31:14 1989, DSJ, Created. 02396 */ 02397 int i; 02398 02399 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02400 Buckets->Count[i] = 0; 02401 } 02402 02403 } // InitBuckets 02404 02405 02406 //--------------------------------------------------------------------------- 02407 int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct, 02408 void *arg2) { //CHISTRUCT *SearchKey) 02409 /* 02410 ** Parameters: 02411 ** ChiStruct chi-squared struct being tested for a match 02412 ** SearchKey chi-squared struct that is the search key 02413 ** Operation: 02414 ** This routine is used to search a list of structures which 02415 ** hold pre-computed chi-squared values for a chi-squared 02416 ** value whose corresponding alpha field matches the alpha 02417 ** field of SearchKey. 02418 ** It is called by the list search routines. 02419 ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha 02420 ** Exceptions: none 02421 ** History: Thu Aug 3 14:17:33 1989, DSJ, Created. 02422 */ 02423 CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1; 02424 CHISTRUCT *SearchKey = (CHISTRUCT *) arg2; 02425 02426 return (ChiStruct->Alpha == SearchKey->Alpha); 02427 02428 } // AlphaMatch 02429 02430 02431 //--------------------------------------------------------------------------- 02432 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) { 02433 /* 02434 ** Parameters: 02435 ** DegreesOfFreedom degrees of freedom for new chi value 02436 ** Alpha confidence level for new chi value 02437 ** Operation: 02438 ** This routine allocates a new data structure which is used 02439 ** to hold a chi-squared value along with its associated 02440 ** number of degrees of freedom and alpha value. 02441 ** Return: none 02442 ** Exceptions: none 02443 ** History: Fri Aug 4 11:04:59 1989, DSJ, Created. 02444 */ 02445 CHISTRUCT *NewChiStruct; 02446 02447 NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT)); 02448 NewChiStruct->DegreesOfFreedom = DegreesOfFreedom; 02449 NewChiStruct->Alpha = Alpha; 02450 return (NewChiStruct); 02451 02452 } // NewChiStruct 02453 02454 02455 //--------------------------------------------------------------------------- 02456 FLOAT64 02457 Solve (SOLVEFUNC Function, 02458 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy) 02459 /* 02460 ** Parameters: 02461 ** Function function whose zero is to be found 02462 ** FunctionParams arbitrary data to pass to function 02463 ** InitialGuess point to start solution search at 02464 ** Accuracy maximum allowed error 02465 ** Operation: 02466 ** This routine attempts to find an x value at which Function 02467 ** goes to zero (i.e. a root of the function ). It will only 02468 ** work correctly if a solution actually exists and there 02469 ** are no extrema between the solution and the InitialGuess. 02470 ** The algorithms used are extremely primitive. 02471 ** Return: Solution of function ( x for which f(x) = 0 ). 02472 ** Exceptions: none 02473 ** History: Fri Aug 4 11:08:59 1989, DSJ, Created. 02474 */ 02475 #define INITIALDELTA 0.1 02476 #define DELTARATIO 0.1 02477 { 02478 FLOAT64 x; 02479 FLOAT64 f; 02480 FLOAT64 Slope; 02481 FLOAT64 Delta; 02482 FLOAT64 NewDelta; 02483 FLOAT64 xDelta; 02484 FLOAT64 LastPosX, LastNegX; 02485 02486 x = InitialGuess; 02487 Delta = INITIALDELTA; 02488 LastPosX = MAX_FLOAT32; 02489 LastNegX = -MAX_FLOAT32; 02490 f = (*Function) ((CHISTRUCT *) FunctionParams, x); 02491 while (Abs (LastPosX - LastNegX) > Accuracy) { 02492 // keep track of outer bounds of current estimate 02493 if (f < 0) 02494 LastNegX = x; 02495 else 02496 LastPosX = x; 02497 02498 // compute the approx. slope of f(x) at the current point 02499 Slope = 02500 ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta; 02501 02502 // compute the next solution guess */ 02503 xDelta = f / Slope; 02504 x -= xDelta; 02505 02506 // reduce the delta used for computing slope to be a fraction of 02507 //the amount moved to get to the new guess 02508 NewDelta = Abs (xDelta) * DELTARATIO; 02509 if (NewDelta < Delta) 02510 Delta = NewDelta; 02511 02512 // compute the value of the function at the new guess 02513 f = (*Function) ((CHISTRUCT *) FunctionParams, x); 02514 } 02515 return (x); 02516 02517 } // Solve 02518 02519 02520 //--------------------------------------------------------------------------- 02521 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) { 02522 /* 02523 ** Parameters: 02524 ** ChiParams contains degrees of freedom and alpha 02525 ** x value of chi-squared to evaluate 02526 ** Operation: 02527 ** This routine computes the area under a chi density curve 02528 ** from 0 to x, minus the desired area under the curve. The 02529 ** number of degrees of freedom of the chi curve is specified 02530 ** in the ChiParams structure. The desired area is also 02531 ** specified in the ChiParams structure as Alpha ( or 1 minus 02532 ** the desired area ). This routine is intended to be passed 02533 ** to the Solve() function to find the value of chi-squared 02534 ** which will yield a desired area under the right tail of 02535 ** the chi density curve. The function will only work for 02536 ** even degrees of freedom. The equations are based on 02537 ** integrating the chi density curve in parts to obtain 02538 ** a series that can be used to compute the area under the 02539 ** curve. 02540 ** Return: Error between actual and desired area under the chi curve. 02541 ** Exceptions: none 02542 ** History: Fri Aug 4 12:48:41 1989, DSJ, Created. 02543 */ 02544 int i, N; 02545 FLOAT64 SeriesTotal; 02546 FLOAT64 Denominator; 02547 FLOAT64 PowerOfx; 02548 02549 N = ChiParams->DegreesOfFreedom / 2 - 1; 02550 SeriesTotal = 1; 02551 Denominator = 1; 02552 PowerOfx = 1; 02553 for (i = 1; i <= N; i++) { 02554 Denominator *= 2 * i; 02555 PowerOfx *= x; 02556 SeriesTotal += PowerOfx / Denominator; 02557 } 02558 return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha); 02559 02560 } // ChiArea 02561 02562 02563 //--------------------------------------------------------------------------- 02564 BOOL8 02565 MultipleCharSamples (CLUSTERER * Clusterer, 02566 CLUSTER * Cluster, FLOAT32 MaxIllegal) 02567 /* 02568 ** Parameters: 02569 ** Clusterer data structure holding cluster tree 02570 ** Cluster cluster containing samples to be tested 02571 ** MaxIllegal max percentage of samples allowed to have 02572 ** more than 1 feature in the cluster 02573 ** Operation: 02574 ** This routine looks at all samples in the specified cluster. 02575 ** It computes a running estimate of the percentage of the 02576 ** charaters which have more than 1 sample in the cluster. 02577 ** When this percentage exceeds MaxIllegal, TRUE is returned. 02578 ** Otherwise FALSE is returned. The CharID 02579 ** fields must contain integers which identify the training 02580 ** characters which were used to generate the sample. One 02581 ** integer is used for each sample. The NumChar field in 02582 ** the Clusterer must contain the number of characters in the 02583 ** training set. All CharID fields must be between 0 and 02584 ** NumChar-1. The main function of this routine is to help 02585 ** identify clusters which need to be split further, i.e. if 02586 ** numerous training characters have 2 or more features which are 02587 ** contained in the same cluster, then the cluster should be 02588 ** split. 02589 ** Return: TRUE if the cluster should be split, FALSE otherwise. 02590 ** Exceptions: none 02591 ** History: Wed Aug 30 11:13:05 1989, DSJ, Created. 02592 ** 2/22/90, DSJ, Added MaxIllegal control rather than always 02593 ** splitting illegal clusters. 02594 */ 02595 #define ILLEGAL_CHAR 2 02596 { 02597 static BOOL8 *CharFlags = NULL; 02598 static inT32 NumFlags = 0; 02599 int i; 02600 LIST SearchState; 02601 SAMPLE *Sample; 02602 inT32 CharID; 02603 inT32 NumCharInCluster; 02604 inT32 NumIllegalInCluster; 02605 FLOAT32 PercentIllegal; 02606 02607 // initial estimate assumes that no illegal chars exist in the cluster 02608 NumCharInCluster = Cluster->SampleCount; 02609 NumIllegalInCluster = 0; 02610 02611 if (Clusterer->NumChar > NumFlags) { 02612 if (CharFlags != NULL) 02613 memfree(CharFlags); 02614 NumFlags = Clusterer->NumChar; 02615 CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8)); 02616 } 02617 02618 for (i = 0; i < NumFlags; i++) 02619 CharFlags[i] = FALSE; 02620 02621 // find each sample in the cluster and check if we have seen it before 02622 InitSampleSearch(SearchState, Cluster); 02623 while ((Sample = NextSample (&SearchState)) != NULL) { 02624 CharID = Sample->CharID; 02625 if (CharFlags[CharID] == FALSE) { 02626 CharFlags[CharID] = TRUE; 02627 } 02628 else { 02629 if (CharFlags[CharID] == TRUE) { 02630 NumIllegalInCluster++; 02631 CharFlags[CharID] = ILLEGAL_CHAR; 02632 } 02633 NumCharInCluster--; 02634 PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster; 02635 if (PercentIllegal > MaxIllegal) { 02636 destroy(SearchState); 02637 return (TRUE); 02638 } 02639 } 02640 } 02641 return (FALSE); 02642 02643 } // MultipleCharSamples 02644 02645 // Compute the inverse of a matrix using LU decomposition with partial pivoting. 02646 // The return value is the sum of norms of the off-diagonal terms of the 02647 // product of a and inv. (A measure of the error.) 02648 double InvertMatrix(const float* input, int size, float* inv) { 02649 // Allocate memory for the 2D arrays. 02650 GENERIC_2D_ARRAY<double> U(size, size, 0.0); 02651 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0); 02652 GENERIC_2D_ARRAY<double> L(size, size, 0.0); 02653 02654 // Initialize the working matrices. U starts as input, L as I and U_inv as O. 02655 int row; 02656 int col; 02657 for (row = 0; row < size; row++) { 02658 for (col = 0; col < size; col++) { 02659 U[row][col] = input[row*size + col]; 02660 L[row][col] = row == col ? 1.0 : 0.0; 02661 U_inv[row][col] = 0.0; 02662 } 02663 } 02664 02665 // Compute forward matrix by inversion by LU decomposition of input. 02666 for (col = 0; col < size; ++col) { 02667 // Find best pivot 02668 int best_row = 0; 02669 double best_pivot = -1.0; 02670 for (row = col; row < size; ++row) { 02671 if (Abs(U[row][col]) > best_pivot) { 02672 best_pivot = Abs(U[row][col]); 02673 best_row = row; 02674 } 02675 } 02676 // Exchange pivot rows. 02677 if (best_row != col) { 02678 for (int k = 0; k < size; ++k) { 02679 double tmp = U[best_row][k]; 02680 U[best_row][k] = U[col][k]; 02681 U[col][k] = tmp; 02682 tmp = L[best_row][k]; 02683 L[best_row][k] = L[col][k]; 02684 L[col][k] = tmp; 02685 } 02686 } 02687 // Now do the pivot itself. 02688 for (row = col + 1; row < size; ++row) { 02689 double ratio = -U[row][col] / U[col][col]; 02690 for (int j = col; j < size; ++j) { 02691 U[row][j] += U[col][j] * ratio; 02692 } 02693 for (int k = 0; k < size; ++k) { 02694 L[row][k] += L[col][k] * ratio; 02695 } 02696 } 02697 } 02698 // Next invert U. 02699 for (col = 0; col < size; ++col) { 02700 U_inv[col][col] = 1.0 / U[col][col]; 02701 for (row = col - 1; row >= 0; --row) { 02702 double total = 0.0; 02703 for (int k = col; k > row; --k) { 02704 total += U[row][k] * U_inv[k][col]; 02705 } 02706 U_inv[row][col] = -total / U[row][row]; 02707 } 02708 } 02709 // Now the answer is U_inv.L. 02710 for (row = 0; row < size; row++) { 02711 for (col = 0; col < size; col++) { 02712 double sum = 0.0; 02713 for (int k = row; k < size; ++k) { 02714 sum += U_inv[row][k] * L[k][col]; 02715 } 02716 inv[row*size + col] = sum; 02717 } 02718 } 02719 // Check matrix product. 02720 double error_sum = 0.0; 02721 for (row = 0; row < size; row++) { 02722 for (col = 0; col < size; col++) { 02723 double sum = 0.0; 02724 for (int k = 0; k < size; ++k) { 02725 sum += input[row*size + k] * inv[k *size + col]; 02726 } 02727 if (row != col) { 02728 error_sum += Abs(sum); 02729 } 02730 } 02731 } 02732 return error_sum; 02733 }