tesseract-doc/cluster_8cpp_source.html

00001 /******************************************************************************
00002  **     Filename:       cluster.c
00003  **     Purpose:        Routines for clustering points in N-D space
00004  **     Author:         Dan Johnson
00005  **     History:        5/29/89, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 #include "oldheap.h"
00019 #include "const.h"
00020 #include "cluster.h"
00021 #include "emalloc.h"
00022 #include "helpers.h"
00023 #include "matrix.h"
00024 #include "tprintf.h"
00025 #include "danerror.h"
00026 #include "freelist.h"
00027 #include <math.h>
00028
00029 #define HOTELLING 1  // If true use Hotelling's test to decide where to split.
00030 #define FTABLE_X 10  // Size of FTable.
00031 #define FTABLE_Y 100  // Size of FTable.
00032
00033 // Table of values approximating the cumulative F-distribution for a confidence of 1%.
00034 const double FTable[FTABLE_Y][FTABLE_X] = {
00035  {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
00036   {98.502,  99.000,  99.166,  99.249,  99.300,  99.333,  99.356,  99.374,  99.388,  99.399,},
00037   {34.116,  30.816,  29.457,  28.710,  28.237,  27.911,  27.672,  27.489,  27.345,  27.229,},
00038   {21.198,  18.000,  16.694,  15.977,  15.522,  15.207,  14.976,  14.799,  14.659,  14.546,},
00039   {16.258,  13.274,  12.060,  11.392,  10.967,  10.672,  10.456,  10.289,  10.158,  10.051,},
00040   {13.745,  10.925,   9.780,   9.148,   8.746,   8.466,   8.260,   8.102,   7.976,   7.874,},
00041   {12.246,   9.547,   8.451,   7.847,   7.460,   7.191,   6.993,   6.840,   6.719,   6.620,},
00042   {11.259,   8.649,   7.591,   7.006,   6.632,   6.371,   6.178,   6.029,   5.911,   5.814,},
00043   {10.561,   8.022,   6.992,   6.422,   6.057,   5.802,   5.613,   5.467,   5.351,   5.257,},
00044   {10.044,   7.559,   6.552,   5.994,   5.636,   5.386,   5.200,   5.057,   4.942,   4.849,},
00045   { 9.646,   7.206,   6.217,   5.668,   5.316,   5.069,   4.886,   4.744,   4.632,   4.539,},
00046   { 9.330,   6.927,   5.953,   5.412,   5.064,   4.821,   4.640,   4.499,   4.388,   4.296,},
00047   { 9.074,   6.701,   5.739,   5.205,   4.862,   4.620,   4.441,   4.302,   4.191,   4.100,},
00048   { 8.862,   6.515,   5.564,   5.035,   4.695,   4.456,   4.278,   4.140,   4.030,   3.939,},
00049   { 8.683,   6.359,   5.417,   4.893,   4.556,   4.318,   4.142,   4.004,   3.895,   3.805,},
00050   { 8.531,   6.226,   5.292,   4.773,   4.437,   4.202,   4.026,   3.890,   3.780,   3.691,},
00051   { 8.400,   6.112,   5.185,   4.669,   4.336,   4.102,   3.927,   3.791,   3.682,   3.593,},
00052   { 8.285,   6.013,   5.092,   4.579,   4.248,   4.015,   3.841,   3.705,   3.597,   3.508,},
00053   { 8.185,   5.926,   5.010,   4.500,   4.171,   3.939,   3.765,   3.631,   3.523,   3.434,},
00054   { 8.096,   5.849,   4.938,   4.431,   4.103,   3.871,   3.699,   3.564,   3.457,   3.368,},
00055   { 8.017,   5.780,   4.874,   4.369,   4.042,   3.812,   3.640,   3.506,   3.398,   3.310,},
00056   { 7.945,   5.719,   4.817,   4.313,   3.988,   3.758,   3.587,   3.453,   3.346,   3.258,},
00057   { 7.881,   5.664,   4.765,   4.264,   3.939,   3.710,   3.539,   3.406,   3.299,   3.211,},
00058   { 7.823,   5.614,   4.718,   4.218,   3.895,   3.667,   3.496,   3.363,   3.256,   3.168,},
00059   { 7.770,   5.568,   4.675,   4.177,   3.855,   3.627,   3.457,   3.324,   3.217,   3.129,},
00060   { 7.721,   5.526,   4.637,   4.140,   3.818,   3.591,   3.421,   3.288,   3.182,   3.094,},
00061   { 7.677,   5.488,   4.601,   4.106,   3.785,   3.558,   3.388,   3.256,   3.149,   3.062,},
00062   { 7.636,   5.453,   4.568,   4.074,   3.754,   3.528,   3.358,   3.226,   3.120,   3.032,},
00063   { 7.598,   5.420,   4.538,   4.045,   3.725,   3.499,   3.330,   3.198,   3.092,   3.005,},
00064   { 7.562,   5.390,   4.510,   4.018,   3.699,   3.473,   3.305,   3.173,   3.067,   2.979,},
00065   { 7.530,   5.362,   4.484,   3.993,   3.675,   3.449,   3.281,   3.149,   3.043,   2.955,},
00066   { 7.499,   5.336,   4.459,   3.969,   3.652,   3.427,   3.258,   3.127,   3.021,   2.934,},
00067   { 7.471,   5.312,   4.437,   3.948,   3.630,   3.406,   3.238,   3.106,   3.000,   2.913,},
00068   { 7.444,   5.289,   4.416,   3.927,   3.611,   3.386,   3.218,   3.087,   2.981,   2.894,},
00069   { 7.419,   5.268,   4.396,   3.908,   3.592,   3.368,   3.200,   3.069,   2.963,   2.876,},
00070   { 7.396,   5.248,   4.377,   3.890,   3.574,   3.351,   3.183,   3.052,   2.946,   2.859,},
00071   { 7.373,   5.229,   4.360,   3.873,   3.558,   3.334,   3.167,   3.036,   2.930,   2.843,},
00072   { 7.353,   5.211,   4.343,   3.858,   3.542,   3.319,   3.152,   3.021,   2.915,   2.828,},
00073   { 7.333,   5.194,   4.327,   3.843,   3.528,   3.305,   3.137,   3.006,   2.901,   2.814,},
00074   { 7.314,   5.179,   4.313,   3.828,   3.514,   3.291,   3.124,   2.993,   2.888,   2.801,},
00075   { 7.296,   5.163,   4.299,   3.815,   3.501,   3.278,   3.111,   2.980,   2.875,   2.788,},
00076   { 7.280,   5.149,   4.285,   3.802,   3.488,   3.266,   3.099,   2.968,   2.863,   2.776,},
00077   { 7.264,   5.136,   4.273,   3.790,   3.476,   3.254,   3.087,   2.957,   2.851,   2.764,},
00078   { 7.248,   5.123,   4.261,   3.778,   3.465,   3.243,   3.076,   2.946,   2.840,   2.754,},
00079   { 7.234,   5.110,   4.249,   3.767,   3.454,   3.232,   3.066,   2.935,   2.830,   2.743,},
00080   { 7.220,   5.099,   4.238,   3.757,   3.444,   3.222,   3.056,   2.925,   2.820,   2.733,},
00081   { 7.207,   5.087,   4.228,   3.747,   3.434,   3.213,   3.046,   2.916,   2.811,   2.724,},
00082   { 7.194,   5.077,   4.218,   3.737,   3.425,   3.204,   3.037,   2.907,   2.802,   2.715,},
00083   { 7.182,   5.066,   4.208,   3.728,   3.416,   3.195,   3.028,   2.898,   2.793,   2.706,},
00084   { 7.171,   5.057,   4.199,   3.720,   3.408,   3.186,   3.020,   2.890,   2.785,   2.698,},
00085   { 7.159,   5.047,   4.191,   3.711,   3.400,   3.178,   3.012,   2.882,   2.777,   2.690,},
00086   { 7.149,   5.038,   4.182,   3.703,   3.392,   3.171,   3.005,   2.874,   2.769,   2.683,},
00087   { 7.139,   5.030,   4.174,   3.695,   3.384,   3.163,   2.997,   2.867,   2.762,   2.675,},
00088   { 7.129,   5.021,   4.167,   3.688,   3.377,   3.156,   2.990,   2.860,   2.755,   2.668,},
00089   { 7.119,   5.013,   4.159,   3.681,   3.370,   3.149,   2.983,   2.853,   2.748,   2.662,},
00090   { 7.110,   5.006,   4.152,   3.674,   3.363,   3.143,   2.977,   2.847,   2.742,   2.655,},
00091   { 7.102,   4.998,   4.145,   3.667,   3.357,   3.136,   2.971,   2.841,   2.736,   2.649,},
00092   { 7.093,   4.991,   4.138,   3.661,   3.351,   3.130,   2.965,   2.835,   2.730,   2.643,},
00093   { 7.085,   4.984,   4.132,   3.655,   3.345,   3.124,   2.959,   2.829,   2.724,   2.637,},
00094   { 7.077,   4.977,   4.126,   3.649,   3.339,   3.119,   2.953,   2.823,   2.718,   2.632,},
00095   { 7.070,   4.971,   4.120,   3.643,   3.333,   3.113,   2.948,   2.818,   2.713,   2.626,},
00096   { 7.062,   4.965,   4.114,   3.638,   3.328,   3.108,   2.942,   2.813,   2.708,   2.621,},
00097   { 7.055,   4.959,   4.109,   3.632,   3.323,   3.103,   2.937,   2.808,   2.703,   2.616,},
00098   { 7.048,   4.953,   4.103,   3.627,   3.318,   3.098,   2.932,   2.803,   2.698,   2.611,},
00099   { 7.042,   4.947,   4.098,   3.622,   3.313,   3.093,   2.928,   2.798,   2.693,   2.607,},
00100   { 7.035,   4.942,   4.093,   3.618,   3.308,   3.088,   2.923,   2.793,   2.689,   2.602,},
00101   { 7.029,   4.937,   4.088,   3.613,   3.304,   3.084,   2.919,   2.789,   2.684,   2.598,},
00102   { 7.023,   4.932,   4.083,   3.608,   3.299,   3.080,   2.914,   2.785,   2.680,   2.593,},
00103   { 7.017,   4.927,   4.079,   3.604,   3.295,   3.075,   2.910,   2.781,   2.676,   2.589,},
00104   { 7.011,   4.922,   4.074,   3.600,   3.291,   3.071,   2.906,   2.777,   2.672,   2.585,},
00105   { 7.006,   4.917,   4.070,   3.596,   3.287,   3.067,   2.902,   2.773,   2.668,   2.581,},
00106   { 7.001,   4.913,   4.066,   3.591,   3.283,   3.063,   2.898,   2.769,   2.664,   2.578,},
00107   { 6.995,   4.908,   4.062,   3.588,   3.279,   3.060,   2.895,   2.765,   2.660,   2.574,},
00108   { 6.990,   4.904,   4.058,   3.584,   3.275,   3.056,   2.891,   2.762,   2.657,   2.570,},
00109   { 6.985,   4.900,   4.054,   3.580,   3.272,   3.052,   2.887,   2.758,   2.653,   2.567,},
00110   { 6.981,   4.896,   4.050,   3.577,   3.268,   3.049,   2.884,   2.755,   2.650,   2.563,},
00111   { 6.976,   4.892,   4.047,   3.573,   3.265,   3.046,   2.881,   2.751,   2.647,   2.560,},
00112   { 6.971,   4.888,   4.043,   3.570,   3.261,   3.042,   2.877,   2.748,   2.644,   2.557,},
00113   { 6.967,   4.884,   4.040,   3.566,   3.258,   3.039,   2.874,   2.745,   2.640,   2.554,},
00114   { 6.963,   4.881,   4.036,   3.563,   3.255,   3.036,   2.871,   2.742,   2.637,   2.551,},
00115   { 6.958,   4.877,   4.033,   3.560,   3.252,   3.033,   2.868,   2.739,   2.634,   2.548,},
00116   { 6.954,   4.874,   4.030,   3.557,   3.249,   3.030,   2.865,   2.736,   2.632,   2.545,},
00117   { 6.950,   4.870,   4.027,   3.554,   3.246,   3.027,   2.863,   2.733,   2.629,   2.542,},
00118   { 6.947,   4.867,   4.024,   3.551,   3.243,   3.025,   2.860,   2.731,   2.626,   2.539,},
00119   { 6.943,   4.864,   4.021,   3.548,   3.240,   3.022,   2.857,   2.728,   2.623,   2.537,},
00120   { 6.939,   4.861,   4.018,   3.545,   3.238,   3.019,   2.854,   2.725,   2.621,   2.534,},
00121   { 6.935,   4.858,   4.015,   3.543,   3.235,   3.017,   2.852,   2.723,   2.618,   2.532,},
00122   { 6.932,   4.855,   4.012,   3.540,   3.233,   3.014,   2.849,   2.720,   2.616,   2.529,},
00123   { 6.928,   4.852,   4.010,   3.538,   3.230,   3.012,   2.847,   2.718,   2.613,   2.527,},
00124   { 6.925,   4.849,   4.007,   3.535,   3.228,   3.009,   2.845,   2.715,   2.611,   2.524,},
00125   { 6.922,   4.846,   4.004,   3.533,   3.225,   3.007,   2.842,   2.713,   2.609,   2.522,},
00126   { 6.919,   4.844,   4.002,   3.530,   3.223,   3.004,   2.840,   2.711,   2.606,   2.520,},
00127   { 6.915,   4.841,   3.999,   3.528,   3.221,   3.002,   2.838,   2.709,   2.604,   2.518,},
00128   { 6.912,   4.838,   3.997,   3.525,   3.218,   3.000,   2.835,   2.706,   2.602,   2.515,},
00129   { 6.909,   4.836,   3.995,   3.523,   3.216,   2.998,   2.833,   2.704,   2.600,   2.513,},
00130   { 6.906,   4.833,   3.992,   3.521,   3.214,   2.996,   2.831,   2.702,   2.598,   2.511,},
00131   { 6.904,   4.831,   3.990,   3.519,   3.212,   2.994,   2.829,   2.700,   2.596,   2.509,},
00132   { 6.901,   4.829,   3.988,   3.517,   3.210,   2.992,   2.827,   2.698,   2.594,   2.507,},
00133   { 6.898,   4.826,   3.986,   3.515,   3.208,   2.990,   2.825,   2.696,   2.592,   2.505,},
00134   { 6.895,   4.824,   3.984,   3.513,   3.206,   2.988,   2.823,   2.694,   2.590,   2.503}
00135 };
00136
00137 /* define the variance which will be used as a minimum variance for any
00138   dimension of any feature. Since most features are calculated from numbers
00139   with a precision no better than 1 in 128, the variance should never be
00140   less than the square of this number for parameters whose range is 1. */
00141 #define MINVARIANCE     0.0004
00142
00143 /* define the absolute minimum number of samples which must be present in
00144   order to accurately test hypotheses about underlying probability
00145   distributions.  Define separately the minimum samples that are needed
00146   before a statistical analysis is attempted; this number should be
00147   equal to MINSAMPLES but can be set to a lower number for early testing
00148   when very few samples are available. */
00149 #define MINSAMPLESPERBUCKET 5
00150 #define MINSAMPLES    (MINBUCKETS * MINSAMPLESPERBUCKET)
00151 #define MINSAMPLESNEEDED  1
00152
00153 /* define the size of the table which maps normalized samples to
00154   histogram buckets.  Also define the number of standard deviations
00155   in a normal distribution which are considered to be significant.
00156   The mapping table will be defined in such a way that it covers
00157   the specified number of standard deviations on either side of
00158   the mean.  BUCKETTABLESIZE should always be even. */
00159 #define BUCKETTABLESIZE   1024
00160 #define NORMALEXTENT    3.0
00161
00162 struct TEMPCLUSTER {
00163   CLUSTER *Cluster;
00164   CLUSTER *Neighbor;
00165 };
00166
00167 struct STATISTICS {
00168   FLOAT32 AvgVariance;
00169   FLOAT32 *CoVariance;
00170   FLOAT32 *Min;                  // largest negative distance from the mean
00171   FLOAT32 *Max;                  // largest positive distance from the mean
00172 };
00173
00174 struct BUCKETS {
00175   DISTRIBUTION Distribution;     // distribution being tested for
00176   uinT32 SampleCount;            // # of samples in histogram
00177   FLOAT64 Confidence;            // confidence level of test
00178   FLOAT64 ChiSquared;            // test threshold
00179   uinT16 NumberOfBuckets;        // number of cells in histogram
00180   uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
00181   uinT32 *Count;                 // frequency of occurence histogram
00182   FLOAT32 *ExpectedCount;        // expected histogram
00183 };
00184
00185 struct CHISTRUCT{
00186   uinT16 DegreesOfFreedom;
00187   FLOAT64 Alpha;
00188   FLOAT64 ChiSquared;
00189 };
00190
00191 // For use with KDWalk / MakePotentialClusters
00192 struct ClusteringContext {
00193   HEAP *heap;  // heap used to hold temp clusters, "best" on top
00194   TEMPCLUSTER *candidates;  // array of potential clusters
00195   KDTREE *tree;  // kd-tree to be searched for neighbors
00196   inT32 next;  // next candidate to be used
00197 };
00198
00199 typedef FLOAT64 (*DENSITYFUNC) (inT32);
00200 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
00201
00202 #define Odd(N) ((N)%2)
00203 #define Mirror(N,R) ((R) - (N) - 1)
00204 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
00205
00206 //--------------Global Data Definitions and Declarations----------------------
00207 /* the following variables describe a discrete normal distribution
00208   which is used by NormalDensity() and NormalBucket().  The
00209   constant NORMALEXTENT determines how many standard
00210   deviations of the distribution are mapped onto the fixed
00211   discrete range of x.  x=0 is mapped to -NORMALEXTENT standard
00212   deviations and x=BUCKETTABLESIZE is mapped to
00213   +NORMALEXTENT standard deviations. */
00214 #define SqrtOf2Pi     2.506628275
00215 static const FLOAT64 kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
00216 static const FLOAT64 kNormalVariance =
00217     (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
00218 static const FLOAT64 kNormalMagnitude =
00219     (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
00220 static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2;
00221
00222 /* define lookup tables used to compute the number of histogram buckets
00223   that should be used for a given number of samples. */
00224 #define LOOKUPTABLESIZE   8
00225 #define MAXDEGREESOFFREEDOM MAXBUCKETS
00226
00227 static const uinT32 kCountTable[LOOKUPTABLESIZE] = {
00228   MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
00229 };  // number of samples
00230
00231 static const uinT16 kBucketsTable[LOOKUPTABLESIZE] = {
00232   MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
00233 };  // number of buckets
00234
00235 /*-------------------------------------------------------------------------
00236           Private Function Prototypes
00237 --------------------------------------------------------------------------*/
00238 void CreateClusterTree(CLUSTERER *Clusterer);
00239
00240 void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster,
00241                            inT32 Level);
00242
00243 CLUSTER *FindNearestNeighbor(KDTREE *Tree,
00244                              CLUSTER *Cluster,
00245                              FLOAT32 *Distance);
00246
00247 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
00248
00249 inT32 MergeClusters (inT16 N,
00250 register PARAM_DESC ParamDesc[],
00251 register inT32 n1,
00252 register inT32 n2,
00253 register FLOAT32 m[],
00254 register FLOAT32 m1[], register FLOAT32 m2[]);
00255
00256 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
00257
00258 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
00259                          CLUSTERCONFIG *Config,
00260                          CLUSTER *Cluster);
00261
00262 PROTOTYPE *MakeDegenerateProto(uinT16 N,
00263                                CLUSTER *Cluster,
00264                                STATISTICS *Statistics,
00265                                PROTOSTYLE Style,
00266                                inT32 MinSamples);
00267
00268 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
00269                                CLUSTERCONFIG *Config,
00270                                CLUSTER *Cluster,
00271                                STATISTICS *Statistics);
00272
00273 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
00274                               CLUSTER *Cluster,
00275                               STATISTICS *Statistics,
00276                               BUCKETS *Buckets);
00277
00278 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
00279                                CLUSTER *Cluster,
00280                                STATISTICS *Statistics,
00281                                BUCKETS *Buckets);
00282
00283 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
00284                           CLUSTER *Cluster,
00285                           STATISTICS *Statistics,
00286                           BUCKETS *NormalBuckets,
00287                           FLOAT64 Confidence);
00288
00289 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
00290
00291 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
00292
00293 STATISTICS *ComputeStatistics (inT16 N,
00294 PARAM_DESC ParamDesc[], CLUSTER * Cluster);
00295
00296 PROTOTYPE *NewSphericalProto(uinT16 N,
00297                              CLUSTER *Cluster,
00298                              STATISTICS *Statistics);
00299
00300 PROTOTYPE *NewEllipticalProto(inT16 N,
00301                               CLUSTER *Cluster,
00302                               STATISTICS *Statistics);
00303
00304 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
00305
00306 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
00307
00308 BOOL8 Independent (PARAM_DESC ParamDesc[],
00309 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
00310
00311 BUCKETS *GetBuckets(CLUSTERER* clusterer,
00312                     DISTRIBUTION Distribution,
00313                     uinT32 SampleCount,
00314                     FLOAT64 Confidence);
00315
00316 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
00317                      uinT32 SampleCount,
00318                      FLOAT64 Confidence);
00319
00320 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount);
00321
00322 FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
00323
00324 FLOAT64 NormalDensity(inT32 x);
00325
00326 FLOAT64 UniformDensity(inT32 x);
00327
00328 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx);
00329
00330 void FillBuckets(BUCKETS *Buckets,
00331                  CLUSTER *Cluster,
00332                  uinT16 Dim,
00333                  PARAM_DESC *ParamDesc,
00334                  FLOAT32 Mean,
00335                  FLOAT32 StdDev);
00336
00337 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
00338                     FLOAT32 x,
00339                     FLOAT32 Mean,
00340                     FLOAT32 StdDev);
00341
00342 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
00343                      FLOAT32 x,
00344                      FLOAT32 Mean,
00345                      FLOAT32 StdDev);
00346
00347 BOOL8 DistributionOK(BUCKETS *Buckets);
00348
00349 void FreeStatistics(STATISTICS *Statistics);
00350
00351 void FreeBuckets(BUCKETS *Buckets);
00352
00353 void FreeCluster(CLUSTER *Cluster);
00354
00355 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
00356
00357 int NumBucketsMatch(void *arg1,   // BUCKETS *Histogram,
00358                     void *arg2);  // uinT16 *DesiredNumberOfBuckets);
00359
00360 int ListEntryMatch(void *arg1, void *arg2);
00361
00362 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
00363
00364 void InitBuckets(BUCKETS *Buckets);
00365
00366 int AlphaMatch(void *arg1,   // CHISTRUCT *ChiStruct,
00367                void *arg2);  // CHISTRUCT *SearchKey);
00368
00369 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
00370
00371 FLOAT64 Solve(SOLVEFUNC Function,
00372               void *FunctionParams,
00373               FLOAT64 InitialGuess,
00374               FLOAT64 Accuracy);
00375
00376 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
00377
00378 BOOL8 MultipleCharSamples(CLUSTERER *Clusterer,
00379                           CLUSTER *Cluster,
00380                           FLOAT32 MaxIllegal);
00381
00382 double InvertMatrix(const float* input, int size, float* inv);
00383
00384 //--------------------------Public Code--------------------------------------
00394 CLUSTERER *
00395 MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) {
00396   CLUSTERER *Clusterer;
00397   int i;
00398
00399   // allocate main clusterer data structure and init simple fields
00400   Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
00401   Clusterer->SampleSize = SampleSize;
00402   Clusterer->NumberOfSamples = 0;
00403   Clusterer->NumChar = 0;
00404
00405   // init fields which will not be used initially
00406   Clusterer->Root = NULL;
00407   Clusterer->ProtoList = NIL_LIST;
00408
00409   // maintain a copy of param descriptors in the clusterer data structure
00410   Clusterer->ParamDesc =
00411     (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
00412   for (i = 0; i < SampleSize; i++) {
00413     Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
00414     Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
00415     Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
00416     Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
00417     Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
00418     Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
00419     Clusterer->ParamDesc[i].MidRange =
00420       (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
00421   }
00422
00423   // allocate a kd tree to hold the samples
00424   Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
00425
00426   // Initialize cache of histogram buckets to minimize recomputing them.
00427   for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
00428     for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
00429       Clusterer->bucket_cache[d][c] = NULL;
00430   }
00431
00432   return Clusterer;
00433 }                                // MakeClusterer
00434
00435
00450 SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature,
00451                    inT32 CharID) {
00452   SAMPLE *Sample;
00453   int i;
00454
00455   // see if the samples have already been clustered - if so trap an error
00456   if (Clusterer->Root != NULL)
00457     DoError (ALREADYCLUSTERED,
00458       "Can't add samples after they have been clustered");
00459
00460   // allocate the new sample and initialize it
00461   Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
00462     (Clusterer->SampleSize -
00463     1) * sizeof (FLOAT32));
00464   Sample->Clustered = FALSE;
00465   Sample->Prototype = FALSE;
00466   Sample->SampleCount = 1;
00467   Sample->Left = NULL;
00468   Sample->Right = NULL;
00469   Sample->CharID = CharID;
00470
00471   for (i = 0; i < Clusterer->SampleSize; i++)
00472     Sample->Mean[i] = Feature[i];
00473
00474   // add the sample to the KD tree - keep track of the total # of samples
00475   Clusterer->NumberOfSamples++;
00476   KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
00477   if (CharID >= Clusterer->NumChar)
00478     Clusterer->NumChar = CharID + 1;
00479
00480   // execute hook for monitoring clustering operation
00481   // (*SampleCreationHook)( Sample );
00482
00483   return (Sample);
00484 }                                // MakeSample
00485
00486
00504 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
00505   //only create cluster tree if samples have never been clustered before
00506   if (Clusterer->Root == NULL)
00507     CreateClusterTree(Clusterer);
00508
00509   //deallocate the old prototype list if one exists
00510   FreeProtoList (&Clusterer->ProtoList);
00511   Clusterer->ProtoList = NIL_LIST;
00512
00513   //compute prototypes starting at the root node in the tree
00514   ComputePrototypes(Clusterer, Config);
00515   return (Clusterer->ProtoList);
00516 }                                // ClusterSamples
00517
00518
00532 void FreeClusterer(CLUSTERER *Clusterer) {
00533   if (Clusterer != NULL) {
00534     memfree (Clusterer->ParamDesc);
00535     if (Clusterer->KDTree != NULL)
00536       FreeKDTree (Clusterer->KDTree);
00537     if (Clusterer->Root != NULL)
00538       FreeCluster (Clusterer->Root);
00539     // Free up all used buckets structures.
00540     for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
00541       for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
00542         if (Clusterer->bucket_cache[d][c] != NULL)
00543           FreeBuckets(Clusterer->bucket_cache[d][c]);
00544     }
00545
00546     memfree(Clusterer);
00547   }
00548 }                                // FreeClusterer
00549
00550
00560 void FreeProtoList(LIST *ProtoList) {
00561   destroy_nodes(*ProtoList, FreePrototype);
00562 }                                // FreeProtoList
00563
00564
00575 void FreePrototype(void *arg) {  //PROTOTYPE     *Prototype)
00576   PROTOTYPE *Prototype = (PROTOTYPE *) arg;
00577
00578   // unmark the corresponding cluster (if there is one
00579   if (Prototype->Cluster != NULL)
00580     Prototype->Cluster->Prototype = FALSE;
00581
00582   // deallocate the prototype statistics and then the prototype itself
00583   if (Prototype->Distrib != NULL)
00584     memfree (Prototype->Distrib);
00585   if (Prototype->Mean != NULL)
00586     memfree (Prototype->Mean);
00587   if (Prototype->Style != spherical) {
00588     if (Prototype->Variance.Elliptical != NULL)
00589       memfree (Prototype->Variance.Elliptical);
00590     if (Prototype->Magnitude.Elliptical != NULL)
00591       memfree (Prototype->Magnitude.Elliptical);
00592     if (Prototype->Weight.Elliptical != NULL)
00593       memfree (Prototype->Weight.Elliptical);
00594   }
00595   memfree(Prototype);
00596 }                                // FreePrototype
00597
00598
00614 CLUSTER *NextSample(LIST *SearchState) {
00615   CLUSTER *Cluster;
00616
00617   if (*SearchState == NIL_LIST)
00618     return (NULL);
00619   Cluster = (CLUSTER *) first_node (*SearchState);
00620   *SearchState = pop (*SearchState);
00621   while (TRUE) {
00622     if (Cluster->Left == NULL)
00623       return (Cluster);
00624     *SearchState = push (*SearchState, Cluster->Right);
00625     Cluster = Cluster->Left;
00626   }
00627 }                                // NextSample
00628
00629
00639 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
00640   return (Proto->Mean[Dimension]);
00641 }                                // Mean
00642
00643
00653 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) {
00654   switch (Proto->Style) {
00655     case spherical:
00656       return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
00657     case elliptical:
00658       return ((FLOAT32)
00659         sqrt ((double) Proto->Variance.Elliptical[Dimension]));
00660     case mixed:
00661       switch (Proto->Distrib[Dimension]) {
00662         case normal:
00663           return ((FLOAT32)
00664             sqrt ((double) Proto->Variance.Elliptical[Dimension]));
00665         case uniform:
00666         case D_random:
00667           return (Proto->Variance.Elliptical[Dimension]);
00668         case DISTRIBUTION_COUNT:
00669           ASSERT_HOST(!"Distribution count not allowed!");
00670       }
00671   }
00672   return 0.0f;
00673 }                                // StandardDeviation
00674
00675
00676 /*---------------------------------------------------------------------------
00677             Private Code
00678 ----------------------------------------------------------------------------*/
00694 void CreateClusterTree(CLUSTERER *Clusterer) {
00695   ClusteringContext context;
00696   HEAPENTRY HeapEntry;
00697   TEMPCLUSTER *PotentialCluster;
00698
00699   // each sample and its nearest neighbor form a "potential" cluster
00700   // save these in a heap with the "best" potential clusters on top
00701   context.tree = Clusterer->KDTree;
00702   context.candidates = (TEMPCLUSTER *)
00703     Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
00704   context.next = 0;
00705   context.heap = MakeHeap(Clusterer->NumberOfSamples);
00706   KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
00707
00708   // form potential clusters into actual clusters - always do "best" first
00709   while (GetTopOfHeap(context.heap, &HeapEntry) != EMPTY) {
00710     PotentialCluster = (TEMPCLUSTER *)HeapEntry.Data;
00711
00712     // if main cluster of potential cluster is already in another cluster
00713     // then we don't need to worry about it
00714     if (PotentialCluster->Cluster->Clustered) {
00715       continue;
00716     }
00717
00718     // if main cluster is not yet clustered, but its nearest neighbor is
00719     // then we must find a new nearest neighbor
00720     else if (PotentialCluster->Neighbor->Clustered) {
00721       PotentialCluster->Neighbor =
00722         FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
00723                             &HeapEntry.Key);
00724       if (PotentialCluster->Neighbor != NULL) {
00725         HeapStore(context.heap, &HeapEntry);
00726       }
00727     }
00728
00729     // if neither cluster is already clustered, form permanent cluster
00730     else {
00731       PotentialCluster->Cluster =
00732           MakeNewCluster(Clusterer, PotentialCluster);
00733       PotentialCluster->Neighbor =
00734           FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
00735                               &HeapEntry.Key);
00736       if (PotentialCluster->Neighbor != NULL) {
00737         HeapStore(context.heap, &HeapEntry);
00738       }
00739     }
00740   }
00741
00742   // the root node in the cluster tree is now the only node in the kd-tree
00743   Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
00744
00745   // free up the memory used by the K-D tree, heap, and temp clusters
00746   FreeKDTree(context.tree);
00747   Clusterer->KDTree = NULL;
00748   FreeHeap(context.heap);
00749   memfree(context.candidates);
00750 }                                // CreateClusterTree
00751
00752
00764 void MakePotentialClusters(ClusteringContext *context,
00765                            CLUSTER *Cluster, inT32 Level) {
00766   HEAPENTRY HeapEntry;
00767   int next = context->next;
00768   context->candidates[next].Cluster = Cluster;
00769   HeapEntry.Data = (char *) &(context->candidates[next]);
00770   context->candidates[next].Neighbor =
00771       FindNearestNeighbor(context->tree,
00772                           context->candidates[next].Cluster,
00773                           &HeapEntry.Key);
00774   if (context->candidates[next].Neighbor != NULL) {
00775     HeapStore(context->heap, &HeapEntry);
00776     context->next++;
00777   }
00778 }                                // MakePotentialClusters
00779
00780
00797 CLUSTER *
00798 FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
00799 #define MAXNEIGHBORS  2
00800 #define MAXDISTANCE   MAX_FLOAT32
00801 {
00802   CLUSTER *Neighbor[MAXNEIGHBORS];
00803   FLOAT32 Dist[MAXNEIGHBORS];
00804   int NumberOfNeighbors;
00805   inT32 i;
00806   CLUSTER *BestNeighbor;
00807
00808   // find the 2 nearest neighbors of the cluster
00809   KDNearestNeighborSearch(Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE,
00810                           &NumberOfNeighbors, (void **)Neighbor, Dist);
00811
00812   // search for the nearest neighbor that is not the cluster itself
00813   *Distance = MAXDISTANCE;
00814   BestNeighbor = NULL;
00815   for (i = 0; i < NumberOfNeighbors; i++) {
00816     if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
00817       *Distance = Dist[i];
00818       BestNeighbor = Neighbor[i];
00819     }
00820   }
00821   return BestNeighbor;
00822 }                                // FindNearestNeighbor
00823
00824
00837 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
00838   CLUSTER *Cluster;
00839
00840   // allocate the new cluster and initialize it
00841   Cluster = (CLUSTER *) Emalloc(
00842       sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
00843   Cluster->Clustered = FALSE;
00844   Cluster->Prototype = FALSE;
00845   Cluster->Left = TempCluster->Cluster;
00846   Cluster->Right = TempCluster->Neighbor;
00847   Cluster->CharID = -1;
00848
00849   // mark the old clusters as "clustered" and delete them from the kd-tree
00850   Cluster->Left->Clustered = TRUE;
00851   Cluster->Right->Clustered = TRUE;
00852   KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
00853   KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
00854
00855   // compute the mean and sample count for the new cluster
00856   Cluster->SampleCount =
00857       MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
00858                     Cluster->Left->SampleCount, Cluster->Right->SampleCount,
00859                     Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
00860
00861   // add the new cluster to the KD tree
00862   KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
00863   return Cluster;
00864 }                                // MakeNewCluster
00865
00866
00882 inT32 MergeClusters(inT16 N,
00883                     PARAM_DESC ParamDesc[],
00884                     inT32 n1,
00885                     inT32 n2,
00886                     FLOAT32 m[],
00887                     FLOAT32 m1[], FLOAT32 m2[]) {
00888   inT32 i, n;
00889
00890   n = n1 + n2;
00891   for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
00892     if (ParamDesc->Circular) {
00893       // if distance between means is greater than allowed
00894       // reduce upper point by one "rotation" to compute mean
00895       // then normalize the mean back into the accepted range
00896       if ((*m2 - *m1) > ParamDesc->HalfRange) {
00897         *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
00898         if (*m < ParamDesc->Min)
00899           *m += ParamDesc->Range;
00900       }
00901       else if ((*m1 - *m2) > ParamDesc->HalfRange) {
00902         *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
00903         if (*m < ParamDesc->Min)
00904           *m += ParamDesc->Range;
00905       }
00906       else
00907         *m = (n1 * *m1 + n2 * *m2) / n;
00908     }
00909     else
00910       *m = (n1 * *m1 + n2 * *m2) / n;
00911   }
00912   return n;
00913 }                                // MergeClusters
00914
00915
00927 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
00928   LIST ClusterStack = NIL_LIST;
00929   CLUSTER *Cluster;
00930   PROTOTYPE *Prototype;
00931
00932   // use a stack to keep track of clusters waiting to be processed
00933   // initially the only cluster on the stack is the root cluster
00934   if (Clusterer->Root != NULL)
00935     ClusterStack = push (NIL_LIST, Clusterer->Root);
00936
00937   // loop until we have analyzed all clusters which are potential prototypes
00938   while (ClusterStack != NIL_LIST) {
00939     // remove the next cluster to be analyzed from the stack
00940     // try to make a prototype from the cluster
00941     // if successful, put it on the proto list, else split the cluster
00942     Cluster = (CLUSTER *) first_node (ClusterStack);
00943     ClusterStack = pop (ClusterStack);
00944     Prototype = MakePrototype(Clusterer, Config, Cluster);
00945     if (Prototype != NULL) {
00946       Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
00947     }
00948     else {
00949       ClusterStack = push (ClusterStack, Cluster->Right);
00950       ClusterStack = push (ClusterStack, Cluster->Left);
00951     }
00952   }
00953 }                                // ComputePrototypes
00954
00955
00974 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
00975                          CLUSTERCONFIG *Config,
00976                          CLUSTER *Cluster) {
00977   STATISTICS *Statistics;
00978   PROTOTYPE *Proto;
00979   BUCKETS *Buckets;
00980
00981   // filter out clusters which contain samples from the same character
00982   if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
00983     return NULL;
00984
00985   // compute the covariance matrix and ranges for the cluster
00986   Statistics =
00987       ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
00988
00989   // check for degenerate clusters which need not be analyzed further
00990   // note that the MinSamples test assumes that all clusters with multiple
00991   // character samples have been removed (as above)
00992   Proto = MakeDegenerateProto(
00993       Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
00994       (inT32) (Config->MinSamples * Clusterer->NumChar));
00995   if (Proto != NULL) {
00996     FreeStatistics(Statistics);
00997     return Proto;
00998   }
00999   // check to ensure that all dimensions are independent
01000   if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
01001                    Statistics->CoVariance, Config->Independence)) {
01002     FreeStatistics(Statistics);
01003     return NULL;
01004   }
01005
01006   if (HOTELLING && Config->ProtoStyle == elliptical) {
01007     Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
01008     if (Proto != NULL) {
01009       FreeStatistics(Statistics);
01010       return Proto;
01011     }
01012   }
01013
01014   // create a histogram data structure used to evaluate distributions
01015   Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
01016                        Config->Confidence);
01017
01018   // create a prototype based on the statistics and test it
01019   switch (Config->ProtoStyle) {
01020     case spherical:
01021       Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
01022       break;
01023     case elliptical:
01024       Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
01025       break;
01026     case mixed:
01027       Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
01028                              Config->Confidence);
01029       break;
01030     case automatic:
01031       Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
01032       if (Proto != NULL)
01033         break;
01034       Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
01035       if (Proto != NULL)
01036         break;
01037       Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
01038                              Config->Confidence);
01039       break;
01040   }
01041   FreeStatistics(Statistics);
01042   return Proto;
01043 }                                // MakePrototype
01044
01045
01067 PROTOTYPE *MakeDegenerateProto(  //this was MinSample
01068                                uinT16 N,
01069                                CLUSTER *Cluster,
01070                                STATISTICS *Statistics,
01071                                PROTOSTYLE Style,
01072                                inT32 MinSamples) {
01073   PROTOTYPE *Proto = NULL;
01074
01075   if (MinSamples < MINSAMPLESNEEDED)
01076     MinSamples = MINSAMPLESNEEDED;
01077
01078   if (Cluster->SampleCount < MinSamples) {
01079     switch (Style) {
01080       case spherical:
01081         Proto = NewSphericalProto (N, Cluster, Statistics);
01082         break;
01083       case elliptical:
01084       case automatic:
01085         Proto = NewEllipticalProto (N, Cluster, Statistics);
01086         break;
01087       case mixed:
01088         Proto = NewMixedProto (N, Cluster, Statistics);
01089         break;
01090     }
01091     Proto->Significant = FALSE;
01092   }
01093   return (Proto);
01094 }                                // MakeDegenerateProto
01095
01109 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
01110                                CLUSTERCONFIG *Config,
01111                                CLUSTER *Cluster,
01112                                STATISTICS *Statistics) {
01113   // Fraction of the number of samples used as a range around 1 within
01114   // which a cluster has the magic size that allows a boost to the
01115   // FTable by kFTableBoostMargin, thus allowing clusters near the
01116   // magic size (equal to the number of sample characters) to be more
01117   // likely to stay together.
01118   const double kMagicSampleMargin = 0.0625;
01119   const double kFTableBoostMargin = 2.0;
01120
01121   int N = Clusterer->SampleSize;
01122   CLUSTER* Left = Cluster->Left;
01123   CLUSTER* Right = Cluster->Right;
01124   if (Left == NULL || Right == NULL)
01125     return NULL;
01126   int TotalDims = Left->SampleCount + Right->SampleCount;
01127   if (TotalDims < N + 1 || TotalDims < 2)
01128     return NULL;
01129   const int kMatrixSize = N * N * sizeof(FLOAT32);
01130   FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
01131   FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
01132   FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
01133   // Compute a new covariance matrix that only uses essential features.
01134   for (int i = 0; i < N; ++i) {
01135     int row_offset = i * N;
01136     if (!Clusterer->ParamDesc[i].NonEssential) {
01137       for (int j = 0; j < N; ++j) {
01138         if (!Clusterer->ParamDesc[j].NonEssential)
01139           Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
01140         else
01141           Covariance[j + row_offset] = 0.0f;
01142       }
01143     } else {
01144       for (int j = 0; j < N; ++j) {
01145         if (i == j)
01146           Covariance[j + row_offset] = 1.0f;
01147         else
01148           Covariance[j + row_offset] = 0.0f;
01149       }
01150     }
01151   }
01152   double err = InvertMatrix(Covariance, N, Inverse);
01153   if (err > 1) {
01154     tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
01155   }
01156   int EssentialN = 0;
01157   for (int dim = 0; dim < N; ++dim) {
01158     if (!Clusterer->ParamDesc[dim].NonEssential) {
01159       Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
01160       ++EssentialN;
01161     } else {
01162       Delta[dim] = 0.0f;
01163     }
01164   }
01165   // Compute Hotelling's T-squared.
01166   double Tsq = 0.0;
01167   for (int x = 0; x < N; ++x) {
01168     double temp = 0.0;
01169     for (int y = 0; y < N; ++y) {
01170       temp += Inverse[y + N*x] * Delta[y];
01171     }
01172     Tsq += Delta[x] * temp;
01173   }
01174   memfree(Covariance);
01175   memfree(Inverse);
01176   memfree(Delta);
01177   // Changed this function to match the formula in
01178   // Statistical Methods in Medical Research p 473
01179   // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
01180   // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
01181   double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
01182   int Fx = EssentialN;
01183   if (Fx > FTABLE_X)
01184     Fx = FTABLE_X;
01185   --Fx;
01186   int Fy = TotalDims - EssentialN - 1;
01187   if (Fy > FTABLE_Y)
01188     Fy = FTABLE_Y;
01189   --Fy;
01190   double FTarget = FTable[Fy][Fx];
01191   if (Config->MagicSamples > 0 &&
01192       TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
01193       TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
01194     // Give magic-sized clusters a magic FTable boost.
01195     FTarget += kFTableBoostMargin;
01196   }
01197   if (F < FTarget) {
01198     return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
01199   }
01200   return NULL;
01201 }
01202
01203 /* MakeSphericalProto *******************************************************
01204 Parameters:     Clusterer       data struct containing samples being clustered
01205       Cluster           cluster to be made into a spherical prototype
01206       Statistics        statistical info about cluster
01207       Buckets           histogram struct used to analyze distribution
01208 Operation:      This routine tests the specified cluster to see if it can
01209       be approximated by a spherical normal distribution.  If it
01210       can be, then a new prototype is formed and returned to the
01211       caller.  If it can't be, then NULL is returned to the caller.
01212 Return:         Pointer to new spherical prototype or NULL.
01213 Exceptions:     None
01214 History:        6/1/89, DSJ, Created.
01215 ******************************************************************************/
01216 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
01217                               CLUSTER *Cluster,
01218                               STATISTICS *Statistics,
01219                               BUCKETS *Buckets) {
01220   PROTOTYPE *Proto = NULL;
01221   int i;
01222
01223   // check that each dimension is a normal distribution
01224   for (i = 0; i < Clusterer->SampleSize; i++) {
01225     if (Clusterer->ParamDesc[i].NonEssential)
01226       continue;
01227
01228     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01229       Cluster->Mean[i],
01230       sqrt ((FLOAT64) (Statistics->AvgVariance)));
01231     if (!DistributionOK (Buckets))
01232       break;
01233   }
01234   // if all dimensions matched a normal distribution, make a proto
01235   if (i >= Clusterer->SampleSize)
01236     Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
01237   return (Proto);
01238 }                                // MakeSphericalProto
01239
01240
01254 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
01255                                CLUSTER *Cluster,
01256                                STATISTICS *Statistics,
01257                                BUCKETS *Buckets) {
01258   PROTOTYPE *Proto = NULL;
01259   int i;
01260
01261   // check that each dimension is a normal distribution
01262   for (i = 0; i < Clusterer->SampleSize; i++) {
01263     if (Clusterer->ParamDesc[i].NonEssential)
01264       continue;
01265
01266     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01267       Cluster->Mean[i],
01268       sqrt ((FLOAT64) Statistics->
01269       CoVariance[i * (Clusterer->SampleSize + 1)]));
01270     if (!DistributionOK (Buckets))
01271       break;
01272   }
01273   // if all dimensions matched a normal distribution, make a proto
01274   if (i >= Clusterer->SampleSize)
01275     Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
01276   return (Proto);
01277 }                                // MakeEllipticalProto
01278
01279
01298 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
01299                           CLUSTER *Cluster,
01300                           STATISTICS *Statistics,
01301                           BUCKETS *NormalBuckets,
01302                           FLOAT64 Confidence) {
01303   PROTOTYPE *Proto;
01304   int i;
01305   BUCKETS *UniformBuckets = NULL;
01306   BUCKETS *RandomBuckets = NULL;
01307
01308   // create a mixed proto to work on - initially assume all dimensions normal*/
01309   Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
01310
01311   // find the proper distribution for each dimension
01312   for (i = 0; i < Clusterer->SampleSize; i++) {
01313     if (Clusterer->ParamDesc[i].NonEssential)
01314       continue;
01315
01316     FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01317       Proto->Mean[i],
01318       sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
01319     if (DistributionOK (NormalBuckets))
01320       continue;
01321
01322     if (RandomBuckets == NULL)
01323       RandomBuckets =
01324         GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
01325     MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
01326     FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01327       Proto->Mean[i], Proto->Variance.Elliptical[i]);
01328     if (DistributionOK (RandomBuckets))
01329       continue;
01330
01331     if (UniformBuckets == NULL)
01332       UniformBuckets =
01333         GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
01334     MakeDimUniform(i, Proto, Statistics);
01335     FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01336       Proto->Mean[i], Proto->Variance.Elliptical[i]);
01337     if (DistributionOK (UniformBuckets))
01338       continue;
01339     break;
01340   }
01341   // if any dimension failed to match a distribution, discard the proto
01342   if (i < Clusterer->SampleSize) {
01343     FreePrototype(Proto);
01344     Proto = NULL;
01345   }
01346   return (Proto);
01347 }                                // MakeMixedProto
01348
01349
01350 /* MakeDimRandom *************************************************************
01351 Parameters:     i               index of dimension to be changed
01352       Proto             prototype whose dimension is to be altered
01353       ParamDesc description of specified dimension
01354 Operation:      This routine alters the ith dimension of the specified
01355       mixed prototype to be D_random.
01356 Return:         None
01357 Exceptions:     None
01358 History:        6/20/89, DSJ, Created.
01359 ******************************************************************************/
01360 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
01361   Proto->Distrib[i] = D_random;
01362   Proto->Mean[i] = ParamDesc->MidRange;
01363   Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
01364
01365   // subtract out the previous magnitude of this dimension from the total
01366   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
01367   Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
01368   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01369   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01370
01371   // note that the proto Weight is irrelevant for D_random protos
01372 }                                // MakeDimRandom
01373
01374
01385 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
01386   Proto->Distrib[i] = uniform;
01387   Proto->Mean[i] = Proto->Cluster->Mean[i] +
01388     (Statistics->Min[i] + Statistics->Max[i]) / 2;
01389   Proto->Variance.Elliptical[i] =
01390     (Statistics->Max[i] - Statistics->Min[i]) / 2;
01391   if (Proto->Variance.Elliptical[i] < MINVARIANCE)
01392     Proto->Variance.Elliptical[i] = MINVARIANCE;
01393
01394   // subtract out the previous magnitude of this dimension from the total
01395   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
01396   Proto->Magnitude.Elliptical[i] =
01397     1.0 / (2.0 * Proto->Variance.Elliptical[i]);
01398   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01399   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01400
01401   // note that the proto Weight is irrelevant for uniform protos
01402 }                                // MakeDimUniform
01403
01404
01421 STATISTICS *
01422 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
01423   STATISTICS *Statistics;
01424   int i, j;
01425   FLOAT32 *CoVariance;
01426   FLOAT32 *Distance;
01427   LIST SearchState;
01428   SAMPLE *Sample;
01429   uinT32 SampleCountAdjustedForBias;
01430
01431   // allocate memory to hold the statistics results
01432   Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
01433   Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
01434   Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01435   Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01436
01437   // allocate temporary memory to hold the sample to mean distances
01438   Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01439
01440   // initialize the statistics
01441   Statistics->AvgVariance = 1.0;
01442   CoVariance = Statistics->CoVariance;
01443   for (i = 0; i < N; i++) {
01444     Statistics->Min[i] = 0.0;
01445     Statistics->Max[i] = 0.0;
01446     for (j = 0; j < N; j++, CoVariance++)
01447       *CoVariance = 0;
01448   }
01449   // find each sample in the cluster and merge it into the statistics
01450   InitSampleSearch(SearchState, Cluster);
01451   while ((Sample = NextSample (&SearchState)) != NULL) {
01452     for (i = 0; i < N; i++) {
01453       Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
01454       if (ParamDesc[i].Circular) {
01455         if (Distance[i] > ParamDesc[i].HalfRange)
01456           Distance[i] -= ParamDesc[i].Range;
01457         if (Distance[i] < -ParamDesc[i].HalfRange)
01458           Distance[i] += ParamDesc[i].Range;
01459       }
01460       if (Distance[i] < Statistics->Min[i])
01461         Statistics->Min[i] = Distance[i];
01462       if (Distance[i] > Statistics->Max[i])
01463         Statistics->Max[i] = Distance[i];
01464     }
01465     CoVariance = Statistics->CoVariance;
01466     for (i = 0; i < N; i++)
01467       for (j = 0; j < N; j++, CoVariance++)
01468         *CoVariance += Distance[i] * Distance[j];
01469   }
01470   // normalize the variances by the total number of samples
01471   // use SampleCount-1 instead of SampleCount to get an unbiased estimate
01472   // also compute the geometic mean of the diagonal variances
01473   // ensure that clusters with only 1 sample are handled correctly
01474   if (Cluster->SampleCount > 1)
01475     SampleCountAdjustedForBias = Cluster->SampleCount - 1;
01476   else
01477     SampleCountAdjustedForBias = 1;
01478   CoVariance = Statistics->CoVariance;
01479   for (i = 0; i < N; i++)
01480   for (j = 0; j < N; j++, CoVariance++) {
01481     *CoVariance /= SampleCountAdjustedForBias;
01482     if (j == i) {
01483       if (*CoVariance < MINVARIANCE)
01484         *CoVariance = MINVARIANCE;
01485       Statistics->AvgVariance *= *CoVariance;
01486     }
01487   }
01488   Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
01489                                        1.0 / N);
01490
01491   // release temporary memory and return
01492   memfree(Distance);
01493   return (Statistics);
01494 }                                // ComputeStatistics
01495
01496
01510 PROTOTYPE *NewSphericalProto(uinT16 N,
01511                              CLUSTER *Cluster,
01512                              STATISTICS *Statistics) {
01513   PROTOTYPE *Proto;
01514
01515   Proto = NewSimpleProto (N, Cluster);
01516
01517   Proto->Variance.Spherical = Statistics->AvgVariance;
01518   if (Proto->Variance.Spherical < MINVARIANCE)
01519     Proto->Variance.Spherical = MINVARIANCE;
01520
01521   Proto->Magnitude.Spherical =
01522     1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
01523   Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
01524                                      (double) N);
01525   Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
01526   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01527
01528   return (Proto);
01529 }                                // NewSphericalProto
01530
01531
01544 PROTOTYPE *NewEllipticalProto(inT16 N,
01545                               CLUSTER *Cluster,
01546                               STATISTICS *Statistics) {
01547   PROTOTYPE *Proto;
01548   FLOAT32 *CoVariance;
01549   int i;
01550
01551   Proto = NewSimpleProto (N, Cluster);
01552   Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01553   Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01554   Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01555
01556   CoVariance = Statistics->CoVariance;
01557   Proto->TotalMagnitude = 1.0;
01558   for (i = 0; i < N; i++, CoVariance += N + 1) {
01559     Proto->Variance.Elliptical[i] = *CoVariance;
01560     if (Proto->Variance.Elliptical[i] < MINVARIANCE)
01561       Proto->Variance.Elliptical[i] = MINVARIANCE;
01562
01563     Proto->Magnitude.Elliptical[i] =
01564       1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
01565     Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
01566     Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01567   }
01568   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01569   Proto->Style = elliptical;
01570   return (Proto);
01571 }                                // NewEllipticalProto
01572
01573
01589 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
01590   PROTOTYPE *Proto;
01591   int i;
01592
01593   Proto = NewEllipticalProto (N, Cluster, Statistics);
01594   Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
01595
01596   for (i = 0; i < N; i++) {
01597     Proto->Distrib[i] = normal;
01598   }
01599   Proto->Style = mixed;
01600   return (Proto);
01601 }                                // NewMixedProto
01602
01603
01614 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) {
01615   PROTOTYPE *Proto;
01616   int i;
01617
01618   Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
01619   Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01620
01621   for (i = 0; i < N; i++)
01622     Proto->Mean[i] = Cluster->Mean[i];
01623   Proto->Distrib = NULL;
01624
01625   Proto->Significant = TRUE;
01626   Proto->Merged = FALSE;
01627   Proto->Style = spherical;
01628   Proto->NumSamples = Cluster->SampleCount;
01629   Proto->Cluster = Cluster;
01630   Proto->Cluster->Prototype = TRUE;
01631   return (Proto);
01632 }                                // NewSimpleProto
01633
01634
01655 BOOL8
01656 Independent (PARAM_DESC ParamDesc[],
01657 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
01658   int i, j;
01659   FLOAT32 *VARii;                // points to ith on-diagonal element
01660   FLOAT32 *VARjj;                // points to jth on-diagonal element
01661   FLOAT32 CorrelationCoeff;
01662
01663   VARii = CoVariance;
01664   for (i = 0; i < N; i++, VARii += N + 1) {
01665     if (ParamDesc[i].NonEssential)
01666       continue;
01667
01668     VARjj = VARii + N + 1;
01669     CoVariance = VARii + 1;
01670     for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
01671       if (ParamDesc[j].NonEssential)
01672         continue;
01673
01674       if ((*VARii == 0.0) || (*VARjj == 0.0))
01675         CorrelationCoeff = 0.0;
01676       else
01677         CorrelationCoeff =
01678           sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
01679       if (CorrelationCoeff > Independence)
01680         return (FALSE);
01681     }
01682   }
01683   return (TRUE);
01684 }                                // Independent
01685
01686
01705 BUCKETS *GetBuckets(CLUSTERER* clusterer,
01706                     DISTRIBUTION Distribution,
01707                     uinT32 SampleCount,
01708                     FLOAT64 Confidence) {
01709   // Get an old bucket structure with the same number of buckets.
01710   uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
01711   BUCKETS *Buckets =
01712       clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
01713
01714   // If a matching bucket structure is not found, make one and save it.
01715   if (Buckets == NULL) {
01716     Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
01717     clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
01718         Buckets;
01719   } else {
01720     // Just adjust the existing buckets.
01721     if (SampleCount != Buckets->SampleCount)
01722       AdjustBuckets(Buckets, SampleCount);
01723     if (Confidence != Buckets->Confidence) {
01724       Buckets->Confidence = Confidence;
01725       Buckets->ChiSquared = ComputeChiSquared(
01726           DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
01727           Confidence);
01728     }
01729     InitBuckets(Buckets);
01730   }
01731   return Buckets;
01732 }                                // GetBuckets
01733
01734
01755 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
01756                      uinT32 SampleCount,
01757                      FLOAT64 Confidence) {
01758   const DENSITYFUNC DensityFunction[] =
01759     { NormalDensity, UniformDensity, UniformDensity };
01760   int i, j;
01761   BUCKETS *Buckets;
01762   FLOAT64 BucketProbability;
01763   FLOAT64 NextBucketBoundary;
01764   FLOAT64 Probability;
01765   FLOAT64 ProbabilityDelta;
01766   FLOAT64 LastProbDensity;
01767   FLOAT64 ProbDensity;
01768   uinT16 CurrentBucket;
01769   BOOL8 Symmetrical;
01770
01771   // allocate memory needed for data structure
01772   Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
01773   Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
01774   Buckets->SampleCount = SampleCount;
01775   Buckets->Confidence = Confidence;
01776   Buckets->Count = reinterpret_cast<uinT32*>(
01777       Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
01778   Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
01779       Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
01780
01781   // initialize simple fields
01782   Buckets->Distribution = Distribution;
01783   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
01784     Buckets->Count[i] = 0;
01785     Buckets->ExpectedCount[i] = 0.0;
01786   }
01787
01788   // all currently defined distributions are symmetrical
01789   Symmetrical = TRUE;
01790   Buckets->ChiSquared = ComputeChiSquared(
01791       DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
01792
01793   if (Symmetrical) {
01794     // allocate buckets so that all have approx. equal probability
01795     BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
01796
01797     // distribution is symmetric so fill in upper half then copy
01798     CurrentBucket = Buckets->NumberOfBuckets / 2;
01799     if (Odd (Buckets->NumberOfBuckets))
01800       NextBucketBoundary = BucketProbability / 2;
01801     else
01802       NextBucketBoundary = BucketProbability;
01803
01804     Probability = 0.0;
01805     LastProbDensity =
01806       (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
01807     for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
01808       ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
01809       ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
01810       Probability += ProbabilityDelta;
01811       if (Probability > NextBucketBoundary) {
01812         if (CurrentBucket < Buckets->NumberOfBuckets - 1)
01813           CurrentBucket++;
01814         NextBucketBoundary += BucketProbability;
01815       }
01816       Buckets->Bucket[i] = CurrentBucket;
01817       Buckets->ExpectedCount[CurrentBucket] +=
01818         (FLOAT32) (ProbabilityDelta * SampleCount);
01819       LastProbDensity = ProbDensity;
01820     }
01821     // place any leftover probability into the last bucket
01822     Buckets->ExpectedCount[CurrentBucket] +=
01823       (FLOAT32) ((0.5 - Probability) * SampleCount);
01824
01825     // copy upper half of distribution to lower half
01826     for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
01827       Buckets->Bucket[i] =
01828         Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
01829
01830     // copy upper half of expected counts to lower half
01831     for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
01832       Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
01833   }
01834   return Buckets;
01835 }                                // MakeBuckets
01836
01837
01838 //---------------------------------------------------------------------------
01839 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) {
01840 /*
01841  **     Parameters:
01842  **             SampleCount     number of samples to be tested
01843   **    Operation:
01844  **             This routine computes the optimum number of histogram
01845  **             buckets that should be used in a chi-squared goodness of
01846  **             fit test for the specified number of samples.  The optimum
01847  **             number is computed based on Table 4.1 on pg. 147 of
01848  **             "Measurement and Analysis of Random Data" by Bendat & Piersol.
01849  **             Linear interpolation is used to interpolate between table
01850  **             values.  The table is intended for a 0.05 level of
01851  **             significance (alpha).  This routine assumes that it is
01852  **             equally valid for other alpha's, which may not be true.
01853  **     Return:
01854  **             Optimum number of histogram buckets
01855  **     Exceptions:
01856  **             None
01857  **     History:
01858  **             6/5/89, DSJ, Created.
01859  */
01860   uinT8 Last, Next;
01861   FLOAT32 Slope;
01862
01863   if (SampleCount < kCountTable[0])
01864     return kBucketsTable[0];
01865
01866   for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
01867     if (SampleCount <= kCountTable[Next]) {
01868       Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
01869           (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
01870       return ((uinT16) (kBucketsTable[Last] +
01871           Slope * (SampleCount - kCountTable[Last])));
01872     }
01873   }
01874   return kBucketsTable[Last];
01875 }                                // OptimumNumberOfBuckets
01876
01877
01878 //---------------------------------------------------------------------------
01879 FLOAT64
01880 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
01881 /*
01882  **     Parameters:
01883  **             DegreesOfFreedom        determines shape of distribution
01884  **             Alpha                   probability of right tail
01885  **     Operation:
01886  **             This routine computes the chi-squared value which will
01887  **             leave a cumulative probability of Alpha in the right tail
01888  **             of a chi-squared distribution with the specified number of
01889  **             degrees of freedom.  Alpha must be between 0 and 1.
01890  **             DegreesOfFreedom must be even.  The routine maintains an
01891  **             array of lists.  Each list corresponds to a different
01892  **             number of degrees of freedom.  Each entry in the list
01893  **             corresponds to a different alpha value and its corresponding
01894  **             chi-squared value.  Therefore, once a particular chi-squared
01895  **             value is computed, it is stored in the list and never
01896  **             needs to be computed again.
01897  **     Return: Desired chi-squared value
01898  **     Exceptions: none
01899  **     History: 6/5/89, DSJ, Created.
01900  */
01901 #define CHIACCURACY     0.01
01902 #define MINALPHA  (1e-200)
01903 {
01904   static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
01905
01906   CHISTRUCT *OldChiSquared;
01907   CHISTRUCT SearchKey;
01908
01909   // limit the minimum alpha that can be used - if alpha is too small
01910   //      it may not be possible to compute chi-squared.
01911   Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
01912   if (Odd (DegreesOfFreedom))
01913     DegreesOfFreedom++;
01914
01915   /* find the list of chi-squared values which have already been computed
01916      for the specified number of degrees of freedom.  Search the list for
01917      the desired chi-squared. */
01918   SearchKey.Alpha = Alpha;
01919   OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
01920     &SearchKey, AlphaMatch));
01921
01922   if (OldChiSquared == NULL) {
01923     OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
01924     OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
01925       (FLOAT64) DegreesOfFreedom,
01926       (FLOAT64) CHIACCURACY);
01927     ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
01928       OldChiSquared);
01929   }
01930   else {
01931     // further optimization might move OldChiSquared to front of list
01932   }
01933
01934   return (OldChiSquared->ChiSquared);
01935
01936 }                                // ComputeChiSquared
01937
01938
01939 //---------------------------------------------------------------------------
01940 FLOAT64 NormalDensity(inT32 x) {
01941 /*
01942  **     Parameters:
01943  **             x       number to compute the normal probability density for
01944  **     Globals:
01945  **             kNormalMean     mean of a discrete normal distribution
01946  **             kNormalVariance variance of a discrete normal distribution
01947  **             kNormalMagnitude        magnitude of a discrete normal distribution
01948  **     Operation:
01949  **             This routine computes the probability density function
01950  **             of a discrete normal distribution defined by the global
01951  **             variables kNormalMean, kNormalVariance, and kNormalMagnitude.
01952  **             Normal magnitude could, of course, be computed in terms of
01953  **             the normal variance but it is precomputed for efficiency.
01954  **     Return:
01955  **             The value of the normal distribution at x.
01956  **     Exceptions:
01957  **             None
01958  **     History:
01959  **             6/4/89, DSJ, Created.
01960  */
01961   FLOAT64 Distance;
01962
01963   Distance = x - kNormalMean;
01964   return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
01965 }                                // NormalDensity
01966
01967
01968 //---------------------------------------------------------------------------
01969 FLOAT64 UniformDensity(inT32 x) {
01970 /*
01971  **     Parameters:
01972  **             x       number to compute the uniform probability density for
01973  **     Operation:
01974  **             This routine computes the probability density function
01975  **             of a uniform distribution at the specified point.  The
01976  **             range of the distribution is from 0 to BUCKETTABLESIZE.
01977  **     Return:
01978  **             The value of the uniform distribution at x.
01979  **     Exceptions:
01980  **             None
01981  **     History:
01982  **             6/5/89, DSJ, Created.
01983  */
01984   static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
01985
01986   if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
01987     return UniformDistributionDensity;
01988   else
01989     return (FLOAT64) 0.0;
01990 }                                // UniformDensity
01991
01992
01993 //---------------------------------------------------------------------------
01994 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) {
01995 /*
01996  **     Parameters:
01997  **             f1      value of function at x1
01998  **             f2      value of function at x2
01999  **             Dx      x2 - x1 (should always be positive)
02000  **     Operation:
02001  **             This routine computes a trapezoidal approximation to the
02002  **             integral of a function over a small delta in x.
02003  **     Return:
02004  **             Approximation of the integral of the function from x1 to x2.
02005  **     Exceptions:
02006  **             None
02007  **     History:
02008  **             6/5/89, DSJ, Created.
02009  */
02010   return (f1 + f2) * Dx / 2.0;
02011 }                                // Integral
02012
02013
02014 //---------------------------------------------------------------------------
02015 void FillBuckets(BUCKETS *Buckets,
02016                  CLUSTER *Cluster,
02017                  uinT16 Dim,
02018                  PARAM_DESC *ParamDesc,
02019                  FLOAT32 Mean,
02020                  FLOAT32 StdDev) {
02021 /*
02022  **     Parameters:
02023  **             Buckets         histogram buckets to count samples
02024  **             Cluster         cluster whose samples are being analyzed
02025  **             Dim             dimension of samples which is being analyzed
02026  **             ParamDesc       description of the dimension
02027  **             Mean            "mean" of the distribution
02028  **             StdDev          "standard deviation" of the distribution
02029  **     Operation:
02030  **             This routine counts the number of cluster samples which
02031  **             fall within the various histogram buckets in Buckets.  Only
02032  **             one dimension of each sample is examined.  The exact meaning
02033  **             of the Mean and StdDev parameters depends on the
02034  **             distribution which is being analyzed (this info is in the
02035  **             Buckets data structure).  For normal distributions, Mean
02036  **             and StdDev have the expected meanings.  For uniform and
02037  **             random distributions the Mean is the center point of the
02038  **             range and the StdDev is 1/2 the range.  A dimension with
02039  **             zero standard deviation cannot be statistically analyzed.
02040  **             In this case, a pseudo-analysis is used.
02041  **     Return:
02042  **             None (the Buckets data structure is filled in)
02043  **     Exceptions:
02044  **             None
02045  **     History:
02046  **             6/5/89, DSJ, Created.
02047  */
02048   uinT16 BucketID;
02049   int i;
02050   LIST SearchState;
02051   SAMPLE *Sample;
02052
02053   // initialize the histogram bucket counts to 0
02054   for (i = 0; i < Buckets->NumberOfBuckets; i++)
02055     Buckets->Count[i] = 0;
02056
02057   if (StdDev == 0.0) {
02058     /* if the standard deviation is zero, then we can't statistically
02059        analyze the cluster.  Use a pseudo-analysis: samples exactly on
02060        the mean are distributed evenly across all buckets.  Samples greater
02061        than the mean are placed in the last bucket; samples less than the
02062        mean are placed in the first bucket. */
02063
02064     InitSampleSearch(SearchState, Cluster);
02065     i = 0;
02066     while ((Sample = NextSample (&SearchState)) != NULL) {
02067       if (Sample->Mean[Dim] > Mean)
02068         BucketID = Buckets->NumberOfBuckets - 1;
02069       else if (Sample->Mean[Dim] < Mean)
02070         BucketID = 0;
02071       else
02072         BucketID = i;
02073       Buckets->Count[BucketID] += 1;
02074       i++;
02075       if (i >= Buckets->NumberOfBuckets)
02076         i = 0;
02077     }
02078   }
02079   else {
02080     // search for all samples in the cluster and add to histogram buckets
02081     InitSampleSearch(SearchState, Cluster);
02082     while ((Sample = NextSample (&SearchState)) != NULL) {
02083       switch (Buckets->Distribution) {
02084         case normal:
02085           BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
02086             Mean, StdDev);
02087           break;
02088         case D_random:
02089         case uniform:
02090           BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
02091             Mean, StdDev);
02092           break;
02093         default:
02094           BucketID = 0;
02095       }
02096       Buckets->Count[Buckets->Bucket[BucketID]] += 1;
02097     }
02098   }
02099 }                                // FillBuckets
02100
02101
02102 //---------------------------------------------------------------------------*/
02103 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
02104                     FLOAT32 x,
02105                     FLOAT32 Mean,
02106                     FLOAT32 StdDev) {
02107 /*
02108  **     Parameters:
02109  **             ParamDesc       used to identify circular dimensions
02110  **             x               value to be normalized
02111  **             Mean            mean of normal distribution
02112  **             StdDev          standard deviation of normal distribution
02113  **     Operation:
02114  **             This routine determines which bucket x falls into in the
02115  **             discrete normal distribution defined by kNormalMean
02116  **             and kNormalStdDev.  x values which exceed the range of
02117  **             the discrete distribution are clipped.
02118  **     Return:
02119  **             Bucket number into which x falls
02120  **     Exceptions:
02121  **             None
02122  **     History:
02123  **             6/5/89, DSJ, Created.
02124  */
02125   FLOAT32 X;
02126
02127   // wraparound circular parameters if necessary
02128   if (ParamDesc->Circular) {
02129     if (x - Mean > ParamDesc->HalfRange)
02130       x -= ParamDesc->Range;
02131     else if (x - Mean < -ParamDesc->HalfRange)
02132       x += ParamDesc->Range;
02133   }
02134
02135   X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
02136   if (X < 0)
02137     return 0;
02138   if (X > BUCKETTABLESIZE - 1)
02139     return ((uinT16) (BUCKETTABLESIZE - 1));
02140   return (uinT16) floor((FLOAT64) X);
02141 }                                // NormalBucket
02142
02143
02144 //---------------------------------------------------------------------------
02145 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
02146                      FLOAT32 x,
02147                      FLOAT32 Mean,
02148                      FLOAT32 StdDev) {
02149 /*
02150  **     Parameters:
02151  **             ParamDesc       used to identify circular dimensions
02152  **             x               value to be normalized
02153  **             Mean            center of range of uniform distribution
02154  **             StdDev          1/2 the range of the uniform distribution
02155  **     Operation:
02156  **             This routine determines which bucket x falls into in the
02157  **             discrete uniform distribution defined by
02158  **             BUCKETTABLESIZE.  x values which exceed the range of
02159  **             the discrete distribution are clipped.
02160  **     Return:
02161  **             Bucket number into which x falls
02162  **     Exceptions:
02163  **             None
02164  **     History:
02165  **             6/5/89, DSJ, Created.
02166  */
02167   FLOAT32 X;
02168
02169   // wraparound circular parameters if necessary
02170   if (ParamDesc->Circular) {
02171     if (x - Mean > ParamDesc->HalfRange)
02172       x -= ParamDesc->Range;
02173     else if (x - Mean < -ParamDesc->HalfRange)
02174       x += ParamDesc->Range;
02175   }
02176
02177   X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
02178   if (X < 0)
02179     return 0;
02180   if (X > BUCKETTABLESIZE - 1)
02181     return (uinT16) (BUCKETTABLESIZE - 1);
02182   return (uinT16) floor((FLOAT64) X);
02183 }                                // UniformBucket
02184
02185
02186 //---------------------------------------------------------------------------
02187 BOOL8 DistributionOK(BUCKETS *Buckets) {
02188 /*
02189  **     Parameters:
02190  **             Buckets         histogram data to perform chi-square test on
02191  **     Operation:
02192  **             This routine performs a chi-square goodness of fit test
02193  **             on the histogram data in the Buckets data structure.  TRUE
02194  **             is returned if the histogram matches the probability
02195  **             distribution which was specified when the Buckets
02196  **             structure was originally created.  Otherwise FALSE is
02197  **             returned.
02198  **     Return:
02199  **             TRUE if samples match distribution, FALSE otherwise
02200  **     Exceptions:
02201  **             None
02202  **     History:
02203  **             6/5/89, DSJ, Created.
02204  */
02205   FLOAT32 FrequencyDifference;
02206   FLOAT32 TotalDifference;
02207   int i;
02208
02209   // compute how well the histogram matches the expected histogram
02210   TotalDifference = 0.0;
02211   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02212     FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
02213     TotalDifference += (FrequencyDifference * FrequencyDifference) /
02214       Buckets->ExpectedCount[i];
02215   }
02216
02217   // test to see if the difference is more than expected
02218   if (TotalDifference > Buckets->ChiSquared)
02219     return FALSE;
02220   else
02221     return TRUE;
02222 }                                // DistributionOK
02223
02224
02225 //---------------------------------------------------------------------------
02226 void FreeStatistics(STATISTICS *Statistics) {
02227 /*
02228  **     Parameters:
02229  **             Statistics      pointer to data structure to be freed
02230  **     Operation:
02231  **             This routine frees the memory used by the statistics
02232  **             data structure.
02233  **     Return:
02234  **             None
02235  **     Exceptions:
02236  **             None
02237  **     History:
02238  **             6/5/89, DSJ, Created.
02239  */
02240   memfree (Statistics->CoVariance);
02241   memfree (Statistics->Min);
02242   memfree (Statistics->Max);
02243   memfree(Statistics);
02244 }                                // FreeStatistics
02245
02246
02247 //---------------------------------------------------------------------------
02248 void FreeBuckets(BUCKETS *buckets) {
02249 /*
02250  **  Parameters:
02251  **      buckets  pointer to data structure to be freed
02252  **  Operation:
02253  **      This routine properly frees the memory used by a BUCKETS.
02254  */
02255   Efree(buckets->Count);
02256   Efree(buckets->ExpectedCount);
02257   Efree(buckets);
02258 }                                // FreeBuckets
02259
02260
02261 //---------------------------------------------------------------------------
02262 void FreeCluster(CLUSTER *Cluster) {
02263 /*
02264  **     Parameters:
02265  **             Cluster         pointer to cluster to be freed
02266  **     Operation:
02267  **             This routine frees the memory consumed by the specified
02268  **             cluster and all of its subclusters.  This is done by
02269  **             recursive calls to FreeCluster().
02270  **     Return:
02271  **             None
02272  **     Exceptions:
02273  **             None
02274  **     History:
02275  **             6/6/89, DSJ, Created.
02276  */
02277   if (Cluster != NULL) {
02278     FreeCluster (Cluster->Left);
02279     FreeCluster (Cluster->Right);
02280     memfree(Cluster);
02281   }
02282 }                                // FreeCluster
02283
02284
02285 //---------------------------------------------------------------------------
02286 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
02287 /*
02288  **     Parameters:
02289  **             Distribution            distribution being tested for
02290  **             HistogramBuckets        number of buckets in chi-square test
02291  **     Operation:
02292  **             This routine computes the degrees of freedom that should
02293  **             be used in a chi-squared test with the specified number of
02294  **             histogram buckets.  The result is always rounded up to
02295  **             the next even number so that the value of chi-squared can be
02296  **             computed more easily.  This will cause the value of
02297  **             chi-squared to be higher than the optimum value, resulting
02298  **             in the chi-square test being more lenient than optimum.
02299  **     Return: The number of degrees of freedom for a chi-square test
02300  **     Exceptions: none
02301  **     History: Thu Aug  3 14:04:18 1989, DSJ, Created.
02302  */
02303   static uinT8 DegreeOffsets[] = { 3, 3, 1 };
02304
02305   uinT16 AdjustedNumBuckets;
02306
02307   AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
02308   if (Odd (AdjustedNumBuckets))
02309     AdjustedNumBuckets++;
02310   return (AdjustedNumBuckets);
02311
02312 }                                // DegreesOfFreedom
02313
02314
02315 //---------------------------------------------------------------------------
02316 int NumBucketsMatch(void *arg1,    // BUCKETS *Histogram,
02317                     void *arg2) {  // uinT16 *DesiredNumberOfBuckets)
02318 /*
02319  **     Parameters:
02320  **             Histogram       current histogram being tested for a match
02321  **             DesiredNumberOfBuckets  match key
02322  **     Operation:
02323  **             This routine is used to search a list of histogram data
02324  **             structures to find one with the specified number of
02325  **             buckets.  It is called by the list search routines.
02326  **     Return: TRUE if Histogram matches DesiredNumberOfBuckets
02327  **     Exceptions: none
02328  **     History: Thu Aug  3 14:17:33 1989, DSJ, Created.
02329  */
02330   BUCKETS *Histogram = (BUCKETS *) arg1;
02331   uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
02332
02333   return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
02334
02335 }                                // NumBucketsMatch
02336
02337
02338 //---------------------------------------------------------------------------
02339 int ListEntryMatch(void *arg1,    //ListNode
02340                    void *arg2) {  //Key
02341 /*
02342  **     Parameters: none
02343  **     Operation:
02344  **             This routine is used to search a list for a list node
02345  **             whose contents match Key.  It is called by the list
02346  **             delete_d routine.
02347  **     Return: TRUE if ListNode matches Key
02348  **     Exceptions: none
02349  **     History: Thu Aug  3 14:23:58 1989, DSJ, Created.
02350  */
02351   return (arg1 == arg2);
02352
02353 }                                // ListEntryMatch
02354
02355
02356 //---------------------------------------------------------------------------
02357 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
02358 /*
02359  **     Parameters:
02360  **             Buckets         histogram data structure to adjust
02361  **             NewSampleCount  new sample count to adjust to
02362  **     Operation:
02363  **             This routine multiplies each ExpectedCount histogram entry
02364  **             by NewSampleCount/OldSampleCount so that the histogram
02365  **             is now adjusted to the new sample count.
02366  **     Return: none
02367  **     Exceptions: none
02368  **     History: Thu Aug  3 14:31:14 1989, DSJ, Created.
02369  */
02370   int i;
02371   FLOAT64 AdjustFactor;
02372
02373   AdjustFactor = (((FLOAT64) NewSampleCount) /
02374     ((FLOAT64) Buckets->SampleCount));
02375
02376   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02377     Buckets->ExpectedCount[i] *= AdjustFactor;
02378   }
02379
02380   Buckets->SampleCount = NewSampleCount;
02381
02382 }                                // AdjustBuckets
02383
02384
02385 //---------------------------------------------------------------------------
02386 void InitBuckets(BUCKETS *Buckets) {
02387 /*
02388  **     Parameters:
02389  **             Buckets         histogram data structure to init
02390  **     Operation:
02391  **             This routine sets the bucket counts in the specified histogram
02392  **             to zero.
02393  **     Return: none
02394  **     Exceptions: none
02395  **     History: Thu Aug  3 14:31:14 1989, DSJ, Created.
02396  */
02397   int i;
02398
02399   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02400     Buckets->Count[i] = 0;
02401   }
02402
02403 }                                // InitBuckets
02404
02405
02406 //---------------------------------------------------------------------------
02407 int AlphaMatch(void *arg1,    //CHISTRUCT                             *ChiStruct,
02408                void *arg2) {  //CHISTRUCT                             *SearchKey)
02409 /*
02410  **     Parameters:
02411  **             ChiStruct       chi-squared struct being tested for a match
02412  **             SearchKey       chi-squared struct that is the search key
02413  **     Operation:
02414  **             This routine is used to search a list of structures which
02415  **             hold pre-computed chi-squared values for a chi-squared
02416  **             value whose corresponding alpha field matches the alpha
02417  **             field of SearchKey.
02418  **             It is called by the list search routines.
02419  **     Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
02420  **     Exceptions: none
02421  **     History: Thu Aug  3 14:17:33 1989, DSJ, Created.
02422  */
02423   CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
02424   CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
02425
02426   return (ChiStruct->Alpha == SearchKey->Alpha);
02427
02428 }                                // AlphaMatch
02429
02430
02431 //---------------------------------------------------------------------------
02432 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
02433 /*
02434  **     Parameters:
02435  **             DegreesOfFreedom        degrees of freedom for new chi value
02436  **             Alpha                   confidence level for new chi value
02437  **     Operation:
02438  **             This routine allocates a new data structure which is used
02439  **             to hold a chi-squared value along with its associated
02440  **             number of degrees of freedom and alpha value.
02441  **     Return: none
02442  **     Exceptions: none
02443  **     History: Fri Aug  4 11:04:59 1989, DSJ, Created.
02444  */
02445   CHISTRUCT *NewChiStruct;
02446
02447   NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
02448   NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
02449   NewChiStruct->Alpha = Alpha;
02450   return (NewChiStruct);
02451
02452 }                                // NewChiStruct
02453
02454
02455 //---------------------------------------------------------------------------
02456 FLOAT64
02457 Solve (SOLVEFUNC Function,
02458 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
02459 /*
02460  **     Parameters:
02461  **             Function        function whose zero is to be found
02462  **             FunctionParams  arbitrary data to pass to function
02463  **             InitialGuess    point to start solution search at
02464  **             Accuracy        maximum allowed error
02465  **     Operation:
02466  **             This routine attempts to find an x value at which Function
02467  **             goes to zero (i.e. a root of the function ).  It will only
02468  **             work correctly if a solution actually exists and there
02469  **             are no extrema between the solution and the InitialGuess.
02470  **             The algorithms used are extremely primitive.
02471  **     Return: Solution of function ( x for which f(x) = 0 ).
02472  **     Exceptions: none
02473  **     History: Fri Aug  4 11:08:59 1989, DSJ, Created.
02474  */
02475 #define INITIALDELTA    0.1
02476 #define  DELTARATIO     0.1
02477 {
02478   FLOAT64 x;
02479   FLOAT64 f;
02480   FLOAT64 Slope;
02481   FLOAT64 Delta;
02482   FLOAT64 NewDelta;
02483   FLOAT64 xDelta;
02484   FLOAT64 LastPosX, LastNegX;
02485
02486   x = InitialGuess;
02487   Delta = INITIALDELTA;
02488   LastPosX = MAX_FLOAT32;
02489   LastNegX = -MAX_FLOAT32;
02490   f = (*Function) ((CHISTRUCT *) FunctionParams, x);
02491   while (Abs (LastPosX - LastNegX) > Accuracy) {
02492     // keep track of outer bounds of current estimate
02493     if (f < 0)
02494       LastNegX = x;
02495     else
02496       LastPosX = x;
02497
02498     // compute the approx. slope of f(x) at the current point
02499     Slope =
02500       ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
02501
02502     // compute the next solution guess */
02503     xDelta = f / Slope;
02504     x -= xDelta;
02505
02506     // reduce the delta used for computing slope to be a fraction of
02507     //the amount moved to get to the new guess
02508     NewDelta = Abs (xDelta) * DELTARATIO;
02509     if (NewDelta < Delta)
02510       Delta = NewDelta;
02511
02512     // compute the value of the function at the new guess
02513     f = (*Function) ((CHISTRUCT *) FunctionParams, x);
02514   }
02515   return (x);
02516
02517 }                                // Solve
02518
02519
02520 //---------------------------------------------------------------------------
02521 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) {
02522 /*
02523  **     Parameters:
02524  **             ChiParams       contains degrees of freedom and alpha
02525  **             x               value of chi-squared to evaluate
02526  **     Operation:
02527  **             This routine computes the area under a chi density curve
02528  **             from 0 to x, minus the desired area under the curve.  The
02529  **             number of degrees of freedom of the chi curve is specified
02530  **             in the ChiParams structure.  The desired area is also
02531  **             specified in the ChiParams structure as Alpha ( or 1 minus
02532  **             the desired area ).  This routine is intended to be passed
02533  **             to the Solve() function to find the value of chi-squared
02534  **             which will yield a desired area under the right tail of
02535  **             the chi density curve.  The function will only work for
02536  **             even degrees of freedom.  The equations are based on
02537  **             integrating the chi density curve in parts to obtain
02538  **             a series that can be used to compute the area under the
02539  **             curve.
02540  **     Return: Error between actual and desired area under the chi curve.
02541  **     Exceptions: none
02542  **     History: Fri Aug  4 12:48:41 1989, DSJ, Created.
02543  */
02544   int i, N;
02545   FLOAT64 SeriesTotal;
02546   FLOAT64 Denominator;
02547   FLOAT64 PowerOfx;
02548
02549   N = ChiParams->DegreesOfFreedom / 2 - 1;
02550   SeriesTotal = 1;
02551   Denominator = 1;
02552   PowerOfx = 1;
02553   for (i = 1; i <= N; i++) {
02554     Denominator *= 2 * i;
02555     PowerOfx *= x;
02556     SeriesTotal += PowerOfx / Denominator;
02557   }
02558   return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
02559
02560 }                                // ChiArea
02561
02562
02563 //---------------------------------------------------------------------------
02564 BOOL8
02565 MultipleCharSamples (CLUSTERER * Clusterer,
02566 CLUSTER * Cluster, FLOAT32 MaxIllegal)
02567 /*
02568  **     Parameters:
02569  **             Clusterer       data structure holding cluster tree
02570  **             Cluster         cluster containing samples to be tested
02571  **             MaxIllegal      max percentage of samples allowed to have
02572  **                             more than 1 feature in the cluster
02573  **     Operation:
02574  **             This routine looks at all samples in the specified cluster.
02575  **             It computes a running estimate of the percentage of the
02576  **             charaters which have more than 1 sample in the cluster.
02577  **             When this percentage exceeds MaxIllegal, TRUE is returned.
02578  **             Otherwise FALSE is returned.  The CharID
02579  **             fields must contain integers which identify the training
02580  **             characters which were used to generate the sample.  One
02581  **             integer is used for each sample.  The NumChar field in
02582  **             the Clusterer must contain the number of characters in the
02583  **             training set.  All CharID fields must be between 0 and
02584  **             NumChar-1.  The main function of this routine is to help
02585  **             identify clusters which need to be split further, i.e. if
02586  **             numerous training characters have 2 or more features which are
02587  **             contained in the same cluster, then the cluster should be
02588  **             split.
02589  **     Return: TRUE if the cluster should be split, FALSE otherwise.
02590  **     Exceptions: none
02591  **     History: Wed Aug 30 11:13:05 1989, DSJ, Created.
02592  **             2/22/90, DSJ, Added MaxIllegal control rather than always
02593  **                             splitting illegal clusters.
02594  */
02595 #define ILLEGAL_CHAR    2
02596 {
02597   static BOOL8 *CharFlags = NULL;
02598   static inT32 NumFlags = 0;
02599   int i;
02600   LIST SearchState;
02601   SAMPLE *Sample;
02602   inT32 CharID;
02603   inT32 NumCharInCluster;
02604   inT32 NumIllegalInCluster;
02605   FLOAT32 PercentIllegal;
02606
02607   // initial estimate assumes that no illegal chars exist in the cluster
02608   NumCharInCluster = Cluster->SampleCount;
02609   NumIllegalInCluster = 0;
02610
02611   if (Clusterer->NumChar > NumFlags) {
02612     if (CharFlags != NULL)
02613       memfree(CharFlags);
02614     NumFlags = Clusterer->NumChar;
02615     CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
02616   }
02617
02618   for (i = 0; i < NumFlags; i++)
02619     CharFlags[i] = FALSE;
02620
02621   // find each sample in the cluster and check if we have seen it before
02622   InitSampleSearch(SearchState, Cluster);
02623   while ((Sample = NextSample (&SearchState)) != NULL) {
02624     CharID = Sample->CharID;
02625     if (CharFlags[CharID] == FALSE) {
02626       CharFlags[CharID] = TRUE;
02627     }
02628     else {
02629       if (CharFlags[CharID] == TRUE) {
02630         NumIllegalInCluster++;
02631         CharFlags[CharID] = ILLEGAL_CHAR;
02632       }
02633       NumCharInCluster--;
02634       PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
02635       if (PercentIllegal > MaxIllegal) {
02636         destroy(SearchState);
02637         return (TRUE);
02638       }
02639     }
02640   }
02641   return (FALSE);
02642
02643 }                                // MultipleCharSamples
02644
02645 // Compute the inverse of a matrix using LU decomposition with partial pivoting.
02646 // The return value is the sum of norms of the off-diagonal terms of the
02647 // product of a and inv. (A measure of the error.)
02648 double InvertMatrix(const float* input, int size, float* inv) {
02649   // Allocate memory for the 2D arrays.
02650   GENERIC_2D_ARRAY<double> U(size, size, 0.0);
02651   GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
02652   GENERIC_2D_ARRAY<double> L(size, size, 0.0);
02653
02654   // Initialize the working matrices. U starts as input, L as I and U_inv as O.
02655   int row;
02656   int col;
02657   for (row = 0; row < size; row++) {
02658     for (col = 0; col < size; col++) {
02659       U[row][col] = input[row*size + col];
02660       L[row][col] = row == col ? 1.0 : 0.0;
02661       U_inv[row][col] = 0.0;
02662     }
02663   }
02664
02665   // Compute forward matrix by inversion by LU decomposition of input.
02666   for (col = 0; col < size; ++col) {
02667     // Find best pivot
02668     int best_row = 0;
02669     double best_pivot = -1.0;
02670     for (row = col; row < size; ++row) {
02671       if (Abs(U[row][col]) > best_pivot) {
02672         best_pivot = Abs(U[row][col]);
02673         best_row = row;
02674       }
02675     }
02676     // Exchange pivot rows.
02677     if (best_row != col) {
02678       for (int k = 0; k < size; ++k) {
02679         double tmp = U[best_row][k];
02680         U[best_row][k] = U[col][k];
02681         U[col][k] = tmp;
02682         tmp = L[best_row][k];
02683         L[best_row][k] = L[col][k];
02684         L[col][k] = tmp;
02685       }
02686     }
02687     // Now do the pivot itself.
02688     for (row = col + 1; row < size; ++row) {
02689       double ratio = -U[row][col] / U[col][col];
02690       for (int j = col; j < size; ++j) {
02691         U[row][j] += U[col][j] * ratio;
02692       }
02693       for (int k = 0; k < size; ++k) {
02694         L[row][k] += L[col][k] * ratio;
02695       }
02696     }
02697   }
02698   // Next invert U.
02699   for (col = 0; col < size; ++col) {
02700     U_inv[col][col] = 1.0 / U[col][col];
02701     for (row = col - 1; row >= 0; --row) {
02702       double total = 0.0;
02703       for (int k = col; k > row; --k) {
02704         total += U[row][k] * U_inv[k][col];
02705       }
02706       U_inv[row][col] = -total / U[row][row];
02707     }
02708   }
02709   // Now the answer is U_inv.L.
02710   for (row = 0; row < size; row++) {
02711     for (col = 0; col < size; col++) {
02712       double sum = 0.0;
02713       for (int k = row; k < size; ++k) {
02714         sum += U_inv[row][k] * L[k][col];
02715       }
02716       inv[row*size + col] = sum;
02717     }
02718   }
02719   // Check matrix product.
02720   double error_sum = 0.0;
02721   for (row = 0; row < size; row++) {
02722     for (col = 0; col < size; col++) {
02723       double sum = 0.0;
02724       for (int k = 0; k < size; ++k) {
02725         sum += input[row*size + k] * inv[k *size + col];
02726       }
02727       if (row != col) {
02728         error_sum += Abs(sum);
02729       }
02730     }
02731   }
02732   return error_sum;
02733 }