Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: statistc.h (Formerly stats.h) 00003 * Description: Class description for STATS class. 00004 * Author: Ray Smith 00005 * Created: Mon Feb 04 16:19:07 GMT 1991 00006 * 00007 * (C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef TESSERACT_CCSTRUCT_STATISTC_H_ 00021 #define TESSERACT_CCSTRUCT_STATISTC_H_ 00022 00023 #include <stdio.h> 00024 #include "host.h" 00025 #include "scrollview.h" 00026 00027 // Simple histogram-based statistics for integer values in a known 00028 // range, such that the range is small compared to the number of samples. 00029 class STATS { 00030 public: 00031 // The histogram buckets are in the range 00032 // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e. 00033 // [min_bucket_value, max_bucket_value]. 00034 // Any data under min_bucket value is silently mapped to min_bucket_value, 00035 // and likewise, any data over max_bucket_value is silently mapped to 00036 // max_bucket_value. 00037 // In the internal array, min_bucket_value maps to 0 and 00038 // max_bucket_value_plus_1 - min_bucket_value to the array size. 00039 // TODO(rays) This is ugly. Convert the second argument to 00040 // max_bucket_value and all the code that uses it. 00041 STATS(inT32 min_bucket_value, inT32 max_bucket_value_plus_1); 00042 STATS(); // empty for arrays 00043 00044 ~STATS(); 00045 00046 // (Re)Sets the range and clears the counts. 00047 // See the constructor for info on max and min values. 00048 bool set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1); 00049 00050 void clear(); // empty buckets 00051 00052 void add(inT32 value, inT32 count); 00053 00054 // "Accessors" return various statistics on the data. 00055 inT32 mode() const; // get mode of samples 00056 double mean() const; // get mean of samples 00057 double sd() const; // standard deviation 00058 // Returns the fractile value such that frac fraction (in [0,1]) of samples 00059 // has a value less than the return value. 00060 double ile(double frac) const; 00061 // Returns the minimum used entry in the histogram (ie the minimum of the 00062 // data, NOT the minimum of the supplied range, nor is it an index.) 00063 // Would normally be called min(), but that is a reserved word in VC++. 00064 inT32 min_bucket() const; // Find min 00065 // Returns the maximum used entry in the histogram (ie the maximum of the 00066 // data, NOT the maximum of the supplied range, nor is it an index.) 00067 inT32 max_bucket() const; // Find max 00068 // Finds a more useful estimate of median than ile(0.5). 00069 // Overcomes a problem with ile() - if the samples are, for example, 00070 // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway 00071 // between 6 and 13 = 9.5 00072 double median() const; // get median of samples 00073 // Returns the count of the given value. 00074 inT32 pile_count(inT32 value ) const { 00075 if (value <= rangemin_) 00076 return buckets_[0]; 00077 if (value >= rangemax_ - 1) 00078 return buckets_[rangemax_ - rangemin_ - 1]; 00079 return buckets_[value - rangemin_]; 00080 } 00081 // Returns the total count of all buckets. 00082 inT32 get_total() const { 00083 return total_count_; // total of all piles 00084 } 00085 // Returns true if x is a local min. 00086 bool local_min(inT32 x) const; 00087 00088 // Apply a triangular smoothing filter to the stats. 00089 // This makes the modes a bit more useful. 00090 // The factor gives the height of the triangle, i.e. the weight of the 00091 // centre. 00092 void smooth(inT32 factor); 00093 00094 // Cluster the samples into max_cluster clusters. 00095 // Each call runs one iteration. The array of clusters must be 00096 // max_clusters+1 in size as cluster 0 is used to indicate which samples 00097 // have been used. 00098 // The return value is the current number of clusters. 00099 inT32 cluster(float lower, // thresholds 00100 float upper, 00101 float multiple, // distance threshold 00102 inT32 max_clusters, // max no to make 00103 STATS *clusters); // array of clusters 00104 00105 00106 // Prints a summary and table of the histogram. 00107 void print() const; 00108 // Prints summary stats only of the histogram. 00109 void print_summary() const; 00110 00111 #ifndef GRAPHICS_DISABLED 00112 // Draws the histogram as a series of rectangles. 00113 void plot(ScrollView* window, // window to draw in 00114 float xorigin, // origin of histo 00115 float yorigin, // gram 00116 float xscale, // size of one unit 00117 float yscale, // size of one uint 00118 ScrollView::Color colour) const; // colour to draw in 00119 00120 // Draws a line graph of the histogram. 00121 void plotline(ScrollView* window, // window to draw in 00122 float xorigin, // origin of histo 00123 float yorigin, // gram 00124 float xscale, // size of one unit 00125 float yscale, // size of one uint 00126 ScrollView::Color colour) const; // colour to draw in 00127 #endif // GRAPHICS_DISABLED 00128 00129 private: 00130 inT32 rangemin_; // min of range 00131 // rangemax_ is not well named as it is really one past the max. 00132 inT32 rangemax_; // max of range 00133 inT32 total_count_; // no of samples 00134 inT32* buckets_; // array of cells 00135 }; 00136 00137 // Returns the nth ordered item from the array, as if they were 00138 // ordered, but without ordering them, in linear time. 00139 // The array does get shuffled! 00140 inT32 choose_nth_item(inT32 index, // index to choose 00141 float *array, // array of items 00142 inT32 count); // no of items 00143 // Generic version uses a defined comparator (with qsort semantics). 00144 inT32 choose_nth_item(inT32 index, // index to choose 00145 void *array, // array of items 00146 inT32 count, // no of items 00147 size_t size, // element size 00148 int (*compar)(const void*, const void*)); // comparator 00149 // Swaps 2 entries in an array in-place. 00150 void swap_entries(void *array, // array of entries 00151 size_t size, // size of entry 00152 inT32 index1, // entries to swap 00153 inT32 index2); 00154 00155 #endif // TESSERACT_CCSTRUCT_STATISTC_H_