Tesseract  3.02
tesseract-ocr/ccstruct/statistc.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        statistc.h  (Formerly stats.h)
00003  * Description: Class description for STATS class.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Mon Feb 04 16:19:07 GMT 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef TESSERACT_CCSTRUCT_STATISTC_H_
00021 #define TESSERACT_CCSTRUCT_STATISTC_H_
00022 
00023 #include <stdio.h>
00024 #include "host.h"
00025 #include "scrollview.h"
00026 
00027 // Simple histogram-based statistics for integer values in a known
00028 // range, such that the range is small compared to the number of samples.
00029 class STATS {
00030  public:
00031   // The histogram buckets are in the range
00032   // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e.
00033   // [min_bucket_value, max_bucket_value].
00034   // Any data under min_bucket value is silently mapped to min_bucket_value,
00035   // and likewise, any data over max_bucket_value is silently mapped to
00036   // max_bucket_value.
00037   // In the internal array, min_bucket_value maps to 0 and
00038   // max_bucket_value_plus_1 - min_bucket_value to the array size.
00039   // TODO(rays) This is ugly. Convert the second argument to
00040   // max_bucket_value and all the code that uses it.
00041   STATS(inT32 min_bucket_value, inT32 max_bucket_value_plus_1);
00042   STATS();  // empty for arrays
00043 
00044   ~STATS();
00045 
00046   // (Re)Sets the range and clears the counts.
00047   // See the constructor for info on max and min values.
00048   bool set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1);
00049 
00050   void clear();  // empty buckets
00051 
00052   void add(inT32 value, inT32 count);
00053 
00054   // "Accessors" return various statistics on the data.
00055   inT32 mode() const;  // get mode of samples
00056   double mean() const;  // get mean of samples
00057   double sd() const;  // standard deviation
00058   // Returns the fractile value such that frac fraction (in [0,1]) of samples
00059   // has a value less than the return value.
00060   double ile(double frac) const;
00061   // Returns the minimum used entry in the histogram (ie the minimum of the
00062   // data, NOT the minimum of the supplied range, nor is it an index.)
00063   // Would normally be called min(), but that is a reserved word in VC++.
00064   inT32 min_bucket() const;  // Find min
00065   // Returns the maximum used entry in the histogram (ie the maximum of the
00066   // data, NOT the maximum of the supplied range, nor is it an index.)
00067   inT32 max_bucket() const;  // Find max
00068   // Finds a more useful estimate of median than ile(0.5).
00069   // Overcomes a problem with ile() - if the samples are, for example,
00070   // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
00071   // between 6 and 13 = 9.5
00072   double median() const;  // get median of samples
00073   // Returns the count of the given value.
00074   inT32 pile_count(inT32 value ) const {
00075     if (value <= rangemin_)
00076       return buckets_[0];
00077     if (value >= rangemax_ - 1)
00078       return buckets_[rangemax_ - rangemin_ - 1];
00079     return buckets_[value - rangemin_];
00080   }
00081   // Returns the total count of all buckets.
00082   inT32 get_total() const {
00083     return total_count_;        // total of all piles
00084   }
00085   // Returns true if x is a local min.
00086   bool local_min(inT32 x) const;
00087 
00088   // Apply a triangular smoothing filter to the stats.
00089   // This makes the modes a bit more useful.
00090   // The factor gives the height of the triangle, i.e. the weight of the
00091   // centre.
00092   void smooth(inT32 factor);
00093 
00094   // Cluster the samples into max_cluster clusters.
00095   // Each call runs one iteration. The array of clusters must be
00096   // max_clusters+1 in size as cluster 0 is used to indicate which samples
00097   // have been used.
00098   // The return value is the current number of clusters.
00099   inT32 cluster(float lower,         // thresholds
00100                 float upper,
00101                 float multiple,      // distance threshold
00102                 inT32 max_clusters,  // max no to make
00103                 STATS *clusters);    // array of clusters
00104 
00105 
00106   // Prints a summary and table of the histogram.
00107   void print() const;
00108   // Prints summary stats only of the histogram.
00109   void print_summary() const;
00110 
00111   #ifndef GRAPHICS_DISABLED
00112   // Draws the histogram as a series of rectangles.
00113   void plot(ScrollView* window,   // window to draw in
00114             float xorigin,   // origin of histo
00115             float yorigin,   // gram
00116             float xscale,    // size of one unit
00117             float yscale,    // size of one uint
00118             ScrollView::Color colour) const;  // colour to draw in
00119 
00120   // Draws a line graph of the histogram.
00121   void plotline(ScrollView* window,   // window to draw in
00122                 float xorigin,   // origin of histo
00123                 float yorigin,   // gram
00124                 float xscale,    // size of one unit
00125                 float yscale,    // size of one uint
00126                 ScrollView::Color colour) const;  // colour to draw in
00127   #endif  // GRAPHICS_DISABLED
00128 
00129  private:
00130   inT32 rangemin_;                // min of range
00131   // rangemax_ is not well named as it is really one past the max.
00132   inT32 rangemax_;                // max of range
00133   inT32 total_count_;             // no of samples
00134   inT32* buckets_;                // array of cells
00135 };
00136 
00137 // Returns the nth ordered item from the array, as if they were
00138 // ordered, but without ordering them, in linear time.
00139 // The array does get shuffled!
00140 inT32 choose_nth_item(inT32 index,   // index to choose
00141                       float *array,  // array of items
00142                       inT32 count);  // no of items
00143 // Generic version uses a defined comparator (with qsort semantics).
00144 inT32 choose_nth_item(inT32 index,   // index to choose
00145                       void *array,   // array of items
00146                       inT32 count,   // no of items
00147                       size_t size,   // element size
00148                       int (*compar)(const void*, const void*));  // comparator
00149 // Swaps 2 entries in an array in-place.
00150 void swap_entries(void *array,   // array of entries
00151                   size_t size,   // size of entry
00152                   inT32 index1,  // entries to swap
00153                   inT32 index2);
00154 
00155 #endif  // TESSERACT_CCSTRUCT_STATISTC_H_