Tesseract  3.02
tesseract::ClassPruner Class Reference

List of all members.

Public Member Functions

 ClassPruner (int max_classes)
 ~ClassPruner ()
void ComputeScores (const INT_TEMPLATES_STRUCT *int_templates, int num_features, const INT_FEATURE_STRUCT *features)
void AdjustForExpectedNumFeatures (const uinT16 *expected_num_features, int cutoff_strength)
void DisableDisabledClasses (const UNICHARSET &unicharset)
void DisableFragments (const UNICHARSET &unicharset)
void NormalizeForXheight (int norm_multiplier, const uinT8 *normalization_factors)
void NoNormalization ()
void PruneAndSort (int pruning_factor, bool max_of_non_fragments, const UNICHARSET &unicharset)
void DebugMatch (const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, const INT_FEATURE_STRUCT *features) const
void SummarizeResult (const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, const uinT16 *expected_num_features, int norm_multiplier, const uinT8 *normalization_factors) const
int SetupResults (CP_RESULT_STRUCT *results) const

Detailed Description

Definition at line 108 of file intmatcher.cpp.


Constructor & Destructor Documentation

tesseract::ClassPruner::ClassPruner ( int  max_classes) [inline]

Definition at line 110 of file intmatcher.cpp.

                               {
    // The unrolled loop in ComputeScores means that the array sizes need to
    // be rounded up so that the array is big enough to accommodate the extra
    // entries accessed by the unrolling. Each pruner word is of sized
    // BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are
    // BITS_PER_WERD / NUM_BITS_PER_CLASS entries.
    // See ComputeScores.
    max_classes_ = max_classes;
    rounded_classes_ = RoundUp(
        max_classes, WERDS_PER_CP_VECTOR * BITS_PER_WERD / NUM_BITS_PER_CLASS);
    class_count_ = new int[rounded_classes_];
    norm_count_ = new int[rounded_classes_];
    sort_key_ = new int[rounded_classes_ + 1];
    sort_index_ = new int[rounded_classes_ + 1];
    for (int i = 0; i < rounded_classes_; i++) {
      class_count_[i] = 0;
    }
    pruning_threshold_ = 0;
    num_features_ = 0;
    num_classes_ = 0;
  }
tesseract::ClassPruner::~ClassPruner ( ) [inline]

Definition at line 132 of file intmatcher.cpp.

                 {
    delete []class_count_;
    delete []norm_count_;
    delete []sort_key_;
    delete []sort_index_;
  }

Member Function Documentation

void tesseract::ClassPruner::AdjustForExpectedNumFeatures ( const uinT16 expected_num_features,
int  cutoff_strength 
) [inline]

Definition at line 212 of file intmatcher.cpp.

                                                         {
    for (int class_id = 0; class_id < max_classes_; ++class_id) {
      if (num_features_ < expected_num_features[class_id]) {
        int deficit = expected_num_features[class_id] - num_features_;
        class_count_[class_id] -= class_count_[class_id] * deficit /
          (num_features_ * cutoff_strength + deficit);
      }
    }
  }
void tesseract::ClassPruner::ComputeScores ( const INT_TEMPLATES_STRUCT int_templates,
int  num_features,
const INT_FEATURE_STRUCT features 
) [inline]

Definition at line 141 of file intmatcher.cpp.

                                                                           {
    num_features_ = num_features;
    int num_pruners = int_templates->NumClassPruners;
    for (int f = 0; f < num_features; ++f) {
      const INT_FEATURE_STRUCT* feature = &features[f];
      // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
      int x = feature->X * NUM_CP_BUCKETS >> 8;
      int y = feature->Y * NUM_CP_BUCKETS >> 8;
      int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
      int class_id = 0;
      // Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so
      // we need a collection of them, indexed by pruner_set.
      for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
        // Look up quantized feature in a 3-D array, an array of weights for
        // each class.
        const uinT32* pruner_word_ptr =
            int_templates->ClassPruners[pruner_set]->p[x][y][theta];
        for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
          uinT32 pruner_word = *pruner_word_ptr++;
          // This inner loop is unrolled to speed up the ClassPruner.
          // Currently gcc would not unroll it unless it is set to O3
          // level of optimization or -funroll-loops is specified.
          /*
          uinT32 class_mask = (1 << NUM_BITS_PER_CLASS) - 1;
          for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) {
            class_count_[class_id++] += pruner_word & class_mask;
            pruner_word >>= NUM_BITS_PER_CLASS;
          }
          */
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
          pruner_word >>= NUM_BITS_PER_CLASS;
          class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK;
        }
      }
    }
  }
void tesseract::ClassPruner::DebugMatch ( const Classify classify,
const INT_TEMPLATES_STRUCT int_templates,
const INT_FEATURE_STRUCT features 
) const [inline]

Definition at line 299 of file intmatcher.cpp.

                                                            {
    int num_pruners = int_templates->NumClassPruners;
    int max_num_classes = int_templates->NumClasses;
    for (int f = 0; f < num_features_; ++f) {
      const INT_FEATURE_STRUCT* feature = &features[f];
      tprintf("F=%3d(%d,%d,%d),", f, feature->X, feature->Y, feature->Theta);
      // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS.
      int x = feature->X * NUM_CP_BUCKETS >> 8;
      int y = feature->Y * NUM_CP_BUCKETS >> 8;
      int theta = feature->Theta * NUM_CP_BUCKETS >> 8;
      int class_id = 0;
      for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) {
        // Look up quantized feature in a 3-D array, an array of weights for
        // each class.
        const uinT32* pruner_word_ptr =
            int_templates->ClassPruners[pruner_set]->p[x][y][theta];
        for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) {
          uinT32 pruner_word = *pruner_word_ptr++;
          for (int word_class = 0; word_class < 16 &&
               class_id < max_num_classes; ++word_class, ++class_id) {
            if (norm_count_[class_id] >= pruning_threshold_) {
              tprintf(" %s=%d,",
                      classify.ClassIDToDebugStr(int_templates,
                                                 class_id, 0).string(),
                      pruner_word & CLASS_PRUNER_CLASS_MASK);
            }
            pruner_word >>= NUM_BITS_PER_CLASS;
          }
        }
        tprintf("\n");
      }
    }
  }
void tesseract::ClassPruner::DisableDisabledClasses ( const UNICHARSET unicharset) [inline]

Definition at line 225 of file intmatcher.cpp.

                                                            {
    for (int class_id = 0; class_id < max_classes_; ++class_id) {
      if (!unicharset.get_enabled(class_id))
        class_count_[class_id] = 0;  // This char is disabled!
    }
  }
void tesseract::ClassPruner::DisableFragments ( const UNICHARSET unicharset) [inline]

Definition at line 233 of file intmatcher.cpp.

                                                      {
    for (int class_id = 0; class_id < max_classes_; ++class_id) {
      // Do not include character fragments in the class pruner
      // results if disable_character_fragments is true.
      if (unicharset.get_fragment(class_id)) {
        class_count_[class_id] = 0;
      }
    }
  }
void tesseract::ClassPruner::NoNormalization ( ) [inline]

Definition at line 256 of file intmatcher.cpp.

                         {
    for (int class_id = 0; class_id < max_classes_; class_id++) {
      norm_count_[class_id] = class_count_[class_id];
    }
  }
void tesseract::ClassPruner::NormalizeForXheight ( int  norm_multiplier,
const uinT8 normalization_factors 
) [inline]

Definition at line 247 of file intmatcher.cpp.

                                                               {
    for (int class_id = 0; class_id < max_classes_; class_id++) {
      norm_count_[class_id] = class_count_[class_id] -
          ((norm_multiplier * normalization_factors[class_id]) >> 8);
    }
  }
void tesseract::ClassPruner::PruneAndSort ( int  pruning_factor,
bool  max_of_non_fragments,
const UNICHARSET unicharset 
) [inline]

Definition at line 265 of file intmatcher.cpp.

                                                  {
    int max_count = 0;
    for (int c = 0; c < max_classes_; ++c) {
      if (norm_count_[c] > max_count &&
          // This additional check is added in order to ensure that
          // the classifier will return at least one non-fragmented
          // character match.
          // TODO(daria): verify that this helps accuracy and does not
          // hurt performance.
          (!max_of_non_fragments || !unicharset.get_fragment(c))) {
        max_count = norm_count_[c];
      }
    }
    // Prune Classes.
    pruning_threshold_ = (max_count * pruning_factor) >> 8;
    // Select Classes.
    if (pruning_threshold_ < 1)
      pruning_threshold_ = 1;
    num_classes_ = 0;
    for (int class_id = 0; class_id < max_classes_; class_id++) {
      if (norm_count_[class_id] >= pruning_threshold_) {
          ++num_classes_;
        sort_index_[num_classes_] = class_id;
        sort_key_[num_classes_] = norm_count_[class_id];
      }
    }

    // Sort Classes using Heapsort Algorithm.
    if (num_classes_ > 1)
      HeapSort(num_classes_, sort_key_, sort_index_);
  }
int tesseract::ClassPruner::SetupResults ( CP_RESULT_STRUCT results) const [inline]

Definition at line 359 of file intmatcher.cpp.

                                                    {
    for (int c = 0; c < num_classes_; ++c) {
      results[c].Class = sort_index_[num_classes_ - c];
      results[c].Rating = 1.0 - sort_key_[num_classes_ - c] /
        (static_cast<float>(CLASS_PRUNER_CLASS_MASK) * num_features_);
    }
    return num_classes_;
  }
void tesseract::ClassPruner::SummarizeResult ( const Classify classify,
const INT_TEMPLATES_STRUCT int_templates,
const uinT16 expected_num_features,
int  norm_multiplier,
const uinT8 normalization_factors 
) const [inline]

Definition at line 336 of file intmatcher.cpp.

                                                                 {
    tprintf("CP:%d classes, %d features:\n", num_classes_, num_features_);
    for (int i = 0; i < num_classes_; ++i) {
      int class_id = sort_index_[num_classes_ - i];
      STRING class_string = classify.ClassIDToDebugStr(int_templates,
                                                       class_id, 0);
      tprintf("%s:Initial=%d, E=%d, Xht-adj=%d, N=%d, Rat=%.2f\n",
              class_string.string(),
              class_count_[class_id],
              expected_num_features[class_id],
              (norm_multiplier * normalization_factors[class_id]) >> 8,
              sort_key_[num_classes_ - i],
              100.0 - 100.0 * sort_key_[num_classes_ - i] /
                (CLASS_PRUNER_CLASS_MASK * num_features_));
    }
  }

The documentation for this class was generated from the following file: