Tesseract
3.02
|
Public Member Functions | |
ClassPruner (int max_classes) | |
~ClassPruner () | |
void | ComputeScores (const INT_TEMPLATES_STRUCT *int_templates, int num_features, const INT_FEATURE_STRUCT *features) |
void | AdjustForExpectedNumFeatures (const uinT16 *expected_num_features, int cutoff_strength) |
void | DisableDisabledClasses (const UNICHARSET &unicharset) |
void | DisableFragments (const UNICHARSET &unicharset) |
void | NormalizeForXheight (int norm_multiplier, const uinT8 *normalization_factors) |
void | NoNormalization () |
void | PruneAndSort (int pruning_factor, bool max_of_non_fragments, const UNICHARSET &unicharset) |
void | DebugMatch (const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, const INT_FEATURE_STRUCT *features) const |
void | SummarizeResult (const Classify &classify, const INT_TEMPLATES_STRUCT *int_templates, const uinT16 *expected_num_features, int norm_multiplier, const uinT8 *normalization_factors) const |
int | SetupResults (CP_RESULT_STRUCT *results) const |
Definition at line 108 of file intmatcher.cpp.
tesseract::ClassPruner::ClassPruner | ( | int | max_classes | ) | [inline] |
Definition at line 110 of file intmatcher.cpp.
{ // The unrolled loop in ComputeScores means that the array sizes need to // be rounded up so that the array is big enough to accommodate the extra // entries accessed by the unrolling. Each pruner word is of sized // BITS_PER_WERD and each entry is NUM_BITS_PER_CLASS, so there are // BITS_PER_WERD / NUM_BITS_PER_CLASS entries. // See ComputeScores. max_classes_ = max_classes; rounded_classes_ = RoundUp( max_classes, WERDS_PER_CP_VECTOR * BITS_PER_WERD / NUM_BITS_PER_CLASS); class_count_ = new int[rounded_classes_]; norm_count_ = new int[rounded_classes_]; sort_key_ = new int[rounded_classes_ + 1]; sort_index_ = new int[rounded_classes_ + 1]; for (int i = 0; i < rounded_classes_; i++) { class_count_[i] = 0; } pruning_threshold_ = 0; num_features_ = 0; num_classes_ = 0; }
tesseract::ClassPruner::~ClassPruner | ( | ) | [inline] |
Definition at line 132 of file intmatcher.cpp.
{ delete []class_count_; delete []norm_count_; delete []sort_key_; delete []sort_index_; }
void tesseract::ClassPruner::AdjustForExpectedNumFeatures | ( | const uinT16 * | expected_num_features, |
int | cutoff_strength | ||
) | [inline] |
Definition at line 212 of file intmatcher.cpp.
{ for (int class_id = 0; class_id < max_classes_; ++class_id) { if (num_features_ < expected_num_features[class_id]) { int deficit = expected_num_features[class_id] - num_features_; class_count_[class_id] -= class_count_[class_id] * deficit / (num_features_ * cutoff_strength + deficit); } } }
void tesseract::ClassPruner::ComputeScores | ( | const INT_TEMPLATES_STRUCT * | int_templates, |
int | num_features, | ||
const INT_FEATURE_STRUCT * | features | ||
) | [inline] |
Definition at line 141 of file intmatcher.cpp.
{ num_features_ = num_features; int num_pruners = int_templates->NumClassPruners; for (int f = 0; f < num_features; ++f) { const INT_FEATURE_STRUCT* feature = &features[f]; // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS. int x = feature->X * NUM_CP_BUCKETS >> 8; int y = feature->Y * NUM_CP_BUCKETS >> 8; int theta = feature->Theta * NUM_CP_BUCKETS >> 8; int class_id = 0; // Each CLASS_PRUNER_STRUCT only covers CLASSES_PER_CP(32) classes, so // we need a collection of them, indexed by pruner_set. for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) { // Look up quantized feature in a 3-D array, an array of weights for // each class. const uinT32* pruner_word_ptr = int_templates->ClassPruners[pruner_set]->p[x][y][theta]; for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) { uinT32 pruner_word = *pruner_word_ptr++; // This inner loop is unrolled to speed up the ClassPruner. // Currently gcc would not unroll it unless it is set to O3 // level of optimization or -funroll-loops is specified. /* uinT32 class_mask = (1 << NUM_BITS_PER_CLASS) - 1; for (int bit = 0; bit < BITS_PER_WERD/NUM_BITS_PER_CLASS; bit++) { class_count_[class_id++] += pruner_word & class_mask; pruner_word >>= NUM_BITS_PER_CLASS; } */ class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; pruner_word >>= NUM_BITS_PER_CLASS; class_count_[class_id++] += pruner_word & CLASS_PRUNER_CLASS_MASK; } } } }
void tesseract::ClassPruner::DebugMatch | ( | const Classify & | classify, |
const INT_TEMPLATES_STRUCT * | int_templates, | ||
const INT_FEATURE_STRUCT * | features | ||
) | const [inline] |
Definition at line 299 of file intmatcher.cpp.
{ int num_pruners = int_templates->NumClassPruners; int max_num_classes = int_templates->NumClasses; for (int f = 0; f < num_features_; ++f) { const INT_FEATURE_STRUCT* feature = &features[f]; tprintf("F=%3d(%d,%d,%d),", f, feature->X, feature->Y, feature->Theta); // Quantize the feature to NUM_CP_BUCKETS*NUM_CP_BUCKETS*NUM_CP_BUCKETS. int x = feature->X * NUM_CP_BUCKETS >> 8; int y = feature->Y * NUM_CP_BUCKETS >> 8; int theta = feature->Theta * NUM_CP_BUCKETS >> 8; int class_id = 0; for (int pruner_set = 0; pruner_set < num_pruners; ++pruner_set) { // Look up quantized feature in a 3-D array, an array of weights for // each class. const uinT32* pruner_word_ptr = int_templates->ClassPruners[pruner_set]->p[x][y][theta]; for (int word = 0; word < WERDS_PER_CP_VECTOR; ++word) { uinT32 pruner_word = *pruner_word_ptr++; for (int word_class = 0; word_class < 16 && class_id < max_num_classes; ++word_class, ++class_id) { if (norm_count_[class_id] >= pruning_threshold_) { tprintf(" %s=%d,", classify.ClassIDToDebugStr(int_templates, class_id, 0).string(), pruner_word & CLASS_PRUNER_CLASS_MASK); } pruner_word >>= NUM_BITS_PER_CLASS; } } tprintf("\n"); } } }
void tesseract::ClassPruner::DisableDisabledClasses | ( | const UNICHARSET & | unicharset | ) | [inline] |
Definition at line 225 of file intmatcher.cpp.
{ for (int class_id = 0; class_id < max_classes_; ++class_id) { if (!unicharset.get_enabled(class_id)) class_count_[class_id] = 0; // This char is disabled! } }
void tesseract::ClassPruner::DisableFragments | ( | const UNICHARSET & | unicharset | ) | [inline] |
Definition at line 233 of file intmatcher.cpp.
{ for (int class_id = 0; class_id < max_classes_; ++class_id) { // Do not include character fragments in the class pruner // results if disable_character_fragments is true. if (unicharset.get_fragment(class_id)) { class_count_[class_id] = 0; } } }
void tesseract::ClassPruner::NoNormalization | ( | ) | [inline] |
Definition at line 256 of file intmatcher.cpp.
{ for (int class_id = 0; class_id < max_classes_; class_id++) { norm_count_[class_id] = class_count_[class_id]; } }
void tesseract::ClassPruner::NormalizeForXheight | ( | int | norm_multiplier, |
const uinT8 * | normalization_factors | ||
) | [inline] |
Definition at line 247 of file intmatcher.cpp.
{ for (int class_id = 0; class_id < max_classes_; class_id++) { norm_count_[class_id] = class_count_[class_id] - ((norm_multiplier * normalization_factors[class_id]) >> 8); } }
void tesseract::ClassPruner::PruneAndSort | ( | int | pruning_factor, |
bool | max_of_non_fragments, | ||
const UNICHARSET & | unicharset | ||
) | [inline] |
Definition at line 265 of file intmatcher.cpp.
{ int max_count = 0; for (int c = 0; c < max_classes_; ++c) { if (norm_count_[c] > max_count && // This additional check is added in order to ensure that // the classifier will return at least one non-fragmented // character match. // TODO(daria): verify that this helps accuracy and does not // hurt performance. (!max_of_non_fragments || !unicharset.get_fragment(c))) { max_count = norm_count_[c]; } } // Prune Classes. pruning_threshold_ = (max_count * pruning_factor) >> 8; // Select Classes. if (pruning_threshold_ < 1) pruning_threshold_ = 1; num_classes_ = 0; for (int class_id = 0; class_id < max_classes_; class_id++) { if (norm_count_[class_id] >= pruning_threshold_) { ++num_classes_; sort_index_[num_classes_] = class_id; sort_key_[num_classes_] = norm_count_[class_id]; } } // Sort Classes using Heapsort Algorithm. if (num_classes_ > 1) HeapSort(num_classes_, sort_key_, sort_index_); }
int tesseract::ClassPruner::SetupResults | ( | CP_RESULT_STRUCT * | results | ) | const [inline] |
Definition at line 359 of file intmatcher.cpp.
{ for (int c = 0; c < num_classes_; ++c) { results[c].Class = sort_index_[num_classes_ - c]; results[c].Rating = 1.0 - sort_key_[num_classes_ - c] / (static_cast<float>(CLASS_PRUNER_CLASS_MASK) * num_features_); } return num_classes_; }
void tesseract::ClassPruner::SummarizeResult | ( | const Classify & | classify, |
const INT_TEMPLATES_STRUCT * | int_templates, | ||
const uinT16 * | expected_num_features, | ||
int | norm_multiplier, | ||
const uinT8 * | normalization_factors | ||
) | const [inline] |
Definition at line 336 of file intmatcher.cpp.
{ tprintf("CP:%d classes, %d features:\n", num_classes_, num_features_); for (int i = 0; i < num_classes_; ++i) { int class_id = sort_index_[num_classes_ - i]; STRING class_string = classify.ClassIDToDebugStr(int_templates, class_id, 0); tprintf("%s:Initial=%d, E=%d, Xht-adj=%d, N=%d, Rat=%.2f\n", class_string.string(), class_count_[class_id], expected_num_features[class_id], (norm_multiplier * normalization_factors[class_id]) >> 8, sort_key_[num_classes_ - i], 100.0 - 100.0 * sort_key_[num_classes_ - i] / (CLASS_PRUNER_CLASS_MASK * num_features_)); } }