#include <classify.h>

Inheritance diagram for tesseract::Classify:

Public Member Functions
	Classify ()
virtual	~Classify ()
Dict &	getDict ()
const ShapeTable *	shape_table () const
ADAPT_TEMPLATES	NewAdaptedTemplates (bool InitFromUnicharset)
int	GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
int	PruneClasses (const INT_TEMPLATES_STRUCT int_templates, int num_features, const INT_FEATURE_STRUCT features, const uinT8 normalization_factors, const uinT16 expected_num_features, CP_RESULT_STRUCT *results)
void	ReadNewCutoffs (FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
void	PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
void	WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
ADAPT_TEMPLATES	ReadAdaptedTemplates (FILE *File)
FLOAT32	ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
void	FreeNormProtos ()
NORM_PROTOS *	ReadNormProtos (FILE *File, inT64 end_offset)
void	ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
INT_TEMPLATES	CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
void	LearnWord (const char filename, const char rejmap, WERD_RES *word)
void	LearnPieces (const char filename, int start, int length, float threshold, CharSegmentationType segmentation, const char correct_text, WERD_RES *word)
void	InitAdaptiveClassifier (bool load_pre_trained_templates)
void	InitAdaptedClass (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
void	AdaptToPunc (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
void	AmbigClassifier (TBLOB Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_CLASS Classes, UNICHAR_ID Ambiguities, ADAPT_RESULTS Results)
void	MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT features, const uinT8 norm_factors, ADAPT_CLASS classes, int debug, int num_classes, const TBOX &blob_box, CLASS_PRUNER_RESULTS results, ADAPT_RESULTS final_results)
void	ExpandShapesAndApplyCorrections (ADAPT_CLASS classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, const uinT8 cn_factors, INT_RESULT_STRUCT &int_result, ADAPT_RESULTS *final_results)
double	ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, const uinT8 *cn_factors)
void	ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS Results, BLOB_CHOICE_LIST Choices)
void	AddNewResult (ADAPT_RESULTS *results, CLASS_ID class_id, int shape_id, FLOAT32 rating, bool adapted, int config, int fontinfo_id, int fontinfo_id2)
int	GetAdaptiveFeatures (TBLOB Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET FloatFeatures)
void	DebugAdaptiveClassifier (TBLOB Blob, const DENORM &denorm, ADAPT_RESULTS Results)
void	GetAdaptThresholds (TWERD *Word, const DENORM &denorm, const WERD_CHOICE &BestChoice, const WERD_CHOICE &BestRawChoice, FLOAT32 Thresholds[])
PROTO_ID	MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
int	MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void	MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, const DENORM &denorm, TBLOB *Blob)
void	PrintAdaptiveMatchResults (FILE File, ADAPT_RESULTS Results)
void	RemoveExtraPuncs (ADAPT_RESULTS *Results)
void	RemoveBadMatches (ADAPT_RESULTS *Results)
void	SetAdaptiveThreshold (FLOAT32 Threshold)
void	ShowBestMatchFor (TBLOB Blob, const DENORM &denorm, CLASS_ID ClassId, int shape_id, BOOL8 AdaptiveOn, BOOL8 PreTrainedOn, ADAPT_RESULTS Results)
STRING	ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
int	ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
int	ShapeIDToClassID (int shape_id) const
UNICHAR_ID *	BaselineClassifier (TBLOB Blob, const DENORM &denorm, ADAPT_TEMPLATES Templates, ADAPT_RESULTS Results)
int	CharNormClassifier (TBLOB Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_RESULTS Results)
int	CharNormTrainingSample (bool pruner_only, const TrainingSample &sample, GenericVector< ShapeRating > *results)
UNICHAR_ID *	GetAmbiguities (TBLOB *Blob, const DENORM &denorm, CLASS_ID CorrectClass)
void	DoAdaptiveMatch (TBLOB Blob, const DENORM &denorm, ADAPT_RESULTS Results)
void	AdaptToChar (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
void	DisplayAdaptedChar (TBLOB blob, const DENORM &denorm, INT_CLASS_STRUCT int_class)
int	AdaptableWord (TWERD *Word, const WERD_CHOICE &BestChoiceWord, const WERD_CHOICE &RawChoiceWord)
void	EndAdaptiveClassifier ()
void	PrintAdaptiveStatistics (FILE *File)
void	SettupPass1 ()
void	SettupPass2 ()
void	AdaptiveClassifier (TBLOB Blob, const DENORM &denorm, BLOB_CHOICE_LIST Choices, CLASS_PRUNER_RESULTS cp_results)
void	ClassifyAsNoise (ADAPT_RESULTS *Results)
void	ResetAdaptiveClassifierInternal ()
int	GetBaselineFeatures (TBLOB Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 CharNormArray, inT32 *BlobLength)
int	GetCharNormFeatures (TBLOB Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 PrunerNormArray, uinT8 CharNormArray, inT32 BlobLength, inT32 *FeatureOutlineIndex)
void	ComputeCharNormArrays (FEATURE_STRUCT norm_feature, INT_TEMPLATES_STRUCT templates, uinT8 char_norm_array, uinT8 pruner_array)
bool	TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
void	UpdateAmbigsGroup (CLASS_ID class_id, const DENORM &denorm, TBLOB *Blob)
void	ResetFeaturesHaveBeenExtracted ()
bool	AdaptiveClassifierIsFull ()
bool	LooksLikeGarbage (const DENORM &denorm, TBLOB *blob)
void	RefreshDebugWindow (ScrollView *win, const char msg, int y_offset, const TBOX &wbox)
void	ClearCharNormArray (uinT8 *char_norm_array)
void	ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
void	ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
INT_TEMPLATES	ReadIntTemplates (FILE *File)
void	WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
CLASS_ID	GetClassToDebug (const char Prompt, bool adaptive_on, bool pretrained_on, int shape_id)
void	ShowMatchDisplay ()
UnicityTable< FontInfo > &	get_fontinfo_table ()
UnicityTable< FontSet > &	get_fontset_table ()
void	NormalizeOutlines (LIST Outlines, FLOAT32 XScale, FLOAT32 YScale)
FEATURE_SET	ExtractOutlineFeatures (TBLOB *Blob)
FEATURE_SET	ExtractPicoFeatures (TBLOB *Blob)
ReadClassFile
Read in the training data from a file. All of the classes are read in. The results are stored in the global variable, 'TrainingData'.
void	ReadClassFile ()
Public Attributes
bool	prioritize_division = 0
int	tessedit_single_match = 0
bool	classify_enable_learning = true
int	classify_debug_level = 0
int	classify_norm_method = character
double	classify_char_norm_range = 0.2
double	classify_min_norm_scale_x = 0.0
double	classify_max_norm_scale_x = 0.325
double	classify_min_norm_scale_y = 0.0
double	classify_max_norm_scale_y = 0.325
bool	tess_cn_matching = 0
bool	tess_bn_matching = 0
bool	classify_enable_adaptive_matcher = 1
bool	classify_use_pre_adapted_templates = 0
bool	classify_save_adapted_templates = 0
bool	classify_enable_adaptive_debugger = 0
int	matcher_debug_level = 0
int	matcher_debug_flags = 0
int	classify_learning_debug_level = 0
double	matcher_good_threshold = 0.125
double	matcher_great_threshold = 0.0
double	matcher_perfect_threshold = 0.02
double	matcher_bad_match_pad = 0.15
double	matcher_rating_margin = 0.1
double	matcher_avg_noise_size = 12.0
int	matcher_permanent_classes_min = 1
int	matcher_min_examples_for_prototyping = 3
int	matcher_sufficient_examples_for_prototyping = 5
double	matcher_clustering_max_angle_delta = 0.015
double	classify_misfit_junk_penalty = 0.0
double	rating_scale = 1.5
double	certainty_scale = 20.0
double	tessedit_class_miss_scale = 0.00390625
int	classify_adapt_proto_threshold = 230
int	classify_adapt_feature_threshold = 230
bool	disable_character_fragments = 1
double	classify_character_fragments_garbage_certainty_threshold = -3.0
bool	classify_debug_character_fragments = 0
bool	matcher_debug_separate_windows = 0
char *	classify_learn_debug_str = ""
int	classify_class_pruner_threshold = 229
int	classify_class_pruner_multiplier = 30
int	classify_cp_cutoff_strength = 7
int	classify_integer_matcher_multiplier = 14
INT_TEMPLATES	PreTrainedTemplates
ADAPT_TEMPLATES	AdaptedTemplates
BIT_VECTOR	AllProtosOn
BIT_VECTOR	PrunedProtos
BIT_VECTOR	AllConfigsOn
BIT_VECTOR	AllProtosOff
BIT_VECTOR	AllConfigsOff
BIT_VECTOR	TempProtoMask
bool	EnableLearning
NORM_PROTOS *	NormProtos
UnicityTable< FontInfo >	fontinfo_table_
UnicityTable< FontSet >	fontset_table_
int	il1_adaption_test = 0
bool	classify_bln_numeric_mode = 0
Protected Attributes
IntegerMatcher	im_
FEATURE_DEFS_STRUCT	feature_defs_
ShapeTable *	shape_table_

Detailed Description

Definition at line 58 of file classify.h.

Constructor & Destructor Documentation

tesseract::Classify::Classify ( )

Definition at line 29 of file classify.cpp.

  : BOOL_MEMBER(prioritize_division, FALSE,
                "Prioritize blob division over chopping", this->params()),
    INT_MEMBER(tessedit_single_match, FALSE,
               "Top choice only from CP", this->params()),
    BOOL_MEMBER(classify_enable_learning, true,
                "Enable adaptive classifier", this->params()),
    INT_MEMBER(classify_debug_level, 0, "Classify debug level",
               this->params()),
    INT_MEMBER(classify_norm_method, character, "Normalization Method   ...",
               this->params()),
    double_MEMBER(classify_char_norm_range, 0.2,
                  "Character Normalization Range ...", this->params()),
    double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...",
                  this->params()),  /* PREV DEFAULT 0.1 */
    double_MEMBER(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...",
                  this->params()),  /* PREV DEFAULT 0.3 */
    double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...",
                  this->params()),  /* PREV DEFAULT 0.1 */
    double_MEMBER(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...",
                  this->params()),  /* PREV DEFAULT 0.3 */
    BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
                this->params()),
    BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
                this->params()),
    BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
                "Enable adaptive classifier",
                this->params()),
    BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
                "Use pre-adapted classifier templates", this->params()),
    BOOL_MEMBER(classify_save_adapted_templates, 0,
               "Save adapted templates to a file", this->params()),
    BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
                this->params()),
    INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
    INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
    INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
               this->params()),
    double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
                  this->params()),
    double_MEMBER(matcher_great_threshold, 0.0, "Great Match (0-1)",
                  this->params()),
    double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
                  this->params()),
    double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
                  this->params()),
    double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
                  this->params()),
    double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
                  this->params()),
    INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
               this->params()),
    INT_MEMBER(matcher_min_examples_for_prototyping, 3,
               "Reliable Config Threshold", this->params()),
    INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
               "Enable adaption even if the ambiguities have not been seen",
               this->params()),
    double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
                  "Maximum angle delta for prototype clustering",
                  this->params()),
    double_MEMBER(classify_misfit_junk_penalty, 0.0,
                  "Penalty to apply when a non-alnum is vertically out of "
                  "its expected textline position",
                  this->params()),
    double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
    double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
                  this->params()),
    double_MEMBER(tessedit_class_miss_scale, 0.00390625,
                  "Scale factor for features not used", this->params()),
    INT_MEMBER(classify_adapt_proto_threshold, 230,
               "Threshold for good protos during adaptive 0-255",
               this->params()),
    INT_MEMBER(classify_adapt_feature_threshold, 230,
               "Threshold for good features during adaptive 0-255",
               this->params()),
    BOOL_MEMBER(disable_character_fragments, TRUE,
                "Do not include character fragments in the"
                " results of the classifier", this->params()),
    double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
                  -3.0, "Exclude fragments that do not look like whole"
                  " characters from training and adaption", this->params()),
    BOOL_MEMBER(classify_debug_character_fragments, FALSE,
                "Bring up graphical debugging windows for fragments training",
                this->params()),
    BOOL_MEMBER(matcher_debug_separate_windows, FALSE,
                "Use two different windows for debugging the matching: "
                "One for the protos and one for the features.", this->params()),
    STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
                  this->params()),
    INT_MEMBER(classify_class_pruner_threshold, 229,
               "Class Pruner Threshold 0-255", this->params()),
    INT_MEMBER(classify_class_pruner_multiplier, 30,
               "Class Pruner Multiplier 0-255:       ", this->params()),
    INT_MEMBER(classify_cp_cutoff_strength, 7,
               "Class Pruner CutoffStrength:         ", this->params()),
    INT_MEMBER(classify_integer_matcher_multiplier, 14,
               "Integer Matcher Multiplier  0-255:   ", this->params()),
    EnableLearning(true),
    INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
               this->params()),
    BOOL_MEMBER(classify_bln_numeric_mode, 0,
                "Assume the input is numbers [0-9].", this->params()),
    shape_table_(NULL),
    dict_(&image_) {
  fontinfo_table_.set_compare_callback(
      NewPermanentTessCallback(CompareFontInfo));
  fontinfo_table_.set_clear_callback(
      NewPermanentTessCallback(FontInfoDeleteCallback));
  fontset_table_.set_compare_callback(
      NewPermanentTessCallback(CompareFontSet));
  fontset_table_.set_clear_callback(
      NewPermanentTessCallback(FontSetDeleteCallback));
  AdaptedTemplates = NULL;
  PreTrainedTemplates = NULL;
  AllProtosOn = NULL;
  PrunedProtos = NULL;
  AllConfigsOn = NULL;
  AllProtosOff = NULL;
  AllConfigsOff = NULL;
  TempProtoMask = NULL;
  NormProtos = NULL;

  AdaptiveMatcherCalls = 0;
  BaselineClassifierCalls = 0;
  CharNormClassifierCalls = 0;
  AmbigClassifierCalls = 0;
  NumWordsAdaptedTo = 0;
  NumCharsAdaptedTo = 0;
  NumBaselineClassesTried = 0;
  NumCharNormClassesTried = 0;
  NumAmbigClassesTried = 0;
  NumClassesOutput = 0;
  NumAdaptationsFailed = 0;

  FeaturesHaveBeenExtracted = false;
  FeaturesOK = true;
  learn_debug_win_ = NULL;
  learn_fragmented_word_debug_win_ = NULL;
  learn_fragments_debug_win_ = NULL;

  CharNormCutoffs = new uinT16[MAX_NUM_CLASSES];
  BaselineCutoffs = new uinT16[MAX_NUM_CLASSES];
}

tesseract::Classify::~Classify ( ) [virtual]

Definition at line 173 of file classify.cpp.

                    {
  EndAdaptiveClassifier();
  delete learn_debug_win_;
  delete learn_fragmented_word_debug_win_;
  delete learn_fragments_debug_win_;
  delete[] CharNormCutoffs;
  delete[] BaselineCutoffs;
}

Member Function Documentation

int tesseract::Classify::AdaptableWord	(	TWERD *	Word,
		const WERD_CHOICE &	BestChoiceWord,
		const WERD_CHOICE &	RawChoiceWord
	)

Return TRUE if the specified word is acceptable for adaptation.

Globals: none

Parameters:

Word	current word
BestChoiceWord	best overall choice for word with context
RawChoiceWord	best choice for word without context

Returns:: TRUE or FALSE

Note:: Exceptions: none; History: Thu May 30 14:25:06 1991, DSJ, Created.

Definition at line 896 of file adaptmatch.cpp.

                                                              {
  int BestChoiceLength = BestChoiceWord.length();
  float adaptable_score =
    getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
  return   // rules that apply in general - simplest to compute first
      BestChoiceLength > 0 &&
      BestChoiceLength == Word->NumBlobs() &&
      BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
      getDict().CurrentBestChoiceAdjustFactor() <= adaptable_score &&
      getDict().AlternativeChoicesWorseThan(adaptable_score) &&
      getDict().CurrentBestChoiceIs(BestChoiceWord);
}

void tesseract::Classify::AdaptiveClassifier	(	TBLOB *	Blob,
		const DENORM &	denorm,
		BLOB_CHOICE_LIST *	Choices,
		CLASS_PRUNER_RESULTS	CPResults
	)

This routine calls the adaptive matcher which returns (in an array) the class id of each class matched.

It also returns the number of classes matched. For each class matched it places the best rating found for that class into the Ratings array.

Bad matches are then removed so that they don't need to be sorted. The remaining good matches are then sorted and converted to choices.

This routine also performs some simple speckle filtering.

Note:: Exceptions: none; History: Mon Mar 11 10:00:58 1991, DSJ, Created.

Parameters:

	Blob	blob to be classified
	denorm	normalization/denormalization parameters
[out]	Choices	List of choices found by adaptive matcher.
[out]	CPResults	Array of CPResultStruct of size MAX_NUM_CLASSES is filled on return with the choices found by the class pruner and the ratings therefrom. Also contains the detailed results of the integer matcher.

Definition at line 178 of file adaptmatch.cpp.

                                                                  {
  assert(Choices != NULL);
  ADAPT_RESULTS *Results = new ADAPT_RESULTS();

  if (AdaptedTemplates == NULL)
    AdaptedTemplates = NewAdaptedTemplates (true);

  Results->Initialize();

  DoAdaptiveMatch(Blob, denorm, Results);
  if (CPResults != NULL)
    memcpy(CPResults, Results->CPResults,
           sizeof(CPResults[0]) * Results->NumMatches);

  RemoveBadMatches(Results);
  qsort((void *)Results->match, Results->NumMatches,
        sizeof(ScoredClass), CompareByRating);
  RemoveExtraPuncs(Results);
  ConvertMatchesToChoices(denorm, Blob->bounding_box(), Results, Choices);

  if (matcher_debug_level >= 1) {
    cprintf ("AD Matches =  ");
    PrintAdaptiveMatchResults(stdout, Results);
  }

  if (LargeSpeckle(Blob))
    AddLargeSpeckleTo(Choices);

#ifndef GRAPHICS_DISABLED
  if (classify_enable_adaptive_debugger)
    DebugAdaptiveClassifier(Blob, denorm, Results);
#endif

  NumClassesOutput += Choices->length();
  if (Choices->length() == 0) {
    if (!classify_bln_numeric_mode)
      tprintf ("Empty classification!\n");  // Should never normally happen.
    Choices = new BLOB_CHOICE_LIST();
    BLOB_CHOICE_IT temp_it;
    temp_it.set_to_list(Choices);
    temp_it.add_to_end(
        new BLOB_CHOICE(0, 50.0f, -20.0f, -1, -1, NULL, 0, 0, false));
  }

  delete Results;
}                                /* AdaptiveClassifier */

bool tesseract::Classify::AdaptiveClassifierIsFull ( ) [inline]

Definition at line 319 of file classify.h.

{ return NumAdaptationsFailed > 0; }

void tesseract::Classify::AdaptToChar	(	TBLOB *	Blob,
		const DENORM &	denorm,
		CLASS_ID	ClassId,
		int	FontinfoId,
		FLOAT32	Threshold
	)

Parameters:

Blob	blob to add to templates for ClassId
denorm	normalization/denormalization parameters
ClassId	class to add blob to
FontinfoId	font information from pre-trained templates
Threshold	minimum match rating to existing template

Globals:

AdaptedTemplates current set of adapted templates
AllProtosOn dummy mask to match against all protos
AllConfigsOn dummy mask to match against all configs

Returns:: none

Note:: Exceptions: none; History: Thu Mar 14 09:36:03 1991, DSJ, Created.

Definition at line 928 of file adaptmatch.cpp.

                                              {
  int NumFeatures;
  INT_FEATURE_ARRAY IntFeatures;
  INT_RESULT_STRUCT IntResult;
  INT_CLASS IClass;
  ADAPT_CLASS Class;
  TEMP_CONFIG TempConfig;
  FEATURE_SET FloatFeatures;
  int NewTempConfigId;

  ResetFeaturesHaveBeenExtracted();
  NumCharsAdaptedTo++;
  if (!LegalClassId (ClassId))
    return;

  Class = AdaptedTemplates->Class[ClassId];
  assert(Class != NULL);
  if (IsEmptyAdaptedClass(Class)) {
    InitAdaptedClass(Blob, denorm, ClassId, FontinfoId, Class,
                     AdaptedTemplates);
  }
  else {
    IClass = ClassForClassId (AdaptedTemplates->Templates, ClassId);

    NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
    if (NumFeatures <= 0)
      return;

    im_.SetBaseLineMatch();
    // Only match configs with the matching font.
    BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
    for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
      if (GetFontinfoId(Class, cfg) == FontinfoId) {
        SET_BIT(MatchingFontConfigs, cfg);
      } else {
        reset_bit(MatchingFontConfigs, cfg);
      }
    }
    im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
              NumFeatures, IntFeatures,
              &IntResult, classify_adapt_feature_threshold,
              NO_DEBUG, matcher_debug_separate_windows);
    FreeBitVector(MatchingFontConfigs);

    SetAdaptiveThreshold(Threshold);

    if (IntResult.Rating <= Threshold) {
      if (ConfigIsPermanent (Class, IntResult.Config)) {
        if (classify_learning_debug_level >= 1)
          cprintf ("Found good match to perm config %d = %4.1f%%.\n",
            IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
        FreeFeatureSet(FloatFeatures);
        return;
      }

      TempConfig = TempConfigFor (Class, IntResult.Config);
      IncreaseConfidence(TempConfig);
      if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
        Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
      }
      if (classify_learning_debug_level >= 1)
        cprintf ("Increasing reliability of temp config %d to %d.\n",
          IntResult.Config, TempConfig->NumTimesSeen);

      if (TempConfigReliable(ClassId, TempConfig)) {
        MakePermanent(AdaptedTemplates, ClassId, IntResult.Config, denorm,
                      Blob);
        UpdateAmbigsGroup(ClassId, denorm, Blob);
      }
    }
    else {
      if (classify_learning_debug_level >= 1) {
        cprintf ("Found poor match to temp config %d = %4.1f%%.\n",
          IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
        if (classify_learning_debug_level > 2)
          DisplayAdaptedChar(Blob, denorm, IClass);
      }
      NewTempConfigId = MakeNewTemporaryConfig(AdaptedTemplates,
                                               ClassId,
                                               FontinfoId,
                                               NumFeatures,
                                               IntFeatures,
                                               FloatFeatures);
      if (NewTempConfigId >= 0 &&
          TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
        MakePermanent(AdaptedTemplates, ClassId, NewTempConfigId, denorm, Blob);
        UpdateAmbigsGroup(ClassId, denorm, Blob);
      }

#ifndef GRAPHICS_DISABLED
      if (classify_learning_debug_level > 1) {
        DisplayAdaptedChar(Blob, denorm, IClass);
      }
#endif
    }
    FreeFeatureSet(FloatFeatures);
  }
}                                /* AdaptToChar */

void tesseract::Classify::AdaptToPunc	(	TBLOB *	Blob,
		const DENORM &	denorm,
		CLASS_ID	ClassId,
		int	FontinfoId,
		FLOAT32	Threshold
	)

Parameters:

Blob	blob to add to templates for ClassId
denorm	normalization/denormalization parameters
ClassId	class to add blob to
FontinfoId	font information from pre-trained teamples
Threshold	minimum match rating to existing template

Globals:

PreTrainedTemplates current set of built-in templates

Note:: Exceptions: none; History: Thu Mar 14 09:36:03 1991, DSJ, Created.

Definition at line 1077 of file adaptmatch.cpp.

                                              {
  ADAPT_RESULTS *Results = new ADAPT_RESULTS();
  int i;

  Results->Initialize();
  CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
  RemoveBadMatches(Results);

  if (Results->NumMatches != 1) {
    if (classify_learning_debug_level >= 1) {
      cprintf ("Rejecting punc = %s (Alternatives = ",
               unicharset.id_to_unichar(ClassId));

      for (i = 0; i < Results->NumMatches; i++)
        tprintf("%s", unicharset.id_to_unichar(Results->match[i].unichar_id));
      tprintf(")\n");
    }
  } else {
    #ifndef SECURE_NAMES
    if (classify_learning_debug_level >= 1)
      cprintf ("Adapting to punc = %s, thr= %g\n",
               unicharset.id_to_unichar(ClassId), Threshold);
    #endif
    AdaptToChar(Blob, denorm, ClassId, FontinfoId, Threshold);
  }
  delete Results;
}                                /* AdaptToPunc */

void tesseract::Classify::AddNewResult	(	ADAPT_RESULTS *	results,
		CLASS_ID	class_id,
		int	shape_id,
		FLOAT32	rating,
		bool	adapted,
		int	config,
		int	fontinfo_id,
		int	fontinfo_id2
	)

This routine adds the result of a classification into Results. If the new rating is much worse than the current best rating, it is not entered into results because it would end up being stripped later anyway. If the new rating is better than the old rating for the class, it replaces the old rating. If this is the first rating for the class, the class is added to the list of matched classes in Results. If the new rating is better than the best so far, it becomes the best so far.

Globals:

matcher_bad_match_pad defines limits of an acceptable match

Parameters:

[out]	results	results to add new result to
	class_id	class of new result
	shape_id	shape index
	rating	rating of new result
	adapted	adapted match or not
	config	config id of new result
	fontinfo_id	font information of the new result
	fontinfo_id2	font information of the 2nd choice result

Note:: Exceptions: none; History: Tue Mar 12 18:19:29 1991, DSJ, Created.

Definition at line 1137 of file adaptmatch.cpp.

                                              {
  ScoredClass *old_match = FindScoredUnichar(results, class_id);
  ScoredClass match =
      { class_id,
        shape_id,
        rating,
        adapted,
        static_cast<inT16>(config),
        static_cast<inT16>(fontinfo_id),
        static_cast<inT16>(fontinfo_id2) };

  if (rating > results->best_match.rating + matcher_bad_match_pad ||
      (old_match && rating >= old_match->rating))
    return;

  if (!unicharset.get_fragment(class_id))
    results->HasNonfragment = true;

  if (old_match)
    old_match->rating = rating;
  else
    results->match[results->NumMatches++] = match;

  if (rating < results->best_match.rating &&
      // Ensure that fragments do not affect best rating, class and config.
      // This is needed so that at least one non-fragmented character is
      // always present in the results.
      // TODO(daria): verify that this helps accuracy and does not
      // hurt performance.
      !unicharset.get_fragment(class_id)) {
    results->best_match = match;
  }
}                                /* AddNewResult */

void tesseract::Classify::AmbigClassifier	(	TBLOB *	Blob,
		const DENORM &	denorm,
		INT_TEMPLATES	Templates,
		ADAPT_CLASS *	Classes,
		UNICHAR_ID *	Ambiguities,
		ADAPT_RESULTS *	Results
	)

This routine is identical to CharNormClassifier() except that it does no class pruning. It simply matches the unknown blob against the classes listed in Ambiguities.

Globals:

AllProtosOn mask that enables all protos
AllConfigsOn mask that enables all configs

Parameters:

	Blob	blob to be classified
	denorm	normalization/denormalization parameters
	Templates	built-in templates to classify against
	Classes	adapted class templates
	Ambiguities	array of class id's to match against
[out]	Results	place to put match results

Note:: Exceptions: none; History: Tue Mar 12 19:40:36 1991, DSJ, Created.

Definition at line 1200 of file adaptmatch.cpp.

                                                       {
  int NumFeatures;
  INT_FEATURE_ARRAY IntFeatures;
  uinT8* CharNormArray = new uinT8[unicharset.size()];
  INT_RESULT_STRUCT IntResult;
  CLASS_ID ClassId;

  AmbigClassifierCalls++;

  NumFeatures = GetCharNormFeatures(Blob, denorm, Templates, IntFeatures,
                                    NULL, CharNormArray,
                                    &(Results->BlobLength), NULL);
  if (NumFeatures <= 0) {
    delete [] CharNormArray;
    return;
  }

  bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
  if (debug)
    tprintf("AM Matches =  ");

  int top = Blob->bounding_box().top();
  int bottom = Blob->bounding_box().bottom();
  while (*Ambiguities >= 0) {
    ClassId = *Ambiguities;

    im_.SetCharNormMatch(classify_integer_matcher_multiplier);
    im_.Match(ClassForClassId(Templates, ClassId),
              AllProtosOn, AllConfigsOn,
              NumFeatures, IntFeatures,
              &IntResult,
              classify_adapt_feature_threshold, NO_DEBUG,
              matcher_debug_separate_windows);

    ExpandShapesAndApplyCorrections(NULL, debug, ClassId, bottom, top, 0,
                                    Results->BlobLength, CharNormArray,
                                    IntResult, Results);
    Ambiguities++;

    NumAmbigClassesTried++;
  }
  delete [] CharNormArray;
}                                /* AmbigClassifier */

UNICHAR_ID * tesseract::Classify::BaselineClassifier	(	TBLOB *	Blob,
		const DENORM &	denorm,
		ADAPT_TEMPLATES	Templates,
		ADAPT_RESULTS *	Results
	)

This routine extracts baseline normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Globals:

BaselineCutoffs expected num features for each class

Parameters:

Blob	blob to be classified
denorm	normalization/denormalization parameters
Templates	current set of adapted templates
Results	place to put match results

Returns:: Array of possible ambiguous chars that should be checked.

Note:: Exceptions: none; History: Tue Mar 12 19:38:03 1991, DSJ, Created.

Definition at line 1418 of file adaptmatch.cpp.

                                                                 {
  int NumFeatures;
  int NumClasses;
  INT_FEATURE_ARRAY IntFeatures;
  uinT8* CharNormArray = new uinT8[unicharset.size()];
  CLASS_ID ClassId;

  BaselineClassifierCalls++;

  NumFeatures = GetBaselineFeatures(
      Blob, denorm, Templates->Templates, IntFeatures, CharNormArray,
      &(Results->BlobLength));
  if (NumFeatures <= 0) {
    delete [] CharNormArray;
    return NULL;
  }

  NumClasses = PruneClasses(Templates->Templates, NumFeatures, IntFeatures,
                            CharNormArray, BaselineCutoffs, Results->CPResults);

  NumBaselineClassesTried += NumClasses;

  if (matcher_debug_level >= 2 || classify_debug_level > 1)
    cprintf ("BL Matches =  ");

  im_.SetBaseLineMatch();
  MasterMatcher(Templates->Templates, NumFeatures, IntFeatures, CharNormArray,
                Templates->Class, matcher_debug_flags, NumClasses,
                Blob->bounding_box(), Results->CPResults, Results);

  delete [] CharNormArray;
  ClassId = Results->best_match.unichar_id;
  if (ClassId == NO_CLASS)
    return (NULL);
  /* this is a bug - maybe should return "" */

  return Templates->Class[ClassId]->
      Config[Results->best_match.config].Perm->Ambigs;
}                                /* BaselineClassifier */

int tesseract::Classify::CharNormClassifier	(	TBLOB *	Blob,
		const DENORM &	denorm,
		INT_TEMPLATES	Templates,
		ADAPT_RESULTS *	Results
	)

This routine extracts character normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Parameters:

Blob	blob to be classified
denorm	normalization/denormalization parameters
Templates	templates to classify unknown against
Results	place to put match results

Globals:

CharNormCutoffs expected num features for each class
AllProtosOn mask that enables all protos
AllConfigsOn mask that enables all configs

Note:: Exceptions: none; History: Tue Mar 12 16:02:52 1991, DSJ, Created.

Definition at line 1482 of file adaptmatch.cpp.

                                                         {
  int NumFeatures;
  int NumClasses;
  INT_FEATURE_ARRAY IntFeatures;

  CharNormClassifierCalls++;

  uinT8* CharNormArray = new uinT8[unicharset.size()];
  int num_pruner_classes = MAX(unicharset.size(),
                               PreTrainedTemplates->NumClasses);
  uinT8* PrunerNormArray = new uinT8[num_pruner_classes];
  NumFeatures = GetCharNormFeatures(Blob, denorm, Templates, IntFeatures,
                                    PrunerNormArray, CharNormArray,
                                    &(Results->BlobLength), NULL);
  if (NumFeatures <= 0) {
    delete [] CharNormArray;
    delete [] PrunerNormArray;
    return 0;
  }

  NumClasses = PruneClasses(Templates, NumFeatures, IntFeatures,
                            PrunerNormArray,
                            shape_table_ != NULL ? &shapetable_cutoffs_[0]
                                                 : CharNormCutoffs,
                            Results->CPResults);

  if (tessedit_single_match && NumClasses > 1)
    NumClasses = 1;
  NumCharNormClassesTried += NumClasses;

  im_.SetCharNormMatch(classify_integer_matcher_multiplier);
  MasterMatcher(Templates, NumFeatures, IntFeatures, CharNormArray,
                NULL, matcher_debug_flags, NumClasses,
                Blob->bounding_box(), Results->CPResults, Results);
  delete [] CharNormArray;
  delete [] PrunerNormArray;
  return NumFeatures;
}                                /* CharNormClassifier */

int tesseract::Classify::CharNormTrainingSample	(	bool	pruner_only,
		const TrainingSample &	sample,
		GenericVector< ShapeRating > *	results
	)

Definition at line 1526 of file adaptmatch.cpp.

                                                                          {
  results->clear();
  ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS();
  adapt_results->Initialize();
  // Compute the bounding box of the features.
  int num_features = sample.num_features();
  TBOX blob_box;
  for (int f = 0; f < num_features; ++f) {
    const INT_FEATURE_STRUCT feature = sample.features()[f];
    TBOX fbox(feature.X, feature.Y, feature.X, feature.Y);
    blob_box += fbox;
  }
  // Compute the char_norm_array from the saved cn_feature.
  FEATURE norm_feature = NewFeature(&CharNormDesc);
  norm_feature->Params[CharNormY] = sample.cn_feature(CharNormY);
  norm_feature->Params[CharNormLength] = sample.cn_feature(CharNormLength);
  norm_feature->Params[CharNormRx] = sample.cn_feature(CharNormRx);
  norm_feature->Params[CharNormRy] = sample.cn_feature(CharNormRy);
  uinT8* char_norm_array = new uinT8[unicharset.size()];
  int num_pruner_classes = MAX(unicharset.size(),
                               PreTrainedTemplates->NumClasses);
  uinT8* pruner_norm_array = new uinT8[num_pruner_classes];
  adapt_results->BlobLength =
      static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
  ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
                        pruner_norm_array);

  int num_classes = PruneClasses(PreTrainedTemplates, num_features,
                                 sample.features(),
                                 pruner_norm_array,
                                 shape_table_ != NULL ? &shapetable_cutoffs_[0]
                                                      : CharNormCutoffs,
                                 adapt_results->CPResults);
  delete [] pruner_norm_array;
  if (pruner_only) {
    // Convert pruner results to output format.
    for (int i = 0; i < num_classes; ++i) {
      int class_id = adapt_results->CPResults[i].Class;
      int shape_id = class_id;
      if (shape_table_ != NULL) {
        // All shapes in a class have the same combination of unichars, so
        // it doesn't really matter which config we give it, as we aren't
        // trying to get the font here.
        shape_id = ClassAndConfigIDToFontOrShapeID(class_id, 0);
      }
      results->push_back(
          ShapeRating(shape_id, 1.0f - adapt_results->CPResults[i].Rating));
    }
  } else {
    im_.SetCharNormMatch(classify_integer_matcher_multiplier);
    MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
                  char_norm_array,
                  NULL, matcher_debug_flags, num_classes,
                  blob_box, adapt_results->CPResults, adapt_results);
    // Convert master matcher results to output format.
    for (int i = 0; i < adapt_results->NumMatches; i++) {
      ScoredClass next = adapt_results->match[i];
      results->push_back(ShapeRating(next.shape_id, 1.0f - next.rating));
    }
    results->sort(&ShapeRating::SortDescendingRating);
  }
  delete [] char_norm_array;
  delete adapt_results;
  return num_features;
}                                /* CharNormTrainingSample */

int tesseract::Classify::ClassAndConfigIDToFontOrShapeID	(	int	class_id,
		int	int_result_config
	)		const

Definition at line 2734 of file adaptmatch.cpp.

                                                                           {
  int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
  // Older inttemps have no font_ids.
  if (font_set_id < 0)
    return kBlankFontinfoId;
  const FontSet &fs = fontset_table_.get(font_set_id);
  ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
  return fs.configs[int_result_config];
}

STRING tesseract::Classify::ClassIDToDebugStr	(	const INT_TEMPLATES_STRUCT *	templates,
		int	class_id,
		int	config_id
	)		const

Definition at line 2721 of file adaptmatch.cpp.

                                                                      {
  STRING class_string;
  if (templates == PreTrainedTemplates && shape_table_ != NULL) {
    int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
    class_string = shape_table_->DebugStr(shape_id);
  } else {
    class_string = unicharset.debug_str(class_id);
  }
  return class_string;
}

void tesseract::Classify::ClassifyAsNoise ( ADAPT_RESULTS * Results )

This routine computes a rating which reflects the likelihood that the blob being classified is a noise blob. NOTE: assumes that the blob length has already been computed and placed into Results.

Parameters:

Results results to add noise classification to

Globals:

matcher_avg_noise_size avg. length of a noise blob

Note:: Exceptions: none; History: Tue Mar 12 18:36:52 1991, DSJ, Created.

Definition at line 1610 of file adaptmatch.cpp.

                                                     {
  register FLOAT32 Rating;

  Rating = Results->BlobLength / matcher_avg_noise_size;
  Rating *= Rating;
  Rating /= 1.0 + Rating;

  AddNewResult(Results, NO_CLASS, -1, Rating, false, -1,
               kBlankFontinfoId, kBlankFontinfoId);
}                                /* ClassifyAsNoise */

void tesseract::Classify::ClearCharNormArray ( uinT8 * char_norm_array )

For each class in the unicharset, clears the corresponding entry in char_norm_array. char_norm_array is indexed by unichar_id.

Globals:

none

Parameters:

char_norm_array array to be cleared

Note:: Exceptions: none; History: Wed Feb 20 11:20:54 1991, DSJ, Created.

Definition at line 48 of file float2int.cpp.

                                                        {
  memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());
}                                /* ClearCharNormArray */

void tesseract::Classify::ComputeCharNormArrays	(	FEATURE_STRUCT *	norm_feature,
		INT_TEMPLATES_STRUCT *	templates,
		uinT8 *	char_norm_array,
		uinT8 *	pruner_array
	)

Definition at line 2087 of file adaptmatch.cpp.

                                                          {
  ComputeIntCharNormArray(*norm_feature, char_norm_array);
  if (pruner_array != NULL) {
    if (shape_table_ == NULL) {
      ComputeIntCharNormArray(*norm_feature, pruner_array);
    } else {
      memset(pruner_array, MAX_UINT8,
             templates->NumClasses * sizeof(pruner_array[0]));
      // Each entry in the pruner norm array is the MIN of all the entries of
      // the corresponding unichars in the CharNormArray.
      for (int id = 0; id < templates->NumClasses; ++id) {
        int font_set_id = templates->Class[id]->font_set_id;
        const FontSet &fs = fontset_table_.get(font_set_id);
        for (int config = 0; config < fs.size; ++config) {
          const Shape& shape = shape_table_->GetShape(fs.configs[config]);
          for (int c = 0; c < shape.size(); ++c) {
            if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
              pruner_array[id] = char_norm_array[shape[c].unichar_id];
          }
        }
      }
    }
  }
  FreeFeature(norm_feature);
}

double tesseract::Classify::ComputeCorrectedRating	(	bool	debug,
		int	unichar_id,
		double	cp_rating,
		double	im_rating,
		int	feature_misses,
		int	bottom,
		int	top,
		int	blob_length,
		const uinT8 *	cn_factors
	)

Definition at line 1355 of file adaptmatch.cpp.

                                                                 {
  // Compute class feature corrections.
  double cn_corrected = im_.ApplyCNCorrection(im_rating, blob_length,
                                              cn_factors[unichar_id]);
  double miss_penalty = tessedit_class_miss_scale * feature_misses;
  double vertical_penalty = 0.0;
  // Penalize non-alnums for being vertical misfits.
  if (!unicharset.get_isalpha(unichar_id) &&
      !unicharset.get_isdigit(unichar_id) &&
      cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
    int min_bottom, max_bottom, min_top, max_top;
    unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
                              &min_top, &max_top);
    if (debug) {
      tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
              top, min_top, max_top, bottom, min_bottom, max_bottom);
    }
    if (top < min_top || top > max_top ||
        bottom < min_bottom || bottom > max_bottom) {
      vertical_penalty = classify_misfit_junk_penalty;
    }
  }
  double result =cn_corrected + miss_penalty + vertical_penalty;
  if (result > WORST_POSSIBLE_RATING)
    result = WORST_POSSIBLE_RATING;
  if (debug) {
    tprintf("%s: %2.1f(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
            unicharset.id_to_unichar(unichar_id),
            result * 100.0,
            cp_rating * 100.0,
            im_rating * 100.0,
            (cn_corrected - im_rating) * 100.0,
            cn_factors[unichar_id],
            miss_penalty * 100.0,
            vertical_penalty * 100.0);
  }
  return result;
}

void tesseract::Classify::ComputeIntCharNormArray	(	const FEATURE_STRUCT &	norm_feature,
		uinT8 *	char_norm_array
	)

For each class in unicharset, computes the match between norm_feature and the normalization protos for that class. Converts this number to the range from 0 - 255 and stores it into char_norm_array. CharNormArray is indexed by unichar_id.

Globals:

none

Parameters:

	norm_feature	character normalization feature
[out]	char_norm_array	place to put results of size unicharset.size()

Note:: Exceptions: none; History: Wed Feb 20 11:20:54 1991, DSJ, Created.

Definition at line 69 of file float2int.cpp.

                                                               {
  for (int i = 0; i < unicharset.size(); i++) {
    int norm_adjust = static_cast<int>(INT_CHAR_NORM_RANGE *
      ComputeNormMatch(i, norm_feature, FALSE));
    char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);
  }
}                                /* ComputeIntCharNormArray */

void tesseract::Classify::ComputeIntFeatures	(	FEATURE_SET	Features,
		INT_FEATURE_ARRAY	IntFeatures
	)

This routine converts each floating point pico-feature in Features into integer format and saves it into IntFeatures.

Globals:

none

Parameters:

	Features	floating point pico-features to be converted
[out]	IntFeatures	array to put converted features into

Note:: Exceptions: none; History: Wed Feb 20 10:58:45 1991, DSJ, Created.

Definition at line 94 of file float2int.cpp.

                                                                 {
  int Fid;
  FEATURE Feature;
  FLOAT32 YShift;

  if (classify_norm_method == baseline)
    YShift = BASELINE_Y_SHIFT;
  else
    YShift = Y_SHIFT;

  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
    Feature = Features->Features[Fid];

    IntFeatures[Fid].X = BucketFor (Feature->Params[PicoFeatX],
      X_SHIFT, INT_FEAT_RANGE);
    IntFeatures[Fid].Y = BucketFor (Feature->Params[PicoFeatY],
      YShift, INT_FEAT_RANGE);
    IntFeatures[Fid].Theta = CircBucketFor (Feature->Params[PicoFeatDir],
      ANGLE_SHIFT, INT_FEAT_RANGE);
    IntFeatures[Fid].CP_misses = 0;
  }
}                                /* ComputeIntFeatures */

FLOAT32 tesseract::Classify::ComputeNormMatch	(	CLASS_ID	ClassId,
		const FEATURE_STRUCT &	feature,
		BOOL8	DebugMatch
	)

Definition at line 73 of file normmatch.cpp.

                                                     {
/*
 **     Parameters:
 **             ClassId         id of class to match against
 **             Feature         character normalization feature
 **             DebugMatch      controls dump of debug info
 **     Globals:
 **             NormProtos      character normalization prototypes
 **     Operation: This routine compares Features against each character
 **             normalization proto for ClassId and returns the match
 **             rating of the best match.
 **     Return: Best match rating for Feature against protos of ClassId.
 **     Exceptions: none
 **     History: Wed Dec 19 16:56:12 1990, DSJ, Created.
 */
  LIST Protos;
  FLOAT32 BestMatch;
  FLOAT32 Match;
  FLOAT32 Delta;
  PROTOTYPE *Proto;
  int ProtoId;

  /* handle requests for classification as noise */
  if (ClassId == NO_CLASS) {
    /* kludge - clean up constants and make into control knobs later */
    Match = (feature.Params[CharNormLength] *
      feature.Params[CharNormLength] * 500.0 +
      feature.Params[CharNormRx] *
      feature.Params[CharNormRx] * 8000.0 +
      feature.Params[CharNormRy] *
      feature.Params[CharNormRy] * 8000.0);
    return (1.0 - NormEvidenceOf (Match));
  }

  BestMatch = MAX_FLOAT32;
  Protos = NormProtos->Protos[ClassId];

  if (DebugMatch) {
    tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
  }

  ProtoId = 0;
  iterate(Protos) {
    Proto = (PROTOTYPE *) first_node (Protos);
    Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
    Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
    if (DebugMatch) {
      tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
              Proto->Mean[CharNormY], Delta,
              Proto->Weight.Elliptical[CharNormY], Match);
    }
    Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
    Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
    if (DebugMatch) {
      tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
              Proto->Mean[CharNormRx], Delta,
              Proto->Weight.Elliptical[CharNormRx], Match);
    }
    // Ry is width! See intfx.cpp.
    Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
    if (DebugMatch) {
      tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
              Proto->Mean[CharNormRy], Delta,
              Proto->Weight.Elliptical[CharNormRy]);
    }
    Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
    Delta *= kWidthErrorWeighting;
    Match += Delta;
    if (DebugMatch) {
      tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
              Match, Match / classify_norm_adj_midpoint,
              NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
    }

    if (Match < BestMatch)
      BestMatch = Match;

    ProtoId++;
  }
  return 1.0 - NormEvidenceOf(BestMatch);
}                                /* ComputeNormMatch */

void tesseract::Classify::ConvertMatchesToChoices	(	const DENORM &	denorm,
		const TBOX &	box,
		ADAPT_RESULTS *	Results,
		BLOB_CHOICE_LIST *	Choices
	)

The function converts the given match ratings to the list of blob choices with ratings and certainties (used by the context checkers). If character fragments are present in the results, this function also makes sure that there is at least one non-fragmented classification included. For each classification result check the unicharset for "definite" ambiguities and modify the resulting Choices accordingly.

Definition at line 1670 of file adaptmatch.cpp.

                                                                  {
  assert(Choices != NULL);
  FLOAT32 Rating;
  FLOAT32 Certainty;
  BLOB_CHOICE_IT temp_it;
  bool contains_nonfrag = false;
  temp_it.set_to_list(Choices);
  int choices_length = 0;
  // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
  // number of returned results, but with a shape_table_ we want to have room
  // for at least the biggest shape (which might contain hundreds of Indic
  // grapheme fragments) and more, so use double the size of the biggest shape
  // if that is more than the default.
  int max_matches = MAX_MATCHES;
  if (shape_table_ != NULL) {
    max_matches = shape_table_->MaxNumUnichars() * 2;
    if (max_matches < MAX_MATCHES)
      max_matches = MAX_MATCHES;
  }

  for (int i = 0; i < Results->NumMatches; i++) {
    ScoredClass next = Results->match[i];
    int fontinfo_id = next.fontinfo_id;
    int fontinfo_id2 = next.fontinfo_id2;
    bool adapted = next.adapted;
    bool current_is_frag = (unicharset.get_fragment(next.unichar_id) != NULL);
    if (temp_it.length()+1 == max_matches &&
        !contains_nonfrag && current_is_frag) {
      continue;  // look for a non-fragmented character to fill the
                 // last spot in Choices if only fragments are present
    }
    // BlobLength can never be legally 0, this means recognition failed.
    // But we must return a classification result because some invoking
    // functions (chopper/permuter) do not anticipate a null blob choice.
    // So we need to assign a poor, but not infinitely bad score.
    if (Results->BlobLength == 0) {
      Certainty = -20;
      Rating = 100;    // should be -certainty * real_blob_length
    } else {
      Rating = Certainty = next.rating;
      Rating *= rating_scale * Results->BlobLength;
      Certainty *= -(getDict().certainty_scale);
    }
    inT16 min_xheight, max_xheight;
    denorm.XHeightRange(next.unichar_id, unicharset, box,
                        &min_xheight, &max_xheight);
    temp_it.add_to_end(new BLOB_CHOICE(next.unichar_id, Rating, Certainty,
                                        fontinfo_id, fontinfo_id2,
                                        unicharset.get_script(next.unichar_id),
                                        min_xheight, max_xheight, adapted));
    contains_nonfrag |= !current_is_frag;  // update contains_nonfrag
    choices_length++;
    if (choices_length >= max_matches) break;
  }
  Results->NumMatches = choices_length;
}  // ConvertMatchesToChoices

void tesseract::Classify::ConvertProto	(	PROTO	Proto,
		int	ProtoId,
		INT_CLASS	Class
	)

Definition at line 528 of file intproto.cpp.

                                                                     {
/*
 ** Parameters:
 **   Proto floating-pt proto to be converted to integer format
 **   ProtoId id of proto
 **   Class integer class to add converted proto to
 ** Globals: none
 ** Operation: This routine converts Proto to integer format and
 **   installs it as ProtoId in Class.
 ** Return: none
 ** Exceptions: none
 ** History: Fri Feb  8 11:22:43 1991, DSJ, Created.
 */
  INT_PROTO P;
  FLOAT32 Param;

  assert(ProtoId < Class->NumProtos);

  P = ProtoForProtoId(Class, ProtoId);

  Param = Proto->A * 128;
  P->A = TruncateParam(Param, -128, 127, NULL);

  Param = -Proto->B * 256;
  P->B = TruncateParam(Param, 0, 255, NULL);

  Param = Proto->C * 128;
  P->C = TruncateParam(Param, -128, 127, NULL);

  Param = Proto->Angle * 256;
  if (Param < 0 || Param >= 256)
    P->Angle = 0;
  else
    P->Angle = (uinT8) Param;

  /* round proto length to nearest integer number of pico-features */
  Param = (Proto->Length / GetPicoFeatureLength()) + 0.5;
  Class->ProtoLengths[ProtoId] = TruncateParam(Param, 1, 255, NULL);
  if (classify_learning_debug_level >= 2)
    cprintf("Converted ffeat to (A=%d,B=%d,C=%d,L=%d)",
            P->A, P->B, P->C, Class->ProtoLengths[ProtoId]);
}                                /* ConvertProto */

INT_TEMPLATES tesseract::Classify::CreateIntTemplates	(	CLASSES	FloatProtos,
		const UNICHARSET &	target_unicharset
	)

Definition at line 573 of file intproto.cpp.

                                                              {
/*
 ** Parameters:
 **   FloatProtos prototypes in old floating pt format
 ** Globals: none
 ** Operation: This routine converts from the old floating point format
 **   to the new integer format.
 ** Return: New set of training templates in integer format.
 ** Exceptions: none
 ** History: Thu Feb  7 14:40:42 1991, DSJ, Created.
 */
  INT_TEMPLATES IntTemplates;
  CLASS_TYPE FClass;
  INT_CLASS IClass;
  int ClassId;
  int ProtoId;
  int ConfigId;

  IntTemplates = NewIntTemplates();

  for (ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
    FClass = &(FloatProtos[ClassId]);
    if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
        strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
      cprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
              target_unicharset.id_to_unichar(ClassId));
    }
    assert(UnusedClassIdIn(IntTemplates, ClassId));
    IClass = NewIntClass(FClass->NumProtos, FClass->NumConfigs);
    FontSet fs;
    fs.size = FClass->font_set.size();
    fs.configs = new int[fs.size];
    for (int i = 0; i < fs.size; ++i) {
      fs.configs[i] = FClass->font_set.get(i);
    }
    if (this->fontset_table_.contains(fs)) {
      IClass->font_set_id = this->fontset_table_.get_id(fs);
      delete[] fs.configs;
    } else {
      IClass->font_set_id = this->fontset_table_.push_back(fs);
    }
    AddIntClass(IntTemplates, ClassId, IClass);

    for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {
      AddIntProto(IClass);
      ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);
      AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,
                            classify_learning_debug_level >= 2);
      AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);
    }

    for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {
      AddIntConfig(IClass);
      ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);
    }
  }
  return (IntTemplates);
}                                /* CreateIntTemplates */

void tesseract::Classify::DebugAdaptiveClassifier	(	TBLOB *	Blob,
		const DENORM &	denorm,
		ADAPT_RESULTS *	Results
	)

Parameters:

Blob	blob whose classification is being debugged
denorm	normalization/denormalization parameters
Results	results of match being debugged

Globals: none

Note:: Exceptions: none; History: Wed Mar 13 16:44:41 1991, DSJ, Created.

Definition at line 1743 of file adaptmatch.cpp.

                                                               {
  for (int i = 0; i < Results->NumMatches; i++) {
    if (Results->match[i].rating < Results->best_match.rating)
      Results->best_match = Results->match[i];
  }
  const char *Prompt =
    "Left-click in IntegerMatch Window to continue or right click to debug...";
  CLASS_ID unichar_id = Results->best_match.unichar_id;
  int shape_id = Results->best_match.shape_id;
  bool adaptive_on = true;
  bool pretrained_on = true;

  const char* debug_mode;
  do {
    if (!pretrained_on)
      debug_mode = "Adaptive Templates Only";
    else if (!adaptive_on)
      debug_mode = "PreTrained Templates Only";
    else
      debug_mode = "All Templates";
    ShowMatchDisplay();
    tprintf("Debugging class %d = %s in mode %s ...",
            unichar_id, unicharset.id_to_unichar(unichar_id), debug_mode);
    if (shape_id >= 0 && shape_table_ != NULL) {
      tprintf(" from shape %s\n", shape_table_->DebugStr(shape_id).string());
    }
    ShowBestMatchFor(Blob, denorm, unichar_id, shape_id, adaptive_on,
                     pretrained_on, Results);
    UpdateMatchDisplay();
  } while ((unichar_id = GetClassToDebug(Prompt, &adaptive_on,
                                         &pretrained_on, &shape_id)) != 0);
}                                /* DebugAdaptiveClassifier */

void tesseract::Classify::DisplayAdaptedChar	(	TBLOB *	blob,
		const DENORM &	denorm,
		INT_CLASS_STRUCT *	int_class
	)

Definition at line 1031 of file adaptmatch.cpp.

                                                               {
#ifndef GRAPHICS_DISABLED
  int bloblength = 0;
  INT_FEATURE_ARRAY features;
  uinT8* norm_array = new uinT8[unicharset.size()];
  int num_features = GetBaselineFeatures(blob, denorm, PreTrainedTemplates,
                                         features,
                                         norm_array, &bloblength);
  delete [] norm_array;
  INT_RESULT_STRUCT IntResult;

  im_.Match(int_class, AllProtosOn, AllConfigsOn,
            num_features, features,
            &IntResult, classify_adapt_feature_threshold,
            NO_DEBUG, matcher_debug_separate_windows);
  cprintf ("Best match to temp config %d = %4.1f%%.\n",
    IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
  if (classify_learning_debug_level >= 2) {
    uinT32 ConfigMask;
    ConfigMask = 1 << IntResult.Config;
    ShowMatchDisplay();
    im_.Match(int_class, AllProtosOn, (BIT_VECTOR)&ConfigMask,
              num_features, features,
              &IntResult, classify_adapt_feature_threshold,
              6 | 0x19, matcher_debug_separate_windows);
    UpdateMatchDisplay();
  }
#endif
}

void tesseract::Classify::DoAdaptiveMatch	(	TBLOB *	Blob,
		const DENORM &	denorm,
		ADAPT_RESULTS *	Results
	)

This routine performs an adaptive classification. If we have not yet adapted to enough classes, a simple classification to the pre-trained templates is performed. Otherwise, we match the blob against the adapted templates. If the adapted templates do not match well, we try a match against the pre-trained templates. If an adapted template match is found, we do a match to any pre-trained templates which could be ambiguous. The results from all of these classifications are merged together into Results.

Parameters:

Blob	blob to be classified
denorm	normalization/denormalization parameters
Results	place to put match results

Globals:

PreTrainedTemplates built-in training templates
AdaptedTemplates templates adapted for this page
matcher_great_threshold rating limit for a great match

Note:: Exceptions: none; History: Tue Mar 12 08:50:11 1991, DSJ, Created.

Definition at line 1803 of file adaptmatch.cpp.

                                                       {
  UNICHAR_ID *Ambiguities;

  AdaptiveMatcherCalls++;
  InitIntFX();

  if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min ||
      tess_cn_matching) {
    CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
  } else {
    Ambiguities = BaselineClassifier(Blob, denorm, AdaptedTemplates, Results);
    if ((Results->NumMatches > 0 &&
         MarginalMatch (Results->best_match.rating) &&
         !tess_bn_matching) ||
        Results->NumMatches == 0) {
      CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
    } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
      AmbigClassifier(Blob, denorm,
                      PreTrainedTemplates,
                      AdaptedTemplates->Class,
                      Ambiguities,
                      Results);
    }
  }

  // Force the blob to be classified as noise
  // if the results contain only fragments.
  // TODO(daria): verify that this is better than
  // just adding a NULL classification.
  if (!Results->HasNonfragment || Results->NumMatches == 0)
    ClassifyAsNoise(Results);
}   /* DoAdaptiveMatch */

void tesseract::Classify::EndAdaptiveClassifier ( )

This routine performs cleanup operations on the adaptive classifier. It should be called before the program is terminated. Its main function is to save the adapted templates to a file.

Globals:

AdaptedTemplates current set of adapted templates
classify_save_adapted_templates TRUE if templates should be saved
classify_enable_adaptive_matcher TRUE if adaptive matcher is enabled

Note:: Exceptions: none; History: Tue Mar 19 14:37:06 1991, DSJ, Created.

Definition at line 478 of file adaptmatch.cpp.

                                     {
  STRING Filename;
  FILE *File;

  #ifndef SECURE_NAMES
  if (AdaptedTemplates != NULL &&
      classify_enable_adaptive_matcher && classify_save_adapted_templates) {
    Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
    File = fopen (Filename.string(), "wb");
    if (File == NULL)
      cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
    else {
      cprintf ("\nSaving adapted templates to %s ...", Filename.string());
      fflush(stdout);
      WriteAdaptedTemplates(File, AdaptedTemplates);
      cprintf ("\n");
      fclose(File);
    }
  }
  #endif

  if (AdaptedTemplates != NULL) {
    free_adapted_templates(AdaptedTemplates);
    AdaptedTemplates = NULL;
  }

  if (PreTrainedTemplates != NULL) {
    free_int_templates(PreTrainedTemplates);
    PreTrainedTemplates = NULL;
  }
  getDict().EndDangerousAmbigs();
  FreeNormProtos();
  if (AllProtosOn != NULL) {
    FreeBitVector(AllProtosOn);
    FreeBitVector(PrunedProtos);
    FreeBitVector(AllConfigsOn);
    FreeBitVector(AllProtosOff);
    FreeBitVector(AllConfigsOff);
    FreeBitVector(TempProtoMask);
    AllProtosOn = NULL;
    PrunedProtos = NULL;
    AllConfigsOn = NULL;
    AllProtosOff = NULL;
    AllConfigsOff = NULL;
    TempProtoMask = NULL;
  }
  delete shape_table_;
  shape_table_ = NULL;
}                                /* EndAdaptiveClassifier */

void tesseract::Classify::ExpandShapesAndApplyCorrections	(	ADAPT_CLASS *	classes,
		bool	debug,
		int	class_id,
		int	bottom,
		int	top,
		float	cp_rating,
		int	blob_length,
		const uinT8 *	cn_factors,
		INT_RESULT_STRUCT &	int_result,
		ADAPT_RESULTS *	final_results
	)

Definition at line 1290 of file adaptmatch.cpp.

                                                                 {
  // Compute the fontinfo_ids.
  int fontinfo_id = kBlankFontinfoId;
  int fontinfo_id2 = kBlankFontinfoId;
  if (classes != NULL) {
    // Adapted result.
    fontinfo_id = GetFontinfoId(classes[class_id], int_result.Config);
    if (int_result.Config2 >= 0)
      fontinfo_id2 = GetFontinfoId(classes[class_id], int_result.Config2);
  } else {
    // Pre-trained result.
    fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, int_result.Config);
    if (int_result.Config2 >= 0) {
      fontinfo_id2 = ClassAndConfigIDToFontOrShapeID(class_id,
                                                     int_result.Config2);
    }
    if (shape_table_ != NULL) {
      // Actually fontinfo_id is an index into the shape_table_ and it
      // contains a list of unchar_id/font_id pairs.
      int shape_id = fontinfo_id;
      const Shape& shape = shape_table_->GetShape(fontinfo_id);
      double min_rating = 0.0;
      for (int c = 0; c < shape.size(); ++c) {
        int unichar_id = shape[c].unichar_id;
        fontinfo_id = shape[c].font_ids[0];
        if (shape[c].font_ids.size() > 1)
          fontinfo_id2 = shape[c].font_ids[1];
        else if (fontinfo_id2 != kBlankFontinfoId)
          fontinfo_id2 = shape_table_->GetShape(fontinfo_id2)[0].font_ids[0];
        double rating = ComputeCorrectedRating(debug, unichar_id, cp_rating,
                                               int_result.Rating,
                                               int_result.FeatureMisses,
                                               bottom, top, blob_length,
                                               cn_factors);
        if (c == 0 || rating < min_rating)
          min_rating = rating;
        if (unicharset.get_enabled(unichar_id)) {
          AddNewResult(final_results, unichar_id, shape_id, rating,
                       classes != NULL, int_result.Config,
                       fontinfo_id, fontinfo_id2);
        }
      }
      int_result.Rating = min_rating;
      return;
    }
  }
  double rating = ComputeCorrectedRating(debug, class_id, cp_rating,
                                         int_result.Rating,
                                         int_result.FeatureMisses,
                                         bottom, top, blob_length,
                                         cn_factors);
  if (unicharset.get_enabled(class_id)) {
    AddNewResult(final_results, class_id, -1, rating,
                 classes != NULL, int_result.Config,
                 fontinfo_id, fontinfo_id2);
  }
  int_result.Rating = rating;
}

FEATURE_SET tesseract::Classify::ExtractOutlineFeatures ( TBLOB * Blob )

Definition at line 36 of file outfeat.cpp.

                                                        {
/*
 **     Parameters:
 **             Blob            blob to extract pico-features from
 **             LineStats       statistics on text row blob is in
 **     Globals: none
 **     Operation: Convert each segment in the outline to a feature
 **             and return the features.
 **     Return: Outline-features for Blob.
 **     Exceptions: none
 **     History: 11/13/90, DSJ, Created.
 **             05/24/91, DSJ, Updated for either char or baseline normalize.
 */
  LIST Outlines;
  LIST RemainingOutlines;
  MFOUTLINE Outline;
  FEATURE_SET FeatureSet;
  FLOAT32 XScale, YScale;

  FeatureSet = NewFeatureSet (MAX_OUTLINE_FEATURES);
  if (Blob == NULL)
    return (FeatureSet);

  Outlines = ConvertBlob (Blob);

  NormalizeOutlines(Outlines, &XScale, &YScale);
  RemainingOutlines = Outlines;
  iterate(RemainingOutlines) {
    Outline = (MFOUTLINE) first_node (RemainingOutlines);
    ConvertToOutlineFeatures(Outline, FeatureSet);
  }
  if (classify_norm_method == baseline)
    NormalizeOutlineX(FeatureSet);
  FreeOutlines(Outlines);
  return (FeatureSet);
}                                /* ExtractOutlineFeatures */

FEATURE_SET tesseract::Classify::ExtractPicoFeatures ( TBLOB * Blob )

Definition at line 57 of file picofeat.cpp.

                                                     {
/*
 **     Parameters:
 **             Blob            blob to extract pico-features from
 **             LineStats       statistics on text row blob is in
 **     Globals:
 **             classify_norm_method    normalization method currently specified
 **     Operation: Dummy for now.
 **     Return: Pico-features for Blob.
 **     Exceptions: none
 **     History: 9/4/90, DSJ, Created.
 */
  LIST Outlines;
  LIST RemainingOutlines;
  MFOUTLINE Outline;
  FEATURE_SET FeatureSet;
  FLOAT32 XScale, YScale;

  FeatureSet = NewFeatureSet(MAX_PICO_FEATURES);
  Outlines = ConvertBlob(Blob);
  NormalizeOutlines(Outlines, &XScale, &YScale);
  RemainingOutlines = Outlines;
  iterate(RemainingOutlines) {
    Outline = (MFOUTLINE) first_node (RemainingOutlines);
    ConvertToPicoFeatures2(Outline, FeatureSet);
  }
  if (classify_norm_method == baseline)
    NormalizePicoX(FeatureSet);
  FreeOutlines(Outlines);
  return (FeatureSet);

}                                /* ExtractPicoFeatures */

void tesseract::Classify::FreeNormProtos ( )

Definition at line 157 of file normmatch.cpp.

                              {
  if (NormProtos != NULL) {
    for (int i = 0; i < NormProtos->NumProtos; i++)
      FreeProtoList(&NormProtos->Protos[i]);
    Efree(NormProtos->Protos);
    Efree(NormProtos->ParamDesc);
    Efree(NormProtos);
    NormProtos = NULL;
  }
}

UnicityTable<FontInfo>& tesseract::Classify::get_fontinfo_table ( ) [inline]

Definition at line 336 of file classify.h.

                                               {
    return fontinfo_table_;
  }

UnicityTable<FontSet>& tesseract::Classify::get_fontset_table ( ) [inline]

Definition at line 339 of file classify.h.

                                             {
    return fontset_table_;
  }

int tesseract::Classify::GetAdaptiveFeatures	(	TBLOB *	Blob,
		INT_FEATURE_ARRAY	IntFeatures,
		FEATURE_SET *	FloatFeatures
	)

This routine sets up the feature extractor to extract baseline normalized pico-features.

The extracted pico-features are converted to integer form and placed in IntFeatures. The original floating-pt. features are returned in FloatFeatures.

Globals: none

Parameters:

	Blob	blob to extract features from
[out]	IntFeatures	array to fill with integer features
[out]	FloatFeatures	place to return actual floating-pt features

Returns:: Number of pico-features returned (0 if an error occurred)

Note:: Exceptions: none; History: Tue Mar 12 17:55:18 1991, DSJ, Created.

Definition at line 856 of file adaptmatch.cpp.

                                                              {
  FEATURE_SET Features;
  int NumFeatures;

  classify_norm_method.set_value(baseline);
  Features = ExtractPicoFeatures(Blob);

  NumFeatures = Features->NumFeatures;
  if (NumFeatures > UNLIKELY_NUM_FEAT) {
    FreeFeatureSet(Features);
    return 0;
  }

  ComputeIntFeatures(Features, IntFeatures);
  *FloatFeatures = Features;

  return NumFeatures;
}                                /* GetAdaptiveFeatures */

void tesseract::Classify::GetAdaptThresholds	(	TWERD *	Word,
		const DENORM &	denorm,
		const WERD_CHOICE &	BestChoice,
		const WERD_CHOICE &	BestRawChoice,
		FLOAT32	Thresholds[]
	)

This routine tries to estimate how tight the adaptation threshold should be set for each character in the current word. In general, the routine tries to set tighter thresholds for a character when the current set of templates would have made an error on that character. It tries to set a threshold tight enough to eliminate the error. Two different sets of rules can be used to determine the desired thresholds.

Parameters:

	Word	current word
	denorm	normalization/denormalization parameters
	BestChoice	best choice for current word with context
	BestRawChoice	best choice for current word without context
[out]	Thresholds	array of thresholds to be filled in

Globals:

matcher_good_threshold
matcher_perfect_threshold
matcher_rating_margin

Returns:: none (results are returned in Thresholds)

Note:: Exceptions: none; History: Fri May 31 09:22:08 1991, DSJ, Created.

Definition at line 1864 of file adaptmatch.cpp.

                                                        {
  getDict().FindClassifierErrors(matcher_perfect_threshold,
                                 matcher_good_threshold,
                                 matcher_rating_margin,
                                 Thresholds);
}                              /* GetAdaptThresholds */

UNICHAR_ID * tesseract::Classify::GetAmbiguities	(	TBLOB *	Blob,
		const DENORM &	denorm,
		CLASS_ID	CorrectClass
	)

This routine matches blob to the built-in templates to find out if there are any classes other than the correct class which are potential ambiguities.

Parameters:

Blob	blob to get classification ambiguities for
denorm	normalization/denormalization parameters
CorrectClass	correct class for Blob

Globals:

CurrentRatings used by qsort compare routine
PreTrainedTemplates built-in templates

Returns:: String containing all possible ambiguous classes.

Note:: Exceptions: none; History: Fri Mar 15 08:08:22 1991, DSJ, Created.

Definition at line 1893 of file adaptmatch.cpp.

                                                            {
  ADAPT_RESULTS *Results = new ADAPT_RESULTS();
  UNICHAR_ID *Ambiguities;
  int i;

  Results->Initialize();

  CharNormClassifier(Blob, denorm, PreTrainedTemplates, Results);
  RemoveBadMatches(Results);
  qsort((void *)Results->match, Results->NumMatches,
        sizeof(ScoredClass), CompareByRating);

  /* copy the class id's into an string of ambiguities - don't copy if
     the correct class is the only class id matched */
  Ambiguities = (UNICHAR_ID *) Emalloc (sizeof (UNICHAR_ID) *
                                        (Results->NumMatches + 1));
  if (Results->NumMatches > 1 ||
      (Results->NumMatches == 1 &&
          Results->match[0].unichar_id != CorrectClass)) {
    for (i = 0; i < Results->NumMatches; i++)
      Ambiguities[i] = Results->match[i].unichar_id;
    Ambiguities[i] = -1;
  } else {
    Ambiguities[0] = -1;
  }

  delete Results;
  return Ambiguities;
}                              /* GetAmbiguities */

int tesseract::Classify::GetBaselineFeatures	(	TBLOB *	Blob,
		const DENORM &	denorm,
		INT_TEMPLATES	Templates,
		INT_FEATURE_ARRAY	IntFeatures,
		uinT8 *	CharNormArray,
		inT32 *	BlobLength
	)

This routine calls the integer (Hardware) feature extractor if it has not been called before for this blob. The results from the feature extractor are placed into globals so that they can be used in other routines without re-extracting the features. It then copies the baseline features into the IntFeatures array provided by the caller.

Parameters:

Blob	blob to extract features from
denorm	normalization/denormalization parameters
Templates	used to compute char norm adjustments
IntFeatures	array to fill with integer features
CharNormArray	array to fill with dummy char norm adjustments
BlobLength	length of blob in baseline-normalized units

Globals:

FeaturesHaveBeenExtracted TRUE if fx has been done
BaselineFeatures holds extracted baseline feat
CharNormFeatures holds extracted char norm feat
FXInfo holds misc. FX info

Returns:: Number of features extracted or 0 if an error occured.

Note:: Exceptions: none; History: Tue May 28 10:40:52 1991, DSJ, Created.

Definition at line 1952 of file adaptmatch.cpp.

                                                     {
  register INT_FEATURE Src, Dest, End;

  if (!FeaturesHaveBeenExtracted) {
    FeaturesOK = ExtractIntFeat(Blob, denorm, BaselineFeatures,
                                CharNormFeatures, &FXInfo, NULL);
    FeaturesHaveBeenExtracted = TRUE;
  }

  if (!FeaturesOK) {
    *BlobLength = FXInfo.NumBL;
    return 0;
  }

  for (Src = BaselineFeatures, End = Src + FXInfo.NumBL, Dest = IntFeatures;
       Src < End;
       *Dest++ = *Src++);

  ClearCharNormArray(CharNormArray);
  *BlobLength = FXInfo.NumBL;
  return FXInfo.NumBL;
}                              /* GetBaselineFeatures */

int tesseract::Classify::GetCharNormFeatures	(	TBLOB *	Blob,
		const DENORM &	denorm,
		INT_TEMPLATES	Templates,
		INT_FEATURE_ARRAY	IntFeatures,
		uinT8 *	PrunerNormArray,
		uinT8 *	CharNormArray,
		inT32 *	BlobLength,
		inT32 *	FeatureOutlineArray
	)

This routine calls the integer (Hardware) feature extractor if it has not been called before for this blob.

The results from the feature extractor are placed into globals so that they can be used in other routines without re-extracting the features.

It then copies the char norm features into the IntFeatures array provided by the caller.

Parameters:

Blob	blob to extract features from
denorm	normalization/denormalization parameters
Templates	used to compute char norm adjustments
IntFeatures	array to fill with integer features
PrunerNormArray	Array of factors from blob normalization process
CharNormArray	array to fill with dummy char norm adjustments
BlobLength	length of blob in baseline-normalized units
FeatureOutlineArray	Globals: FeaturesHaveBeenExtracted TRUE if fx has been done BaselineFeatures holds extracted baseline feat CharNormFeatures holds extracted char norm feat FXInfo holds misc. FX info

Returns:: Number of features extracted or 0 if an error occured.

Note:: Exceptions: none; History: Tue May 28 10:40:52 1991, DSJ, Created.

Definition at line 2040 of file adaptmatch.cpp.

                                                              {
  register INT_FEATURE Src, Dest, End;
  FEATURE NormFeature;
  FLOAT32 Baseline, Scale;
  inT32 FeatureOutlineIndex[MAX_NUM_INT_FEATURES];

  if (!FeaturesHaveBeenExtracted) {
    FeaturesOK = ExtractIntFeat(Blob, denorm, BaselineFeatures,
                                CharNormFeatures, &FXInfo,
                                FeatureOutlineIndex);
    FeaturesHaveBeenExtracted = TRUE;
  }

  if (!FeaturesOK) {
    *BlobLength = FXInfo.NumBL;
    return (0);
  }

  for (Src = CharNormFeatures, End = Src + FXInfo.NumCN, Dest = IntFeatures;
       Src < End;
       *Dest++ = *Src++);
  for (int i = 0;  FeatureOutlineArray && i < FXInfo.NumCN; ++i) {
    FeatureOutlineArray[i] = FeatureOutlineIndex[i];
  }

  NormFeature = NewFeature(&CharNormDesc);
  Baseline = BASELINE_OFFSET;
  Scale = MF_SCALE_FACTOR;
  NormFeature->Params[CharNormY] = (FXInfo.Ymean - Baseline) * Scale;
  NormFeature->Params[CharNormLength] =
    FXInfo.Length * Scale / LENGTH_COMPRESSION;
  NormFeature->Params[CharNormRx] = FXInfo.Rx * Scale;
  NormFeature->Params[CharNormRy] = FXInfo.Ry * Scale;
  ComputeCharNormArrays(NormFeature, Templates, CharNormArray, PrunerNormArray);
  *BlobLength = FXInfo.NumBL;
  return (FXInfo.NumCN);
}                              /* GetCharNormFeatures */

CLASS_ID tesseract::Classify::GetClassToDebug	(	const char *	Prompt,
		bool *	adaptive_on,
		bool *	pretrained_on,
		int *	shape_id
	)

Definition at line 1432 of file intproto.cpp.

                                                                       {
/*
 ** Parameters:
 **   Prompt  prompt to print while waiting for input from window
 ** Globals: none
 ** Operation: This routine prompts the user with Prompt and waits
 **   for the user to enter something in the debug window.
 ** Return: Character entered in the debug window.
 ** Exceptions: none
 ** History: Thu Mar 21 16:55:13 1991, DSJ, Created.
 */
  tprintf("%s\n", Prompt);
  SVEvent* ev;
  SVEventType ev_type;
  int unichar_id = INVALID_UNICHAR_ID;
  // Wait until a click or popup event.
  do {
    ev = IntMatchWindow->AwaitEvent(SVET_ANY);
    ev_type = ev->type;
    if (ev_type == SVET_POPUP) {
      if (ev->command_id == IDA_SHAPE_INDEX) {
        if (shape_table_ != NULL) {
          *shape_id = atoi(ev->parameter);
          *adaptive_on = false;
          *pretrained_on = true;
          if (*shape_id >= 0 && *shape_id < shape_table_->NumShapes()) {
            int font_id;
            shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id,
                                                 &font_id);
            tprintf("Shape %d, first unichar=%d, font=%d\n",
                    *shape_id, unichar_id, font_id);
            return unichar_id;
          }
          tprintf("Shape index '%s' not found in shape table\n", ev->parameter);
        } else {
          tprintf("No shape table loaded!\n");
        }
      } else {
        if (unicharset.contains_unichar(ev->parameter)) {
          unichar_id = unicharset.unichar_to_id(ev->parameter);
          if (ev->command_id == IDA_ADAPTIVE) {
            *adaptive_on = true;
            *pretrained_on = false;
            *shape_id = -1;
          } else if (ev->command_id == IDA_STATIC) {
            *adaptive_on = false;
            *pretrained_on = true;
          } else {
            *adaptive_on = true;
            *pretrained_on = true;
          }
          if (ev->command_id == IDA_ADAPTIVE || shape_table_ == NULL) {
            *shape_id = -1;
            return unichar_id;
          }
          for (int s = 0; s < shape_table_->NumShapes(); ++s) {
            if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
              tprintf("%s\n", shape_table_->DebugStr(s).string());
            }
          }
        } else {
          tprintf("Char class '%s' not found in unicharset",
                  ev->parameter);
        }
      }
    }
    delete ev;
  } while (ev_type != SVET_CLICK);
  return 0;
}                                /* GetClassToDebug */

Dict& tesseract::Classify::getDict ( ) [inline]

Definition at line 62 of file classify.h.

                  {
    return dict_;
  }

int tesseract::Classify::GetFontinfoId	(	ADAPT_CLASS	Class,
		uinT8	ConfigId
	)

Definition at line 190 of file adaptive.cpp.

                                                             {
  return (ConfigIsPermanent(Class, ConfigId) ?
      PermConfigFor(Class, ConfigId)->FontinfoId :
      TempConfigFor(Class, ConfigId)->FontinfoId);
}

void tesseract::Classify::InitAdaptedClass	(	TBLOB *	Blob,
		const DENORM &	denorm,
		CLASS_ID	ClassId,
		int	FontinfoId,
		ADAPT_CLASS	Class,
		ADAPT_TEMPLATES	Templates
	)

This routine creates a new adapted class and uses Blob as the model for the first config in that class.

Parameters:

Blob	blob to model new class after
denorm	normalization/denormalization parameters
ClassId	id of the class to be initialized
FontinfoId	font information inferred from pre-trained templates
Class	adapted class to be initialized
Templates	adapted templates to add new class to

Globals:

AllProtosOn dummy mask with all 1's
BaselineCutoffs kludge needed to get cutoffs
PreTrainedTemplates kludge needed to get cutoffs

Note:: Exceptions: none; History: Thu Mar 14 12:49:39 1991, DSJ, Created.

Definition at line 760 of file adaptmatch.cpp.

                                                           {
  FEATURE_SET Features;
  int Fid, Pid;
  FEATURE Feature;
  int NumFeatures;
  TEMP_PROTO TempProto;
  PROTO Proto;
  INT_CLASS IClass;
  TEMP_CONFIG Config;

  classify_norm_method.set_value(baseline);
  Features = ExtractOutlineFeatures(Blob);
  NumFeatures = Features->NumFeatures;
  if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
    FreeFeatureSet(Features);
    return;
  }

  Config = NewTempConfig(NumFeatures - 1, FontinfoId);
  TempConfigFor(Class, 0) = Config;

  /* this is a kludge to construct cutoffs for adapted templates */
  if (Templates == AdaptedTemplates)
    BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];

  IClass = ClassForClassId (Templates->Templates, ClassId);

  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
    Pid = AddIntProto (IClass);
    assert (Pid != NO_PROTO);

    Feature = Features->Features[Fid];
    TempProto = NewTempProto ();
    Proto = &(TempProto->Proto);

    /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
       ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
       instead of the -0.25 to 0.75 used in baseline normalization */
    Proto->Angle = Feature->Params[OutlineFeatDir];
    Proto->X = Feature->Params[OutlineFeatX];
    Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
    Proto->Length = Feature->Params[OutlineFeatLength];
    FillABC(Proto);

    TempProto->ProtoId = Pid;
    SET_BIT (Config->Protos, Pid);

    ConvertProto(Proto, Pid, IClass);
    AddProtoToProtoPruner(Proto, Pid, IClass,
                          classify_learning_debug_level >= 2);

    Class->TempProtos = push (Class->TempProtos, TempProto);
  }
  FreeFeatureSet(Features);

  AddIntConfig(IClass);
  ConvertConfig (AllProtosOn, 0, IClass);

  if (classify_learning_debug_level >= 1) {
    cprintf ("Added new class '%s' with class id %d and %d protos.\n",
             unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
    if (classify_learning_debug_level > 1)
      DisplayAdaptedChar(Blob, denorm, IClass);
  }

  if (IsEmptyAdaptedClass(Class))
    (Templates->NumNonEmptyClasses)++;
}                                /* InitAdaptedClass */

void tesseract::Classify::InitAdaptiveClassifier ( bool load_pre_trained_templates )

This routine reads in the training information needed by the adaptive classifier and saves it into global variables. Parameters: load_pre_trained_templates Indicates whether the pre-trained templates (inttemp, normproto and pffmtable components) should be lodaded. Should only be set to true if the necesary classifier components are present in the [lang].traineddata file. Globals: BuiltInTemplatesFile file to get built-in temps from BuiltInCutoffsFile file to get avg. feat per class from classify_use_pre_adapted_templates enables use of pre-adapted templates

Note:: History: Mon Mar 11 12:49:34 1991, DSJ, Created.

Definition at line 547 of file adaptmatch.cpp.

                                                                     {
  if (!classify_enable_adaptive_matcher)
    return;
  if (AllProtosOn != NULL)
    EndAdaptiveClassifier();  // Don't leak with multiple inits.

  // If there is no language_data_path_prefix, the classifier will be
  // adaptive only.
  if (language_data_path_prefix.length() > 0 &&
      load_pre_trained_templates) {
    ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_INTTEMP));
    PreTrainedTemplates =
      ReadIntTemplates(tessdata_manager.GetDataFilePtr());
    if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded inttemp\n");

    if (tessdata_manager.SeekToStart(TESSDATA_SHAPE_TABLE)) {
      shape_table_ = new ShapeTable(unicharset);
      if (!shape_table_->DeSerialize(tessdata_manager.swap(),
                                     tessdata_manager.GetDataFilePtr())) {
        tprintf("Error loading shape table!\n");
        delete shape_table_;
        shape_table_ = NULL;
      } else if (tessdata_manager.DebugLevel() > 0) {
        tprintf("Successfully loaded shape table!\n");
      }
    }

    ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_PFFMTABLE));
    ReadNewCutoffs(tessdata_manager.GetDataFilePtr(),
                   tessdata_manager.swap(),
                   tessdata_manager.GetEndOffset(TESSDATA_PFFMTABLE),
                   CharNormCutoffs);
    if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded pffmtable\n");

    ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_NORMPROTO));
    NormProtos =
      ReadNormProtos(tessdata_manager.GetDataFilePtr(),
                     tessdata_manager.GetEndOffset(TESSDATA_NORMPROTO));
    if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded normproto\n");
  }

  im_.Init(&classify_debug_level, classify_integer_matcher_multiplier);
  InitIntegerFX();

  AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
  PrunedProtos = NewBitVector(MAX_NUM_PROTOS);
  AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
  AllProtosOff = NewBitVector(MAX_NUM_PROTOS);
  AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
  TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
  set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
  set_all_bits(PrunedProtos, WordsInVectorOfSize(MAX_NUM_PROTOS));
  set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
  zero_all_bits(AllProtosOff, WordsInVectorOfSize(MAX_NUM_PROTOS));
  zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));

  for (int i = 0; i < MAX_NUM_CLASSES; i++) {
     BaselineCutoffs[i] = 0;
  }

  if (classify_use_pre_adapted_templates) {
    FILE *File;
    STRING Filename;

    Filename = imagefile;
    Filename += ADAPT_TEMPLATE_SUFFIX;
    File = fopen(Filename.string(), "rb");
    if (File == NULL) {
      AdaptedTemplates = NewAdaptedTemplates(true);
    } else {
      #ifndef SECURE_NAMES
      cprintf("\nReading pre-adapted templates from %s ...\n",
              Filename.string());
      fflush(stdout);
      #endif
      AdaptedTemplates = ReadAdaptedTemplates(File);
      cprintf("\n");
      fclose(File);
      PrintAdaptedTemplates(stdout, AdaptedTemplates);

      for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
        BaselineCutoffs[i] = CharNormCutoffs[i];
      }
    }
  } else {
    if (AdaptedTemplates != NULL)
      free_adapted_templates(AdaptedTemplates);
    AdaptedTemplates = NewAdaptedTemplates(true);
  }
}                                /* InitAdaptiveClassifier */

void tesseract::Classify::LearnPieces	(	const char *	filename,
		int	start,
		int	length,
		float	threshold,
		CharSegmentationType	segmentation,
		const char *	correct_text,
		WERD_RES *	word
	)

Definition at line 396 of file adaptmatch.cpp.

                                                                     {
  // TODO(daria) Remove/modify this if/when we want
  // to train and/or adapt to n-grams.
  if (segmentation != CST_WHOLE &&
      (segmentation != CST_FRAGMENT || disable_character_fragments))
    return;

  if (length > 1) {
    join_pieces(word->chopped_word->blobs, word->seam_array,
                start, start + length - 1);
  }
  TBLOB* blob = word->chopped_word->blobs;
  for (int i = 0; i < start; ++i)
    blob = blob->next;
  // Rotate the blob if needed for classification.
  const DENORM* denorm = &word->denorm;
  TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded(&denorm);
  if (rotated_blob == NULL)
    rotated_blob = blob;

  #ifndef GRAPHICS_DISABLED
  // Draw debug windows showing the blob that is being learned if needed.
  if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
    RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
                       word->chopped_word->bounding_box());
    rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
    learn_debug_win_->Update();
    window_wait(learn_debug_win_);
  }
  if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
    ASSERT_HOST(learn_fragments_debug_win_ != NULL);  // set up in LearnWord
    blob->plot(learn_fragments_debug_win_,
               ScrollView::BLUE, ScrollView::BROWN);
    learn_fragments_debug_win_->Update();
  }
  #endif  // GRAPHICS_DISABLED

  if (filename != NULL) {
    classify_norm_method.set_value(character);  // force char norm spc 30/11/93
    tess_bn_matching.set_value(false);    // turn it off
    tess_cn_matching.set_value(false);
    LearnBlob(feature_defs_, filename, rotated_blob, *denorm,
              correct_text);
  } else if (unicharset.contains_unichar(correct_text)) {
    UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
    int font_id = word->fontinfo != NULL
                ? fontinfo_table_.get_id(*word->fontinfo)
                : 0;
    if (classify_learning_debug_level >= 1)
      tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
              unicharset.id_to_unichar(class_id), threshold, font_id);
    // If filename is not NULL we are doing recognition
    // (as opposed to training), so we must have already set word fonts.
    AdaptToChar(rotated_blob, *denorm, class_id, font_id, threshold);
  } else if (classify_debug_level >= 1) {
    tprintf("Can't adapt to %s not in unicharset\n", correct_text);
  }
  if (rotated_blob != blob) {
    delete rotated_blob;
    delete denorm;
  }

  break_pieces(blob, word->seam_array, start, start + length - 1);
}  // LearnPieces.

void tesseract::Classify::LearnWord	(	const char *	filename,
		const char *	rejmap,
		WERD_RES *	word
	)

Definition at line 256 of file adaptmatch.cpp.

                                         {
  int word_len = word->correct_text.size();
  if (word_len == 0) return;

  float* thresholds = NULL;
  if (filename == NULL) {
    // Adaption mode.
    if (!EnableLearning || word->best_choice == NULL ||
        // If word->best_choice is not recorded at the top of accumulator's
        // best choices (which could happen for choices that are
        // altered with ReplaceAmbig()) we skip the adaption.
        !getDict().CurrentBestChoiceIs(*(word->best_choice)))
      return;  // Can't or won't adapt.

    NumWordsAdaptedTo++;
    if (classify_learning_debug_level >= 1)
      tprintf("\n\nAdapting to word = %s\n",
              word->best_choice->debug_string().string());
    thresholds = new float[word_len];
    GetAdaptThresholds(word->rebuild_word, word->denorm, *word->best_choice,
                       *word->raw_choice, thresholds);
  }
  int start_blob = 0;
  char prev_map_char = '0';

  #ifndef GRAPHICS_DISABLED
  if (classify_debug_character_fragments) {
    if (learn_fragmented_word_debug_win_ != NULL) {
      window_wait(learn_fragmented_word_debug_win_);
    }
    RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
                       word->chopped_word->bounding_box());
    RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
                       word->chopped_word->bounding_box());
    word->chopped_word->plot(learn_fragmented_word_debug_win_);
    ScrollView::Update();
  }
  #endif  // GRAPHICS_DISABLED

  for (int ch = 0; ch < word_len; ++ch) {
    if (classify_debug_character_fragments) {
      tprintf("\nLearning %s\n",  word->correct_text[ch].string());
    }
    char rej_map_char = rejmap != NULL ? *rejmap++ : '1';

    if (word->correct_text[ch].length() > 0 && rej_map_char == '1') {
      float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;

      LearnPieces(filename, start_blob, word->best_state[ch],
                  threshold, CST_WHOLE, word->correct_text[ch].string(), word);

      if (word->best_state[ch] > 1 && !disable_character_fragments) {
        // Check that the character breaks into meaningful fragments
        // that each match a whole character with at least
        // classify_character_fragments_garbage_certainty_threshold
        bool garbage = false;
        TBLOB* frag_blob = word->chopped_word->blobs;
        for (int i = 0; i < start_blob; ++i) frag_blob = frag_blob->next;
        int frag;
        for (frag = 0; frag < word->best_state[ch]; ++frag) {
          if (classify_character_fragments_garbage_certainty_threshold < 0) {
            garbage |= LooksLikeGarbage(word->denorm, frag_blob);
          }
          frag_blob = frag_blob->next;
        }
        // Learn the fragments.
        if (!garbage) {
          bool pieces_all_natural = word->PiecesAllNatural(start_blob,
              word->best_state[ch]);
          if (pieces_all_natural || !prioritize_division) {
            for (frag = 0; frag < word->best_state[ch]; ++frag) {
              GenericVector<STRING> tokens;
              word->correct_text[ch].split(' ', &tokens);

              tokens[0] = CHAR_FRAGMENT::to_string(
                  tokens[0].string(), frag, word->best_state[ch],
                  pieces_all_natural);

              STRING full_string;
              for (int i = 0; i < tokens.size(); i++) {
                full_string += tokens[i];
                if (i != tokens.size() - 1)
                  full_string += ' ';
              }
              LearnPieces(filename, start_blob + frag, 1,
                          threshold, CST_FRAGMENT, full_string.string(), word);
            }
          }
        }
      }

      // TODO(rays): re-enable this part of the code when we switch to the
      // new classifier that needs to see examples of garbage.
      /*
      char next_map_char = ch + 1 < word_len
                           ? (rejmap != NULL ? *rejmap : '1')
                           : '0';
      if (word->best_state[ch] > 1) {
        // If the next blob is good, make junk with the rightmost fragment.
        if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
            next_map_char == '1') {
          LearnPieces(filename, start_blob + word->best_state[ch] - 1,
                      word->best_state[ch + 1] + 1,
                      threshold, CST_IMPROPER, INVALID_UNICHAR, word);
        }
        // If the previous blob is good, make junk with the leftmost fragment.
        if (ch > 0 && word->correct_text[ch - 1].length() > 0 &&
            prev_map_char == '1') {
          LearnPieces(filename, start_blob - word->best_state[ch - 1],
                      word->best_state[ch - 1] + 1,
                      threshold, CST_IMPROPER, INVALID_UNICHAR, word);
        }
      }
      // If the next blob is good, make a join with it.
      if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0 &&
          next_map_char == '1') {
        STRING joined_text = word->correct_text[ch];
        joined_text += word->correct_text[ch + 1];
        LearnPieces(filename, start_blob,
                    word->best_state[ch] + word->best_state[ch + 1],
                    threshold, CST_NGRAM, joined_text.string(), word);
      }
      */
    }
    start_blob += word->best_state[ch];
    prev_map_char = rej_map_char;
  }
  delete [] thresholds;
}  // LearnWord.

bool tesseract::Classify::LooksLikeGarbage	(	const DENORM &	denorm,
		TBLOB *	blob
	)

Definition at line 1986 of file adaptmatch.cpp.

                                                                 {
  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();
  AdaptiveClassifier(blob, denorm, ratings, NULL);
  BLOB_CHOICE_IT ratings_it(ratings);
  const UNICHARSET &unicharset = getDict().getUnicharset();
  if (classify_debug_character_fragments) {
    print_ratings_list("======================\nLooksLikeGarbage() got ",
                       ratings, unicharset);
  }
  for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
       ratings_it.forward()) {
    if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != NULL) {
      continue;
    }
    delete ratings;
    return (ratings_it.data()->certainty() <
            classify_character_fragments_garbage_certainty_threshold);
  }
  delete ratings;
  return true;  // no whole characters in ratings
}

int tesseract::Classify::MakeNewTemporaryConfig	(	ADAPT_TEMPLATES	Templates,
		CLASS_ID	ClassId,
		int	FontinfoId,
		int	NumFeatures,
		INT_FEATURE_ARRAY	Features,
		FEATURE_SET	FloatFeatures
	)

Parameters:

Templates	adapted templates to add new config to
ClassId	class id to associate with new config
FontinfoId	font information inferred from pre-trained templates
NumFeatures	number of features in IntFeatures
Features	features describing model for new config
FloatFeatures	floating-pt representation of features

Returns:: The id of the new config created, a negative integer in case of error.

Note:: Exceptions: none; History: Fri Mar 15 08:49:46 1991, DSJ, Created.

Definition at line 2131 of file adaptmatch.cpp.

                                                      {
  INT_CLASS IClass;
  ADAPT_CLASS Class;
  PROTO_ID OldProtos[MAX_NUM_PROTOS];
  FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
  int NumOldProtos;
  int NumBadFeatures;
  int MaxProtoId, OldMaxProtoId;
  int BlobLength = 0;
  int MaskSize;
  int ConfigId;
  TEMP_CONFIG Config;
  int i;
  int debug_level = NO_DEBUG;

  if (classify_learning_debug_level >= 3)
    debug_level =
        PRINT_MATCH_SUMMARY | PRINT_FEATURE_MATCHES | PRINT_PROTO_MATCHES;

  IClass = ClassForClassId(Templates->Templates, ClassId);
  Class = Templates->Class[ClassId];

  if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
    ++NumAdaptationsFailed;
    if (classify_learning_debug_level >= 1)
      cprintf("Cannot make new temporary config: maximum number exceeded.\n");
    return -1;
  }

  OldMaxProtoId = IClass->NumProtos - 1;

  NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
                                    BlobLength, NumFeatures, Features,
                                    OldProtos, classify_adapt_proto_threshold,
                                    debug_level);

  MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
  zero_all_bits(TempProtoMask, MaskSize);
  for (i = 0; i < NumOldProtos; i++)
    SET_BIT(TempProtoMask, OldProtos[i]);

  NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
                                       BlobLength, NumFeatures, Features,
                                       BadFeatures,
                                       classify_adapt_feature_threshold,
                                       debug_level);

  MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
                                 IClass, Class, TempProtoMask);
  if (MaxProtoId == NO_PROTO) {
    ++NumAdaptationsFailed;
    if (classify_learning_debug_level >= 1)
      cprintf("Cannot make new temp protos: maximum number exceeded.\n");
    return -1;
  }

  ConfigId = AddIntConfig(IClass);
  ConvertConfig(TempProtoMask, ConfigId, IClass);
  Config = NewTempConfig(MaxProtoId, FontinfoId);
  TempConfigFor(Class, ConfigId) = Config;
  copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);

  if (classify_learning_debug_level >= 1)
    cprintf("Making new temp config %d fontinfo id %d"
            " using %d old and %d new protos.\n",
            ConfigId, Config->FontinfoId,
            NumOldProtos, MaxProtoId - OldMaxProtoId);

  return ConfigId;
}                              /* MakeNewTemporaryConfig */

PROTO_ID tesseract::Classify::MakeNewTempProtos	(	FEATURE_SET	Features,
		int	NumBadFeat,
		FEATURE_ID	BadFeat[],
		INT_CLASS	IClass,
		ADAPT_CLASS	Class,
		BIT_VECTOR	TempProtoMask
	)

This routine finds sets of sequential bad features that all have the same angle and converts each set into a new temporary proto. The temp proto is added to the proto pruner for IClass, pushed onto the list of temp protos in Class, and added to TempProtoMask.

Parameters:

Features	floating-pt features describing new character
NumBadFeat	number of bad features to turn into protos
BadFeat	feature id's of bad features
IClass	integer class templates to add new protos to
Class	adapted class templates to add new protos to
TempProtoMask	proto mask to add new protos to

Globals: none

Returns:: Max proto id in class after all protos have been added. Exceptions: none History: Fri Mar 15 11:39:38 1991, DSJ, Created.

Definition at line 2228 of file adaptmatch.cpp.

                                                               {
  FEATURE_ID *ProtoStart;
  FEATURE_ID *ProtoEnd;
  FEATURE_ID *LastBad;
  TEMP_PROTO TempProto;
  PROTO Proto;
  FEATURE F1, F2;
  FLOAT32 X1, X2, Y1, Y2;
  FLOAT32 A1, A2, AngleDelta;
  FLOAT32 SegmentLength;
  PROTO_ID Pid;

  for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
       ProtoStart < LastBad; ProtoStart = ProtoEnd) {
    F1 = Features->Features[*ProtoStart];
    X1 = F1->Params[PicoFeatX];
    Y1 = F1->Params[PicoFeatY];
    A1 = F1->Params[PicoFeatDir];

    for (ProtoEnd = ProtoStart + 1,
         SegmentLength = GetPicoFeatureLength();
         ProtoEnd < LastBad;
         ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
      F2 = Features->Features[*ProtoEnd];
      X2 = F2->Params[PicoFeatX];
      Y2 = F2->Params[PicoFeatY];
      A2 = F2->Params[PicoFeatDir];

      AngleDelta = fabs(A1 - A2);
      if (AngleDelta > 0.5)
        AngleDelta = 1.0 - AngleDelta;

      if (AngleDelta > matcher_clustering_max_angle_delta ||
          fabs(X1 - X2) > SegmentLength ||
          fabs(Y1 - Y2) > SegmentLength)
        break;
    }

    F2 = Features->Features[*(ProtoEnd - 1)];
    X2 = F2->Params[PicoFeatX];
    Y2 = F2->Params[PicoFeatY];
    A2 = F2->Params[PicoFeatDir];

    Pid = AddIntProto(IClass);
    if (Pid == NO_PROTO)
      return (NO_PROTO);

    TempProto = NewTempProto();
    Proto = &(TempProto->Proto);

    /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
       ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
       instead of the -0.25 to 0.75 used in baseline normalization */
    Proto->Length = SegmentLength;
    Proto->Angle = A1;
    Proto->X = (X1 + X2) / 2.0;
    Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
    FillABC(Proto);

    TempProto->ProtoId = Pid;
    SET_BIT(TempProtoMask, Pid);

    ConvertProto(Proto, Pid, IClass);
    AddProtoToProtoPruner(Proto, Pid, IClass,
                          classify_learning_debug_level >= 2);

    Class->TempProtos = push(Class->TempProtos, TempProto);
  }
  return IClass->NumProtos - 1;
}                              /* MakeNewTempProtos */

void tesseract::Classify::MakePermanent	(	ADAPT_TEMPLATES	Templates,
		CLASS_ID	ClassId,
		int	ConfigId,
		const DENORM &	denorm,
		TBLOB *	Blob
	)

Parameters:

Templates	current set of adaptive templates
ClassId	class containing config to be made permanent
ConfigId	config to be made permanent
denorm	normalization/denormalization parameters
Blob	current blob being adapted to

Globals: none

Note:: Exceptions: none; History: Thu Mar 14 15:54:08 1991, DSJ, Created.

Definition at line 2318 of file adaptmatch.cpp.

                                          {
  UNICHAR_ID *Ambigs;
  TEMP_CONFIG Config;
  ADAPT_CLASS Class;
  PROTO_KEY ProtoKey;

  Class = Templates->Class[ClassId];
  Config = TempConfigFor(Class, ConfigId);

  MakeConfigPermanent(Class, ConfigId);
  if (Class->NumPermConfigs == 0)
    Templates->NumPermClasses++;
  Class->NumPermConfigs++;

  // Initialize permanent config.
  Ambigs = GetAmbiguities(Blob, denorm, ClassId);
  PERM_CONFIG Perm = (PERM_CONFIG) alloc_struct(sizeof(PERM_CONFIG_STRUCT),
                                                "PERM_CONFIG_STRUCT");
  Perm->Ambigs = Ambigs;
  Perm->FontinfoId = Config->FontinfoId;

  // Free memory associated with temporary config (since ADAPTED_CONFIG
  // is a union we need to clean up before we record permanent config).
  ProtoKey.Templates = Templates;
  ProtoKey.ClassId = ClassId;
  ProtoKey.ConfigId = ConfigId;
  Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
  FreeTempConfig(Config);

  // Record permanent config.
  PermConfigFor(Class, ConfigId) = Perm;

  if (classify_learning_debug_level >= 1) {
    tprintf("Making config %d for %s (ClassId %d) permanent:"
            " fontinfo id %d, ambiguities '",
            ConfigId, getDict().getUnicharset().debug_str(ClassId).string(),
            ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
    for (UNICHAR_ID *AmbigsPointer = Ambigs;
        *AmbigsPointer >= 0; ++AmbigsPointer)
      tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
    tprintf("'.\n");
  }
}                              /* MakePermanent */

void tesseract::Classify::MasterMatcher	(	INT_TEMPLATES	templates,
		inT16	num_features,
		const INT_FEATURE_STRUCT *	features,
		const uinT8 *	norm_factors,
		ADAPT_CLASS *	classes,
		int	debug,
		int	num_classes,
		const TBOX &	blob_box,
		CLASS_PRUNER_RESULTS	results,
		ADAPT_RESULTS *	final_results
	)

Factored-out calls to IntegerMatcher based on class pruner results. Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.

Definition at line 1252 of file adaptmatch.cpp.

                                                           {
  int top = blob_box.top();
  int bottom = blob_box.bottom();
  for (int c = 0; c < num_classes; c++) {
    CLASS_ID class_id = results[c].Class;
    INT_RESULT_STRUCT& int_result = results[c].IMResult;
    BIT_VECTOR protos = classes != NULL ? classes[class_id]->PermProtos
                                        : AllProtosOn;
    BIT_VECTOR configs = classes != NULL ? classes[class_id]->PermConfigs
                                         : AllConfigsOn;

    im_.Match(ClassForClassId(templates, class_id),
              protos, configs,
              num_features, features,
              &int_result, classify_adapt_feature_threshold, debug,
              matcher_debug_separate_windows);
    bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
    ExpandShapesAndApplyCorrections(classes, debug, class_id, bottom, top,
                                    results[c].Rating,
                                    final_results->BlobLength, norm_factors,
                                    int_result, final_results);
  }
}

ADAPT_TEMPLATES tesseract::Classify::NewAdaptedTemplates ( bool InitFromUnicharset )

Allocates memory for adapted tempates. each char in unicharset to the newly created templates

Parameters:

InitFromUnicharset if true, add an empty class for

Returns:: Ptr to new adapted templates.

Note:: Globals: none; Exceptions: none; History: Fri Mar 8 10:15:28 1991, DSJ, Created.

Definition at line 167 of file adaptive.cpp.

                                                                     {
  ADAPT_TEMPLATES Templates;
  int i;

  Templates = (ADAPT_TEMPLATES) Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT));

  Templates->Templates = NewIntTemplates ();
  Templates->NumPermClasses = 0;
  Templates->NumNonEmptyClasses = 0;

  /* Insert an empty class for each unichar id in unicharset */
  for (i = 0; i < MAX_NUM_CLASSES; i++) {
    Templates->Class[i] = NULL;
    if (InitFromUnicharset && i < unicharset.size()) {
      AddAdaptedClass(Templates, NewAdaptedClass(), i);
    }
  }

  return (Templates);

}                                /* NewAdaptedTemplates */

void tesseract::Classify::NormalizeOutlines	(	LIST	Outlines,
		FLOAT32 *	XScale,
		FLOAT32 *	YScale
	)

Definition at line 346 of file mfoutline.cpp.

                                                  {
/*
 ** Parameters:
 **   Outlines  list of outlines to be normalized
 **   XScale    x-direction scale factor used by routine
 **   YScale    y-direction scale factor used by routine
 ** Globals:
 **   classify_norm_method  method being used for normalization
 **   classify_char_norm_range map radius of gyration to this value
 ** Operation: This routine normalizes every outline in Outlines
 **   according to the currently selected normalization method.
 **   It also returns the scale factors that it used to do this
 **   scaling.  The scale factors returned represent the x and
 **   y sizes in the normalized coordinate system that correspond
 **   to 1 pixel in the original coordinate system.
 ** Return: none (Outlines are changed and XScale and YScale are updated)
 ** Exceptions: none
 ** History: Fri Dec 14 08:14:55 1990, DSJ, Created.
 */
  MFOUTLINE Outline;
  OUTLINE_STATS OutlineStats;
  FLOAT32 BaselineScale;

  switch (classify_norm_method) {
    case character:
      ComputeOutlineStats(Outlines, &OutlineStats);

      /* limit scale factor to avoid overscaling small blobs (.,`'),
         thin blobs (l1ift), and merged blobs */
      *XScale = *YScale = BaselineScale = MF_SCALE_FACTOR;
      *XScale *= OutlineStats.Ry;
      *YScale *= OutlineStats.Rx;
      if (*XScale < classify_min_norm_scale_x)
        *XScale = classify_min_norm_scale_x;
      if (*YScale < classify_min_norm_scale_y)
        *YScale = classify_min_norm_scale_y;
      if (*XScale > classify_max_norm_scale_x &&
          *YScale <= classify_max_norm_scale_y)
        *XScale = classify_max_norm_scale_x;
      *XScale = classify_char_norm_range * BaselineScale / *XScale;
      *YScale = classify_char_norm_range * BaselineScale / *YScale;

      iterate(Outlines) {
        Outline = (MFOUTLINE) first_node (Outlines);
        CharNormalizeOutline (Outline,
          OutlineStats.x, OutlineStats.y,
          *XScale, *YScale);
      }
      break;

    case baseline:
      iterate(Outlines) {
        Outline = (MFOUTLINE) first_node(Outlines);
        NormalizeOutline(Outline, 0.0);
      }
      *XScale = *YScale = MF_SCALE_FACTOR;
      break;
  }
}                                /* NormalizeOutlines */

void tesseract::Classify::PrintAdaptedTemplates	(	FILE *	File,
		ADAPT_TEMPLATES	Templates
	)

This routine prints a summary of the adapted templates in Templates to File.

Parameters:

File	open text file to print Templates to
Templates	adapted templates to print to File

Note:: Globals: none; Exceptions: none; History: Wed Mar 20 13:35:29 1991, DSJ, Created.

Definition at line 273 of file adaptive.cpp.

                                                                          {
  int i;
  INT_CLASS IClass;
  ADAPT_CLASS AClass;

  #ifndef SECURE_NAMES
  fprintf (File, "\n\nSUMMARY OF ADAPTED TEMPLATES:\n\n");
  fprintf (File, "Num classes = %d;  Num permanent classes = %d\n\n",
           Templates->NumNonEmptyClasses, Templates->NumPermClasses);
  fprintf (File, "   Id  NC NPC  NP NPP\n");
  fprintf (File, "------------------------\n");

  for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
    IClass = Templates->Templates->Class[i];
    AClass = Templates->Class[i];
    if (!IsEmptyAdaptedClass (AClass)) {
      fprintf (File, "%5d  %s %3d %3d %3d %3d\n",
        i, unicharset.id_to_unichar(i),
      IClass->NumConfigs, AClass->NumPermConfigs,
      IClass->NumProtos,
      IClass->NumProtos - count (AClass->TempProtos));
    }
  }
  #endif
  fprintf (File, "\n");

}                                /* PrintAdaptedTemplates */

void tesseract::Classify::PrintAdaptiveMatchResults	(	FILE *	File,
		ADAPT_RESULTS *	Results
	)

This routine writes the matches in Results to File.

Parameters:

File	open text file to write Results to
Results	match results to write to File

Globals: none

Note:: Exceptions: none; History: Mon Mar 18 09:24:53 1991, DSJ, Created.

Definition at line 2419 of file adaptmatch.cpp.

                                                                           {
  for (int i = 0; i < Results->NumMatches; ++i) {
    tprintf("%s(%d), shape %d, %.2f  ",
            unicharset.debug_str(Results->match[i].unichar_id).string(),
            Results->match[i].unichar_id, Results->match[i].shape_id,
            Results->match[i].rating * 100.0);
  }
  tprintf("\n");
}                              /* PrintAdaptiveMatchResults */

void tesseract::Classify::PrintAdaptiveStatistics ( FILE * File )

Print to File the statistics which have been gathered for the adaptive matcher.

Parameters:

File	open text file to print adaptive statistics to

Globals: none

Note:: Exceptions: none; History: Thu Apr 18 14:37:37 1991, DSJ, Created.

Definition at line 661 of file adaptmatch.cpp.

                                                 {
  #ifndef SECURE_NAMES

  fprintf (File, "\nADAPTIVE MATCHER STATISTICS:\n");
  fprintf (File, "\tNum blobs classified = %d\n", AdaptiveMatcherCalls);
  fprintf (File, "\tNum classes output   = %d (Avg = %4.2f)\n",
    NumClassesOutput,
    ((AdaptiveMatcherCalls == 0) ? (0.0) :
  ((float) NumClassesOutput / AdaptiveMatcherCalls)));
  fprintf (File, "\t\tBaseline Classifier: %4d calls (%4.2f classes/call)\n",
    BaselineClassifierCalls,
    ((BaselineClassifierCalls == 0) ? (0.0) :
  ((float) NumBaselineClassesTried / BaselineClassifierCalls)));
  fprintf (File, "\t\tCharNorm Classifier: %4d calls (%4.2f classes/call)\n",
    CharNormClassifierCalls,
    ((CharNormClassifierCalls == 0) ? (0.0) :
  ((float) NumCharNormClassesTried / CharNormClassifierCalls)));
  fprintf (File, "\t\tAmbig    Classifier: %4d calls (%4.2f classes/call)\n",
    AmbigClassifierCalls,
    ((AmbigClassifierCalls == 0) ? (0.0) :
  ((float) NumAmbigClassesTried / AmbigClassifierCalls)));

  fprintf (File, "\nADAPTIVE LEARNER STATISTICS:\n");
  fprintf (File, "\tNumber of words adapted to: %d\n", NumWordsAdaptedTo);
  fprintf (File, "\tNumber of chars adapted to: %d\n", NumCharsAdaptedTo);

  PrintAdaptedTemplates(File, AdaptedTemplates);
  #endif
}                                /* PrintAdaptiveStatistics */

int tesseract::Classify::PruneClasses	(	const INT_TEMPLATES_STRUCT *	int_templates,
		int	num_features,
		const INT_FEATURE_STRUCT *	features,
		const uinT8 *	normalization_factors,
		const uinT16 *	expected_num_features,
		CP_RESULT_STRUCT *	results
	)

Definition at line 405 of file intmatcher.cpp.

                                                      {
/*
 **  Operation:
 **    Prunes the classes using a modified fast match table.
 **    Returns a sorted list of classes along with the number
 **      of pruned classes in that list.
 **  Return: Number of pruned classes.
 **  Exceptions: none
 **  History: Tue Feb 19 10:24:24 MST 1991, RWM, Created.
 */
  ClassPruner pruner(int_templates->NumClasses);
  // Compute initial match scores for all classes.
  pruner.ComputeScores(int_templates, num_features, features);
  // Adjust match scores for number of expected features.
  pruner.AdjustForExpectedNumFeatures(expected_num_features,
                                      classify_cp_cutoff_strength);
  // Apply disabled classes in unicharset - only works without a shape_table.
  if (shape_table_ == NULL)
    pruner.DisableDisabledClasses(unicharset);
  // If fragments are disabled, remove them, also only without a shape table.
  if (disable_character_fragments && shape_table_ == NULL)
    pruner.DisableFragments(unicharset);

  // If we have good x-heights, apply the given normalization factors.
  if (normalization_factors != NULL) {
    pruner.NormalizeForXheight(classify_class_pruner_multiplier,
                               normalization_factors);
  } else {
    pruner.NoNormalization();
  }
  // Do the actual pruning and sort the short-list.
  pruner.PruneAndSort(classify_class_pruner_threshold,
                      shape_table_ == NULL, unicharset);

  if (classify_debug_level > 2) {
    pruner.DebugMatch(*this, int_templates, features);
  }
  if (classify_debug_level > 1) {
    pruner.SummarizeResult(*this, int_templates, expected_num_features,
                           classify_class_pruner_multiplier,
                           normalization_factors);
  }
  // Convert to the expected output format.
  return pruner.SetupResults(results);
}

ADAPT_TEMPLATES tesseract::Classify::ReadAdaptedTemplates ( FILE * File )

Read a set of adapted templates from File and return a ptr to the templates.

Parameters:

File	open text file to read adapted templates from

Returns:: Ptr to adapted templates read from File.

Note:: Globals: none; Exceptions: none; History: Mon Mar 18 15:18:10 1991, DSJ, Created.

Definition at line 371 of file adaptive.cpp.

                                                         {
  int i;
  ADAPT_TEMPLATES Templates;

  /* first read the high level adaptive template struct */
  Templates = (ADAPT_TEMPLATES) Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT));
  fread ((char *) Templates, sizeof (ADAPT_TEMPLATES_STRUCT), 1, File);

  /* then read in the basic integer templates */
  Templates->Templates = ReadIntTemplates (File);

  /* then read in the adaptive info for each class */
  for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
    Templates->Class[i] = ReadAdaptedClass (File);
  }
  return (Templates);

}                                /* ReadAdaptedTemplates */

void tesseract::Classify::ReadClassFile ( )

Definition at line 293 of file protos.cpp.

                             {
 FILE *File;
 char TextLine[CHARS_PER_LINE];
 char unichar[CHARS_PER_LINE];

 cprintf ("Reading training data from '%s' ...",
          static_cast<STRING>(classify_training_file).string());
 fflush(stdout);

 File = open_file(static_cast<STRING>(classify_training_file).string(), "r");
 while (fgets (TextLine, CHARS_PER_LINE, File) != NULL) {

   sscanf(TextLine, "%s", unichar);
   ReadClassFromFile (File, unicharset.unichar_to_id(unichar));
   fgets(TextLine, CHARS_PER_LINE, File);
   fgets(TextLine, CHARS_PER_LINE, File);
 }
 fclose(File);
 new_line();
}

INT_TEMPLATES tesseract::Classify::ReadIntTemplates ( FILE * File )

Definition at line 786 of file intproto.cpp.

                                                   {
/*
 ** Parameters:
 **   File    open file to read templates from
 ** Globals: none
 ** Operation: This routine reads a set of integer templates from
 **   File.  File must already be open and must be in the
 **   correct binary format.
 ** Return: Pointer to integer templates read from File.
 ** Exceptions: none
 ** History: Wed Feb 27 11:48:46 1991, DSJ, Created.
 */
  int i, j, w, x, y, z;
  BOOL8 swap;
  int nread;
  int unicharset_size;
  int version_id = 0;
  INT_TEMPLATES Templates;
  CLASS_PRUNER_STRUCT* Pruner;
  INT_CLASS Class;
  uinT8 *Lengths;
  PROTO_SET ProtoSet;

  /* variables for conversion from older inttemp formats */
  int b, bit_number, last_cp_bit_number, new_b, new_i, new_w;
  CLASS_ID class_id, max_class_id;
  inT16 *IndexFor = new inT16[MAX_NUM_CLASSES];
  CLASS_ID *ClassIdFor = new CLASS_ID[MAX_NUM_CLASSES];
  CLASS_PRUNER_STRUCT **TempClassPruner =
      new CLASS_PRUNER_STRUCT*[MAX_NUM_CLASS_PRUNERS];
  uinT32 SetBitsForMask =           // word with NUM_BITS_PER_CLASS
    (1 << NUM_BITS_PER_CLASS) - 1;  // set starting at bit 0
  uinT32 Mask, NewMask, ClassBits;
  int MaxNumConfigs = MAX_NUM_CONFIGS;
  int WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;

  /* first read the high level template struct */
  Templates = NewIntTemplates();
  // Read Templates in parts for 64 bit compatibility.
  if (fread(&unicharset_size, sizeof(int), 1, File) != 1)
    cprintf("Bad read of inttemp!\n");
  if (fread(&Templates->NumClasses,
            sizeof(Templates->NumClasses), 1, File) != 1 ||
      fread(&Templates->NumClassPruners,
            sizeof(Templates->NumClassPruners), 1, File) != 1)
    cprintf("Bad read of inttemp!\n");
  // Swap status is determined automatically.
  swap = Templates->NumClassPruners < 0 ||
    Templates->NumClassPruners > MAX_NUM_CLASS_PRUNERS;
  if (swap) {
    Reverse32(&Templates->NumClassPruners);
    Reverse32(&Templates->NumClasses);
    Reverse32(&unicharset_size);
  }
  if (Templates->NumClasses < 0) {
    // This file has a version id!
    version_id = -Templates->NumClasses;
    if (fread(&Templates->NumClasses, sizeof(Templates->NumClasses),
              1, File) != 1)
      cprintf("Bad read of inttemp!\n");
    if (swap)
      Reverse32(&Templates->NumClasses);
  }

  if (version_id < 3) {
    MaxNumConfigs = OLD_MAX_NUM_CONFIGS;
    WerdsPerConfigVec = OLD_WERDS_PER_CONFIG_VEC;
  }

  if (version_id < 2) {
    for (i = 0; i < unicharset_size; ++i) {
      if (fread(&IndexFor[i], sizeof(inT16), 1, File) != 1)
        cprintf("Bad read of inttemp!\n");
    }
    for (i = 0; i < Templates->NumClasses; ++i) {
      if (fread(&ClassIdFor[i], sizeof(CLASS_ID), 1, File) != 1)
        cprintf("Bad read of inttemp!\n");
    }
    if (swap) {
      for (i = 0; i < Templates->NumClasses; i++)
        Reverse16(&IndexFor[i]);
      for (i = 0; i < Templates->NumClasses; i++)
        Reverse32(&ClassIdFor[i]);
    }
  }

  /* then read in the class pruners */
  for (i = 0; i < Templates->NumClassPruners; i++) {
    Pruner = new CLASS_PRUNER_STRUCT;
    if ((nread =
         fread(Pruner, 1, sizeof(CLASS_PRUNER_STRUCT),
                File)) != sizeof(CLASS_PRUNER_STRUCT))
      cprintf("Bad read of inttemp!\n");
    if (swap) {
      for (x = 0; x < NUM_CP_BUCKETS; x++) {
        for (y = 0; y < NUM_CP_BUCKETS; y++) {
          for (z = 0; z < NUM_CP_BUCKETS; z++) {
            for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
              Reverse32(&Pruner->p[x][y][z][w]);
            }
          }
        }
      }
    }
    if (version_id < 2) {
      TempClassPruner[i] = Pruner;
    } else {
      Templates->ClassPruners[i] = Pruner;
    }
  }

  /* fix class pruners if they came from an old version of inttemp */
  if (version_id < 2) {
    // Allocate enough class pruners to cover all the class ids.
    max_class_id = 0;
    for (i = 0; i < Templates->NumClasses; i++)
      if (ClassIdFor[i] > max_class_id)
        max_class_id = ClassIdFor[i];
    for (i = 0; i <= CPrunerIdFor(max_class_id); i++) {
      Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
      memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
    }
    // Convert class pruners from the old format (indexed by class index)
    // to the new format (indexed by class id).
    last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;
    for (i = 0; i < Templates->NumClassPruners; i++) {
      for (x = 0; x < NUM_CP_BUCKETS; x++)
        for (y = 0; y < NUM_CP_BUCKETS; y++)
          for (z = 0; z < NUM_CP_BUCKETS; z++)
            for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
              if (TempClassPruner[i]->p[x][y][z][w] == 0)
                continue;
              for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {
                bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;
                if (bit_number > last_cp_bit_number)
                  break; // the rest of the bits in this word are not used
                class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];
                // Single out NUM_BITS_PER_CLASS bits relating to class_id.
                Mask = SetBitsForMask << b;
                ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;
                // Move these bits to the new position in which they should
                // appear (indexed corresponding to the class_id).
                new_i = CPrunerIdFor(class_id);
                new_w = CPrunerWordIndexFor(class_id);
                new_b = CPrunerBitIndexFor(class_id) * NUM_BITS_PER_CLASS;
                if (new_b > b) {
                  ClassBits <<= (new_b - b);
                } else {
                  ClassBits >>= (b - new_b);
                }
                // Copy bits relating to class_id to the correct position
                // in Templates->ClassPruner.
                NewMask = SetBitsForMask << new_b;
                Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;
                Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;
              }
            }
    }
    for (i = 0; i < Templates->NumClassPruners; i++) {
      delete TempClassPruner[i];
    }
  }

  /* then read in each class */
  for (i = 0; i < Templates->NumClasses; i++) {
    /* first read in the high level struct for the class */
    Class = (INT_CLASS) Emalloc (sizeof (INT_CLASS_STRUCT));
    if (fread(&Class->NumProtos, sizeof(Class->NumProtos), 1, File) != 1 ||
        fread(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File) != 1 ||
        fread(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File) != 1)
      cprintf ("Bad read of inttemp!\n");
    if (version_id == 0) {
      // Only version 0 writes 5 pointless pointers to the file.
      for (j = 0; j < 5; ++j) {
        int junk;
        if (fread(&junk, sizeof(junk), 1, File) != 1)
          cprintf ("Bad read of inttemp!\n");
      }
    }
    if (version_id < 4) {
      for (j = 0; j < MaxNumConfigs; ++j) {
        if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1)
          cprintf ("Bad read of inttemp!\n");
      }
      if (swap) {
        Reverse16(&Class->NumProtos);
        for (j = 0; j < MaxNumConfigs; j++)
          Reverse16(&Class->ConfigLengths[j]);
      }
    } else {
      ASSERT_HOST(Class->NumConfigs < MaxNumConfigs);
      for (j = 0; j < Class->NumConfigs; ++j) {
        if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1)
          cprintf ("Bad read of inttemp!\n");
      }
      if (swap) {
        Reverse16(&Class->NumProtos);
        for (j = 0; j < MaxNumConfigs; j++)
          Reverse16(&Class->ConfigLengths[j]);
      }
    }
    if (version_id < 2) {
      ClassForClassId (Templates, ClassIdFor[i]) = Class;
    } else {
      ClassForClassId (Templates, i) = Class;
    }

    /* then read in the proto lengths */
    Lengths = NULL;
    if (MaxNumIntProtosIn (Class) > 0) {
      Lengths = (uinT8 *)Emalloc(sizeof(uinT8) * MaxNumIntProtosIn(Class));
      if ((nread =
           fread((char *)Lengths, sizeof(uinT8),
                 MaxNumIntProtosIn(Class), File)) != MaxNumIntProtosIn (Class))
        cprintf ("Bad read of inttemp!\n");
    }
    Class->ProtoLengths = Lengths;

    /* then read in the proto sets */
    for (j = 0; j < Class->NumProtoSets; j++) {
      ProtoSet = (PROTO_SET)Emalloc(sizeof(PROTO_SET_STRUCT));
      if (version_id < 3) {
        if ((nread =
             fread((char *) &ProtoSet->ProtoPruner, 1,
                    sizeof(PROTO_PRUNER), File)) != sizeof(PROTO_PRUNER))
          cprintf("Bad read of inttemp!\n");
        for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
          if ((nread = fread((char *) &ProtoSet->Protos[x].A, 1,
                             sizeof(inT8), File)) != sizeof(inT8) ||
              (nread = fread((char *) &ProtoSet->Protos[x].B, 1,
                             sizeof(uinT8), File)) != sizeof(uinT8) ||
              (nread = fread((char *) &ProtoSet->Protos[x].C, 1,
                             sizeof(inT8), File)) != sizeof(inT8) ||
              (nread = fread((char *) &ProtoSet->Protos[x].Angle, 1,
                             sizeof(uinT8), File)) != sizeof(uinT8))
            cprintf("Bad read of inttemp!\n");
          for (y = 0; y < WerdsPerConfigVec; y++)
            if ((nread = fread((char *) &ProtoSet->Protos[x].Configs[y], 1,
                               sizeof(uinT32), File)) != sizeof(uinT32))
              cprintf("Bad read of inttemp!\n");
        }
      } else {
        if ((nread =
             fread((char *) ProtoSet, 1, sizeof(PROTO_SET_STRUCT),
                   File)) != sizeof(PROTO_SET_STRUCT))
          cprintf("Bad read of inttemp!\n");
      }
      if (swap) {
        for (x = 0; x < NUM_PP_PARAMS; x++)
          for (y = 0; y < NUM_PP_BUCKETS; y++)
            for (z = 0; z < WERDS_PER_PP_VECTOR; z++)
              Reverse32(&ProtoSet->ProtoPruner[x][y][z]);
        for (x = 0; x < PROTOS_PER_PROTO_SET; x++)
          for (y = 0; y < WerdsPerConfigVec; y++)
            Reverse32(&ProtoSet->Protos[x].Configs[y]);
      }
      Class->ProtoSets[j] = ProtoSet;
    }
    if (version_id < 4)
      Class->font_set_id = -1;
    else {
      fread(&Class->font_set_id, sizeof(int), 1, File);
      if (swap)
        Reverse32(&Class->font_set_id);
    }
  }

  if (version_id < 2) {
    /* add an empty NULL class with class id 0 */
    assert(UnusedClassIdIn (Templates, 0));
    ClassForClassId (Templates, 0) = NewIntClass (1, 1);
    ClassForClassId (Templates, 0)->font_set_id = -1;
    Templates->NumClasses++;
    /* make sure the classes are contiguous */
    for (i = 0; i < MAX_NUM_CLASSES; i++) {
      if (i < Templates->NumClasses) {
        if (ClassForClassId (Templates, i) == NULL) {
          fprintf(stderr, "Non-contiguous class ids in inttemp\n");
          exit(1);
        }
      } else {
        if (ClassForClassId (Templates, i) != NULL) {
          fprintf(stderr, "Class id %d exceeds NumClassesIn (Templates) %d\n",
                  i, Templates->NumClasses);
          exit(1);
        }
      }
    }
  }
  if (version_id >= 4) {
    this->fontinfo_table_.read(File, NewPermanentTessCallback(read_info), swap);
    if (version_id >= 5) {
      this->fontinfo_table_.read(File,
                                 NewPermanentTessCallback(read_spacing_info),
                                 swap);
    }
    this->fontset_table_.read(File, NewPermanentTessCallback(read_set), swap);
  }

  // Clean up.
  delete[] IndexFor;
  delete[] ClassIdFor;
  delete[] TempClassPruner;

  return (Templates);
}                                /* ReadIntTemplates */

void tesseract::Classify::ReadNewCutoffs	(	FILE *	CutoffFile,
		bool	swap,
		inT64	end_offset,
		CLASS_CUTOFF_ARRAY	Cutoffs
	)

Definition at line 42 of file cutoffs.cpp.

                                                          {
/*
 **     Parameters:
 **             Filename        name of file containing cutoff definitions
 **             Cutoffs         array to put cutoffs into
 **     Globals: none
 **     Operation: Open Filename, read in all of the class-id/cutoff pairs
 **             and insert them into the Cutoffs array.  Cutoffs are
 **             indexed in the array by class id.  Unused entries in the
 **             array are set to an arbitrarily high cutoff value.
 **     Return: none
 **     Exceptions: none
 **     History: Wed Feb 20 09:38:26 1991, DSJ, Created.
 */
  char Class[UNICHAR_LEN + 1];
  CLASS_ID ClassId;
  int Cutoff;
  int i;

  if (shape_table_ != NULL) {
    if (!shapetable_cutoffs_.DeSerialize(swap, CutoffFile)) {
      tprintf("Error during read of shapetable pffmtable!\n");
    }
  }
  for (i = 0; i < MAX_NUM_CLASSES; i++)
    Cutoffs[i] = MAX_CUTOFF;

  while ((end_offset < 0 || ftell(CutoffFile) < end_offset) &&
         fscanf(CutoffFile, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d",
                Class, &Cutoff) == 2) {
    if (strcmp(Class, "NULL") == 0) {
      ClassId = unicharset.unichar_to_id(" ");
    } else {
      ClassId = unicharset.unichar_to_id(Class);
    }
    Cutoffs[ClassId] = Cutoff;
    SkipNewline(CutoffFile);
  }
}                                /* ReadNewCutoffs */

NORM_PROTOS * tesseract::Classify::ReadNormProtos	(	FILE *	File,
		inT64	end_offset
	)

Definition at line 230 of file normmatch.cpp.

                                                                  {
/*
 **     Parameters:
 **             File    open text file to read normalization protos from
 **     Globals: none
 **     Operation: This routine allocates a new data structure to hold
 **             a set of character normalization protos.  It then fills in
 **             the data structure by reading from the specified File.
 **     Return: Character normalization protos.
 **     Exceptions: none
 **     History: Wed Dec 19 16:38:49 1990, DSJ, Created.
 */
  NORM_PROTOS *NormProtos;
  int i;
  char unichar[2 * UNICHAR_LEN + 1];
  UNICHAR_ID unichar_id;
  LIST Protos;
  int NumProtos;

  /* allocate and initialization data structure */
  NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
  NormProtos->NumProtos = unicharset.size();
  NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
  for (i = 0; i < NormProtos->NumProtos; i++)
    NormProtos->Protos[i] = NIL_LIST;

  /* read file header and save in data structure */
  NormProtos->NumParams = ReadSampleSize (File);
  NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams);

  /* read protos for each class into a separate list */
  while ((end_offset < 0 || ftell(File) < end_offset) &&
         fscanf(File, "%s %d", unichar, &NumProtos) == 2) {
    if (unicharset.contains_unichar(unichar)) {
      unichar_id = unicharset.unichar_to_id(unichar);
      Protos = NormProtos->Protos[unichar_id];
      for (i = 0; i < NumProtos; i++)
        Protos =
            push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
      NormProtos->Protos[unichar_id] = Protos;
    } else {
      cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
              unichar);
      for (i = 0; i < NumProtos; i++)
        FreePrototype(ReadPrototype (File, NormProtos->NumParams));
    }
    SkipNewline(File);
  }
  return (NormProtos);
}                                /* ReadNormProtos */

void tesseract::Classify::RefreshDebugWindow	(	ScrollView **	win,
		const char *	msg,
		int	y_offset,
		const TBOX &	wbox
	)

Definition at line 230 of file adaptmatch.cpp.

                                                                  {
  #ifndef GRAPHICS_DISABLED
  const int kSampleSpaceWidth = 500;
  if (*win == NULL) {
    *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
                          kSampleSpaceWidth * 2, 200, true);
  }
  (*win)->Clear();
  (*win)->Pen(64, 64, 64);
  (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
               kSampleSpaceWidth, kBlnBaselineOffset);
  (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
               kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
  (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
                          wbox.right(), wbox.bottom());
  #endif  // GRAPHICS_DISABLED
}

void tesseract::Classify::RemoveBadMatches ( ADAPT_RESULTS * Results )

This routine steps thru each matching class in Results and removes it from the match list if its rating is worse than the BestRating plus a pad. In other words, all good matches get moved to the front of the classes array.

Parameters:

Results contains matches to be filtered

Globals:

matcher_bad_match_pad defines a "bad match"

Note:: Exceptions: none; History: Tue Mar 12 13:51:03 1991, DSJ, Created.

Definition at line 2445 of file adaptmatch.cpp.

                                                      {
  int Next, NextGood;
  FLOAT32 BadMatchThreshold;
  static const char* romans = "i v x I V X";
  BadMatchThreshold = Results->best_match.rating + matcher_bad_match_pad;

  if (classify_bln_numeric_mode) {
    UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
        unicharset.unichar_to_id("1") : -1;
    UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
        unicharset.unichar_to_id("0") : -1;
    ScoredClass scored_one = ScoredUnichar(Results, unichar_id_one);
    ScoredClass scored_zero = ScoredUnichar(Results, unichar_id_zero);

    for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
      if (Results->match[Next].rating <= BadMatchThreshold) {
        ScoredClass match = Results->match[Next];
        if (!unicharset.get_isalpha(match.unichar_id) ||
            strstr(romans,
                   unicharset.id_to_unichar(match.unichar_id)) != NULL) {
          Results->match[NextGood++] = Results->match[Next];
        } else if (unicharset.eq(match.unichar_id, "l") &&
                   scored_one.rating >= BadMatchThreshold) {
          Results->match[NextGood] = scored_one;
          Results->match[NextGood].rating = match.rating;
          NextGood++;
        } else if (unicharset.eq(match.unichar_id, "O") &&
                   scored_zero.rating >= BadMatchThreshold) {
          Results->match[NextGood] = scored_zero;
          Results->match[NextGood].rating = match.rating;
          NextGood++;
        }
      }
    }
  } else {
    for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
      if (Results->match[Next].rating <= BadMatchThreshold)
        Results->match[NextGood++] = Results->match[Next];
    }
  }
  Results->NumMatches = NextGood;
}                              /* RemoveBadMatches */

void tesseract::Classify::RemoveExtraPuncs ( ADAPT_RESULTS * Results )

This routine steps thru each matching class in Results and removes it from the match list if its rating is worse than the BestRating plus a pad. In other words, all good matches get moved to the front of the classes array.

Parameters:

Results contains matches to be filtered

Globals:

matcher_bad_match_pad defines a "bad match"

Note:: Exceptions: none; History: Tue Mar 12 13:51:03 1991, DSJ, Created.

Definition at line 2504 of file adaptmatch.cpp.

                                                      {
  int Next, NextGood;
  int punc_count;              /*no of garbage characters */
  int digit_count;
  /*garbage characters */
  static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
  static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";

  punc_count = 0;
  digit_count = 0;
  for (Next = NextGood = 0; Next < Results->NumMatches; Next++) {
    ScoredClass match = Results->match[Next];
    if (strstr(punc_chars,
               unicharset.id_to_unichar(match.unichar_id)) != NULL) {
      if (punc_count < 2)
        Results->match[NextGood++] = match;
      punc_count++;
    } else {
      if (strstr(digit_chars,
                 unicharset.id_to_unichar(match.unichar_id)) != NULL) {
        if (digit_count < 1)
          Results->match[NextGood++] = match;
        digit_count++;
      } else {
        Results->match[NextGood++] = match;
      }
    }
  }
  Results->NumMatches = NextGood;
}                              /* RemoveExtraPuncs */

void tesseract::Classify::ResetAdaptiveClassifierInternal ( )

Definition at line 638 of file adaptmatch.cpp.

                                               {
  if (classify_learning_debug_level > 0) {
    tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
            NumAdaptationsFailed);
  }
  free_adapted_templates(AdaptedTemplates);
  AdaptedTemplates = NULL;
  NumAdaptationsFailed = 0;
}

void tesseract::Classify::ResetFeaturesHaveBeenExtracted ( )

Definition at line 1980 of file adaptmatch.cpp.

                                              {
  FeaturesHaveBeenExtracted = FALSE;
}

void tesseract::Classify::SetAdaptiveThreshold ( FLOAT32 Threshold )

This routine resets the internal thresholds inside the integer matcher to correspond to the specified threshold.

Parameters:

Threshold threshold for creating new templates

Globals:

matcher_good_threshold default good match rating

Note:: Exceptions: none; History: Tue Apr 9 08:33:13 1991, DSJ, Created.

Definition at line 2549 of file adaptmatch.cpp.

                                                     {
  Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
  classify_adapt_proto_threshold.set_value(
      ClipToRange<int>(255 * Threshold, 0, 255));
  classify_adapt_feature_threshold.set_value(
      ClipToRange<int>(255 * Threshold, 0, 255));
}                              /* SetAdaptiveThreshold */

void tesseract::Classify::SettupPass1 ( )

This routine prepares the adaptive matcher for the start of the first pass. Learning is enabled (unless it is disabled for the whole program).

Note:: this is somewhat redundant, it simply says that if learning is enabled then it will remain enabled on the first pass. If it is disabled, then it will remain disabled. This is only put here to make it very clear that learning is controlled directly by the global setting of EnableLearning.

Globals:

EnableLearning set to TRUE by this routine

Note:: Exceptions: none; History: Mon Apr 15 16:39:29 1991, DSJ, Created.

Definition at line 712 of file adaptmatch.cpp.

                           {
  EnableLearning = classify_enable_learning;

  getDict().SettupStopperPass1();

}                                /* SettupPass1 */

void tesseract::Classify::SettupPass2 ( )

This routine prepares the adaptive matcher for the start of the second pass. Further learning is disabled.

Globals:

EnableLearning set to FALSE by this routine

Note:: Exceptions: none; History: Mon Apr 15 16:39:29 1991, DSJ, Created.

Definition at line 732 of file adaptmatch.cpp.

                           {
  EnableLearning = FALSE;
  getDict().SettupStopperPass2();

}                                /* SettupPass2 */

const ShapeTable* tesseract::Classify::shape_table ( ) const [inline]

Definition at line 66 of file classify.h.

                                        {
    return shape_table_;
  }

int tesseract::Classify::ShapeIDToClassID ( int shape_id ) const

Definition at line 2747 of file adaptmatch.cpp.

                                                 {
  for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
    int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
    ASSERT_HOST(font_set_id >= 0);
    const FontSet &fs = fontset_table_.get(font_set_id);
    for (int config = 0; config < fs.size; ++config) {
      if (fs.configs[config] == shape_id)
        return id;
    }
  }
  tprintf("Shape %d not found\n", shape_id);
  return -1;
}

void tesseract::Classify::ShowBestMatchFor	(	TBLOB *	Blob,
		const DENORM &	denorm,
		CLASS_ID	ClassId,
		int	shape_id,
		BOOL8	AdaptiveOn,
		BOOL8	PreTrainedOn,
		ADAPT_RESULTS *	Results
	)

This routine compares Blob to both sets of templates (adaptive and pre-trained) and then displays debug information for the config which matched best.

Parameters:

Blob	blob to show best matching config for
denorm	normalization/denormalization parameters
ClassId	class whose configs are to be searched
shape_id	shape index
AdaptiveOn	TRUE if adaptive configs are enabled
PreTrainedOn	TRUE if pretrained configs are enabled
Results	results of match being debugged

Globals:

PreTrainedTemplates built-in training
AdaptedTemplates adaptive templates
AllProtosOn dummy proto mask
AllConfigsOn dummy config mask

Note:: Exceptions: none; History: Fri Mar 22 08:43:52 1991, DSJ, Created.

Definition at line 2580 of file adaptmatch.cpp.

                                                        {
  int NumCNFeatures = 0, NumBLFeatures = 0;
  INT_FEATURE_ARRAY CNFeatures, BLFeatures;
  INT_RESULT_STRUCT CNResult, BLResult;
  inT32 BlobLength;
  uinT32 ConfigMask;
  static int next_config = -1;

  if (PreTrainedOn) next_config = -1;

  CNResult.Rating = BLResult.Rating = 2.0;

  if (!LegalClassId (ClassId)) {
    cprintf ("%d is not a legal class id!!\n", ClassId);
    return;
  }

  uinT8 *CNAdjust = new uinT8[MAX_NUM_CLASSES];
  uinT8 *BLAdjust = new uinT8[MAX_NUM_CLASSES];

  if (shape_table_ == NULL)
    shape_id = ClassId;
  else
    shape_id = ShapeIDToClassID(shape_id);
  if (PreTrainedOn && shape_id >= 0) {
    if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
      tprintf("No built-in templates for class/shape %d\n", shape_id);
    } else {
      NumCNFeatures = GetCharNormFeatures(Blob, denorm, PreTrainedTemplates,
                                          CNFeatures, NULL, CNAdjust,
                                          &BlobLength, NULL);
      if (NumCNFeatures <= 0) {
        tprintf("Illegal blob (char norm features)!\n");
      } else {
        im_.SetCharNormMatch(classify_integer_matcher_multiplier);
        im_.Match(ClassForClassId(PreTrainedTemplates, shape_id),
                  AllProtosOn, AllConfigsOn,
                  NumCNFeatures, CNFeatures,
                  &CNResult,
                  classify_adapt_feature_threshold, NO_DEBUG,
                  matcher_debug_separate_windows);
        ExpandShapesAndApplyCorrections(NULL, false, shape_id,
                                        Blob->bounding_box().bottom(),
                                        Blob->bounding_box().top(),
                                        0, BlobLength, CNAdjust,
                                        CNResult, Results);
      }
    }
  }

  if (AdaptiveOn) {
    if (ClassId < 0 || ClassId >= AdaptedTemplates->Templates->NumClasses) {
      tprintf("Invalid adapted class id: %d\n", ClassId);
    } else if (UnusedClassIdIn(AdaptedTemplates->Templates, ClassId) ||
               AdaptedTemplates->Class[ClassId] == NULL ||
               IsEmptyAdaptedClass(AdaptedTemplates->Class[ClassId])) {
      tprintf("No AD templates for class %d = %s\n",
              ClassId, unicharset.id_to_unichar(ClassId));
    } else {
      NumBLFeatures = GetBaselineFeatures(Blob,
                                          denorm,
                                          AdaptedTemplates->Templates,
                                          BLFeatures, BLAdjust,
                                          &BlobLength);
      if (NumBLFeatures <= 0)
        tprintf("Illegal blob (baseline features)!\n");
      else {
        im_.SetBaseLineMatch();
        im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
                  AllProtosOn, AllConfigsOn,
                  NumBLFeatures, BLFeatures,
                  &BLResult,
                  classify_adapt_feature_threshold, NO_DEBUG,
                  matcher_debug_separate_windows);
        ExpandShapesAndApplyCorrections(
            AdaptedTemplates->Class, false,
            ClassId, Blob->bounding_box().bottom(),
            Blob->bounding_box().top(), 0, BlobLength, CNAdjust,
            BLResult, Results);
      }
    }
  }

  tprintf("\n");
  if (BLResult.Rating < CNResult.Rating) {
    if (next_config < 0) {
      ConfigMask = 1 << BLResult.Config;
      next_config = 0;
    } else {
      ConfigMask = 1 << next_config;
      ++next_config;
    }
    classify_norm_method.set_value(baseline);

    im_.SetBaseLineMatch();
    tprintf("Adaptive Class ID: %d\n", ClassId);
    im_.Match(ClassForClassId(AdaptedTemplates->Templates, ClassId),
              AllProtosOn, (BIT_VECTOR) &ConfigMask,
              NumBLFeatures, BLFeatures,
              &BLResult,
              classify_adapt_feature_threshold,
              matcher_debug_flags,
              matcher_debug_separate_windows);
    ExpandShapesAndApplyCorrections(
        AdaptedTemplates->Class, true,
        ClassId, Blob->bounding_box().bottom(),
        Blob->bounding_box().top(), 0, BlobLength, CNAdjust,
        BLResult, Results);
  } else if (shape_id >= 0) {
    ConfigMask = 1 << CNResult.Config;
    classify_norm_method.set_value(character);

    tprintf("Static Shape ID: %d\n", shape_id);
    im_.SetCharNormMatch(classify_integer_matcher_multiplier);
    im_.Match(ClassForClassId (PreTrainedTemplates, shape_id),
              AllProtosOn, (BIT_VECTOR) & ConfigMask,
              NumCNFeatures, CNFeatures,
              &CNResult,
              classify_adapt_feature_threshold,
              matcher_debug_flags,
              matcher_debug_separate_windows);
    ExpandShapesAndApplyCorrections(NULL, true, shape_id,
                                    Blob->bounding_box().bottom(),
                                    Blob->bounding_box().top(),
                                    0, BlobLength, CNAdjust,
                                    CNResult, Results);
  }

  // Clean up.
  delete[] CNAdjust;
  delete[] BLAdjust;
}                              /* ShowBestMatchFor */

void tesseract::Classify::ShowMatchDisplay ( )

Definition at line 1096 of file intproto.cpp.

                                {
/*
 ** Parameters: none
 ** Globals:
 **   FeatureShapes display list containing feature matches
 **   ProtoShapes display list containing proto matches
 ** Operation: This routine sends the shapes in the global display
 **   lists to the match debugger window.
 ** Return: none
 ** Exceptions: none
 ** History: Thu Mar 21 15:47:33 1991, DSJ, Created.
 */
  InitIntMatchWindowIfReqd();
  if (ProtoDisplayWindow) {
    ProtoDisplayWindow->Clear();
  }
  if (FeatureDisplayWindow) {
    FeatureDisplayWindow->Clear();
  }
  ClearFeatureSpaceWindow(
      static_cast<NORM_METHOD>(static_cast<int>(classify_norm_method)),
      IntMatchWindow);
  IntMatchWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
                                  INT_MAX_X, INT_MAX_Y);
  if (ProtoDisplayWindow) {
    ProtoDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
                                        INT_MAX_X, INT_MAX_Y);
  }
  if (FeatureDisplayWindow) {
    FeatureDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y,
                                          INT_MAX_X, INT_MAX_Y);
  }
}                                /* ShowMatchDisplay */

bool tesseract::Classify::TempConfigReliable	(	CLASS_ID	class_id,
		const TEMP_CONFIG &	config
	)

Definition at line 2763 of file adaptmatch.cpp.

                                                             {
  if (classify_learning_debug_level >= 1) {
    tprintf("NumTimesSeen for config of %s is %d\n",
            getDict().getUnicharset().debug_str(class_id).string(),
            config->NumTimesSeen);
  }
  if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
    return true;
  } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
    return false;
  } else if (use_ambigs_for_adaption) {
    // Go through the ambigs vector and see whether we have already seen
    // enough times all the characters represented by the ambigs vector.
    const UnicharIdVector *ambigs =
      getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
    int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
    for (int ambig = 0; ambig < ambigs_size; ++ambig) {
      ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
      assert(ambig_class != NULL);
      if (ambig_class->NumPermConfigs == 0 &&
          ambig_class->MaxNumTimesSeen <
          matcher_min_examples_for_prototyping) {
        if (classify_learning_debug_level >= 1) {
          tprintf("Ambig %s has not been seen enough times,"
                  " not making config for %s permanent\n",
                  getDict().getUnicharset().debug_str(
                      (*ambigs)[ambig]).string(),
                  getDict().getUnicharset().debug_str(class_id).string());
        }
        return false;
      }
    }
  }
  return true;
}

void tesseract::Classify::UpdateAmbigsGroup	(	CLASS_ID	class_id,
		const DENORM &	denorm,
		TBLOB *	Blob
	)

Definition at line 2800 of file adaptmatch.cpp.

                                              {
  const UnicharIdVector *ambigs =
    getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);
  int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
  if (classify_learning_debug_level >= 1) {
    tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
            getDict().getUnicharset().debug_str(class_id).string(), class_id);
  }
  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
    CLASS_ID ambig_class_id = (*ambigs)[ambig];
    const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
    for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
      if (ConfigIsPermanent(ambigs_class, cfg)) continue;
      const TEMP_CONFIG config =
        TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
      if (config != NULL && TempConfigReliable(ambig_class_id, config)) {
        if (classify_learning_debug_level >= 1) {
          tprintf("Making config %d of %s permanent\n", cfg,
                  getDict().getUnicharset().debug_str(
                      ambig_class_id).string());
        }
        MakePermanent(AdaptedTemplates, ambig_class_id, cfg, denorm, Blob);
      }
    }
  }
}

void tesseract::Classify::WriteAdaptedTemplates	(	FILE *	File,
		ADAPT_TEMPLATES	Templates
	)

This routine saves Templates to File in a binary format.

Parameters:

File	open text file to write Templates to
Templates	set of adapted templates to write to File

Note:: Globals: none; Exceptions: none; History: Mon Mar 18 15:07:32 1991, DSJ, Created.

Definition at line 507 of file adaptive.cpp.

                                                                          {
  int i;

  /* first write the high level adaptive template struct */
  fwrite ((char *) Templates, sizeof (ADAPT_TEMPLATES_STRUCT), 1, File);

  /* then write out the basic integer templates */
  WriteIntTemplates (File, Templates->Templates, unicharset);

  /* then write out the adaptive info for each class */
  for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
    WriteAdaptedClass (File, Templates->Class[i],
      Templates->Templates->Class[i]->NumConfigs);
  }
}                                /* WriteAdaptedTemplates */

void tesseract::Classify::WriteIntTemplates	(	FILE *	File,
		INT_TEMPLATES	Templates,
		const UNICHARSET &	target_unicharset
	)

Definition at line 1155 of file intproto.cpp.

                                                                      {
/*
 ** Parameters:
 **   File    open file to write templates to
 **   Templates templates to save into File
 ** Globals: none
 ** Operation: This routine writes Templates to File.  The format
 **   is an efficient binary format.  File must already be open
 **   for writing.
 ** Return: none
 ** Exceptions: none
 ** History: Wed Feb 27 11:48:46 1991, DSJ, Created.
 */
  int i, j;
  INT_CLASS Class;
  int unicharset_size = target_unicharset.size();
  int version_id = -5;  // When negated by the reader -1 becomes +1 etc.

  if (Templates->NumClasses != unicharset_size) {
    cprintf("Warning: executing WriteIntTemplates() with %d classes in"
            " Templates, while target_unicharset size is %d\n",
            Templates->NumClasses, unicharset_size);
  }

  /* first write the high level template struct */
  fwrite(&unicharset_size, sizeof(unicharset_size), 1, File);
  fwrite(&version_id, sizeof(version_id), 1, File);
  fwrite(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners),
         1, File);
  fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);

  /* then write out the class pruners */
  for (i = 0; i < Templates->NumClassPruners; i++)
    fwrite(Templates->ClassPruners[i],
           sizeof(CLASS_PRUNER_STRUCT), 1, File);

  /* then write out each class */
  for (i = 0; i < Templates->NumClasses; i++) {
    Class = Templates->Class[i];

    /* first write out the high level struct for the class */
    fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
    fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
    ASSERT_HOST(Class->NumConfigs == this->fontset_table_.get(Class->font_set_id).size);
    fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
    for (j = 0; j < Class->NumConfigs; ++j) {
      fwrite(&Class->ConfigLengths[j], sizeof(uinT16), 1, File);
    }

    /* then write out the proto lengths */
    if (MaxNumIntProtosIn (Class) > 0) {
      fwrite ((char *) (Class->ProtoLengths), sizeof (uinT8),
              MaxNumIntProtosIn (Class), File);
    }

    /* then write out the proto sets */
    for (j = 0; j < Class->NumProtoSets; j++)
      fwrite ((char *) Class->ProtoSets[j],
              sizeof (PROTO_SET_STRUCT), 1, File);

    /* then write the fonts info */
    fwrite(&Class->font_set_id, sizeof(int), 1, File);
  }

  /* Write the fonts info tables */
  this->fontinfo_table_.write(File, NewPermanentTessCallback(write_info));
  this->fontinfo_table_.write(File,
                              NewPermanentTessCallback(write_spacing_info));
  this->fontset_table_.write(File, NewPermanentTessCallback(write_set));
}                                /* WriteIntTemplates */