Tesseract  3.02
tesseract::MasterTrainer Class Reference

#include <mastertrainer.h>

List of all members.

Public Member Functions

 MasterTrainer (NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
 ~MasterTrainer ()
bool Serialize (FILE *fp) const
bool DeSerialize (bool swap, FILE *fp)
void LoadUnicharset (const char *filename)
void SetFeatureSpace (const IntFeatureSpace &fs)
void ReadTrainingSamples (FILE *fp, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
void AddSample (bool verification, const char *unichar_str, TrainingSample *sample)
void LoadPageImages (const char *filename)
void PostLoadCleanup ()
void PreTrainingSetup ()
void SetupMasterShapes ()
void IncludeJunk ()
void ReplicateAndRandomizeSamplesIfRequired ()
bool LoadFontInfo (const char *filename)
bool LoadXHeights (const char *filename)
bool AddSpacingInfo (const char *filename)
int GetFontInfoId (const char *font_name)
int GetBestMatchingFontInfoId (const char *filename)
void SetupFlatShapeTable (ShapeTable *shape_table)
CLUSTERERSetupForClustering (const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
void WriteInttempAndPFFMTable (const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
const UNICHARSETunicharset () const
TrainingSampleSetGetSamples ()
const ShapeTablemaster_shapes () const
void DebugCanonical (const char *unichar_str1, const char *unichar_str2)
void DisplaySamples (const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
void TestClassifierOnSamples (int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
double TestClassifier (int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, STRING *report_string)
float ShapeDistance (const ShapeTable &shapes, int s1, int s2)

Detailed Description

Definition at line 68 of file mastertrainer.h.


Constructor & Destructor Documentation

tesseract::MasterTrainer::MasterTrainer ( NormalizationMode  norm_mode,
bool  shape_analysis,
bool  replicate_samples,
int  debug_level 
)

Definition at line 46 of file mastertrainer.cpp.

  : norm_mode_(norm_mode), samples_(fontinfo_table_),
    junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
    charsetsize_(0),
    enable_shape_anaylsis_(shape_analysis),
    enable_replication_(replicate_samples),
    fragments_(NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
  fontinfo_table_.set_compare_callback(
      NewPermanentTessCallback(CompareFontInfo));
  fontinfo_table_.set_clear_callback(
      NewPermanentTessCallback(FontInfoDeleteCallback));
}
tesseract::MasterTrainer::~MasterTrainer ( )

Definition at line 62 of file mastertrainer.cpp.

                              {
  delete [] fragments_;
  for (int p = 0; p < page_images_.size(); ++p)
    pixDestroy(&page_images_[p]);
}

Member Function Documentation

void tesseract::MasterTrainer::AddSample ( bool  verification,
const char *  unichar_str,
TrainingSample sample 
)

Definition at line 179 of file mastertrainer.cpp.

                                                      {
  if (verification) {
    verify_samples_.AddSample(unichar, sample);
    prev_unichar_id_ = -1;
  } else if (unicharset_.contains_unichar(unichar)) {
    if (prev_unichar_id_ >= 0)
      fragments_[prev_unichar_id_] = -1;
    prev_unichar_id_ = samples_.AddSample(unichar, sample);
    if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0)
      flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
  } else {
    int junk_id = junk_samples_.AddSample(unichar, sample);
    if (prev_unichar_id_ >= 0) {
      CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(unichar);
      if (frag != NULL && frag->is_natural()) {
        if (fragments_[prev_unichar_id_] == 0)
          fragments_[prev_unichar_id_] = junk_id;
        else if (fragments_[prev_unichar_id_] != junk_id)
          fragments_[prev_unichar_id_] = -1;
      }
      delete frag;
    }
    prev_unichar_id_ = -1;
  }
}
bool tesseract::MasterTrainer::AddSpacingInfo ( const char *  filename)

Definition at line 417 of file mastertrainer.cpp.

                                                       {
  FILE* fontinfo_file = fopen(filename, "rb");
  if (fontinfo_file == NULL)
    return true;  // We silently ignore missing files!
  // Find the fontinfo_id.
  int fontinfo_id = GetBestMatchingFontInfoId(filename);
  if (fontinfo_id < 0) {
    tprintf("No font found matching fontinfo filename %s\n", filename);
    fclose(fontinfo_file);
    return false;
  }
  tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
  // TODO(rays) scale should probably be a double, but keep as an int for now
  // to duplicate current behavior.
  int scale = kBlnXHeight / xheights_[fontinfo_id];
  int num_unichars;
  char uch[UNICHAR_LEN];
  char kerned_uch[UNICHAR_LEN];
  int x_gap, x_gap_before, x_gap_after, num_kerned;
  ASSERT_HOST(fscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
  FontInfo *fi = fontinfo_table_.get_mutable(fontinfo_id);
  fi->init_spacing(unicharset_.size());
  FontSpacingInfo *spacing = NULL;
  for (int l = 0; l < num_unichars; ++l) {
    if (fscanf(fontinfo_file, "%s %d %d %d",
               uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
      tprintf("Bad format of font spacing file %s\n", filename);
      fclose(fontinfo_file);
      return false;
    }
    bool valid = unicharset_.contains_unichar(uch);
    if (valid) {
      spacing = new FontSpacingInfo();
      spacing->x_gap_before = static_cast<inT16>(x_gap_before * scale);
      spacing->x_gap_after = static_cast<inT16>(x_gap_after * scale);
    }
    for (int k = 0; k < num_kerned; ++k) {
      if (fscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
        tprintf("Bad format of font spacing file %s\n", filename);
        fclose(fontinfo_file);
        return false;
      }
      if (!valid || !unicharset_.contains_unichar(kerned_uch)) continue;
      spacing->kerned_unichar_ids.push_back(
          unicharset_.unichar_to_id(kerned_uch));
      spacing->kerned_x_gaps.push_back(static_cast<inT16>(x_gap * scale));
    }
    if (valid) fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
  }
  fclose(fontinfo_file);
  return true;
}
void tesseract::MasterTrainer::DebugCanonical ( const char *  unichar_str1,
const char *  unichar_str2 
)

Definition at line 635 of file mastertrainer.cpp.

                                                             {
  int class_id1 = unicharset_.unichar_to_id(unichar_str1);
  int class_id2 = unicharset_.unichar_to_id(unichar_str2);
  if (class_id2 == INVALID_UNICHAR_ID)
    class_id2 = class_id1;
  if (class_id1 == INVALID_UNICHAR_ID) {
    tprintf("No unicharset entry found for %s\n", unichar_str1);
    return;
  } else {
    tprintf("Font ambiguities for unichar %d = %s and %d = %s\n",
            class_id1, unichar_str1, class_id2, unichar_str2);
  }
  int num_fonts = samples_.NumFonts();
  const IntFeatureMap& feature_map = feature_map_;
  // Iterate the fonts to get the similarity with other fonst of the same
  // class.
  tprintf("      ");
  for (int f = 0; f < num_fonts; ++f) {
    if (samples_.NumClassSamples(f, class_id2, false) == 0)
      continue;
    tprintf("%6d", f);
  }
  tprintf("\n");
  for (int f1 = 0; f1 < num_fonts; ++f1) {
    // Map the features of the canonical_sample.
    if (samples_.NumClassSamples(f1, class_id1, false) == 0)
      continue;
    tprintf("%4d  ", f1);
    for (int f2 = 0; f2 < num_fonts; ++f2) {
      if (samples_.NumClassSamples(f2, class_id2, false) == 0)
        continue;
      float dist = samples_.ClusterDistance(f1, class_id1, f2, class_id2,
                                            feature_map);
      tprintf(" %5.3f", dist);
    }
    tprintf("\n");
  }
  // Build a fake ShapeTable containing all the sample types.
  ShapeTable shapes(unicharset_);
  for (int f = 0; f < num_fonts; ++f) {
    if (samples_.NumClassSamples(f, class_id1, true) > 0)
      shapes.AddShape(class_id1, f);
    if (class_id1 != class_id2 &&
        samples_.NumClassSamples(f, class_id2, true) > 0)
      shapes.AddShape(class_id2, f);
  }
}
bool tesseract::MasterTrainer::DeSerialize ( bool  swap,
FILE *  fp 
)

Definition at line 90 of file mastertrainer.cpp.

                                                   {
  if (fread(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
  if (swap) {
    ReverseN(&norm_mode_, sizeof(norm_mode_));
  }
  if (!unicharset_.load_from_file(fp)) return false;
  charsetsize_ = unicharset_.size();
  if (!feature_space_.DeSerialize(swap, fp)) return false;
  feature_map_.Init(feature_space_);
  if (!samples_.DeSerialize(swap, fp)) return false;
  if (!junk_samples_.DeSerialize(swap, fp)) return false;
  if (!verify_samples_.DeSerialize(swap, fp)) return false;
  if (!master_shapes_.DeSerialize(swap, fp)) return false;
  if (!flat_shapes_.DeSerialize(swap, fp)) return false;
  if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_info), swap))
    return false;
  if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_spacing_info),
                            swap))
    return false;
  if (!xheights_.DeSerialize(swap, fp)) return false;
  return true;
}
void tesseract::MasterTrainer::DisplaySamples ( const char *  unichar_str1,
int  cloud_font,
const char *  unichar_str2,
int  canonical_font 
)

Definition at line 695 of file mastertrainer.cpp.

                                                       {
  const IntFeatureMap& feature_map = feature_map_;
  const IntFeatureSpace& feature_space = feature_map.feature_space();
  ScrollView* f_window = CreateFeatureSpaceWindow("Features", 100, 500);
  ClearFeatureSpaceWindow(norm_mode_ == NM_BASELINE ? baseline : character,
                          f_window);
  int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
  if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
    const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
                                                               class_id2);
    for (int f = 0; f < sample->num_features(); ++f) {
      RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
    }
  }
  int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
  if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
    const BitVector& cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
    for (int f = 0; f < cloud.size(); ++f) {
      if (cloud[f]) {
        INT_FEATURE_STRUCT feature =
            feature_map.InverseIndexFeature(f);
        RenderIntFeature(f_window, &feature, ScrollView::GREEN);
      }
    }
  }
  f_window->Update();
  ScrollView* s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
  SVEventType ev_type;
  do {
    SVEvent* ev;
    // Wait until a click or popup event.
    ev = f_window->AwaitEvent(SVET_ANY);
    ev_type = ev->type;
    if (ev_type == SVET_CLICK) {
      int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
      if (feature_index >= 0) {
        // Iterate samples and display those with the feature.
        Shape shape;
        shape.AddToShape(class_id1, cloud_font);
        s_window->Clear();
        samples_.DisplaySamplesWithFeature(feature_index, shape,
                                           feature_space, ScrollView::GREEN,
                                           s_window);
        s_window->Update();
      }
    }
    delete ev;
  } while (ev_type != SVET_DESTROY);
}
int tesseract::MasterTrainer::GetBestMatchingFontInfoId ( const char *  filename)

Definition at line 487 of file mastertrainer.cpp.

                                                                 {
  int fontinfo_id = -1;
  int best_len = 0;
  for (int f = 0; f < fontinfo_table_.size(); ++f) {
    if (strstr(filename, fontinfo_table_.get(f).name) != NULL) {
      int len = strlen(fontinfo_table_.get(f).name);
      // Use the longest matching length in case a substring of a font matched.
      if (len > best_len) {
        best_len = len;
        fontinfo_id = f;
      }
    }
  }
  return fontinfo_id;
}
int tesseract::MasterTrainer::GetFontInfoId ( const char *  font_name)

Definition at line 472 of file mastertrainer.cpp.

                                                      {
  FontInfo fontinfo;
  // We are only borrowing the string, so it is OK to const cast it.
  fontinfo.name = const_cast<char*>(font_name);
  fontinfo.properties = 0;  // Not used to lookup in the table
  fontinfo.universal_id = 0;
  if (!fontinfo_table_.contains(fontinfo)) {
    return -1;
  } else {
    return fontinfo_table_.get_id(fontinfo);
  }
}
TrainingSampleSet* tesseract::MasterTrainer::GetSamples ( ) [inline]

Definition at line 185 of file mastertrainer.h.

                                  {
    return &samples_;
  }
void tesseract::MasterTrainer::IncludeJunk ( )

Definition at line 307 of file mastertrainer.cpp.

                                {
  // Get ids of fragments in junk_samples_ that replace the dead chars.
  const UNICHARSET& junk_set = junk_samples_.unicharset();
  const UNICHARSET& sample_set = samples_.unicharset();
  int num_junks = junk_samples_.num_samples();
  tprintf("Moving %d junk samples to master sample set.\n", num_junks);
  for (int s = 0; s < num_junks; ++s) {
    TrainingSample* sample = junk_samples_.mutable_sample(s);
    int junk_id = sample->class_id();
    const char* junk_utf8 = junk_set.id_to_unichar(junk_id);
    int sample_id = sample_set.unichar_to_id(junk_utf8);
    if (sample_id == INVALID_UNICHAR_ID)
      sample_id = 0;
    sample->set_class_id(sample_id);
    junk_samples_.extract_sample(s);
    samples_.AddSample(sample_id, sample);
  }
  junk_samples_.DeleteDeadSamples();
  samples_.OrganizeByFontAndClass();
}
bool tesseract::MasterTrainer::LoadFontInfo ( const char *  filename)

Definition at line 345 of file mastertrainer.cpp.

                                                     {
  FILE* fp = fopen(filename, "rb");
  if (fp == NULL) {
    fprintf(stderr, "Failed to load font_properties from %s\n", filename);
    return false;
  }
  int italic, bold, fixed, serif, fraktur;
  while (!feof(fp)) {
    FontInfo fontinfo;
    char* font_name = new char[1024];
    fontinfo.name = font_name;
    fontinfo.properties = 0;
    fontinfo.universal_id = 0;
    if (fscanf(fp, "%1024s %i %i %i %i %i\n", font_name,
               &italic, &bold, &fixed, &serif, &fraktur) != 6)
      continue;
    fontinfo.properties =
        (italic << 0) +
        (bold << 1) +
        (fixed << 2) +
        (serif << 3) +
        (fraktur << 4);
    if (!fontinfo_table_.contains(fontinfo)) {
      fontinfo_table_.push_back(fontinfo);
    }
  }
  fclose(fp);
  return true;
}
void tesseract::MasterTrainer::LoadPageImages ( const char *  filename)

Definition at line 209 of file mastertrainer.cpp.

                                                       {
  int page;
  Pix* pix;
  for (page = 0; (pix = pixReadTiff(filename, page)) != NULL; ++page) {
    page_images_.push_back(pix);
  }
  tprintf("Loaded %d page images from %s\n", page, filename);
}
void tesseract::MasterTrainer::LoadUnicharset ( const char *  filename)

Definition at line 114 of file mastertrainer.cpp.

                                                       {
  if (!unicharset_.load_from_file(filename)) {
    tprintf("Failed to load unicharset from file %s\n"
            "Building unicharset for training from scratch...\n",
            filename);
    unicharset_.clear();
    // Space character needed to represent NIL_LIST classification.
    unicharset_.unichar_insert(" ");
  }
  charsetsize_ = unicharset_.size();
  delete [] fragments_;
  fragments_ = new int[charsetsize_];
  memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
  samples_.LoadUnicharset(filename);
  junk_samples_.LoadUnicharset(filename);
  verify_samples_.LoadUnicharset(filename);
}
bool tesseract::MasterTrainer::LoadXHeights ( const char *  filename)

Definition at line 377 of file mastertrainer.cpp.

                                                     {
  tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
  xheights_.init_to_size(fontinfo_table_.size(), -1);
  if (filename == NULL) return true;
  FILE *f = fopen(filename, "rb");
  if (f == NULL) {
    fprintf(stderr, "Failed to load font xheights from %s\n", filename);
    return false;
  }
  tprintf("Reading x-heights from %s ...\n", filename);
  FontInfo fontinfo;
  fontinfo.properties = 0;  // Not used to lookup in the table.
  fontinfo.universal_id = 0;
  char buffer[1024];
  int xht;
  int total_xheight = 0;
  int xheight_count = 0;
  while (!feof(f)) {
    if (fscanf(f, "%1024s %d\n", buffer, &xht) != 2)
      continue;
    fontinfo.name = buffer;
    if (!fontinfo_table_.contains(fontinfo)) continue;
    int fontinfo_id = fontinfo_table_.get_id(fontinfo);
    xheights_[fontinfo_id] = xht;
    total_xheight += xht;
    ++xheight_count;
  }
  if (xheight_count == 0) {
    fprintf(stderr, "No valid xheights in %s!\n", filename);
    return false;
  }
  int mean_xheight = DivRounded(total_xheight, xheight_count);
  for (int i = 0; i < fontinfo_table_.size(); ++i) {
    if (xheights_[i] < 0)
      xheights_[i] = mean_xheight;
  }
  return true;
}  // LoadXHeights
const ShapeTable& tesseract::MasterTrainer::master_shapes ( ) const [inline]

Definition at line 188 of file mastertrainer.h.

                                          {
    return master_shapes_;
  }
void tesseract::MasterTrainer::PostLoadCleanup ( )

Definition at line 223 of file mastertrainer.cpp.

                                    {
  if (debug_level_ > 0)
    tprintf("PostLoadCleanup...\n");
  if (enable_shape_anaylsis_)
    ReplaceFragmentedSamples();
  SampleIterator sample_it;
  sample_it.Init(NULL, NULL, true, &verify_samples_);
  sample_it.NormalizeSamples();
  verify_samples_.OrganizeByFontAndClass();

  samples_.IndexFeatures(feature_space_);
  // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
  // against current training.
  //  samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
  samples_.OrganizeByFontAndClass();
  if (debug_level_ > 0)
    tprintf("ComputeCanonicalSamples...\n");
  samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
}
void tesseract::MasterTrainer::PreTrainingSetup ( )

Definition at line 246 of file mastertrainer.cpp.

                                     {
  if (debug_level_ > 0)
    tprintf("PreTrainingSetup...\n");
  samples_.IndexFeatures(feature_space_);
  samples_.ComputeCanonicalFeatures();
  if (debug_level_ > 0)
    tprintf("ComputeCloudFeatures...\n");
  samples_.ComputeCloudFeatures(feature_space_.Size());
}
void tesseract::MasterTrainer::ReadTrainingSamples ( FILE *  fp,
const FEATURE_DEFS_STRUCT feature_defs,
bool  verification 
)

Definition at line 136 of file mastertrainer.cpp.

                                                           {
  char buffer[2048];
  int int_feature_type = ShortNameToFeatureType(feature_defs, kIntFeatureType);
  int micro_feature_type = ShortNameToFeatureType(feature_defs,
                                                  kMicroFeatureType);
  int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
  int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);

  while (fgets(buffer, sizeof(buffer), fp) != NULL) {
    if (buffer[0] == '\n')
      continue;

    char* space = strchr(buffer, ' ');
    if (space == NULL) {
      tprintf("Bad format in tr file, reading fontname, unichar\n");
      continue;
    }
    *space++ = '\0';
    int font_id = GetFontInfoId(buffer);
    int page_number;
    STRING unichar;
    TBOX bounding_box;
    if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
      tprintf("Bad format in tr file, reading box coords\n");
      continue;
    }
    CHAR_DESC char_desc = ReadCharDescription(feature_defs, fp);
    TrainingSample* sample = new TrainingSample;
    sample->set_font_id(font_id);
    sample->set_page_num(page_number + page_images_.size());
    sample->set_bounding_box(bounding_box);
    sample->ExtractCharDesc(int_feature_type, micro_feature_type,
                            cn_feature_type, geo_feature_type, char_desc);
    AddSample(verification, unichar.string(), sample);
    FreeCharDescription(char_desc);
  }
  charsetsize_ = unicharset_.size();
}
void tesseract::MasterTrainer::ReplicateAndRandomizeSamplesIfRequired ( )

Definition at line 333 of file mastertrainer.cpp.

                                                           {
  if (enable_replication_) {
    if (debug_level_ > 0)
      tprintf("ReplicateAndRandomize...\n");
    verify_samples_.ReplicateAndRandomizeSamples();
    samples_.ReplicateAndRandomizeSamples();
    samples_.IndexFeatures(feature_space_);
  }
}
bool tesseract::MasterTrainer::Serialize ( FILE *  fp) const

Definition at line 71 of file mastertrainer.cpp.

                                            {
  if (fwrite(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
  if (!unicharset_.save_to_file(fp)) return false;
  if (!feature_space_.Serialize(fp)) return false;
  if (!samples_.Serialize(fp)) return false;
  if (!junk_samples_.Serialize(fp)) return false;
  if (!verify_samples_.Serialize(fp)) return false;
  if (!master_shapes_.Serialize(fp)) return false;
  if (!flat_shapes_.Serialize(fp)) return false;
  if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_info)))
    return false;
  if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_spacing_info)))
    return false;
  if (!xheights_.Serialize(fp)) return false;
  return true;
}
void tesseract::MasterTrainer::SetFeatureSpace ( const IntFeatureSpace fs) [inline]

Definition at line 84 of file mastertrainer.h.

                                                  {
    feature_space_ = fs;
    feature_map_.Init(fs);
  }
void tesseract::MasterTrainer::SetupFlatShapeTable ( ShapeTable shape_table)

Definition at line 504 of file mastertrainer.cpp.

                                                               {
  // To exactly mimic the results of the previous implementation, the shapes
  // must be clustered in order the fonts arrived, and reverse order of the
  // characters within each font.
  // Get a list of the fonts in the order they appeared.
  GenericVector<int> active_fonts;
  int num_shapes = flat_shapes_.NumShapes();
  for (int s = 0; s < num_shapes; ++s) {
    int font = flat_shapes_.GetShape(s)[0].font_ids[0];
    int f = 0;
    for (f = 0; f < active_fonts.size(); ++f) {
      if (active_fonts[f] == font)
        break;
    }
    if (f == active_fonts.size())
      active_fonts.push_back(font);
  }
  // For each font in order, add all the shapes with that font in reverse order.
  int num_fonts = active_fonts.size();
  for (int f = 0; f < num_fonts; ++f) {
    for (int s = num_shapes - 1; s >= 0; --s) {
      int font = flat_shapes_.GetShape(s)[0].font_ids[0];
      if (font == active_fonts[f]) {
        shape_table->AddShape(flat_shapes_.GetShape(s));
      }
    }
  }
}
CLUSTERER * tesseract::MasterTrainer::SetupForClustering ( const ShapeTable shape_table,
const FEATURE_DEFS_STRUCT feature_defs,
int  shape_id,
int *  num_samples 
)

Definition at line 535 of file mastertrainer.cpp.

                      {

  int desc_index = ShortNameToFeatureType(feature_defs, kMicroFeatureType);
  int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
  ASSERT_HOST(num_params == MFCount);
  CLUSTERER* clusterer = MakeClusterer(
      num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);

  // We want to iterate over the samples of just the one shape.
  IndexMapBiDi shape_map;
  shape_map.Init(shape_table.NumShapes(), false);
  shape_map.SetMap(shape_id, true);
  shape_map.Setup();
  // Reverse the order of the samples to match the previous behavior.
  GenericVector<const TrainingSample*> sample_ptrs;
  SampleIterator it;
  it.Init(&shape_map, &shape_table, false, &samples_);
  for (it.Begin(); !it.AtEnd(); it.Next()) {
    sample_ptrs.push_back(&it.GetSample());
  }
  int sample_id = 0;
  for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
    const TrainingSample* sample = sample_ptrs[i];
    int num_features = sample->num_micro_features();
    for (int f = 0; f < num_features; ++f)
      MakeSample(clusterer, sample->micro_features()[f], sample_id);
    ++sample_id;
  }
  *num_samples = sample_id;
  return clusterer;
}
void tesseract::MasterTrainer::SetupMasterShapes ( )

Definition at line 258 of file mastertrainer.cpp.

                                      {
  tprintf("Building master shape table\n");
  int num_fonts = samples_.NumFonts();

  ShapeTable char_shapes_begin_fragment(samples_.unicharset());
  ShapeTable char_shapes_end_fragment(samples_.unicharset());
  ShapeTable char_shapes(samples_.unicharset());
  for (int c = 0; c < samples_.charsetsize(); ++c) {
    ShapeTable shapes(samples_.unicharset());
    for (int f = 0; f < num_fonts; ++f) {
      if (samples_.NumClassSamples(f, c, true) > 0)
        shapes.AddShape(c, f);
    }
    ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);

    const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);

    if (fragment == NULL)
      char_shapes.AppendMasterShapes(shapes);
    else if (fragment->is_beginning())
      char_shapes_begin_fragment.AppendMasterShapes(shapes);
    else if (fragment->is_ending())
      char_shapes_end_fragment.AppendMasterShapes(shapes);
    else
      char_shapes.AppendMasterShapes(shapes);
  }
  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
                kFontMergeDistance, &char_shapes_begin_fragment);
  char_shapes.AppendMasterShapes(char_shapes_begin_fragment);
  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
                kFontMergeDistance, &char_shapes_end_fragment);
  char_shapes.AppendMasterShapes(char_shapes_end_fragment);
  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
                kFontMergeDistance, &char_shapes);
  master_shapes_.AppendMasterShapes(char_shapes);
  tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
}
float tesseract::MasterTrainer::ShapeDistance ( const ShapeTable shapes,
int  s1,
int  s2 
)

Definition at line 797 of file mastertrainer.cpp.

                                                                           {
  const IntFeatureMap& feature_map = feature_map_;
  const Shape& shape1 = shapes.GetShape(s1);
  const Shape& shape2 = shapes.GetShape(s2);
  int num_chars1 = shape1.size();
  int num_chars2 = shape2.size();
  float dist_sum = 0.0f;
  int dist_count = 0;
  if (num_chars1 > 1 || num_chars2 > 1) {
    // In the multi-char case try to optimize the calculation by computing
    // distances between characters of matching font where possible.
    for (int c1 = 0; c1 < num_chars1; ++c1) {
      for (int c2 = 0; c2 < num_chars2; ++c2) {
        dist_sum += samples_.UnicharDistance(shape1[c1], shape2[c2],
                                             true, feature_map);
        ++dist_count;
      }
    }
  } else {
    // In the single unichar case, there is little alternative, but to compute
    // the squared-order distance between pairs of fonts.
    dist_sum = samples_.UnicharDistance(shape1[0], shape2[0],
                                        false, feature_map);
    ++dist_count;
  }
  return dist_sum / dist_count;
}
double tesseract::MasterTrainer::TestClassifier ( int  report_level,
bool  replicate_samples,
TrainingSampleSet samples,
ShapeClassifier test_classifier,
STRING report_string 
)

Definition at line 770 of file mastertrainer.cpp.

                                                            {
  SampleIterator sample_it;
  sample_it.Init(NULL, test_classifier->GetShapeTable(), replicate_samples,
                 samples);
  if (report_level > 0) {
    int num_samples = 0;
    for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
      ++num_samples;
    tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
            sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
            test_classifier->GetShapeTable()->NumShapes(), num_samples);
    tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
  }
  double unichar_error = 0.0;
  ErrorCounter::ComputeErrorRate(test_classifier, report_level,
                                 CT_SHAPE_TOP_ERR, fontinfo_table_,
                                 page_images_, &sample_it, &unichar_error,
                                 NULL, report_string);
  return unichar_error;
}
void tesseract::MasterTrainer::TestClassifierOnSamples ( int  report_level,
bool  replicate_samples,
ShapeClassifier test_classifier,
STRING report_string 
)

Definition at line 750 of file mastertrainer.cpp.

                                                                   {
  TestClassifier(report_level, replicate_samples, &samples_,
                 test_classifier, report_string);
}
const UNICHARSET& tesseract::MasterTrainer::unicharset ( ) const [inline]

Definition at line 182 of file mastertrainer.h.

                                       {
    return samples_.unicharset();
  }
void tesseract::MasterTrainer::WriteInttempAndPFFMTable ( const UNICHARSET unicharset,
const UNICHARSET shape_set,
const ShapeTable shape_table,
CLASS_STRUCT float_classes,
const char *  inttemp_file,
const char *  pffmtable_file 
)

Definition at line 575 of file mastertrainer.cpp.

                                                                         {
  tesseract::Classify *classify = new tesseract::Classify();
  // Move the fontinfo table to classify.
  classify->get_fontinfo_table().move(&fontinfo_table_);
  INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
                                                             shape_set);
  FILE* fp = fopen(inttemp_file, "wb");
  classify->WriteIntTemplates(fp, int_templates, shape_set);
  fclose(fp);
  // Now write pffmtable. This is complicated by the fact that the adaptive
  // classifier still wants one indexed by unichar-id, but the static
  // classifier needs one indexed by its shape class id.
  // We put the shapetable_cutoffs in a GenericVector, and compute the
  // unicharset cutoffs along the way.
  GenericVector<uinT16> shapetable_cutoffs;
  GenericVector<uinT16> unichar_cutoffs;
  for (int c = 0; c < unicharset.size(); ++c)
    unichar_cutoffs.push_back(0);
  /* then write out each class */
  for (int i = 0; i < int_templates->NumClasses; ++i) {
    INT_CLASS Class = ClassForClassId(int_templates, i);
    // Todo: Test with min instead of max
    // int MaxLength = LengthForConfigId(Class, 0);
    uinT16 max_length = 0;
    for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
      // Todo: Test with min instead of max
      // if (LengthForConfigId (Class, config_id) < MaxLength)
      uinT16 length = Class->ConfigLengths[config_id];
      if (length > max_length)
        max_length = Class->ConfigLengths[config_id];
      int shape_id = float_classes[i].font_set.get(config_id);
      const Shape& shape = shape_table.GetShape(shape_id);
      for (int c = 0; c < shape.size(); ++c) {
        int unichar_id = shape[c].unichar_id;
        if (length > unichar_cutoffs[unichar_id])
          unichar_cutoffs[unichar_id] = length;
      }
    }
    shapetable_cutoffs.push_back(max_length);
  }
  fp = fopen(pffmtable_file, "wb");
  shapetable_cutoffs.Serialize(fp);
  for (int c = 0; c < unicharset.size(); ++c) {
    const char *unichar = unicharset.id_to_unichar(c);
    if (strcmp(unichar, " ") == 0) {
      unichar = "NULL";
    }
    fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
  }
  fclose(fp);
  free_int_templates(int_templates);
}

The documentation for this class was generated from the following files: