Tesseract  3.02
tesseract-ocr/training/cntraining.cpp File Reference
#include "oldlist.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "clusttool.h"
#include "cluster.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "unichar.h"
#include "commontraining.h"

Go to the source code of this file.

Defines

#define PROGRAM_FEATURE_TYPE   "cn"

Functions

 DECLARE_STRING_PARAM_FLAG (D)
int main (int argc, char **argv)
void WriteNormProtos (const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
void WriteProtos (FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
int main (int argc, char *argv[])

Variables

CLUSTERCONFIG CNConfig

Define Documentation

#define PROGRAM_FEATURE_TYPE   "cn"

---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------

Definition at line 41 of file cntraining.cpp.


Function Documentation

DECLARE_STRING_PARAM_FLAG ( )
int main ( int  argc,
char **  argv 
)

---------------------------------------------------------------------------- Public Function Prototypes ----------------------------------------------------------------------------

Definition at line 49 of file tesseractmain.cpp.

                                {
#ifdef USING_GETTEXT
  setlocale (LC_ALL, "");
  bindtextdomain (PACKAGE, LOCALEDIR);
  textdomain (PACKAGE);
#endif
  if ((argc == 2 && strcmp(argv[1], "-v") == 0) ||
      (argc == 2 && strcmp(argv[1], "--version") == 0)) {
    char *versionStrP;

    fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());
    
    versionStrP = getLeptonicaVersion();
    fprintf(stderr, " %s\n", versionStrP);
    lept_free(versionStrP);
    
    versionStrP = getImagelibVersions();
    fprintf(stderr, "  %s\n", versionStrP);
    lept_free(versionStrP);

    exit(0);
  }
  // Make the order of args a bit more forgiving than it used to be.
  const char* lang = "eng";
  const char* image = NULL;
  const char* output = NULL;
  tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
  int arg = 1;
  while (arg < argc && (output == NULL || argv[arg][0] == '-')) {
    if (strcmp(argv[arg], "-l") == 0 && arg + 1 < argc) {
      lang = argv[arg + 1];
      ++arg;
    } else if (strcmp(argv[arg], "-psm") == 0 && arg + 1 < argc) {
      pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[arg + 1]));
      ++arg;
    } else if (image == NULL) {
      image = argv[arg];
    } else if (output == NULL) {
      output = argv[arg];
    }
    ++arg;
  }
  if (output == NULL) {
    fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] "
                      "[-psm pagesegmode] [configfile...]\n"), argv[0]);
    fprintf(stderr,
            _("pagesegmode values are:\n"
              "0 = Orientation and script detection (OSD) only.\n"
              "1 = Automatic page segmentation with OSD.\n"
              "2 = Automatic page segmentation, but no OSD, or OCR\n"
              "3 = Fully automatic page segmentation, but no OSD. (Default)\n"
              "4 = Assume a single column of text of variable sizes.\n"
              "5 = Assume a single uniform block of vertically aligned text.\n"
              "6 = Assume a single uniform block of text.\n"
              "7 = Treat the image as a single text line.\n"
              "8 = Treat the image as a single word.\n"
              "9 = Treat the image as a single word in a circle.\n"
              "10 = Treat the image as a single character.\n"));
    fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any"
                      "configfile.\n"));
    exit(1);
  }

  tesseract::TessBaseAPI  api;

  api.SetOutputName(output);

  int rc = api.Init(argv[0], lang, tesseract::OEM_DEFAULT,
           &(argv[arg]), argc - arg, NULL, NULL, false);
  if (rc) {
    fprintf(stderr, "Could not initialize tesseract.\n");
    exit(1);
  }
   
  // We have 2 possible sources of pagesegmode: a config file and
  // the command line. For backwards compatability reasons, the
  // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
  // default for this program is tesseract::PSM_AUTO. We will let
  // the config file take priority, so the command-line default
  // can take priority over the tesseract default, so we use the
  // value from the command line only if the retrieved mode
  // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
  // in any config file. Therefore the only way to force
  // tesseract::PSM_SINGLE_BLOCK is from the command line.
  // It would be simpler if we could set the value before Init,
  // but that doesn't work.
  if (api.GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK)
    api.SetPageSegMode(pagesegmode);
  tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n",
           tesseract::TessBaseAPI::Version());


  FILE* fin = fopen(image, "rb");
  if (fin == NULL) {
    printf("Cannot open input file: %s\n", image);
    exit(2);
  }
  fclose(fin);

  PIX   *pixs;
  if ((pixs = pixRead(image)) == NULL) {
    printf("Unsupported image type.\n");
    exit(3);
  }
  pixDestroy(&pixs);

  STRING text_out;
  if (!api.ProcessPages(image, NULL, 0, &text_out)) {
    printf("Error during processing.\n");
  }
  bool output_hocr = false;
  api.GetBoolVariable("tessedit_create_hocr", &output_hocr);
  bool output_box = false;
  api.GetBoolVariable("tessedit_create_boxfile", &output_box);
  STRING outfile = output;
  outfile += output_hocr ? ".html" : output_box ? ".box" : ".txt";
  FILE* fout = fopen(outfile.string(), "wb");
  if (fout == NULL) {
    printf("Cannot create output file %s\n", outfile.string());
    exit(1);
  }
  fwrite(text_out.string(), 1, text_out.length(), fout);
  fclose(fout);

  return 0;                      // Normal exit
}
int main ( int  argc,
char *  argv[] 
)

---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------

Definition at line 89 of file cntraining.cpp.

{
  // Set the global Config parameters before parsing the command line.
  Config = CNConfig;

  const char  *PageName;
  FILE  *TrainingPage;
  LIST  CharList = NIL_LIST;
  CLUSTERER  *Clusterer = NULL;
  LIST    ProtoList = NIL_LIST;
  LIST    NormProtoList = NIL_LIST;
  LIST pCharList;
  LABELEDLIST CharSample;
  FEATURE_DEFS_STRUCT FeatureDefs;
  InitFeatureDefs(&FeatureDefs);

  ParseArguments(&argc, &argv);
  int num_fonts = 0;
  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
    printf("Reading %s ...\n", PageName);
    TrainingPage = Efopen(PageName, "rb");
    ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE,
                        100, NULL, TrainingPage, &CharList);
    fclose(TrainingPage);
    ++num_fonts;
  }
  printf("Clustering ...\n");
  // To allow an individual font to form a separate cluster,
  // reduce the min samples:
  // Config.MinSamples = 0.5 / num_fonts;
  pCharList = CharList;
  iterate(pCharList) {
    //Cluster
    CharSample = (LABELEDLIST)first_node(pCharList);
    Clusterer =
      SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
    float SavedMinSamples = Config.MinSamples;
    // To disable the tendency to produce a single cluster for all fonts,
    // make MagicSamples an impossible to achieve number:
    // Config.MagicSamples = CharSample->SampleCount * 10;
    Config.MagicSamples = CharSample->SampleCount;
    while (Config.MinSamples > 0.001) {
      ProtoList = ClusterSamples(Clusterer, &Config);
      if (NumberOfProtos(ProtoList, 1, 0) > 0) {
        break;
      } else {
        Config.MinSamples *= 0.95;
        printf("0 significant protos for %s."
               " Retrying clustering with MinSamples = %f%%\n",
               CharSample->Label, Config.MinSamples);
      }
    }
    Config.MinSamples = SavedMinSamples;
    AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
  }
  FreeTrainingSamples(CharList);
  if (Clusterer == NULL) { // To avoid a SIGSEGV
    fprintf(stderr, "Error: NULL clusterer!\n");
    return 1;
  }
  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
  FreeNormProtoList(NormProtoList);
  FreeProtoList(&ProtoList);
  FreeClusterer(Clusterer);
  printf ("\n");
  return 0;
}  // main
void WriteNormProtos ( const char *  Directory,
LIST  LabeledProtoList,
CLUSTERER Clusterer 
)

---------------------------------------------------------------------------- Private Function Prototypes ----------------------------------------------------------------------------

---------------------------------------------------------------------------- Private Code ----------------------------------------------------------------------------

Definition at line 215 of file cntraining.cpp.

{
  FILE    *File;
  STRING Filename;
  LABELEDLIST LabeledProto;
  int N;

  Filename = "";
  if (Directory != NULL && Directory[0] != '\0')
  {
    Filename += Directory;
    Filename += "/";
  }
  Filename += "normproto";
  printf ("\nWriting %s ...", Filename.string());
  File = Efopen (Filename.string(), "wb");
  fprintf(File,"%0d\n",Clusterer->SampleSize);
  WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
  iterate(LabeledProtoList)
  {
    LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
    N = NumberOfProtos(LabeledProto->List, true, false);
    if (N < 1) {
      printf ("\nError! Not enough protos for %s: %d protos"
              " (%d significant protos"
              ", %d insignificant protos)\n",
              LabeledProto->Label, N,
              NumberOfProtos(LabeledProto->List, 1, 0),
              NumberOfProtos(LabeledProto->List, 0, 1));
      exit(1);
    }
    fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
    WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
  }
  fclose (File);

}  // WriteNormProtos
void WriteProtos ( FILE *  File,
uinT16  N,
LIST  ProtoList,
BOOL8  WriteSigProtos,
BOOL8  WriteInsigProtos 
)

Definition at line 270 of file cntraining.cpp.

{
  PROTOTYPE  *Proto;

  // write prototypes
  iterate(ProtoList)
  {
    Proto = (PROTOTYPE *) first_node ( ProtoList );
    if (( Proto->Significant && WriteSigProtos )  ||
      ( ! Proto->Significant && WriteInsigProtos ) )
      WritePrototype( File, N, Proto );
  }
}  // WriteProtos

Variable Documentation

Initial value:
{
  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
}

---------------------------------------------------------------------------- Global Data Definitions and Declarations ----------------------------------------------------------------------------

Definition at line 79 of file cntraining.cpp.