Tesseract  3.02
tesseract-ocr/training/classifier_tester.cpp
Go to the documentation of this file.
00001 // Copyright 2011 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00003 
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 
00014 //  Filename: classifier_tester.cpp
00015 //  Purpose:  Tests a character classifier on data as formatted for training,
00016 //            but doesn't have to be the same as the training data.
00017 //  Author:   Ray Smith
00018 
00019 #ifndef USE_STD_NAMESPACE
00020 #include "base/commandlineflags.h"
00021 #endif
00022 #include "baseapi.h"
00023 #include "commontraining.h"
00024 #include "cubeclassifier.h"
00025 #include "mastertrainer.h"
00026 #include "params.h"
00027 #include "strngs.h"
00028 #include "tessclassifier.h"
00029 
00030 STRING_PARAM_FLAG(classifier, "", "Classifier to test");
00031 STRING_PARAM_FLAG(lang, "eng", "Language to test");
00032 STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
00033 
00034 enum ClassifierName {
00035   CN_PRUNER,
00036   CN_FULL,
00037   CN_CUBE,
00038   CN_CUBETESS,
00039   CN_COUNT
00040 };
00041 
00042 const char* names[] = {"pruner", "full", "cube", "cubetess", NULL };
00043 
00044 // This program has complex setup requirements, so here is some help:
00045 // Two different modes, tr files and serialized mastertrainer.
00046 // From tr files:
00047 //   classifier_tester -U unicharset -F font_properties -X xheights
00048 //     -classifier x -lang lang [-output_trainer trainer] *.tr
00049 // From a serialized trainer:
00050 //  classifier_tester -input_trainer trainer [-lang lang] -classifier x
00051 //
00052 // In the first case, the unicharset must be the unicharset from within
00053 // the classifier under test, and the font_properties and xheights files must
00054 // match the files used during training.
00055 // In the second case, the trainer file must have been prepared from
00056 // some previous run of shapeclustering, mftraining, or classifier_tester
00057 // using the same conditions as above, ie matching unicharset/font_properties.
00058 //
00059 // Available values of classifier (x above) are:
00060 // pruner   : Tesseract class pruner only.
00061 // full     : Tesseract full classifier.
00062 // cube     : Cube classifier. (Not possible with an input trainer.)
00063 // cubetess : Tesseract class pruner with rescoring by Cube.  (Not possible
00064 //            with an input trainer.)
00065 int main(int argc, char **argv) {
00066   ParseArguments(&argc, &argv);
00067   // Decode the classifier string.
00068   ClassifierName classifier = CN_COUNT;
00069   for (int c = 0; c < CN_COUNT; ++c) {
00070     if (strcmp(FLAGS_classifier.c_str(), names[c]) == 0) {
00071       classifier = static_cast<ClassifierName>(c);
00072       break;
00073     }
00074   }
00075   if (classifier == CN_COUNT) {
00076     fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
00077     return 1;
00078   }
00079 
00080   STRING file_prefix;
00081   tesseract::MasterTrainer* trainer = tesseract::LoadTrainingData(
00082       argc, argv, true, NULL, &file_prefix);
00083   // We want to test junk as well if it is available.
00084   trainer->IncludeJunk();
00085   // We want to test with replicated samples too.
00086   trainer->ReplicateAndRandomizeSamplesIfRequired();
00087 
00088   // We need to initialize tesseract to test.
00089   tesseract::TessBaseAPI api;
00090   tesseract::OcrEngineMode engine_mode = tesseract::OEM_TESSERACT_ONLY;
00091   if (classifier == CN_CUBE || classifier == CN_CUBETESS)
00092     engine_mode = tesseract::OEM_TESSERACT_CUBE_COMBINED;
00093   if (api.Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
00094                engine_mode) < 0) {
00095     fprintf(stderr, "Tesseract initialization failed!\n");
00096     return 1;
00097   }
00098   tesseract::ShapeClassifier* shape_classifier = NULL;
00099   tesseract::Tesseract* tesseract =
00100       const_cast<tesseract::Tesseract*>(api.tesseract());
00101   tesseract::Classify* classify =
00102       reinterpret_cast<tesseract::Classify*>(tesseract);
00103   // Copy the shape_table from the classifier and add the space character if
00104   // not already present to count junk.
00105   tesseract::ShapeTable shape_table;
00106   shape_table.set_unicharset(classify->shape_table()->unicharset());
00107   shape_table.AppendMasterShapes(*classify->shape_table());
00108   if (shape_table.FindShape(0, -1) < 0)
00109     shape_table.AddShape(0, 0);
00110   if (classifier == CN_PRUNER) {
00111     shape_classifier = new tesseract::TessClassifier(true, classify);
00112   } else if (classifier == CN_FULL) {
00113     shape_classifier = new tesseract::TessClassifier(false, classify);
00114   } else if (classifier == CN_CUBE) {
00115     shape_classifier = new tesseract::CubeClassifier(tesseract);
00116   } else if (classifier == CN_CUBETESS) {
00117     shape_classifier = new tesseract::CubeTessClassifier(tesseract);
00118   } else {
00119     fprintf(stderr, "%s tester not yet implemented\n",
00120             FLAGS_classifier.c_str());
00121     return 1;
00122   }
00123   tprintf("Testing classifier %s:\n", FLAGS_classifier.c_str());
00124   trainer->TestClassifierOnSamples(3, false, shape_classifier, NULL);
00125   if (classifier != CN_CUBE && classifier != CN_CUBETESS) {
00126     // Test with replicated samples as well.
00127     trainer->TestClassifierOnSamples(3, true, shape_classifier, NULL);
00128   }
00129   delete shape_classifier;
00130   delete trainer;
00131 
00132   return 0;
00133 } /* main */
00134 
00135 
00136 
00137 
00138 
00139