Tesseract  3.02
tesseract-ocr/training/mftraining.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002 **  Filename:  mftraining.c
00003 **  Purpose:  Separates training pages into files for each character.
00004 **        Strips from files only the features and there parameters of
00005         the feature type mf.
00006 **  Author:    Dan Johnson
00007 **  Revisment:  Christy Russon
00008 **  Environment: HPUX 6.5
00009 **  Library:     HPUX 6.5
00010 **  History:     Fri Aug 18 08:53:50 1989, DSJ, Created.
00011 **         5/25/90, DSJ, Adapted to multiple feature types.
00012 **        Tuesday, May 17, 1998 Changes made to make feature specific and
00013 **        simplify structures. First step in simplifying training process.
00014 **
00015  **  (c) Copyright Hewlett-Packard Company, 1988.
00016  ** Licensed under the Apache License, Version 2.0 (the "License");
00017  ** you may not use this file except in compliance with the License.
00018  ** You may obtain a copy of the License at
00019  ** http://www.apache.org/licenses/LICENSE-2.0
00020  ** Unless required by applicable law or agreed to in writing, software
00021  ** distributed under the License is distributed on an "AS IS" BASIS,
00022  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00023  ** See the License for the specific language governing permissions and
00024  ** limitations under the License.
00025 ******************************************************************************/
00029 #include <string.h>
00030 #include <stdio.h>
00031 #define _USE_MATH_DEFINES
00032 #include <math.h>
00033 #ifdef _WIN32
00034 #ifndef M_PI
00035 #define M_PI 3.14159265358979323846
00036 #endif
00037 #endif
00038 
00039 #include "classify.h"
00040 #include "cluster.h"
00041 #include "clusttool.h"
00042 #include "commontraining.h"
00043 #include "danerror.h"
00044 #include "efio.h"
00045 #include "emalloc.h"
00046 #include "featdefs.h"
00047 #include "fontinfo.h"
00048 #include "genericvector.h"
00049 #include "indexmapbidi.h"
00050 #include "intproto.h"
00051 #include "mastertrainer.h"
00052 #include "mergenf.h"
00053 #include "mf.h"
00054 #include "ndminx.h"
00055 #include "ocrfeatures.h"
00056 #include "oldlist.h"
00057 #include "protos.h"
00058 #include "shapetable.h"
00059 #include "tessopt.h"
00060 #include "tprintf.h"
00061 #include "unicity_table.h"
00062 
00063 using tesseract::Classify;
00064 using tesseract::FontInfo;
00065 using tesseract::FontSpacingInfo;
00066 using tesseract::IndexMapBiDi;
00067 using tesseract::MasterTrainer;
00068 using tesseract::Shape;
00069 using tesseract::ShapeTable;
00070 
00071 #define PROGRAM_FEATURE_TYPE "mf"
00072 
00073 // Max length of a fake shape label.
00074 const int kMaxShapeLabelLength = 10;
00075 
00076 DECLARE_STRING_PARAM_FLAG(test_ch);
00077 
00081 int main (
00082      int  argc,
00083      char  **argv);
00084 
00085 
00086 /*----------------------------------------------------------------------------
00087             Public Code
00088 -----------------------------------------------------------------------------*/
00089 #ifndef GRAPHICS_DISABLED
00090 static void DisplayProtoList(const char* ch, LIST protolist) {
00091   void* window = c_create_window("Char samples", 50, 200,
00092                                  520, 520, -130.0, 130.0, -130.0, 130.0);
00093   LIST proto = protolist;
00094   iterate(proto) {
00095     PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE *>(first_node(proto));
00096     if (prototype->Significant)
00097       c_line_color_index(window, Green);
00098     else if (prototype->NumSamples == 0)
00099       c_line_color_index(window, Blue);
00100     else if (prototype->Merged)
00101       c_line_color_index(window, Magenta);
00102     else
00103       c_line_color_index(window, Red);
00104     float x = CenterX(prototype->Mean);
00105     float y = CenterY(prototype->Mean);
00106     double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
00107     float dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
00108     float dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
00109     c_move(window, (x - dx) * 256, (y - dy) * 256);
00110     c_draw(window, (x + dx) * 256, (y + dy) * 256);
00111     if (prototype->Significant)
00112       tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n",
00113               x, y, dx, dy, prototype->NumSamples);
00114     else if (prototype->NumSamples > 0 && !prototype->Merged)
00115       tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n",
00116               x, y, dx, dy, prototype->NumSamples);
00117   }
00118   c_make_current(window);
00119 }
00120 #endif  // GRAPHICS_DISABLED
00121 
00122 // Helper to run clustering on a single config.
00123 // Mostly copied from the old mftraining, but with renamed variables.
00124 static LIST ClusterOneConfig(int shape_id, const char* class_label,
00125                              LIST mf_classes,
00126                              const ShapeTable& shape_table,
00127                              MasterTrainer* trainer) {
00128   int num_samples;
00129   CLUSTERER  *clusterer = trainer->SetupForClustering(shape_table,
00130                                                       feature_defs,
00131                                                       shape_id,
00132                                                       &num_samples);
00133   Config.MagicSamples = num_samples;
00134   LIST proto_list = ClusterSamples(clusterer, &Config);
00135   CleanUpUnusedData(proto_list);
00136 
00137   // Merge protos where reasonable to make more of them significant by
00138   // representing almost all samples of the class/font.
00139   MergeInsignificantProtos(proto_list, class_label, clusterer, &Config);
00140   #ifndef GRAPHICS_DISABLED
00141   if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0)
00142     DisplayProtoList(FLAGS_test_ch.c_str(), proto_list);
00143   #endif  // GRAPHICS_DISABLED
00144   // Delete the protos that will not be used in the inttemp output file.
00145   proto_list = RemoveInsignificantProtos(proto_list, true,
00146                                          false,
00147                                          clusterer->SampleSize);
00148   FreeClusterer(clusterer);
00149   MERGE_CLASS merge_class = FindClass(mf_classes, class_label);
00150   if (merge_class == NULL) {
00151     merge_class = NewLabeledClass(class_label);
00152     mf_classes = push(mf_classes, merge_class);
00153   }
00154   int config_id = AddConfigToClass(merge_class->Class);
00155   merge_class->Class->font_set.push_back(shape_id);
00156   LIST proto_it = proto_list;
00157   iterate(proto_it) {
00158     PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE*>(first_node(proto_it));
00159     // See if proto can be approximated by existing proto.
00160     int p_id = FindClosestExistingProto(merge_class->Class,
00161                                         merge_class->NumMerged, prototype);
00162     if (p_id == NO_PROTO) {
00163       // Need to make a new proto, as it doesn't match anything.
00164       p_id = AddProtoToClass(merge_class->Class);
00165       MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype);
00166       merge_class->NumMerged[p_id] = 1;
00167     } else {
00168       PROTO_STRUCT dummy_proto;
00169       MakeNewFromOld(&dummy_proto, prototype);
00170       // Merge with the similar proto.
00171       ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto,
00172                          static_cast<FLOAT32>(merge_class->NumMerged[p_id]),
00173                          1.0,
00174                          ProtoIn(merge_class->Class, p_id));
00175       merge_class->NumMerged[p_id]++;
00176     }
00177     AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]);
00178   }
00179   FreeProtoList(&proto_list);
00180   return mf_classes;
00181 }
00182 
00183 // Helper to setup the config map.
00184 // Setup an index mapping from the shapes in the shape table to the classes
00185 // that will be trained. In keeping with the original design, each shape
00186 // with the same list of unichars becomes a different class and the configs
00187 // represent the different combinations of fonts.
00188 static void SetupConfigMap(ShapeTable* shape_table, IndexMapBiDi* config_map) {
00189   int num_configs = shape_table->NumShapes();
00190   config_map->Init(num_configs, true);
00191   config_map->Setup();
00192   for (int c1 = 0; c1 < num_configs; ++c1) {
00193     // Only process ids that are not already merged.
00194     if (config_map->SparseToCompact(c1) == c1) {
00195       Shape* shape1 = shape_table->MutableShape(c1);
00196       // Find all the subsequent shapes that are equal.
00197       for (int c2 = c1 + 1; c2 < num_configs; ++c2) {
00198         if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) {
00199           config_map->Merge(c1, c2);
00200         }
00201       }
00202     }
00203   }
00204   config_map->CompleteMerges();
00205 }
00206 
00207 /*---------------------------------------------------------------------------*/
00208 int main (int argc, char **argv) {
00209 /*
00210 **  Parameters:
00211 **    argc  number of command line arguments
00212 **    argv  array of command line arguments
00213 **  Globals: none
00214 **  Operation:
00215 **    This program reads in a text file consisting of feature
00216 **    samples from a training page in the following format:
00217 **
00218 **      FontName UTF8-char-str xmin ymin xmax ymax page-number
00219 **       NumberOfFeatureTypes(N)
00220 **         FeatureTypeName1 NumberOfFeatures(M)
00221 **            Feature1
00222 **            ...
00223 **            FeatureM
00224 **         FeatureTypeName2 NumberOfFeatures(M)
00225 **            Feature1
00226 **            ...
00227 **            FeatureM
00228 **         ...
00229 **         FeatureTypeNameN NumberOfFeatures(M)
00230 **            Feature1
00231 **            ...
00232 **            FeatureM
00233 **      FontName CharName ...
00234 **
00235 **    The result of this program is a binary inttemp file used by
00236 **    the OCR engine.
00237 **  Return: none
00238 **  Exceptions: none
00239 **  History:  Fri Aug 18 08:56:17 1989, DSJ, Created.
00240 **        Mon May 18 1998, Christy Russson, Revistion started.
00241 */
00242   ParseArguments(&argc, &argv);
00243 
00244   ShapeTable* shape_table = NULL;
00245   STRING file_prefix;
00246   // Load the training data.
00247   MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv,
00248                                                        false,
00249                                                        &shape_table,
00250                                                        &file_prefix);
00251   if (trainer == NULL)
00252     return 1;  // Failed.
00253 
00254   // Setup an index mapping from the shapes in the shape table to the classes
00255   // that will be trained. In keeping with the original design, each shape
00256   // with the same list of unichars becomes a different class and the configs
00257   // represent the different combinations of fonts.
00258   IndexMapBiDi config_map;
00259   SetupConfigMap(shape_table, &config_map);
00260 
00261   WriteShapeTable(file_prefix, *shape_table);
00262   // If the shape_table is flat, then either we didn't run shape clustering, or
00263   // it did nothing, so we just output the trainer's unicharset.
00264   // Otherwise shape_set will hold a fake unicharset with an entry for each
00265   // shape in the shape table, and we will output that instead.
00266   UNICHARSET shape_set;
00267   const UNICHARSET* unicharset = &trainer->unicharset();
00268   // If we ran shapeclustering (and it worked) then at least one shape will
00269   // have multiple unichars, so we have to build a fake unicharset.
00270   if (shape_table->AnyMultipleUnichars()) {
00271     unicharset = &shape_set;
00272     // Now build a fake unicharset for the compact shape space to keep the
00273     // output modules happy that we are doing things correctly.
00274     int num_shapes = config_map.CompactSize();
00275     for (int s = 0; s < num_shapes; ++s) {
00276       char shape_label[kMaxShapeLabelLength + 1];
00277       snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s);
00278       shape_set.unichar_insert(shape_label);
00279     }
00280   }
00281 
00282   // Now train each config separately.
00283   int num_configs = shape_table->NumShapes();
00284   LIST mf_classes = NIL_LIST;
00285   for (int s = 0; s < num_configs; ++s) {
00286     int unichar_id, font_id;
00287     if (unicharset == &shape_set) {
00288       // Using fake unichar_ids from the config_map/shape_set.
00289       unichar_id = config_map.SparseToCompact(s);
00290     } else {
00291       // Get the real unichar_id from the shape table/unicharset.
00292       shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
00293     }
00294     const char* class_label = unicharset->id_to_unichar(unichar_id);
00295     mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table,
00296                                   trainer);
00297   }
00298   STRING inttemp_file = file_prefix;
00299   inttemp_file += "inttemp";
00300   STRING pffmtable_file = file_prefix;
00301   pffmtable_file += "pffmtable";
00302   CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
00303   // Now write the inttemp and pffmtable.
00304   trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset,
00305                                     *shape_table, float_classes,
00306                                     inttemp_file.string(),
00307                                     pffmtable_file.string());
00308   delete [] float_classes;
00309   FreeLabeledClassList(mf_classes);
00310   delete trainer;
00311   delete shape_table;
00312   printf("Done!\n");
00313   if (!FLAGS_test_ch.empty()) {
00314     // If we are displaying debug window(s), wait for the user to look at them.
00315     printf("Hit return to exit...\n");
00316     while (getchar() != '\n');
00317   }
00318   return 0;
00319 }  /* main */