Tesseract  3.02
tesseract-ocr/training/commontraining.cpp
Go to the documentation of this file.
00001 // Copyright 2008 Google Inc. All Rights Reserved.
00002 // Author: scharron@google.com (Samuel Charron)
00003 //
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 
00014 #include "commontraining.h"
00015 
00016 #ifndef USE_STD_NAMESPACE
00017 #include "base/init_google.h"
00018 #include "base/commandlineflags.h"
00019 #endif
00020 #include "allheaders.h"
00021 #include "ccutil.h"
00022 #include "classify.h"
00023 #include "oldlist.h"
00024 #include "globals.h"
00025 #include "mf.h"
00026 #include "clusttool.h"
00027 #include "cluster.h"
00028 #include "tessopt.h"
00029 #include "efio.h"
00030 #include "emalloc.h"
00031 #include "featdefs.h"
00032 #include "fontinfo.h"
00033 #include "intfeaturespace.h"
00034 #include "mastertrainer.h"
00035 #include "tessdatamanager.h"
00036 #include "tprintf.h"
00037 #include "freelist.h"
00038 #include "params.h"
00039 #include "shapetable.h"
00040 #include "unicity_table.h"
00041 
00042 #include <math.h>
00043 
00044 using tesseract::CCUtil;
00045 using tesseract::FontInfo;
00046 using tesseract::IntFeatureSpace;
00047 using tesseract::ParamUtils;
00048 using tesseract::ShapeTable;
00049 
00050 // Global Variables.
00051 // global variable to hold configuration parameters to control clustering
00052 // -M 0.625   -B 0.05   -I 1.0   -C 1e-6.
00053 CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
00054 
00055 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
00056 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
00057 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
00058 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
00059 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
00060 STRING_PARAM_FLAG(X, "", "File listing font xheights");
00061 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
00062 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
00063 STRING_PARAM_FLAG(input_trainer, "", "File to load trainer from");
00064 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
00065 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
00066 
00067 // The usage strings are different as the DEFINE_* flags are available on
00068 // the command line, but the *_VAR flags are set through a config file with
00069 // some of them available through special command-line args.
00070 #ifndef USE_STD_NAMESPACE
00071 const char* kUsage = "[flags] [ .tr files ... ]\n";
00072 #else
00073 const char* kUsage = "[-c configfile]\n"
00074     "\t[-D Directory]\n"
00075     "\t[-M MinSamples] [-B MaxBad] [-I Independence] [-C Confidence]\n"
00076     "\t[-U InputUnicharset]\n"
00077     "\t[-O OutputUnicharset]\n"
00078     "\t[-F FontInfoFile]\n"
00079     "\t[-X InputXHeightsFile]\n"
00080     "\t[-S InputShapeTable]\n"
00081     "\t[ .tr files ... ]\n";
00082 #endif
00083 
00084 FEATURE_DEFS_STRUCT feature_defs;
00085 CCUtil ccutil;
00086 
00087 /*---------------------------------------------------------------------------*/
00088 void ParseArguments(int* argc, char ***argv) {
00089 /*
00090  **     Parameters:
00091  **             argc    number of command line arguments to parse
00092  **             argv    command line arguments
00093  **     Globals:
00094  **             ShowSignificantProtos   flag controlling proto display
00095  **             ShowInsignificantProtos flag controlling proto display
00096  **             Config                  current clustering parameters
00097  **             tessoptarg, tessoptind          defined by tessopt sys call
00098  **             Argc, Argv              global copies of argc and argv
00099  **     Operation:
00100  **             This routine parses the command line arguments that were
00101  **             passed to the program.  The legal arguments are shown in the usage
00102  **             message below:
00103 
00104  **     Return: none
00105  **     Exceptions: Illegal options terminate the program.
00106  **     History: 7/24/89, DSJ, Created.
00107  */
00108 #ifndef USE_STD_NAMESPACE
00109   InitGoogle(kUsage, argc, argv, true);
00110   tessoptind = 1;
00111 #else
00112   int    Option;
00113   int    ParametersRead;
00114   BOOL8  Error;
00115 
00116   Error = FALSE;
00117   while ((Option = tessopt(*argc, *argv, "F:O:U:D:C:I:M:B:S:X:c:")) != EOF) {
00118     switch (Option) {
00119       case 'C':
00120         ParametersRead = sscanf(tessoptarg, "%lf", &(Config.Confidence) );
00121         if ( ParametersRead != 1 ) Error = TRUE;
00122         else if ( Config.Confidence > 1 ) Config.Confidence = 1;
00123         else if ( Config.Confidence < 0 ) Config.Confidence = 0;
00124         break;
00125       case 'I':
00126         ParametersRead = sscanf(tessoptarg, "%f", &(Config.Independence) );
00127         if ( ParametersRead != 1 ) Error = TRUE;
00128         else if ( Config.Independence > 1 ) Config.Independence = 1;
00129         else if ( Config.Independence < 0 ) Config.Independence = 0;
00130         break;
00131       case 'M':
00132         ParametersRead = sscanf(tessoptarg, "%f", &(Config.MinSamples) );
00133         if ( ParametersRead != 1 ) Error = TRUE;
00134         else if ( Config.MinSamples > 1 ) Config.MinSamples = 1;
00135         else if ( Config.MinSamples < 0 ) Config.MinSamples = 0;
00136         break;
00137       case 'B':
00138         ParametersRead = sscanf(tessoptarg, "%f", &(Config.MaxIllegal) );
00139         if ( ParametersRead != 1 ) Error = TRUE;
00140         else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1;
00141         else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0;
00142         break;
00143       case 'c':
00144         FLAGS_configfile.set_value(tessoptarg);
00145         break;
00146       case 'D':
00147         FLAGS_D.set_value(tessoptarg);
00148         break;
00149       case 'U':
00150         FLAGS_U.set_value(tessoptarg);
00151         break;
00152       case 'O':
00153         FLAGS_O.set_value(tessoptarg);
00154         break;
00155       case 'F':
00156         FLAGS_F.set_value(tessoptarg);
00157         break;
00158       case 'X':
00159         FLAGS_X.set_value(tessoptarg);
00160         break;
00161       case '?':
00162         Error = TRUE;
00163         break;
00164     }
00165     if (Error) {
00166       fprintf(stderr, "Usage: %s %s\n", (*argv)[0], kUsage);
00167       exit(2);
00168     }
00169   }
00170 #endif
00171   // Set additional parameters from config file if specified.
00172   if (!FLAGS_configfile.empty()) {
00173     tesseract::ParamUtils::ReadParamsFile(
00174         FLAGS_configfile.c_str(),
00175         tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
00176         ccutil.params());
00177   }
00178 }  // ParseArguments
00179 
00180 namespace tesseract {
00181 
00182 // Helper loads shape table from the given file.
00183 ShapeTable* LoadShapeTable(const STRING& file_prefix) {
00184   ShapeTable* shape_table = NULL;
00185   STRING shape_table_file = file_prefix;
00186   shape_table_file += kShapeTableFileSuffix;
00187   FILE* shape_fp = fopen(shape_table_file.string(), "rb");
00188   if (shape_fp != NULL) {
00189     shape_table = new ShapeTable;
00190     if (!shape_table->DeSerialize(false, shape_fp)) {
00191       delete shape_table;
00192       shape_table = NULL;
00193       tprintf("Error: Failed to read shape table %s\n",
00194               shape_table_file.string());
00195     } else {
00196       int num_shapes = shape_table->NumShapes();
00197       tprintf("Read shape table %s of %d shapes\n",
00198               shape_table_file.string(), num_shapes);
00199     }
00200     fclose(shape_fp);
00201   } else {
00202     tprintf("Warning: No shape table file present: %s\n",
00203             shape_table_file.string());
00204   }
00205   return shape_table;
00206 }
00207 
00208 // Helper to write the shape_table.
00209 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
00210   STRING shape_table_file = file_prefix;
00211   shape_table_file += kShapeTableFileSuffix;
00212   FILE* fp = fopen(shape_table_file.string(), "wb");
00213   if (fp != NULL) {
00214     if (!shape_table.Serialize(fp)) {
00215       fprintf(stderr, "Error writing shape table: %s\n",
00216               shape_table_file.string());
00217     }
00218     fclose(fp);
00219   } else {
00220     fprintf(stderr, "Error creating shape table: %s\n",
00221             shape_table_file.string());
00222   }
00223 }
00224 
00225 // Creates a MasterTraininer and loads the training data into it:
00226 // Initializes feature_defs and IntegerFX.
00227 // Loads the shape_table if shape_table != NULL.
00228 // Loads initial unicharset from -U command-line option.
00229 // If FLAGS_input_trainer is set, loads the majority of data from there, else:
00230 //   Loads font info from -F option.
00231 //   Loads xheights from -X option.
00232 //   Loads samples from .tr files in remaining command-line args.
00233 //   Deletes outliers and computes canonical samples.
00234 //   If FLAGS_output_trainer is set, saves the trainer for future use.
00235 // Computes canonical and cloud features.
00236 // If shape_table is not NULL, but failed to load, make a fake flat one,
00237 // as shape clustering was not run.
00238 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
00239                                 bool replication,
00240                                 ShapeTable** shape_table,
00241                                 STRING* file_prefix) {
00242   InitFeatureDefs(&feature_defs);
00243   InitIntegerFX();
00244   *file_prefix = "";
00245   if (!FLAGS_D.empty()) {
00246     *file_prefix += FLAGS_D.c_str();
00247     *file_prefix += "/";
00248   }
00249   // If we are shape clustering (NULL shape_table) or we successfully load
00250   // a shape_table written by a previous shape clustering, then
00251   // shape_analysis will be true, meaning that the MasterTrainer will replace
00252   // some members of the unicharset with their fragments.
00253   bool shape_analysis = false;
00254   if (shape_table != NULL) {
00255     *shape_table = LoadShapeTable(*file_prefix);
00256     if (*shape_table != NULL)
00257       shape_analysis = true;
00258   } else {
00259     shape_analysis = true;
00260   }
00261   MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
00262                                              shape_analysis,
00263                                              replication,
00264                                              FLAGS_debug_level);
00265   if (FLAGS_input_trainer.empty()) {
00266     trainer->LoadUnicharset(FLAGS_U.c_str());
00267     // Get basic font information from font_properties.
00268     if (!FLAGS_F.empty()) {
00269       if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
00270         delete trainer;
00271         return NULL;
00272       }
00273     }
00274     if (!FLAGS_X.empty()) {
00275       if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
00276         delete trainer;
00277         return NULL;
00278       }
00279     }
00280     IntFeatureSpace fs;
00281     fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
00282     trainer->SetFeatureSpace(fs);
00283     const char* page_name;
00284     // Load training data from .tr files on the command line.
00285     while ((page_name = GetNextFilename(argc, argv)) != NULL) {
00286       tprintf("Reading %s ...\n", page_name);
00287       FILE* fp = Efopen(page_name, "rb");
00288       trainer->ReadTrainingSamples(fp, feature_defs, false);
00289       fclose(fp);
00290 
00291       // If there is a file with [lang].[fontname].exp[num].fontinfo present,
00292       // read font spacing information in to fontinfo_table.
00293       int pagename_len = strlen(page_name);
00294       char *fontinfo_file_name = new char[pagename_len + 7];
00295       strncpy(fontinfo_file_name, page_name, pagename_len - 2);  // remove "tr"
00296       strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo");  // +"fontinfo"
00297       trainer->AddSpacingInfo(fontinfo_file_name);
00298       delete[] fontinfo_file_name;
00299 
00300       // Load the images into memory if required by the classifier.
00301       if (FLAGS_load_images) {
00302         STRING image_name = page_name;
00303         // Chop off the tr and replace with tif. Extension must be tif!
00304         image_name.truncate_at(image_name.length() - 2);
00305         image_name += "tif";
00306         trainer->LoadPageImages(image_name.string());
00307       }
00308     }
00309     trainer->PostLoadCleanup();
00310     // Write the master trainer if required.
00311     if (!FLAGS_output_trainer.empty()) {
00312       FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
00313       if (fp == NULL) {
00314         tprintf("Can't create saved trainer data!\n");
00315       } else {
00316         trainer->Serialize(fp);
00317         fclose(fp);
00318       }
00319     }
00320   } else {
00321     bool success = false;
00322     tprintf("Loading master trainer from file:%s\n",
00323             FLAGS_input_trainer.c_str());
00324     FILE* fp = fopen(FLAGS_input_trainer.c_str(), "rb");
00325     if (fp == NULL) {
00326       tprintf("Can't read file %s to initialize master trainer\n",
00327               FLAGS_input_trainer.c_str());
00328     } else {
00329       success = trainer->DeSerialize(false, fp);
00330       fclose(fp);
00331     }
00332     if (!success) {
00333       tprintf("Deserialize of master trainer failed!\n");
00334       delete trainer;
00335       return NULL;
00336     }
00337   }
00338   trainer->PreTrainingSetup();
00339   if (!FLAGS_O.empty() &&
00340       !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
00341     fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
00342     delete trainer;
00343     return NULL;
00344   }
00345   if (shape_table != NULL) {
00346     // If we previously failed to load a shapetable, then shape clustering
00347     // wasn't run so make a flat one now.
00348     if (*shape_table == NULL) {
00349       *shape_table = new ShapeTable;
00350       trainer->SetupFlatShapeTable(*shape_table);
00351       tprintf("Flat shape table summary: %s\n",
00352               (*shape_table)->SummaryStr().string());
00353     }
00354     (*shape_table)->set_unicharset(trainer->unicharset());
00355   }
00356   return trainer;
00357 }
00358 
00359 }  // namespace tesseract.
00360 
00361 /*---------------------------------------------------------------------------*/
00362 const char *GetNextFilename(int argc, const char* const * argv) {
00363   /*
00364    **   Parameters: none
00365    **   Globals:
00366    **           tessoptind                      defined by tessopt sys call
00367    **   Operation:
00368    **           This routine returns the next command line argument.  If
00369    **           there are no remaining command line arguments, it returns
00370    **           NULL.  This routine should only be called after all option
00371    **           arguments have been parsed and removed with ParseArguments.
00372    **   Return: Next command line argument or NULL.
00373    **   Exceptions: none
00374    **   History: Fri Aug 18 09:34:12 1989, DSJ, Created.
00375    */
00376   if (tessoptind < argc)
00377     return argv[tessoptind++];
00378   else
00379     return NULL;
00380 }       /* GetNextFilename */
00381 
00382 
00383 
00384 /*---------------------------------------------------------------------------*/
00385 LABELEDLIST FindList (
00386     LIST        List,
00387     char        *Label)
00388 
00389 /*
00390  **     Parameters:
00391  **             List            list to search
00392  **             Label           label to search for
00393  **     Globals: none
00394  **     Operation:
00395  **             This routine searches thru a list of labeled lists to find
00396  **             a list with the specified label.  If a matching labeled list
00397  **             cannot be found, NULL is returned.
00398  **     Return: Labeled list with the specified Label or NULL.
00399  **     Exceptions: none
00400  **     History: Fri Aug 18 15:57:41 1989, DSJ, Created.
00401  */
00402 
00403 {
00404   LABELEDLIST   LabeledList;
00405 
00406   iterate (List)
00407   {
00408     LabeledList = (LABELEDLIST) first_node (List);
00409     if (strcmp (LabeledList->Label, Label) == 0)
00410       return (LabeledList);
00411   }
00412   return (NULL);
00413 
00414 }       /* FindList */
00415 
00416 /*---------------------------------------------------------------------------*/
00417 LABELEDLIST NewLabeledList (
00418     const char  *Label)
00419 
00420 /*
00421  **     Parameters:
00422  **             Label   label for new list
00423  **     Globals: none
00424  **     Operation:
00425  **             This routine allocates a new, empty labeled list and gives
00426  **             it the specified label.
00427  **     Return: New, empty labeled list.
00428  **     Exceptions: none
00429  **     History: Fri Aug 18 16:08:46 1989, DSJ, Created.
00430  */
00431 
00432 {
00433   LABELEDLIST   LabeledList;
00434 
00435   LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
00436   LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
00437   strcpy (LabeledList->Label, Label);
00438   LabeledList->List = NIL_LIST;
00439   LabeledList->SampleCount = 0;
00440   LabeledList->font_sample_count = 0;
00441   return (LabeledList);
00442 
00443 }       /* NewLabeledList */
00444 
00445 /*---------------------------------------------------------------------------*/
00446 // TODO(rays) This is now used only by cntraining. Convert cntraining to use
00447 // the new method or get rid of it entirely.
00448 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
00449                          const char *feature_name, int max_samples,
00450                          UNICHARSET* unicharset,
00451                          FILE* file, LIST* training_samples) {
00452 /*
00453 **  Parameters:
00454 **    file    open text file to read samples from
00455 **  Globals: none
00456 **  Operation:
00457 **    This routine reads training samples from a file and
00458 **    places them into a data structure which organizes the
00459 **    samples by FontName and CharName.  It then returns this
00460 **    data structure.
00461 **  Return: none
00462 **  Exceptions: none
00463 **  History: Fri Aug 18 13:11:39 1989, DSJ, Created.
00464 **       Tue May 17 1998 simplifications to structure, illiminated
00465 **        font, and feature specification levels of structure.
00466 */
00467   char    buffer[2048];
00468   char    unichar[UNICHAR_LEN + 1];
00469   LABELEDLIST char_sample;
00470   FEATURE_SET feature_samples;
00471   CHAR_DESC char_desc;
00472   int   i;
00473   int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
00474   // Zero out the font_sample_count for all the classes.
00475   LIST it = *training_samples;
00476   iterate(it) {
00477     char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
00478     char_sample->font_sample_count = 0;
00479   }
00480 
00481   while (fgets(buffer, 2048, file) != NULL) {
00482     if (buffer[0] == '\n')
00483       continue;
00484 
00485     sscanf(buffer, "%*s %s", unichar);
00486     if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
00487       unicharset->unichar_insert(unichar);
00488       if (unicharset->size() > MAX_NUM_CLASSES) {
00489         tprintf("Error: Size of unicharset in training is "
00490                 "greater than MAX_NUM_CLASSES\n");
00491         exit(1);
00492       }
00493     }
00494     char_sample = FindList(*training_samples, unichar);
00495     if (char_sample == NULL) {
00496       char_sample = NewLabeledList(unichar);
00497       *training_samples = push(*training_samples, char_sample);
00498     }
00499     char_desc = ReadCharDescription(feature_defs, file);
00500     feature_samples = char_desc->FeatureSets[feature_type];
00501     if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
00502       char_sample->List = push(char_sample->List, feature_samples);
00503       char_sample->SampleCount++;
00504       char_sample->font_sample_count++;
00505     } else {
00506       FreeFeatureSet(feature_samples);
00507     }
00508     for (i = 0; i < char_desc->NumFeatureSets; i++) {
00509       if (feature_type != i)
00510         FreeFeatureSet(char_desc->FeatureSets[i]);
00511     }
00512     free(char_desc);
00513   }
00514 }  // ReadTrainingSamples
00515 
00516 
00517 /*---------------------------------------------------------------------------*/
00518 void FreeTrainingSamples(LIST CharList) {
00519 /*
00520  **     Parameters:
00521  **             FontList        list of all fonts in document
00522  **     Globals: none
00523  **     Operation:
00524  **             This routine deallocates all of the space allocated to
00525  **             the specified list of training samples.
00526  **     Return: none
00527  **     Exceptions: none
00528  **     History: Fri Aug 18 17:44:27 1989, DSJ, Created.
00529  */
00530   LABELEDLIST char_sample;
00531   FEATURE_SET FeatureSet;
00532   LIST FeatureList;
00533 
00534 
00535   iterate(CharList) {  /* iterate thru all of the fonts */
00536     char_sample = (LABELEDLIST) first_node(CharList);
00537     FeatureList = char_sample->List;
00538     iterate(FeatureList) {  /* iterate thru all of the classes */
00539       FeatureSet = (FEATURE_SET) first_node(FeatureList);
00540       FreeFeatureSet(FeatureSet);
00541     }
00542     FreeLabeledList(char_sample);
00543   }
00544   destroy(CharList);
00545 }  /* FreeTrainingSamples */
00546 
00547 /*---------------------------------------------------------------------------*/
00548 void FreeLabeledList(LABELEDLIST LabeledList) {
00549 /*
00550  **     Parameters:
00551  **             LabeledList     labeled list to be freed
00552  **     Globals: none
00553  **     Operation:
00554  **             This routine deallocates all of the memory consumed by
00555  **             a labeled list.  It does not free any memory which may be
00556  **             consumed by the items in the list.
00557  **     Return: none
00558  **     Exceptions: none
00559  **     History: Fri Aug 18 17:52:45 1989, DSJ, Created.
00560  */
00561   destroy(LabeledList->List);
00562   free(LabeledList->Label);
00563   free(LabeledList);
00564 }  /* FreeLabeledList */
00565 
00566 /*---------------------------------------------------------------------------*/
00567 CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs,
00568                               LABELEDLIST char_sample,
00569                               const char* program_feature_type) {
00570 /*
00571  **     Parameters:
00572  **             char_sample: LABELEDLIST that holds all the feature information for a
00573  **             given character.
00574  **     Globals:
00575  **             None
00576  **     Operation:
00577  **             This routine reads samples from a LABELEDLIST and enters
00578  **             those samples into a clusterer data structure.  This
00579  **             data structure is then returned to the caller.
00580  **     Return:
00581  **             Pointer to new clusterer data structure.
00582  **     Exceptions:
00583  **             None
00584  **     History:
00585  **             8/16/89, DSJ, Created.
00586  */
00587   uinT16 N;
00588   int i, j;
00589   FLOAT32 *Sample = NULL;
00590   CLUSTERER *Clusterer;
00591   inT32 CharID;
00592   LIST FeatureList = NULL;
00593   FEATURE_SET FeatureSet = NULL;
00594 
00595   int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
00596   N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
00597   Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
00598 
00599   FeatureList = char_sample->List;
00600   CharID = 0;
00601   iterate(FeatureList) {
00602     FeatureSet = (FEATURE_SET) first_node(FeatureList);
00603     for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
00604       if (Sample == NULL)
00605         Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00606       for (j = 0; j < N; j++)
00607         Sample[j] = FeatureSet->Features[i]->Params[j];
00608       MakeSample (Clusterer, Sample, CharID);
00609     }
00610     CharID++;
00611   }
00612   if ( Sample != NULL ) free( Sample );
00613   return( Clusterer );
00614 
00615 }       /* SetUpForClustering */
00616 
00617 /*------------------------------------------------------------------------*/
00618 void MergeInsignificantProtos(LIST ProtoList, const char* label,
00619                               CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
00620   PROTOTYPE     *Prototype;
00621   bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
00622 
00623   LIST pProtoList = ProtoList;
00624   iterate(pProtoList) {
00625     Prototype = (PROTOTYPE *) first_node (pProtoList);
00626     if (Prototype->Significant || Prototype->Merged)
00627       continue;
00628     FLOAT32 best_dist = 0.125;
00629     PROTOTYPE* best_match = NULL;
00630     // Find the nearest alive prototype.
00631     LIST list_it = ProtoList;
00632     iterate(list_it) {
00633       PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
00634       if (test_p != Prototype && !test_p->Merged) {
00635         FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
00636                                        Clusterer->ParamDesc,
00637                                        Prototype->Mean, test_p->Mean);
00638         if (dist < best_dist) {
00639           best_match = test_p;
00640           best_dist = dist;
00641         }
00642       }
00643     }
00644     if (best_match != NULL && !best_match->Significant) {
00645       if (debug)
00646         tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
00647                 best_match->NumSamples, Prototype->NumSamples,
00648                 best_match->Mean[0], best_match->Mean[1],
00649                 Prototype->Mean[0], Prototype->Mean[1]);
00650       best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
00651                                              Clusterer->ParamDesc,
00652                                              best_match->NumSamples,
00653                                              Prototype->NumSamples,
00654                                              best_match->Mean,
00655                                              best_match->Mean, Prototype->Mean);
00656       Prototype->NumSamples = 0;
00657       Prototype->Merged = 1;
00658     } else if (best_match != NULL) {
00659       if (debug)
00660         tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
00661                 Prototype->Mean[0], Prototype->Mean[1],
00662                 best_match->Mean[0], best_match->Mean[1]);
00663       Prototype->Merged = 1;
00664     }
00665   }
00666   // Mark significant those that now have enough samples.
00667   int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
00668   pProtoList = ProtoList;
00669   iterate(pProtoList) {
00670     Prototype = (PROTOTYPE *) first_node (pProtoList);
00671     // Process insignificant protos that do not match a green one
00672     if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
00673         !Prototype->Merged) {
00674       if (debug)
00675         tprintf("Red proto at %g,%g becoming green\n",
00676                 Prototype->Mean[0], Prototype->Mean[1]);
00677       Prototype->Significant = true;
00678     }
00679   }
00680 }       /* MergeInsignificantProtos */
00681 
00682 /*-----------------------------------------------------------------------------*/
00683 void CleanUpUnusedData(
00684     LIST ProtoList)
00685 {
00686   PROTOTYPE* Prototype;
00687 
00688   iterate(ProtoList)
00689   {
00690     Prototype = (PROTOTYPE *) first_node (ProtoList);
00691     if(Prototype->Variance.Elliptical != NULL)
00692     {
00693       memfree(Prototype->Variance.Elliptical);
00694       Prototype->Variance.Elliptical = NULL;
00695     }
00696     if(Prototype->Magnitude.Elliptical != NULL)
00697     {
00698       memfree(Prototype->Magnitude.Elliptical);
00699       Prototype->Magnitude.Elliptical = NULL;
00700     }
00701     if(Prototype->Weight.Elliptical != NULL)
00702     {
00703       memfree(Prototype->Weight.Elliptical);
00704       Prototype->Weight.Elliptical = NULL;
00705     }
00706   }
00707 }
00708 
00709 /*------------------------------------------------------------------------*/
00710 LIST RemoveInsignificantProtos(
00711     LIST ProtoList,
00712     BOOL8 KeepSigProtos,
00713     BOOL8 KeepInsigProtos,
00714     int N)
00715 
00716 {
00717   LIST NewProtoList = NIL_LIST;
00718   LIST pProtoList;
00719   PROTOTYPE* Proto;
00720   PROTOTYPE* NewProto;
00721   int i;
00722 
00723   pProtoList = ProtoList;
00724   iterate(pProtoList)
00725   {
00726     Proto = (PROTOTYPE *) first_node (pProtoList);
00727     if ((Proto->Significant && KeepSigProtos) ||
00728         (!Proto->Significant && KeepInsigProtos))
00729     {
00730       NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
00731 
00732       NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00733       NewProto->Significant = Proto->Significant;
00734       NewProto->Style = Proto->Style;
00735       NewProto->NumSamples = Proto->NumSamples;
00736       NewProto->Cluster = NULL;
00737       NewProto->Distrib = NULL;
00738 
00739       for (i=0; i < N; i++)
00740         NewProto->Mean[i] = Proto->Mean[i];
00741       if (Proto->Variance.Elliptical != NULL)
00742       {
00743         NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00744         for (i=0; i < N; i++)
00745           NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
00746       }
00747       else
00748         NewProto->Variance.Elliptical = NULL;
00749       //---------------------------------------------
00750       if (Proto->Magnitude.Elliptical != NULL)
00751       {
00752         NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00753         for (i=0; i < N; i++)
00754           NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
00755       }
00756       else
00757         NewProto->Magnitude.Elliptical = NULL;
00758       //------------------------------------------------
00759       if (Proto->Weight.Elliptical != NULL)
00760       {
00761         NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00762         for (i=0; i < N; i++)
00763           NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
00764       }
00765       else
00766         NewProto->Weight.Elliptical = NULL;
00767 
00768       NewProto->TotalMagnitude = Proto->TotalMagnitude;
00769       NewProto->LogMagnitude = Proto->LogMagnitude;
00770       NewProtoList = push_last(NewProtoList, NewProto);
00771     }
00772   }
00773   FreeProtoList(&ProtoList);
00774   return (NewProtoList);
00775 }       /* RemoveInsignificantProtos */
00776 
00777 /*----------------------------------------------------------------------------*/
00778 MERGE_CLASS FindClass (
00779     LIST        List,
00780     const char  *Label)
00781 {
00782   MERGE_CLASS   MergeClass;
00783 
00784   iterate (List)
00785   {
00786     MergeClass = (MERGE_CLASS) first_node (List);
00787     if (strcmp (MergeClass->Label, Label) == 0)
00788       return (MergeClass);
00789   }
00790   return (NULL);
00791 
00792 }       /* FindClass */
00793 
00794 /*---------------------------------------------------------------------------*/
00795 MERGE_CLASS NewLabeledClass (
00796     const char  *Label)
00797 {
00798   MERGE_CLASS   MergeClass;
00799 
00800   MergeClass = new MERGE_CLASS_NODE;
00801   MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
00802   strcpy (MergeClass->Label, Label);
00803   MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
00804   return (MergeClass);
00805 
00806 }       /* NewLabeledClass */
00807 
00808 /*-----------------------------------------------------------------------------*/
00809 void FreeLabeledClassList (
00810     LIST        ClassList)
00811 
00812 /*
00813  **     Parameters:
00814  **             FontList        list of all fonts in document
00815  **     Globals: none
00816  **     Operation:
00817  **             This routine deallocates all of the space allocated to
00818  **             the specified list of training samples.
00819  **     Return: none
00820  **     Exceptions: none
00821  **     History: Fri Aug 18 17:44:27 1989, DSJ, Created.
00822  */
00823 
00824 {
00825   MERGE_CLASS   MergeClass;
00826 
00827   iterate (ClassList)           /* iterate thru all of the fonts */
00828   {
00829     MergeClass = (MERGE_CLASS) first_node (ClassList);
00830     free (MergeClass->Label);
00831     FreeClass(MergeClass->Class);
00832     delete MergeClass;
00833   }
00834   destroy (ClassList);
00835 
00836 }       /* FreeLabeledClassList */
00837 
00839 CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
00840                                 LIST LabeledClassList) {
00841   MERGE_CLASS   MergeClass;
00842   CLASS_TYPE            Class;
00843   int                           NumProtos;
00844   int                           NumConfigs;
00845   int                           NumWords;
00846   int                           i, j;
00847   float                 Values[3];
00848   PROTO                 NewProto;
00849   PROTO                 OldProto;
00850   BIT_VECTOR            NewConfig;
00851   BIT_VECTOR            OldConfig;
00852 
00853   //    printf("Float2Int ...\n");
00854 
00855   CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
00856   iterate(LabeledClassList)
00857   {
00858     UnicityTableEqEq<int>   font_set;
00859     MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
00860     Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
00861     NumProtos = MergeClass->Class->NumProtos;
00862     NumConfigs = MergeClass->Class->NumConfigs;
00863     font_set.move(&MergeClass->Class->font_set);
00864     Class->NumProtos = NumProtos;
00865     Class->MaxNumProtos = NumProtos;
00866     Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
00867     for(i=0; i < NumProtos; i++)
00868     {
00869       NewProto = ProtoIn(Class, i);
00870       OldProto = ProtoIn(MergeClass->Class, i);
00871       Values[0] = OldProto->X;
00872       Values[1] = OldProto->Y;
00873       Values[2] = OldProto->Angle;
00874       Normalize(Values);
00875       NewProto->X = OldProto->X;
00876       NewProto->Y = OldProto->Y;
00877       NewProto->Length = OldProto->Length;
00878       NewProto->Angle = OldProto->Angle;
00879       NewProto->A = Values[0];
00880       NewProto->B = Values[1];
00881       NewProto->C = Values[2];
00882     }
00883 
00884     Class->NumConfigs = NumConfigs;
00885     Class->MaxNumConfigs = NumConfigs;
00886     Class->font_set.move(&font_set);
00887     Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
00888     NumWords = WordsInVectorOfSize(NumProtos);
00889     for(i=0; i < NumConfigs; i++)
00890     {
00891       NewConfig = NewBitVector(NumProtos);
00892       OldConfig = MergeClass->Class->Configurations[i];
00893       for(j=0; j < NumWords; j++)
00894         NewConfig[j] = OldConfig[j];
00895       Class->Configurations[i] = NewConfig;
00896     }
00897   }
00898   return float_classes;
00899 } // SetUpForFloat2Int
00900 
00901 /*--------------------------------------------------------------------------*/
00902 void Normalize (
00903     float  *Values)
00904 {
00905   register float Slope;
00906   register float Intercept;
00907   register float Normalizer;
00908 
00909   Slope      = tan (Values [2] * 2 * PI);
00910   Intercept  = Values [1] - Slope * Values [0];
00911   Normalizer = 1 / sqrt (Slope * Slope + 1.0);
00912 
00913   Values [0] = Slope * Normalizer;
00914   Values [1] = - Normalizer;
00915   Values [2] = Intercept * Normalizer;
00916 } // Normalize
00917 
00918 /*-------------------------------------------------------------------------*/
00919 void FreeNormProtoList (
00920     LIST        CharList)
00921 
00922 {
00923   LABELEDLIST   char_sample;
00924 
00925   iterate (CharList)            /* iterate thru all of the fonts */
00926   {
00927     char_sample = (LABELEDLIST) first_node (CharList);
00928     FreeLabeledList (char_sample);
00929   }
00930   destroy (CharList);
00931 
00932 }       // FreeNormProtoList
00933 
00934 /*---------------------------------------------------------------------------*/
00935 void AddToNormProtosList(
00936     LIST* NormProtoList,
00937     LIST ProtoList,
00938     char* CharName)
00939 {
00940   PROTOTYPE* Proto;
00941   LABELEDLIST LabeledProtoList;
00942 
00943   LabeledProtoList = NewLabeledList(CharName);
00944   iterate(ProtoList)
00945   {
00946     Proto = (PROTOTYPE *) first_node (ProtoList);
00947     LabeledProtoList->List = push(LabeledProtoList->List, Proto);
00948   }
00949   *NormProtoList = push(*NormProtoList, LabeledProtoList);
00950 }
00951 
00952 /*---------------------------------------------------------------------------*/
00953 int NumberOfProtos(
00954     LIST ProtoList,
00955     BOOL8       CountSigProtos,
00956     BOOL8       CountInsigProtos)
00957 {
00958   int N = 0;
00959   PROTOTYPE     *Proto;
00960 
00961   iterate(ProtoList)
00962   {
00963     Proto = (PROTOTYPE *) first_node ( ProtoList );
00964     if (( Proto->Significant && CountSigProtos )        ||
00965         ( ! Proto->Significant && CountInsigProtos ) )
00966       N++;
00967   }
00968   return(N);
00969 }