Tesseract
3.02
|
00001 // Copyright 2008 Google Inc. All Rights Reserved. 00002 // Author: scharron@google.com (Samuel Charron) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 00014 #include "commontraining.h" 00015 00016 #ifndef USE_STD_NAMESPACE 00017 #include "base/init_google.h" 00018 #include "base/commandlineflags.h" 00019 #endif 00020 #include "allheaders.h" 00021 #include "ccutil.h" 00022 #include "classify.h" 00023 #include "oldlist.h" 00024 #include "globals.h" 00025 #include "mf.h" 00026 #include "clusttool.h" 00027 #include "cluster.h" 00028 #include "tessopt.h" 00029 #include "efio.h" 00030 #include "emalloc.h" 00031 #include "featdefs.h" 00032 #include "fontinfo.h" 00033 #include "intfeaturespace.h" 00034 #include "mastertrainer.h" 00035 #include "tessdatamanager.h" 00036 #include "tprintf.h" 00037 #include "freelist.h" 00038 #include "params.h" 00039 #include "shapetable.h" 00040 #include "unicity_table.h" 00041 00042 #include <math.h> 00043 00044 using tesseract::CCUtil; 00045 using tesseract::FontInfo; 00046 using tesseract::IntFeatureSpace; 00047 using tesseract::ParamUtils; 00048 using tesseract::ShapeTable; 00049 00050 // Global Variables. 00051 // global variable to hold configuration parameters to control clustering 00052 // -M 0.625 -B 0.05 -I 1.0 -C 1e-6. 00053 CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }; 00054 00055 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging"); 00056 INT_PARAM_FLAG(load_images, 0, "Load images with tr files"); 00057 STRING_PARAM_FLAG(configfile, "", "File to load more configs from"); 00058 STRING_PARAM_FLAG(D, "", "Directory to write output files to"); 00059 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties"); 00060 STRING_PARAM_FLAG(X, "", "File listing font xheights"); 00061 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from"); 00062 STRING_PARAM_FLAG(O, "", "File to write unicharset to"); 00063 STRING_PARAM_FLAG(input_trainer, "", "File to load trainer from"); 00064 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to"); 00065 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string"); 00066 00067 // The usage strings are different as the DEFINE_* flags are available on 00068 // the command line, but the *_VAR flags are set through a config file with 00069 // some of them available through special command-line args. 00070 #ifndef USE_STD_NAMESPACE 00071 const char* kUsage = "[flags] [ .tr files ... ]\n"; 00072 #else 00073 const char* kUsage = "[-c configfile]\n" 00074 "\t[-D Directory]\n" 00075 "\t[-M MinSamples] [-B MaxBad] [-I Independence] [-C Confidence]\n" 00076 "\t[-U InputUnicharset]\n" 00077 "\t[-O OutputUnicharset]\n" 00078 "\t[-F FontInfoFile]\n" 00079 "\t[-X InputXHeightsFile]\n" 00080 "\t[-S InputShapeTable]\n" 00081 "\t[ .tr files ... ]\n"; 00082 #endif 00083 00084 FEATURE_DEFS_STRUCT feature_defs; 00085 CCUtil ccutil; 00086 00087 /*---------------------------------------------------------------------------*/ 00088 void ParseArguments(int* argc, char ***argv) { 00089 /* 00090 ** Parameters: 00091 ** argc number of command line arguments to parse 00092 ** argv command line arguments 00093 ** Globals: 00094 ** ShowSignificantProtos flag controlling proto display 00095 ** ShowInsignificantProtos flag controlling proto display 00096 ** Config current clustering parameters 00097 ** tessoptarg, tessoptind defined by tessopt sys call 00098 ** Argc, Argv global copies of argc and argv 00099 ** Operation: 00100 ** This routine parses the command line arguments that were 00101 ** passed to the program. The legal arguments are shown in the usage 00102 ** message below: 00103 00104 ** Return: none 00105 ** Exceptions: Illegal options terminate the program. 00106 ** History: 7/24/89, DSJ, Created. 00107 */ 00108 #ifndef USE_STD_NAMESPACE 00109 InitGoogle(kUsage, argc, argv, true); 00110 tessoptind = 1; 00111 #else 00112 int Option; 00113 int ParametersRead; 00114 BOOL8 Error; 00115 00116 Error = FALSE; 00117 while ((Option = tessopt(*argc, *argv, "F:O:U:D:C:I:M:B:S:X:c:")) != EOF) { 00118 switch (Option) { 00119 case 'C': 00120 ParametersRead = sscanf(tessoptarg, "%lf", &(Config.Confidence) ); 00121 if ( ParametersRead != 1 ) Error = TRUE; 00122 else if ( Config.Confidence > 1 ) Config.Confidence = 1; 00123 else if ( Config.Confidence < 0 ) Config.Confidence = 0; 00124 break; 00125 case 'I': 00126 ParametersRead = sscanf(tessoptarg, "%f", &(Config.Independence) ); 00127 if ( ParametersRead != 1 ) Error = TRUE; 00128 else if ( Config.Independence > 1 ) Config.Independence = 1; 00129 else if ( Config.Independence < 0 ) Config.Independence = 0; 00130 break; 00131 case 'M': 00132 ParametersRead = sscanf(tessoptarg, "%f", &(Config.MinSamples) ); 00133 if ( ParametersRead != 1 ) Error = TRUE; 00134 else if ( Config.MinSamples > 1 ) Config.MinSamples = 1; 00135 else if ( Config.MinSamples < 0 ) Config.MinSamples = 0; 00136 break; 00137 case 'B': 00138 ParametersRead = sscanf(tessoptarg, "%f", &(Config.MaxIllegal) ); 00139 if ( ParametersRead != 1 ) Error = TRUE; 00140 else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1; 00141 else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0; 00142 break; 00143 case 'c': 00144 FLAGS_configfile.set_value(tessoptarg); 00145 break; 00146 case 'D': 00147 FLAGS_D.set_value(tessoptarg); 00148 break; 00149 case 'U': 00150 FLAGS_U.set_value(tessoptarg); 00151 break; 00152 case 'O': 00153 FLAGS_O.set_value(tessoptarg); 00154 break; 00155 case 'F': 00156 FLAGS_F.set_value(tessoptarg); 00157 break; 00158 case 'X': 00159 FLAGS_X.set_value(tessoptarg); 00160 break; 00161 case '?': 00162 Error = TRUE; 00163 break; 00164 } 00165 if (Error) { 00166 fprintf(stderr, "Usage: %s %s\n", (*argv)[0], kUsage); 00167 exit(2); 00168 } 00169 } 00170 #endif 00171 // Set additional parameters from config file if specified. 00172 if (!FLAGS_configfile.empty()) { 00173 tesseract::ParamUtils::ReadParamsFile( 00174 FLAGS_configfile.c_str(), 00175 tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY, 00176 ccutil.params()); 00177 } 00178 } // ParseArguments 00179 00180 namespace tesseract { 00181 00182 // Helper loads shape table from the given file. 00183 ShapeTable* LoadShapeTable(const STRING& file_prefix) { 00184 ShapeTable* shape_table = NULL; 00185 STRING shape_table_file = file_prefix; 00186 shape_table_file += kShapeTableFileSuffix; 00187 FILE* shape_fp = fopen(shape_table_file.string(), "rb"); 00188 if (shape_fp != NULL) { 00189 shape_table = new ShapeTable; 00190 if (!shape_table->DeSerialize(false, shape_fp)) { 00191 delete shape_table; 00192 shape_table = NULL; 00193 tprintf("Error: Failed to read shape table %s\n", 00194 shape_table_file.string()); 00195 } else { 00196 int num_shapes = shape_table->NumShapes(); 00197 tprintf("Read shape table %s of %d shapes\n", 00198 shape_table_file.string(), num_shapes); 00199 } 00200 fclose(shape_fp); 00201 } else { 00202 tprintf("Warning: No shape table file present: %s\n", 00203 shape_table_file.string()); 00204 } 00205 return shape_table; 00206 } 00207 00208 // Helper to write the shape_table. 00209 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) { 00210 STRING shape_table_file = file_prefix; 00211 shape_table_file += kShapeTableFileSuffix; 00212 FILE* fp = fopen(shape_table_file.string(), "wb"); 00213 if (fp != NULL) { 00214 if (!shape_table.Serialize(fp)) { 00215 fprintf(stderr, "Error writing shape table: %s\n", 00216 shape_table_file.string()); 00217 } 00218 fclose(fp); 00219 } else { 00220 fprintf(stderr, "Error creating shape table: %s\n", 00221 shape_table_file.string()); 00222 } 00223 } 00224 00225 // Creates a MasterTraininer and loads the training data into it: 00226 // Initializes feature_defs and IntegerFX. 00227 // Loads the shape_table if shape_table != NULL. 00228 // Loads initial unicharset from -U command-line option. 00229 // If FLAGS_input_trainer is set, loads the majority of data from there, else: 00230 // Loads font info from -F option. 00231 // Loads xheights from -X option. 00232 // Loads samples from .tr files in remaining command-line args. 00233 // Deletes outliers and computes canonical samples. 00234 // If FLAGS_output_trainer is set, saves the trainer for future use. 00235 // Computes canonical and cloud features. 00236 // If shape_table is not NULL, but failed to load, make a fake flat one, 00237 // as shape clustering was not run. 00238 MasterTrainer* LoadTrainingData(int argc, const char* const * argv, 00239 bool replication, 00240 ShapeTable** shape_table, 00241 STRING* file_prefix) { 00242 InitFeatureDefs(&feature_defs); 00243 InitIntegerFX(); 00244 *file_prefix = ""; 00245 if (!FLAGS_D.empty()) { 00246 *file_prefix += FLAGS_D.c_str(); 00247 *file_prefix += "/"; 00248 } 00249 // If we are shape clustering (NULL shape_table) or we successfully load 00250 // a shape_table written by a previous shape clustering, then 00251 // shape_analysis will be true, meaning that the MasterTrainer will replace 00252 // some members of the unicharset with their fragments. 00253 bool shape_analysis = false; 00254 if (shape_table != NULL) { 00255 *shape_table = LoadShapeTable(*file_prefix); 00256 if (*shape_table != NULL) 00257 shape_analysis = true; 00258 } else { 00259 shape_analysis = true; 00260 } 00261 MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC, 00262 shape_analysis, 00263 replication, 00264 FLAGS_debug_level); 00265 if (FLAGS_input_trainer.empty()) { 00266 trainer->LoadUnicharset(FLAGS_U.c_str()); 00267 // Get basic font information from font_properties. 00268 if (!FLAGS_F.empty()) { 00269 if (!trainer->LoadFontInfo(FLAGS_F.c_str())) { 00270 delete trainer; 00271 return NULL; 00272 } 00273 } 00274 if (!FLAGS_X.empty()) { 00275 if (!trainer->LoadXHeights(FLAGS_X.c_str())) { 00276 delete trainer; 00277 return NULL; 00278 } 00279 } 00280 IntFeatureSpace fs; 00281 fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets); 00282 trainer->SetFeatureSpace(fs); 00283 const char* page_name; 00284 // Load training data from .tr files on the command line. 00285 while ((page_name = GetNextFilename(argc, argv)) != NULL) { 00286 tprintf("Reading %s ...\n", page_name); 00287 FILE* fp = Efopen(page_name, "rb"); 00288 trainer->ReadTrainingSamples(fp, feature_defs, false); 00289 fclose(fp); 00290 00291 // If there is a file with [lang].[fontname].exp[num].fontinfo present, 00292 // read font spacing information in to fontinfo_table. 00293 int pagename_len = strlen(page_name); 00294 char *fontinfo_file_name = new char[pagename_len + 7]; 00295 strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr" 00296 strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo" 00297 trainer->AddSpacingInfo(fontinfo_file_name); 00298 delete[] fontinfo_file_name; 00299 00300 // Load the images into memory if required by the classifier. 00301 if (FLAGS_load_images) { 00302 STRING image_name = page_name; 00303 // Chop off the tr and replace with tif. Extension must be tif! 00304 image_name.truncate_at(image_name.length() - 2); 00305 image_name += "tif"; 00306 trainer->LoadPageImages(image_name.string()); 00307 } 00308 } 00309 trainer->PostLoadCleanup(); 00310 // Write the master trainer if required. 00311 if (!FLAGS_output_trainer.empty()) { 00312 FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb"); 00313 if (fp == NULL) { 00314 tprintf("Can't create saved trainer data!\n"); 00315 } else { 00316 trainer->Serialize(fp); 00317 fclose(fp); 00318 } 00319 } 00320 } else { 00321 bool success = false; 00322 tprintf("Loading master trainer from file:%s\n", 00323 FLAGS_input_trainer.c_str()); 00324 FILE* fp = fopen(FLAGS_input_trainer.c_str(), "rb"); 00325 if (fp == NULL) { 00326 tprintf("Can't read file %s to initialize master trainer\n", 00327 FLAGS_input_trainer.c_str()); 00328 } else { 00329 success = trainer->DeSerialize(false, fp); 00330 fclose(fp); 00331 } 00332 if (!success) { 00333 tprintf("Deserialize of master trainer failed!\n"); 00334 delete trainer; 00335 return NULL; 00336 } 00337 } 00338 trainer->PreTrainingSetup(); 00339 if (!FLAGS_O.empty() && 00340 !trainer->unicharset().save_to_file(FLAGS_O.c_str())) { 00341 fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str()); 00342 delete trainer; 00343 return NULL; 00344 } 00345 if (shape_table != NULL) { 00346 // If we previously failed to load a shapetable, then shape clustering 00347 // wasn't run so make a flat one now. 00348 if (*shape_table == NULL) { 00349 *shape_table = new ShapeTable; 00350 trainer->SetupFlatShapeTable(*shape_table); 00351 tprintf("Flat shape table summary: %s\n", 00352 (*shape_table)->SummaryStr().string()); 00353 } 00354 (*shape_table)->set_unicharset(trainer->unicharset()); 00355 } 00356 return trainer; 00357 } 00358 00359 } // namespace tesseract. 00360 00361 /*---------------------------------------------------------------------------*/ 00362 const char *GetNextFilename(int argc, const char* const * argv) { 00363 /* 00364 ** Parameters: none 00365 ** Globals: 00366 ** tessoptind defined by tessopt sys call 00367 ** Operation: 00368 ** This routine returns the next command line argument. If 00369 ** there are no remaining command line arguments, it returns 00370 ** NULL. This routine should only be called after all option 00371 ** arguments have been parsed and removed with ParseArguments. 00372 ** Return: Next command line argument or NULL. 00373 ** Exceptions: none 00374 ** History: Fri Aug 18 09:34:12 1989, DSJ, Created. 00375 */ 00376 if (tessoptind < argc) 00377 return argv[tessoptind++]; 00378 else 00379 return NULL; 00380 } /* GetNextFilename */ 00381 00382 00383 00384 /*---------------------------------------------------------------------------*/ 00385 LABELEDLIST FindList ( 00386 LIST List, 00387 char *Label) 00388 00389 /* 00390 ** Parameters: 00391 ** List list to search 00392 ** Label label to search for 00393 ** Globals: none 00394 ** Operation: 00395 ** This routine searches thru a list of labeled lists to find 00396 ** a list with the specified label. If a matching labeled list 00397 ** cannot be found, NULL is returned. 00398 ** Return: Labeled list with the specified Label or NULL. 00399 ** Exceptions: none 00400 ** History: Fri Aug 18 15:57:41 1989, DSJ, Created. 00401 */ 00402 00403 { 00404 LABELEDLIST LabeledList; 00405 00406 iterate (List) 00407 { 00408 LabeledList = (LABELEDLIST) first_node (List); 00409 if (strcmp (LabeledList->Label, Label) == 0) 00410 return (LabeledList); 00411 } 00412 return (NULL); 00413 00414 } /* FindList */ 00415 00416 /*---------------------------------------------------------------------------*/ 00417 LABELEDLIST NewLabeledList ( 00418 const char *Label) 00419 00420 /* 00421 ** Parameters: 00422 ** Label label for new list 00423 ** Globals: none 00424 ** Operation: 00425 ** This routine allocates a new, empty labeled list and gives 00426 ** it the specified label. 00427 ** Return: New, empty labeled list. 00428 ** Exceptions: none 00429 ** History: Fri Aug 18 16:08:46 1989, DSJ, Created. 00430 */ 00431 00432 { 00433 LABELEDLIST LabeledList; 00434 00435 LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE)); 00436 LabeledList->Label = (char*)Emalloc (strlen (Label)+1); 00437 strcpy (LabeledList->Label, Label); 00438 LabeledList->List = NIL_LIST; 00439 LabeledList->SampleCount = 0; 00440 LabeledList->font_sample_count = 0; 00441 return (LabeledList); 00442 00443 } /* NewLabeledList */ 00444 00445 /*---------------------------------------------------------------------------*/ 00446 // TODO(rays) This is now used only by cntraining. Convert cntraining to use 00447 // the new method or get rid of it entirely. 00448 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs, 00449 const char *feature_name, int max_samples, 00450 UNICHARSET* unicharset, 00451 FILE* file, LIST* training_samples) { 00452 /* 00453 ** Parameters: 00454 ** file open text file to read samples from 00455 ** Globals: none 00456 ** Operation: 00457 ** This routine reads training samples from a file and 00458 ** places them into a data structure which organizes the 00459 ** samples by FontName and CharName. It then returns this 00460 ** data structure. 00461 ** Return: none 00462 ** Exceptions: none 00463 ** History: Fri Aug 18 13:11:39 1989, DSJ, Created. 00464 ** Tue May 17 1998 simplifications to structure, illiminated 00465 ** font, and feature specification levels of structure. 00466 */ 00467 char buffer[2048]; 00468 char unichar[UNICHAR_LEN + 1]; 00469 LABELEDLIST char_sample; 00470 FEATURE_SET feature_samples; 00471 CHAR_DESC char_desc; 00472 int i; 00473 int feature_type = ShortNameToFeatureType(feature_defs, feature_name); 00474 // Zero out the font_sample_count for all the classes. 00475 LIST it = *training_samples; 00476 iterate(it) { 00477 char_sample = reinterpret_cast<LABELEDLIST>(first_node(it)); 00478 char_sample->font_sample_count = 0; 00479 } 00480 00481 while (fgets(buffer, 2048, file) != NULL) { 00482 if (buffer[0] == '\n') 00483 continue; 00484 00485 sscanf(buffer, "%*s %s", unichar); 00486 if (unicharset != NULL && !unicharset->contains_unichar(unichar)) { 00487 unicharset->unichar_insert(unichar); 00488 if (unicharset->size() > MAX_NUM_CLASSES) { 00489 tprintf("Error: Size of unicharset in training is " 00490 "greater than MAX_NUM_CLASSES\n"); 00491 exit(1); 00492 } 00493 } 00494 char_sample = FindList(*training_samples, unichar); 00495 if (char_sample == NULL) { 00496 char_sample = NewLabeledList(unichar); 00497 *training_samples = push(*training_samples, char_sample); 00498 } 00499 char_desc = ReadCharDescription(feature_defs, file); 00500 feature_samples = char_desc->FeatureSets[feature_type]; 00501 if (char_sample->font_sample_count < max_samples || max_samples <= 0) { 00502 char_sample->List = push(char_sample->List, feature_samples); 00503 char_sample->SampleCount++; 00504 char_sample->font_sample_count++; 00505 } else { 00506 FreeFeatureSet(feature_samples); 00507 } 00508 for (i = 0; i < char_desc->NumFeatureSets; i++) { 00509 if (feature_type != i) 00510 FreeFeatureSet(char_desc->FeatureSets[i]); 00511 } 00512 free(char_desc); 00513 } 00514 } // ReadTrainingSamples 00515 00516 00517 /*---------------------------------------------------------------------------*/ 00518 void FreeTrainingSamples(LIST CharList) { 00519 /* 00520 ** Parameters: 00521 ** FontList list of all fonts in document 00522 ** Globals: none 00523 ** Operation: 00524 ** This routine deallocates all of the space allocated to 00525 ** the specified list of training samples. 00526 ** Return: none 00527 ** Exceptions: none 00528 ** History: Fri Aug 18 17:44:27 1989, DSJ, Created. 00529 */ 00530 LABELEDLIST char_sample; 00531 FEATURE_SET FeatureSet; 00532 LIST FeatureList; 00533 00534 00535 iterate(CharList) { /* iterate thru all of the fonts */ 00536 char_sample = (LABELEDLIST) first_node(CharList); 00537 FeatureList = char_sample->List; 00538 iterate(FeatureList) { /* iterate thru all of the classes */ 00539 FeatureSet = (FEATURE_SET) first_node(FeatureList); 00540 FreeFeatureSet(FeatureSet); 00541 } 00542 FreeLabeledList(char_sample); 00543 } 00544 destroy(CharList); 00545 } /* FreeTrainingSamples */ 00546 00547 /*---------------------------------------------------------------------------*/ 00548 void FreeLabeledList(LABELEDLIST LabeledList) { 00549 /* 00550 ** Parameters: 00551 ** LabeledList labeled list to be freed 00552 ** Globals: none 00553 ** Operation: 00554 ** This routine deallocates all of the memory consumed by 00555 ** a labeled list. It does not free any memory which may be 00556 ** consumed by the items in the list. 00557 ** Return: none 00558 ** Exceptions: none 00559 ** History: Fri Aug 18 17:52:45 1989, DSJ, Created. 00560 */ 00561 destroy(LabeledList->List); 00562 free(LabeledList->Label); 00563 free(LabeledList); 00564 } /* FreeLabeledList */ 00565 00566 /*---------------------------------------------------------------------------*/ 00567 CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, 00568 LABELEDLIST char_sample, 00569 const char* program_feature_type) { 00570 /* 00571 ** Parameters: 00572 ** char_sample: LABELEDLIST that holds all the feature information for a 00573 ** given character. 00574 ** Globals: 00575 ** None 00576 ** Operation: 00577 ** This routine reads samples from a LABELEDLIST and enters 00578 ** those samples into a clusterer data structure. This 00579 ** data structure is then returned to the caller. 00580 ** Return: 00581 ** Pointer to new clusterer data structure. 00582 ** Exceptions: 00583 ** None 00584 ** History: 00585 ** 8/16/89, DSJ, Created. 00586 */ 00587 uinT16 N; 00588 int i, j; 00589 FLOAT32 *Sample = NULL; 00590 CLUSTERER *Clusterer; 00591 inT32 CharID; 00592 LIST FeatureList = NULL; 00593 FEATURE_SET FeatureSet = NULL; 00594 00595 int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type); 00596 N = FeatureDefs.FeatureDesc[desc_index]->NumParams; 00597 Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc); 00598 00599 FeatureList = char_sample->List; 00600 CharID = 0; 00601 iterate(FeatureList) { 00602 FeatureSet = (FEATURE_SET) first_node(FeatureList); 00603 for (i = 0; i < FeatureSet->MaxNumFeatures; i++) { 00604 if (Sample == NULL) 00605 Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); 00606 for (j = 0; j < N; j++) 00607 Sample[j] = FeatureSet->Features[i]->Params[j]; 00608 MakeSample (Clusterer, Sample, CharID); 00609 } 00610 CharID++; 00611 } 00612 if ( Sample != NULL ) free( Sample ); 00613 return( Clusterer ); 00614 00615 } /* SetUpForClustering */ 00616 00617 /*------------------------------------------------------------------------*/ 00618 void MergeInsignificantProtos(LIST ProtoList, const char* label, 00619 CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { 00620 PROTOTYPE *Prototype; 00621 bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0; 00622 00623 LIST pProtoList = ProtoList; 00624 iterate(pProtoList) { 00625 Prototype = (PROTOTYPE *) first_node (pProtoList); 00626 if (Prototype->Significant || Prototype->Merged) 00627 continue; 00628 FLOAT32 best_dist = 0.125; 00629 PROTOTYPE* best_match = NULL; 00630 // Find the nearest alive prototype. 00631 LIST list_it = ProtoList; 00632 iterate(list_it) { 00633 PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it); 00634 if (test_p != Prototype && !test_p->Merged) { 00635 FLOAT32 dist = ComputeDistance(Clusterer->SampleSize, 00636 Clusterer->ParamDesc, 00637 Prototype->Mean, test_p->Mean); 00638 if (dist < best_dist) { 00639 best_match = test_p; 00640 best_dist = dist; 00641 } 00642 } 00643 } 00644 if (best_match != NULL && !best_match->Significant) { 00645 if (debug) 00646 tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n", 00647 best_match->NumSamples, Prototype->NumSamples, 00648 best_match->Mean[0], best_match->Mean[1], 00649 Prototype->Mean[0], Prototype->Mean[1]); 00650 best_match->NumSamples = MergeClusters(Clusterer->SampleSize, 00651 Clusterer->ParamDesc, 00652 best_match->NumSamples, 00653 Prototype->NumSamples, 00654 best_match->Mean, 00655 best_match->Mean, Prototype->Mean); 00656 Prototype->NumSamples = 0; 00657 Prototype->Merged = 1; 00658 } else if (best_match != NULL) { 00659 if (debug) 00660 tprintf("Red proto at %g,%g matched a green one at %g,%g\n", 00661 Prototype->Mean[0], Prototype->Mean[1], 00662 best_match->Mean[0], best_match->Mean[1]); 00663 Prototype->Merged = 1; 00664 } 00665 } 00666 // Mark significant those that now have enough samples. 00667 int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar); 00668 pProtoList = ProtoList; 00669 iterate(pProtoList) { 00670 Prototype = (PROTOTYPE *) first_node (pProtoList); 00671 // Process insignificant protos that do not match a green one 00672 if (!Prototype->Significant && Prototype->NumSamples >= min_samples && 00673 !Prototype->Merged) { 00674 if (debug) 00675 tprintf("Red proto at %g,%g becoming green\n", 00676 Prototype->Mean[0], Prototype->Mean[1]); 00677 Prototype->Significant = true; 00678 } 00679 } 00680 } /* MergeInsignificantProtos */ 00681 00682 /*-----------------------------------------------------------------------------*/ 00683 void CleanUpUnusedData( 00684 LIST ProtoList) 00685 { 00686 PROTOTYPE* Prototype; 00687 00688 iterate(ProtoList) 00689 { 00690 Prototype = (PROTOTYPE *) first_node (ProtoList); 00691 if(Prototype->Variance.Elliptical != NULL) 00692 { 00693 memfree(Prototype->Variance.Elliptical); 00694 Prototype->Variance.Elliptical = NULL; 00695 } 00696 if(Prototype->Magnitude.Elliptical != NULL) 00697 { 00698 memfree(Prototype->Magnitude.Elliptical); 00699 Prototype->Magnitude.Elliptical = NULL; 00700 } 00701 if(Prototype->Weight.Elliptical != NULL) 00702 { 00703 memfree(Prototype->Weight.Elliptical); 00704 Prototype->Weight.Elliptical = NULL; 00705 } 00706 } 00707 } 00708 00709 /*------------------------------------------------------------------------*/ 00710 LIST RemoveInsignificantProtos( 00711 LIST ProtoList, 00712 BOOL8 KeepSigProtos, 00713 BOOL8 KeepInsigProtos, 00714 int N) 00715 00716 { 00717 LIST NewProtoList = NIL_LIST; 00718 LIST pProtoList; 00719 PROTOTYPE* Proto; 00720 PROTOTYPE* NewProto; 00721 int i; 00722 00723 pProtoList = ProtoList; 00724 iterate(pProtoList) 00725 { 00726 Proto = (PROTOTYPE *) first_node (pProtoList); 00727 if ((Proto->Significant && KeepSigProtos) || 00728 (!Proto->Significant && KeepInsigProtos)) 00729 { 00730 NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE)); 00731 00732 NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); 00733 NewProto->Significant = Proto->Significant; 00734 NewProto->Style = Proto->Style; 00735 NewProto->NumSamples = Proto->NumSamples; 00736 NewProto->Cluster = NULL; 00737 NewProto->Distrib = NULL; 00738 00739 for (i=0; i < N; i++) 00740 NewProto->Mean[i] = Proto->Mean[i]; 00741 if (Proto->Variance.Elliptical != NULL) 00742 { 00743 NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); 00744 for (i=0; i < N; i++) 00745 NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i]; 00746 } 00747 else 00748 NewProto->Variance.Elliptical = NULL; 00749 //--------------------------------------------- 00750 if (Proto->Magnitude.Elliptical != NULL) 00751 { 00752 NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); 00753 for (i=0; i < N; i++) 00754 NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i]; 00755 } 00756 else 00757 NewProto->Magnitude.Elliptical = NULL; 00758 //------------------------------------------------ 00759 if (Proto->Weight.Elliptical != NULL) 00760 { 00761 NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); 00762 for (i=0; i < N; i++) 00763 NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i]; 00764 } 00765 else 00766 NewProto->Weight.Elliptical = NULL; 00767 00768 NewProto->TotalMagnitude = Proto->TotalMagnitude; 00769 NewProto->LogMagnitude = Proto->LogMagnitude; 00770 NewProtoList = push_last(NewProtoList, NewProto); 00771 } 00772 } 00773 FreeProtoList(&ProtoList); 00774 return (NewProtoList); 00775 } /* RemoveInsignificantProtos */ 00776 00777 /*----------------------------------------------------------------------------*/ 00778 MERGE_CLASS FindClass ( 00779 LIST List, 00780 const char *Label) 00781 { 00782 MERGE_CLASS MergeClass; 00783 00784 iterate (List) 00785 { 00786 MergeClass = (MERGE_CLASS) first_node (List); 00787 if (strcmp (MergeClass->Label, Label) == 0) 00788 return (MergeClass); 00789 } 00790 return (NULL); 00791 00792 } /* FindClass */ 00793 00794 /*---------------------------------------------------------------------------*/ 00795 MERGE_CLASS NewLabeledClass ( 00796 const char *Label) 00797 { 00798 MERGE_CLASS MergeClass; 00799 00800 MergeClass = new MERGE_CLASS_NODE; 00801 MergeClass->Label = (char*)Emalloc (strlen (Label)+1); 00802 strcpy (MergeClass->Label, Label); 00803 MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS); 00804 return (MergeClass); 00805 00806 } /* NewLabeledClass */ 00807 00808 /*-----------------------------------------------------------------------------*/ 00809 void FreeLabeledClassList ( 00810 LIST ClassList) 00811 00812 /* 00813 ** Parameters: 00814 ** FontList list of all fonts in document 00815 ** Globals: none 00816 ** Operation: 00817 ** This routine deallocates all of the space allocated to 00818 ** the specified list of training samples. 00819 ** Return: none 00820 ** Exceptions: none 00821 ** History: Fri Aug 18 17:44:27 1989, DSJ, Created. 00822 */ 00823 00824 { 00825 MERGE_CLASS MergeClass; 00826 00827 iterate (ClassList) /* iterate thru all of the fonts */ 00828 { 00829 MergeClass = (MERGE_CLASS) first_node (ClassList); 00830 free (MergeClass->Label); 00831 FreeClass(MergeClass->Class); 00832 delete MergeClass; 00833 } 00834 destroy (ClassList); 00835 00836 } /* FreeLabeledClassList */ 00837 00839 CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset, 00840 LIST LabeledClassList) { 00841 MERGE_CLASS MergeClass; 00842 CLASS_TYPE Class; 00843 int NumProtos; 00844 int NumConfigs; 00845 int NumWords; 00846 int i, j; 00847 float Values[3]; 00848 PROTO NewProto; 00849 PROTO OldProto; 00850 BIT_VECTOR NewConfig; 00851 BIT_VECTOR OldConfig; 00852 00853 // printf("Float2Int ...\n"); 00854 00855 CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()]; 00856 iterate(LabeledClassList) 00857 { 00858 UnicityTableEqEq<int> font_set; 00859 MergeClass = (MERGE_CLASS) first_node (LabeledClassList); 00860 Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)]; 00861 NumProtos = MergeClass->Class->NumProtos; 00862 NumConfigs = MergeClass->Class->NumConfigs; 00863 font_set.move(&MergeClass->Class->font_set); 00864 Class->NumProtos = NumProtos; 00865 Class->MaxNumProtos = NumProtos; 00866 Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos); 00867 for(i=0; i < NumProtos; i++) 00868 { 00869 NewProto = ProtoIn(Class, i); 00870 OldProto = ProtoIn(MergeClass->Class, i); 00871 Values[0] = OldProto->X; 00872 Values[1] = OldProto->Y; 00873 Values[2] = OldProto->Angle; 00874 Normalize(Values); 00875 NewProto->X = OldProto->X; 00876 NewProto->Y = OldProto->Y; 00877 NewProto->Length = OldProto->Length; 00878 NewProto->Angle = OldProto->Angle; 00879 NewProto->A = Values[0]; 00880 NewProto->B = Values[1]; 00881 NewProto->C = Values[2]; 00882 } 00883 00884 Class->NumConfigs = NumConfigs; 00885 Class->MaxNumConfigs = NumConfigs; 00886 Class->font_set.move(&font_set); 00887 Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs); 00888 NumWords = WordsInVectorOfSize(NumProtos); 00889 for(i=0; i < NumConfigs; i++) 00890 { 00891 NewConfig = NewBitVector(NumProtos); 00892 OldConfig = MergeClass->Class->Configurations[i]; 00893 for(j=0; j < NumWords; j++) 00894 NewConfig[j] = OldConfig[j]; 00895 Class->Configurations[i] = NewConfig; 00896 } 00897 } 00898 return float_classes; 00899 } // SetUpForFloat2Int 00900 00901 /*--------------------------------------------------------------------------*/ 00902 void Normalize ( 00903 float *Values) 00904 { 00905 register float Slope; 00906 register float Intercept; 00907 register float Normalizer; 00908 00909 Slope = tan (Values [2] * 2 * PI); 00910 Intercept = Values [1] - Slope * Values [0]; 00911 Normalizer = 1 / sqrt (Slope * Slope + 1.0); 00912 00913 Values [0] = Slope * Normalizer; 00914 Values [1] = - Normalizer; 00915 Values [2] = Intercept * Normalizer; 00916 } // Normalize 00917 00918 /*-------------------------------------------------------------------------*/ 00919 void FreeNormProtoList ( 00920 LIST CharList) 00921 00922 { 00923 LABELEDLIST char_sample; 00924 00925 iterate (CharList) /* iterate thru all of the fonts */ 00926 { 00927 char_sample = (LABELEDLIST) first_node (CharList); 00928 FreeLabeledList (char_sample); 00929 } 00930 destroy (CharList); 00931 00932 } // FreeNormProtoList 00933 00934 /*---------------------------------------------------------------------------*/ 00935 void AddToNormProtosList( 00936 LIST* NormProtoList, 00937 LIST ProtoList, 00938 char* CharName) 00939 { 00940 PROTOTYPE* Proto; 00941 LABELEDLIST LabeledProtoList; 00942 00943 LabeledProtoList = NewLabeledList(CharName); 00944 iterate(ProtoList) 00945 { 00946 Proto = (PROTOTYPE *) first_node (ProtoList); 00947 LabeledProtoList->List = push(LabeledProtoList->List, Proto); 00948 } 00949 *NormProtoList = push(*NormProtoList, LabeledProtoList); 00950 } 00951 00952 /*---------------------------------------------------------------------------*/ 00953 int NumberOfProtos( 00954 LIST ProtoList, 00955 BOOL8 CountSigProtos, 00956 BOOL8 CountInsigProtos) 00957 { 00958 int N = 0; 00959 PROTOTYPE *Proto; 00960 00961 iterate(ProtoList) 00962 { 00963 Proto = (PROTOTYPE *) first_node ( ProtoList ); 00964 if (( Proto->Significant && CountSigProtos ) || 00965 ( ! Proto->Significant && CountInsigProtos ) ) 00966 N++; 00967 } 00968 return(N); 00969 }