Tesseract
3.02
|
00001 /****************************************************************************** 00002 ** Filename: blobclass.c 00003 ** Purpose: High level blob classification and training routines. 00004 ** Author: Dan Johnson 00005 ** History: 7/21/89, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 00022 #include "blobclass.h" 00023 #include "extract.h" 00024 #include "efio.h" 00025 #include "featdefs.h" 00026 #include "callcpp.h" 00027 #include "chartoname.h" 00028 00029 #include <math.h> 00030 #include <stdio.h> 00031 #include <signal.h> 00032 00033 #define MAXFILENAME 80 00034 #define MAXMATCHES 10 00035 00036 static const char kUnknownFontName[] = "UnknownFont"; 00037 00038 STRING_VAR(classify_font_name, kUnknownFontName, 00039 "Default font name to be used in training"); 00040 00044 /* name of current image file being processed */ 00045 extern char imagefile[]; 00046 00051 /*---------------------------------------------------------------------------*/ 00052 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename, 00053 TBLOB * Blob, const DENORM& denorm, const char* BlobText) { 00054 /* 00055 ** Parameters: 00056 ** Blob blob whose micro-features are to be learned 00057 ** Row row of text that blob came from 00058 ** BlobText text that corresponds to blob 00059 ** TextLength number of characters in blob 00060 ** Globals: 00061 ** imagefile base filename of the page being learned 00062 ** classify_font_name 00063 ** name of font currently being trained on 00064 ** Operation: 00065 ** Extract micro-features from the specified blob and append 00066 ** them to the appropriate file. 00067 ** Return: none 00068 ** Exceptions: none 00069 ** History: 7/28/89, DSJ, Created. 00070 */ 00071 #define TRAIN_SUFFIX ".tr" 00072 static FILE *FeatureFile = NULL; 00073 STRING Filename(filename); 00074 00075 // If no fontname was set, try to extract it from the filename 00076 STRING CurrFontName = classify_font_name; 00077 if (CurrFontName == kUnknownFontName) { 00078 // filename is expected to be of the form [lang].[fontname].exp[num] 00079 // The [lang], [fontname] and [num] fields should not have '.' characters. 00080 const char *basename = strrchr(filename.string(), '/'); 00081 const char *firstdot = strchr(basename ? basename : filename.string(), '.'); 00082 const char *lastdot = strrchr(filename.string(), '.'); 00083 if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) { 00084 ++firstdot; 00085 CurrFontName = firstdot; 00086 CurrFontName[lastdot - firstdot] = '\0'; 00087 } 00088 } 00089 00090 // if a feature file is not yet open, open it 00091 // the name of the file is the name of the image plus TRAIN_SUFFIX 00092 if (FeatureFile == NULL) { 00093 Filename += TRAIN_SUFFIX; 00094 FeatureFile = Efopen(Filename.string(), "wb"); 00095 cprintf("TRAINING ... Font name = %s\n", CurrFontName.string()); 00096 } 00097 00098 LearnBlob(FeatureDefs, FeatureFile, Blob, denorm, BlobText, 00099 CurrFontName.string()); 00100 } // LearnBlob 00101 00102 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile, 00103 TBLOB* Blob, const DENORM& denorm, 00104 const char* BlobText, const char* FontName) { 00105 CHAR_DESC CharDesc; 00106 00107 ASSERT_HOST(FeatureFile != NULL); 00108 00109 CharDesc = ExtractBlobFeatures(FeatureDefs, denorm, Blob); 00110 if (CharDesc == NULL) { 00111 cprintf("LearnBLob: CharDesc was NULL. Aborting.\n"); 00112 return; 00113 } 00114 00115 if (ValidCharDescription(FeatureDefs, CharDesc)) { 00116 // label the features with a class name and font name 00117 fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText); 00118 00119 // write micro-features to file and clean up 00120 WriteCharDescription(FeatureDefs, FeatureFile, CharDesc); 00121 } else { 00122 tprintf("Blob learned was invalid!\n"); 00123 } 00124 FreeCharDescription(CharDesc); 00125 00126 } // LearnBlob