Tesseract  3.02
tesseract-ocr/classify/blobclass.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002  **      Filename:       blobclass.c
00003  **      Purpose:        High level blob classification and training routines.
00004  **      Author:         Dan Johnson
00005  **      History:        7/21/89, DSJ, Created.
00006  **
00007  **      (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 
00022 #include "blobclass.h"
00023 #include "extract.h"
00024 #include "efio.h"
00025 #include "featdefs.h"
00026 #include "callcpp.h"
00027 #include "chartoname.h"
00028 
00029 #include <math.h>
00030 #include <stdio.h>
00031 #include <signal.h>
00032 
00033 #define MAXFILENAME             80
00034 #define MAXMATCHES              10
00035 
00036 static const char kUnknownFontName[] = "UnknownFont";
00037 
00038 STRING_VAR(classify_font_name, kUnknownFontName,
00039            "Default font name to be used in training");
00040 
00044 /* name of current image file being processed */
00045 extern char imagefile[];
00046 
00051 /*---------------------------------------------------------------------------*/
00052 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
00053                TBLOB * Blob, const DENORM& denorm, const char* BlobText) {
00054 /*
00055  **      Parameters:
00056  **              Blob            blob whose micro-features are to be learned
00057  **              Row             row of text that blob came from
00058  **              BlobText        text that corresponds to blob
00059  **              TextLength      number of characters in blob
00060  **      Globals:
00061  **              imagefile       base filename of the page being learned
00062  **              classify_font_name
00063  **                              name of font currently being trained on
00064  **      Operation:
00065  **              Extract micro-features from the specified blob and append
00066  **              them to the appropriate file.
00067  **      Return: none
00068  **      Exceptions: none
00069  **      History: 7/28/89, DSJ, Created.
00070  */
00071 #define TRAIN_SUFFIX    ".tr"
00072   static FILE *FeatureFile = NULL;
00073   STRING Filename(filename);
00074 
00075   // If no fontname was set, try to extract it from the filename
00076   STRING CurrFontName = classify_font_name;
00077   if (CurrFontName == kUnknownFontName) {
00078     // filename is expected to be of the form [lang].[fontname].exp[num]
00079     // The [lang], [fontname] and [num] fields should not have '.' characters.
00080     const char *basename = strrchr(filename.string(), '/');
00081     const char *firstdot = strchr(basename ? basename : filename.string(), '.');
00082     const char *lastdot  = strrchr(filename.string(), '.');
00083     if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
00084       ++firstdot;
00085       CurrFontName = firstdot;
00086       CurrFontName[lastdot - firstdot] = '\0';
00087     }
00088   }
00089 
00090   // if a feature file is not yet open, open it
00091   // the name of the file is the name of the image plus TRAIN_SUFFIX
00092   if (FeatureFile == NULL) {
00093     Filename += TRAIN_SUFFIX;
00094     FeatureFile = Efopen(Filename.string(), "wb");
00095     cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
00096   }
00097 
00098   LearnBlob(FeatureDefs, FeatureFile, Blob, denorm, BlobText,
00099             CurrFontName.string());
00100 }                                // LearnBlob
00101 
00102 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
00103                TBLOB* Blob, const DENORM& denorm,
00104                const char* BlobText, const char* FontName) {
00105   CHAR_DESC CharDesc;
00106 
00107   ASSERT_HOST(FeatureFile != NULL);
00108 
00109   CharDesc = ExtractBlobFeatures(FeatureDefs, denorm, Blob);
00110   if (CharDesc == NULL) {
00111     cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
00112     return;
00113   }
00114 
00115   if (ValidCharDescription(FeatureDefs, CharDesc)) {
00116     // label the features with a class name and font name
00117     fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText);
00118 
00119     // write micro-features to file and clean up
00120     WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
00121   } else {
00122     tprintf("Blob learned was invalid!\n");
00123   }
00124   FreeCharDescription(CharDesc);
00125 
00126 }                                // LearnBlob