Tesseract  3.02
tesseract-ocr/wordrec/wordclass.cpp
Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        wordclass.c  (Formerly wordclass.c)
00005  * Description:  Word classifier
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Tue Jan 30 14:03:25 1990
00008  * Modified:     Fri Jul 12 16:03:06 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Experimental (Do Not Distribute)
00012  *
00013  * (c) Copyright 1990, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 /*----------------------------------------------------------------------
00026           I N C L U D E S
00027 ----------------------------------------------------------------------*/
00028 #include <stdio.h>
00029 #ifdef __UNIX__
00030 #include <assert.h>
00031 #endif
00032 
00033 #include "wordclass.h"
00034 #include "associate.h"
00035 #include "render.h"
00036 #include "matchtab.h"
00037 #include "permute.h"
00038 #include "callcpp.h"
00039 #include <assert.h>
00040 #include "wordrec.h"
00041 
00042 // Include automatically generated configuration file if running autoconf.
00043 #ifdef HAVE_CONFIG_H
00044 #include "config_auto.h"
00045 #endif
00046 
00047 /*----------------------------------------------------------------------
00048           F u n c t i o n s
00049 ----------------------------------------------------------------------*/
00050 namespace tesseract {
00062 BLOB_CHOICE_LIST *Wordrec::classify_blob(TBLOB *blob, const DENORM& denorm,
00063                                          const char *string, C_COL color,
00064                                          BlamerBundle *blamer_bundle) {
00065   fflush(stdout);
00066   BLOB_CHOICE_LIST *choices = NULL;
00067 #ifndef GRAPHICS_DISABLED
00068   if (wordrec_display_all_blobs)
00069     display_blob(blob, color);
00070 #endif
00071   choices = blob_match_table.get_match(blob);
00072   if (choices == NULL) {
00073     choices = call_matcher(&denorm, blob);
00074     blob_match_table.put_match(blob, choices);
00075     // If a blob with the same bounding box as one of the truth character
00076     // bounding boxes is not classified as the corresponding truth character
00077     // blame character classifier for incorrect answer.
00078     if (blamer_bundle != NULL && blamer_bundle->truth_has_char_boxes &&
00079         blamer_bundle->incorrect_result_reason == IRR_CORRECT) {
00080       for (int b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) {
00081         const TBOX &truth_box = blamer_bundle->norm_truth_word.BlobBox(b);
00082         const TBOX &blob_box = blob->bounding_box();
00083         // Note that we are more strict on the bounding box boundaries here
00084         // than in other places (chopper, segmentation search), since we do
00085         // not have the ability to check the previous and next bounding box.
00086         if (blob_box.x_almost_equal(truth_box,
00087                                     blamer_bundle->norm_box_tolerance/2)) {
00088           BLOB_CHOICE_IT choices_it(choices);
00089           bool found = false;
00090           bool incorrect_adapted = false;
00091           UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
00092           const char *truth_str = blamer_bundle->truth_text[b].string();
00093           for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
00094               choices_it.forward()) {
00095             if (strcmp(truth_str, getDict().getUnicharset().get_normed_unichar(
00096                 choices_it.data()->unichar_id())) == 0) {
00097               found = true;
00098               break;
00099             } else if (choices_it.data()->adapted()) {
00100               incorrect_adapted = true;
00101               incorrect_adapted_id = choices_it.data()->unichar_id();
00102             }
00103           }  // end choices_it for loop
00104           if (!found) {
00105             STRING debug = "unichar ";
00106             debug += truth_str;
00107             debug += " not found in classification list";
00108             blamer_bundle->SetBlame(IRR_CLASSIFIER, debug,
00109                                     NULL, wordrec_debug_blamer);
00110           } else if (incorrect_adapted) {
00111             STRING debug = "better rating for adapted ";
00112             debug += getDict().getUnicharset().id_to_unichar(
00113                 incorrect_adapted_id);
00114             debug += " than for correct ";
00115             debug += truth_str;
00116             blamer_bundle->SetBlame(IRR_ADAPTION, debug,
00117                                     NULL, wordrec_debug_blamer);
00118           }
00119           break;
00120         }
00121       }  // end iterating over blamer_bundle->norm_truth_word
00122     }
00123   }
00124 #ifndef GRAPHICS_DISABLED
00125   if (classify_debug_level && string)
00126     print_ratings_list(string, choices, getDict().getUnicharset());
00127 
00128   if (wordrec_blob_pause)
00129     window_wait(blob_window);
00130 #endif
00131 
00132   return (choices);
00133 }
00134 
00135 // Returns a valid BLOB_CHOICE_LIST representing the given result.
00136 BLOB_CHOICE_LIST *Wordrec::fake_classify_blob(UNICHAR_ID class_id,
00137                                               float rating, float certainty) {
00138   BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();  // matcher result
00139   BLOB_CHOICE *choice =
00140       new BLOB_CHOICE(class_id, rating, certainty, -1, -1, 0, 0, 0, false);
00141   BLOB_CHOICE_IT temp_it(ratings);
00142   temp_it.add_after_stay_put(choice);
00143   return ratings;
00144 }
00145 
00152 void Wordrec::update_blob_classifications(
00153     TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices) {
00154   TBLOB *tblob = word->blobs;
00155   int index = 0;
00156   for (; tblob != NULL && index < choices.length();
00157        tblob = tblob->next, index++) {
00158     blob_match_table.add_to_match(tblob, choices.get(index));
00159   }
00160 }
00161 
00162 }  // namespace tesseract;