Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: wordclass.c (Formerly wordclass.c) 00005 * Description: Word classifier 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Tue Jan 30 14:03:25 1990 00008 * Modified: Fri Jul 12 16:03:06 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Experimental (Do Not Distribute) 00012 * 00013 * (c) Copyright 1990, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I N C L U D E S 00027 ----------------------------------------------------------------------*/ 00028 #include <stdio.h> 00029 #ifdef __UNIX__ 00030 #include <assert.h> 00031 #endif 00032 00033 #include "wordclass.h" 00034 #include "associate.h" 00035 #include "render.h" 00036 #include "matchtab.h" 00037 #include "permute.h" 00038 #include "callcpp.h" 00039 #include <assert.h> 00040 #include "wordrec.h" 00041 00042 // Include automatically generated configuration file if running autoconf. 00043 #ifdef HAVE_CONFIG_H 00044 #include "config_auto.h" 00045 #endif 00046 00047 /*---------------------------------------------------------------------- 00048 F u n c t i o n s 00049 ----------------------------------------------------------------------*/ 00050 namespace tesseract { 00062 BLOB_CHOICE_LIST *Wordrec::classify_blob(TBLOB *blob, const DENORM& denorm, 00063 const char *string, C_COL color, 00064 BlamerBundle *blamer_bundle) { 00065 fflush(stdout); 00066 BLOB_CHOICE_LIST *choices = NULL; 00067 #ifndef GRAPHICS_DISABLED 00068 if (wordrec_display_all_blobs) 00069 display_blob(blob, color); 00070 #endif 00071 choices = blob_match_table.get_match(blob); 00072 if (choices == NULL) { 00073 choices = call_matcher(&denorm, blob); 00074 blob_match_table.put_match(blob, choices); 00075 // If a blob with the same bounding box as one of the truth character 00076 // bounding boxes is not classified as the corresponding truth character 00077 // blame character classifier for incorrect answer. 00078 if (blamer_bundle != NULL && blamer_bundle->truth_has_char_boxes && 00079 blamer_bundle->incorrect_result_reason == IRR_CORRECT) { 00080 for (int b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) { 00081 const TBOX &truth_box = blamer_bundle->norm_truth_word.BlobBox(b); 00082 const TBOX &blob_box = blob->bounding_box(); 00083 // Note that we are more strict on the bounding box boundaries here 00084 // than in other places (chopper, segmentation search), since we do 00085 // not have the ability to check the previous and next bounding box. 00086 if (blob_box.x_almost_equal(truth_box, 00087 blamer_bundle->norm_box_tolerance/2)) { 00088 BLOB_CHOICE_IT choices_it(choices); 00089 bool found = false; 00090 bool incorrect_adapted = false; 00091 UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID; 00092 const char *truth_str = blamer_bundle->truth_text[b].string(); 00093 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); 00094 choices_it.forward()) { 00095 if (strcmp(truth_str, getDict().getUnicharset().get_normed_unichar( 00096 choices_it.data()->unichar_id())) == 0) { 00097 found = true; 00098 break; 00099 } else if (choices_it.data()->adapted()) { 00100 incorrect_adapted = true; 00101 incorrect_adapted_id = choices_it.data()->unichar_id(); 00102 } 00103 } // end choices_it for loop 00104 if (!found) { 00105 STRING debug = "unichar "; 00106 debug += truth_str; 00107 debug += " not found in classification list"; 00108 blamer_bundle->SetBlame(IRR_CLASSIFIER, debug, 00109 NULL, wordrec_debug_blamer); 00110 } else if (incorrect_adapted) { 00111 STRING debug = "better rating for adapted "; 00112 debug += getDict().getUnicharset().id_to_unichar( 00113 incorrect_adapted_id); 00114 debug += " than for correct "; 00115 debug += truth_str; 00116 blamer_bundle->SetBlame(IRR_ADAPTION, debug, 00117 NULL, wordrec_debug_blamer); 00118 } 00119 break; 00120 } 00121 } // end iterating over blamer_bundle->norm_truth_word 00122 } 00123 } 00124 #ifndef GRAPHICS_DISABLED 00125 if (classify_debug_level && string) 00126 print_ratings_list(string, choices, getDict().getUnicharset()); 00127 00128 if (wordrec_blob_pause) 00129 window_wait(blob_window); 00130 #endif 00131 00132 return (choices); 00133 } 00134 00135 // Returns a valid BLOB_CHOICE_LIST representing the given result. 00136 BLOB_CHOICE_LIST *Wordrec::fake_classify_blob(UNICHAR_ID class_id, 00137 float rating, float certainty) { 00138 BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST(); // matcher result 00139 BLOB_CHOICE *choice = 00140 new BLOB_CHOICE(class_id, rating, certainty, -1, -1, 0, 0, 0, false); 00141 BLOB_CHOICE_IT temp_it(ratings); 00142 temp_it.add_after_stay_put(choice); 00143 return ratings; 00144 } 00145 00152 void Wordrec::update_blob_classifications( 00153 TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices) { 00154 TBLOB *tblob = word->blobs; 00155 int index = 0; 00156 for (; tblob != NULL && index < choices.length(); 00157 tblob = tblob->next, index++) { 00158 blob_match_table.add_to_match(tblob, choices.get(index)); 00159 } 00160 } 00161 00162 } // namespace tesseract;