Tesseract
3.02
|
00001 /****************************************************************** 00002 * File: output.cpp (Formerly output.c) 00003 * Description: Output pass 00004 * Author: Phil Cheatle 00005 * Created: Thu Aug 4 10:56:08 BST 1994 00006 * 00007 * (C) Copyright 1994, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include "mfcpch.h" 00025 #include <string.h> 00026 #include <ctype.h> 00027 #ifdef __UNIX__ 00028 #include <assert.h> 00029 #include <unistd.h> 00030 #include <errno.h> 00031 #endif 00032 #include "helpers.h" 00033 #include "tfacep.h" 00034 #include "tessvars.h" 00035 #include "control.h" 00036 #include "secname.h" 00037 #include "reject.h" 00038 #include "docqual.h" 00039 #include "output.h" 00040 #include "bestfirst.h" 00041 #include "globals.h" 00042 #include "tesseractclass.h" 00043 00044 #define EPAPER_EXT ".ep" 00045 #define PAGE_YSIZE 3508 00046 #define CTRL_INSET '\024' //dc4=text inset 00047 #define CTRL_FONT '\016' //so=font change 00048 #define CTRL_DEFAULT '\017' //si=default font 00049 #define CTRL_SHIFT '\022' //dc2=x shift 00050 #define CTRL_TAB '\011' //tab 00051 #define CTRL_NEWLINE '\012' //newline 00052 #define CTRL_HARDLINE '\015' //cr 00053 00054 /********************************************************************** 00055 * pixels_to_pts 00056 * 00057 * Convert an integer number of pixels to the nearest integer 00058 * number of points. 00059 **********************************************************************/ 00060 00061 inT32 pixels_to_pts( //convert coords 00062 inT32 pixels, 00063 inT32 pix_res //resolution 00064 ) { 00065 float pts; //converted value 00066 00067 pts = pixels * 72.0 / pix_res; 00068 return (inT32) (pts + 0.5); //round it 00069 } 00070 00071 namespace tesseract { 00072 void Tesseract::output_pass( //Tess output pass //send to api 00073 PAGE_RES_IT &page_res_it, 00074 const TBOX *target_word_box) { 00075 BLOCK_RES *block_of_last_word; 00076 inT16 block_id; 00077 BOOL8 force_eol; //During output 00078 BLOCK *nextblock; //block of next word 00079 WERD *nextword; //next word 00080 00081 page_res_it.restart_page (); 00082 block_of_last_word = NULL; 00083 while (page_res_it.word () != NULL) { 00084 check_debug_pt (page_res_it.word (), 120); 00085 00086 if (target_word_box) 00087 { 00088 00089 TBOX current_word_box=page_res_it.word ()->word->bounding_box(); 00090 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2); 00091 if (!target_word_box->contains(center_pt)) 00092 { 00093 page_res_it.forward (); 00094 continue; 00095 } 00096 00097 } 00098 if (tessedit_write_block_separators && 00099 block_of_last_word != page_res_it.block ()) { 00100 block_of_last_word = page_res_it.block (); 00101 block_id = block_of_last_word->block->index(); 00102 } 00103 00104 force_eol = (tessedit_write_block_separators && 00105 (page_res_it.block () != page_res_it.next_block ())) || 00106 (page_res_it.next_word () == NULL); 00107 00108 if (page_res_it.next_word () != NULL) 00109 nextword = page_res_it.next_word ()->word; 00110 else 00111 nextword = NULL; 00112 if (page_res_it.next_block () != NULL) 00113 nextblock = page_res_it.next_block ()->block; 00114 else 00115 nextblock = NULL; 00116 //regardless of tilde crunching 00117 write_results(page_res_it, 00118 determine_newline_type(page_res_it.word()->word, 00119 page_res_it.block()->block, 00120 nextword, nextblock), force_eol); 00121 page_res_it.forward(); 00122 } 00123 } 00124 00125 00126 /************************************************************************* 00127 * write_results() 00128 * 00129 * All recognition and rejection has now been done. Generate the following: 00130 * .txt file - giving the final best choices with NO highlighting 00131 * .raw file - giving the tesseract top choice output for each word 00132 * .map file - showing how the .txt file has been rejected in the .ep file 00133 * epchoice list - a list of one element per word, containing the text for the 00134 * epaper. Reject strings are inserted. 00135 * inset list - a list of bounding boxes of reject insets - indexed by the 00136 * reject strings in the epchoice text. 00137 *************************************************************************/ 00138 void Tesseract::write_results(PAGE_RES_IT &page_res_it, 00139 char newline_type, // type of newline 00140 BOOL8 force_eol) { // override tilde crunch? 00141 WERD_RES *word = page_res_it.word(); 00142 const UNICHARSET &uchset = *word->uch_set; 00143 STRING repetition_code; 00144 const STRING *wordstr; 00145 STRING wordstr_lengths; 00146 int i; 00147 char unrecognised = STRING (unrecognised_char)[0]; 00148 char ep_chars[32]; //Only for unlv_tilde_crunch 00149 int ep_chars_index = 0; 00150 char txt_chs[32]; //Only for unlv_tilde_crunch 00151 char map_chs[32]; //Only for unlv_tilde_crunch 00152 int txt_index = 0; 00153 BOOL8 need_reject = FALSE; 00154 UNICHAR_ID space = uchset.unichar_to_id(" "); 00155 if ((word->unlv_crunch_mode != CR_NONE || 00156 word->best_choice->length() == 0) && 00157 !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { 00158 if ((word->unlv_crunch_mode != CR_DELETE) && 00159 (!stats_.tilde_crunch_written || 00160 ((word->unlv_crunch_mode == CR_KEEP_SPACE) && 00161 (word->word->space () > 0) && 00162 !word->word->flag (W_FUZZY_NON) && 00163 !word->word->flag (W_FUZZY_SP)))) { 00164 if (!word->word->flag (W_BOL) && 00165 (word->word->space () > 0) && 00166 !word->word->flag (W_FUZZY_NON) && 00167 !word->word->flag (W_FUZZY_SP)) { 00168 // Write a space to separate from preceeding good text. 00169 txt_chs[txt_index] = ' '; 00170 map_chs[txt_index++] = '1'; 00171 ep_chars[ep_chars_index++] = ' '; 00172 stats_.last_char_was_tilde = false; 00173 } 00174 need_reject = TRUE; 00175 } 00176 if ((need_reject && !stats_.last_char_was_tilde) || 00177 (force_eol && stats_.write_results_empty_block)) { 00178 /* Write a reject char - mark as rejected unless zero_rejection mode */ 00179 stats_.last_char_was_tilde = TRUE; 00180 txt_chs[txt_index] = unrecognised; 00181 if (tessedit_zero_rejection || (suspect_level == 0)) { 00182 map_chs[txt_index++] = '1'; 00183 ep_chars[ep_chars_index++] = unrecognised; 00184 } 00185 else { 00186 map_chs[txt_index++] = '0'; 00187 /* 00188 The ep_choice string is a faked reject to allow newdiff to sync the 00189 .etx with the .txt and .map files. 00190 */ 00191 ep_chars[ep_chars_index++] = CTRL_INSET; // escape code 00192 //dummy reject 00193 ep_chars[ep_chars_index++] = 1; 00194 //dummy reject 00195 ep_chars[ep_chars_index++] = 1; 00196 //type 00197 ep_chars[ep_chars_index++] = 2; 00198 //dummy reject 00199 ep_chars[ep_chars_index++] = 1; 00200 //dummy reject 00201 ep_chars[ep_chars_index++] = 1; 00202 } 00203 stats_.tilde_crunch_written = true; 00204 stats_.last_char_was_newline = false; 00205 stats_.write_results_empty_block = false; 00206 } 00207 00208 if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) { 00209 /* Add a new line output */ 00210 txt_chs[txt_index] = '\n'; 00211 map_chs[txt_index++] = '\n'; 00212 //end line 00213 ep_chars[ep_chars_index++] = newline_type; 00214 00215 //Cos of the real newline 00216 stats_.tilde_crunch_written = false; 00217 stats_.last_char_was_newline = true; 00218 stats_.last_char_was_tilde = false; 00219 } 00220 txt_chs[txt_index] = '\0'; 00221 map_chs[txt_index] = '\0'; 00222 ep_chars[ep_chars_index] = '\0'; // terminate string 00223 word->ep_choice = new WERD_CHOICE(ep_chars, uchset); 00224 00225 if (force_eol) 00226 stats_.write_results_empty_block = true; 00227 return; 00228 } 00229 00230 /* NORMAL PROCESSING of non tilde crunched words */ 00231 00232 stats_.tilde_crunch_written = false; 00233 if (newline_type) 00234 stats_.last_char_was_newline = true; 00235 else 00236 stats_.last_char_was_newline = false; 00237 stats_.write_results_empty_block = force_eol; // about to write a real word 00238 00239 if (unlv_tilde_crunching && 00240 stats_.last_char_was_tilde && 00241 (word->word->space() == 0) && 00242 !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && 00243 (word->best_choice->unichar_id(0) == space)) { 00244 /* Prevent adjacent tilde across words - we know that adjacent tildes within 00245 words have been removed */ 00246 word->best_choice->remove_unichar_id(0); 00247 if (word->best_choice->blob_choices() != NULL) { 00248 BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices()); 00249 if (!blob_choices_it.empty()) delete blob_choices_it.extract(); 00250 } 00251 word->reject_map.remove_pos (0); 00252 word->box_word->DeleteBox(0); 00253 } 00254 if (newline_type || 00255 (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) 00256 stats_.last_char_was_tilde = false; 00257 else { 00258 if (word->reject_map.length () > 0) { 00259 if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) 00260 stats_.last_char_was_tilde = true; 00261 else 00262 stats_.last_char_was_tilde = false; 00263 } 00264 else if (word->word->space () > 0) 00265 stats_.last_char_was_tilde = false; 00266 /* else it is unchanged as there are no output chars */ 00267 } 00268 00269 ASSERT_HOST (word->best_choice->length() == word->reject_map.length()); 00270 00271 set_unlv_suspects(word); 00272 check_debug_pt (word, 120); 00273 if (tessedit_rejection_debug) { 00274 tprintf ("Dict word: \"%s\": %d\n", 00275 word->best_choice->debug_string().string(), 00276 dict_word(*(word->best_choice))); 00277 } 00278 if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { 00279 repetition_code = "|^~R"; 00280 wordstr_lengths = "\001\001\001\001"; 00281 repetition_code += uchset.id_to_unichar(get_rep_char(word)); 00282 wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word))); 00283 wordstr = &repetition_code; 00284 } else { 00285 if (tessedit_zero_rejection) { 00286 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ 00287 for (i = 0; i < word->best_choice->length(); ++i) { 00288 if (word->reject_map[i].rejected()) 00289 word->reject_map[i].setrej_minimal_rej_accept(); 00290 } 00291 } 00292 if (tessedit_minimal_rejection) { 00293 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ 00294 for (i = 0; i < word->best_choice->length(); ++i) { 00295 if ((word->best_choice->unichar_id(i) != space) && 00296 word->reject_map[i].rejected()) 00297 word->reject_map[i].setrej_minimal_rej_accept(); 00298 } 00299 } 00300 } 00301 } 00302 } // namespace tesseract 00303 00304 /********************************************************************** 00305 * determine_newline_type 00306 * 00307 * Find whether we have a wrapping or hard newline. 00308 * Return FALSE if not at end of line. 00309 **********************************************************************/ 00310 00311 char determine_newline_type( //test line ends 00312 WERD *word, //word to do 00313 BLOCK *block, //current block 00314 WERD *next_word, //next word 00315 BLOCK *next_block //block of next word 00316 ) { 00317 inT16 end_gap; //to right edge 00318 inT16 width; //of next word 00319 TBOX word_box; //bounding 00320 TBOX next_box; //next word 00321 TBOX block_box; //block bounding 00322 00323 if (!word->flag (W_EOL)) 00324 return FALSE; //not end of line 00325 if (next_word == NULL || next_block == NULL || block != next_block) 00326 return CTRL_NEWLINE; 00327 if (next_word->space () > 0) 00328 return CTRL_HARDLINE; //it is tabbed 00329 word_box = word->bounding_box (); 00330 next_box = next_word->bounding_box (); 00331 block_box = block->bounding_box (); 00332 //gap to eol 00333 end_gap = block_box.right () - word_box.right (); 00334 end_gap -= (inT32) block->space (); 00335 width = next_box.right () - next_box.left (); 00336 // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n", 00337 // block_box.right(),word_box.right(),end_gap, 00338 // next_box.right(),next_box.left(),width, 00339 // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE); 00340 return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE; 00341 } 00342 00343 /************************************************************************* 00344 * get_rep_char() 00345 * Return the first accepted character from the repetition string. This is the 00346 * character which is repeated - as determined earlier by fix_rep_char() 00347 *************************************************************************/ 00348 namespace tesseract { 00349 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? 00350 int i; 00351 for (i = 0; ((i < word->reject_map.length()) && 00352 (word->reject_map[i].rejected())); ++i); 00353 00354 if (i < word->reject_map.length()) { 00355 return word->best_choice->unichar_id(i); 00356 } else { 00357 return word->uch_set->unichar_to_id(unrecognised_char.string()); 00358 } 00359 } 00360 00361 /************************************************************************* 00362 * SUSPECT LEVELS 00363 * 00364 * 0 - dont reject ANYTHING 00365 * 1,2 - partial rejection 00366 * 3 - BEST 00367 * 00368 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and 00369 * tessedit_minimal_rejection. 00370 *************************************************************************/ 00371 void Tesseract::set_unlv_suspects(WERD_RES *word_res) { 00372 int len = word_res->reject_map.length(); 00373 const WERD_CHOICE &word = *(word_res->best_choice); 00374 const UNICHARSET &uchset = *word.unicharset(); 00375 int i; 00376 float rating_per_ch; 00377 00378 if (suspect_level == 0) { 00379 for (i = 0; i < len; i++) { 00380 if (word_res->reject_map[i].rejected()) 00381 word_res->reject_map[i].setrej_minimal_rej_accept(); 00382 } 00383 return; 00384 } 00385 00386 if (suspect_level >= 3) 00387 return; //Use defaults 00388 00389 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ 00390 00391 if (safe_dict_word(word_res) && 00392 (count_alphas(word) > suspect_short_words)) { 00393 /* Unreject alphas in dictionary words */ 00394 for (i = 0; i < len; ++i) { 00395 if (word_res->reject_map[i].rejected() && 00396 uchset.get_isalpha(word.unichar_id(i))) 00397 word_res->reject_map[i].setrej_minimal_rej_accept(); 00398 } 00399 } 00400 00401 rating_per_ch = word.rating() / word_res->reject_map.length(); 00402 00403 if (rating_per_ch >= suspect_rating_per_ch) 00404 return; //Dont touch bad ratings 00405 00406 if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { 00407 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ 00408 for (i = 0; i < len; ++i) { 00409 if (word_res->reject_map[i].rejected() && 00410 (!uchset.eq(word.unichar_id(i), " "))) 00411 word_res->reject_map[i].setrej_minimal_rej_accept(); 00412 } 00413 } 00414 00415 for (i = 0; i < len; i++) { 00416 if (word_res->reject_map[i].rejected()) { 00417 if (word_res->reject_map[i].flag(R_DOC_REJ)) 00418 word_res->reject_map[i].setrej_minimal_rej_accept(); 00419 if (word_res->reject_map[i].flag(R_BLOCK_REJ)) 00420 word_res->reject_map[i].setrej_minimal_rej_accept(); 00421 if (word_res->reject_map[i].flag(R_ROW_REJ)) 00422 word_res->reject_map[i].setrej_minimal_rej_accept(); 00423 } 00424 } 00425 00426 if (suspect_level == 2) 00427 return; 00428 00429 if (!suspect_constrain_1Il || 00430 (word_res->reject_map.length() <= suspect_short_words)) { 00431 for (i = 0; i < len; i++) { 00432 if (word_res->reject_map[i].rejected()) { 00433 if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) || 00434 word_res->reject_map[i].flag(R_POSTNN_1IL))) 00435 word_res->reject_map[i].setrej_minimal_rej_accept(); 00436 00437 if (!suspect_constrain_1Il && 00438 word_res->reject_map[i].flag(R_MM_REJECT)) 00439 word_res->reject_map[i].setrej_minimal_rej_accept(); 00440 } 00441 } 00442 } 00443 00444 if (acceptable_word_string(*word_res->uch_set, 00445 word.unichar_string().string(), 00446 word.unichar_lengths().string()) != 00447 AC_UNACCEPTABLE || 00448 acceptable_number_string(word.unichar_string().string(), 00449 word.unichar_lengths().string())) { 00450 if (word_res->reject_map.length() > suspect_short_words) { 00451 for (i = 0; i < len; i++) { 00452 if (word_res->reject_map[i].rejected() && 00453 (!word_res->reject_map[i].perm_rejected() || 00454 word_res->reject_map[i].flag (R_1IL_CONFLICT) || 00455 word_res->reject_map[i].flag (R_POSTNN_1IL) || 00456 word_res->reject_map[i].flag (R_MM_REJECT))) { 00457 word_res->reject_map[i].setrej_minimal_rej_accept(); 00458 } 00459 } 00460 } 00461 } 00462 } 00463 00464 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) { 00465 int count = 0; 00466 for (int i = 0; i < word.length(); ++i) { 00467 if (word.unicharset()->get_isalpha(word.unichar_id(i))) 00468 count++; 00469 } 00470 return count; 00471 } 00472 00473 00474 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) { 00475 int count = 0; 00476 for (int i = 0; i < word.length(); ++i) { 00477 if (word.unicharset()->get_isalpha(word.unichar_id(i)) || 00478 word.unicharset()->get_isdigit(word.unichar_id(i))) 00479 count++; 00480 } 00481 return count; 00482 } 00483 00484 00485 BOOL8 Tesseract::acceptable_number_string(const char *s, 00486 const char *lengths) { 00487 BOOL8 prev_digit = FALSE; 00488 00489 if (*lengths == 1 && *s == '(') 00490 s++; 00491 00492 if (*lengths == 1 && 00493 ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) 00494 s++; 00495 00496 for (; *s != '\0'; s += *(lengths++)) { 00497 if (unicharset.get_isdigit(s, *lengths)) 00498 prev_digit = TRUE; 00499 else if (prev_digit && 00500 (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) 00501 prev_digit = FALSE; 00502 else if (prev_digit && *lengths == 1 && 00503 (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')'))) 00504 return TRUE; 00505 else if (prev_digit && 00506 *lengths == 1 && (*s == '%') && 00507 (*(lengths + 1) == 1 && *(s + *lengths) == ')') && 00508 (*(s + *lengths + *(lengths + 1)) == '\0')) 00509 return TRUE; 00510 else 00511 return FALSE; 00512 } 00513 return TRUE; 00514 } 00515 } // namespace tesseract