Tesseract
3.02
|
#include "mfcpch.h"
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "memry.h"
#include "reject.h"
#include "tfacep.h"
#include "imgs.h"
#include "control.h"
#include "docqual.h"
#include "secname.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"
#include "notdll.h"
Go to the source code of this file.
Namespaces | |
namespace | tesseract |
Functions | |
CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract | |
void | reject_blanks (WERD_RES *word) |
void | reject_poor_matches (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices) |
float | compute_reject_threshold (BLOB_CHOICE_LIST_CLIST *blob_choices) |
CLISTIZEH | ( | STRING | ) |
Definition at line 55 of file reject.cpp.
{ void Tesseract::set_done( //set done flag WERD_RES *word, inT16 pass) { /* 0: Original heuristic used in Tesseract and Ray's prototype Resaljet */ if (tessedit_ok_mode == 0) { /* NOTE - done even if word contains some or all spaces !!! */ word->done = word->tess_accepted; } /* 1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts */ else if (tessedit_ok_mode == 1) { word->done = word->tess_accepted && (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) word->done = FALSE; } /* 2: as 1 + only accept dict words or numerics in pass 1 */ else if (tessedit_ok_mode == 2) { word->done = word->tess_accepted && (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) word->done = FALSE; if (word->done && (pass == 1) && (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) { #ifndef SECURE_NAMES if (tessedit_rejection_debug) tprintf ("\nVETO Tess accepting poor word \"%s\"\n", word->best_choice->unichar_string().string ()); #endif word->done = FALSE; } } /* 3: as 2 + only accept dict words or numerics in pass 2 as well */ else if (tessedit_ok_mode == 3) { word->done = word->tess_accepted && (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) word->done = FALSE; if (word->done && (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) { #ifndef SECURE_NAMES if (tessedit_rejection_debug) tprintf ("\nVETO Tess accepting poor word \"%s\"\n", word->best_choice->unichar_string().string ()); #endif word->done = FALSE; } } /* 4: as 2 + reject dict ambigs in pass 1 */ else if (tessedit_ok_mode == 4) { word->done = word->tess_accepted && (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) word->done = FALSE; if (word->done && (pass == 1) && (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) || (test_ambig_word (word)))) { #ifndef SECURE_NAMES if (tessedit_rejection_debug) tprintf ("\nVETO Tess accepting poor word \"%s\"\n", word->best_choice->unichar_string().string ()); #endif word->done = FALSE; } } /* 5: as 3 + reject dict ambigs in both passes */ else if (tessedit_ok_mode == 5) { word->done = word->tess_accepted && (strchr (word->best_choice->unichar_string().string (), ' ') == NULL); if (word->done && (pass == 1) && one_ell_conflict (word, FALSE)) word->done = FALSE; if (word->done && (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) || (test_ambig_word (word)))) { #ifndef SECURE_NAMES if (tessedit_rejection_debug) tprintf ("\nVETO Tess accepting poor word \"%s\"\n", word->best_choice->unichar_string().string ()); #endif word->done = FALSE; } } else { tprintf ("BAD tessedit_ok_mode\n"); err_exit(); } } /************************************************************************* * make_reject_map() * * Sets the done flag to indicate whether the resylt is acceptable. * * Sets a reject map for the word. *************************************************************************/ void Tesseract::make_reject_map( //make rej map for wd //detailed results WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices, ROW *row, inT16 pass //1st or 2nd? ) { int i; int offset; flip_0O(word); check_debug_pt(word, -1); // For trap only set_done(word, pass); // Set acceptance word->reject_map.initialise(word->best_choice->unichar_lengths().length()); reject_blanks(word); /* 0: Rays original heuristic - the baseline */ if (tessedit_reject_mode == 0) { if (!word->done) reject_poor_matches(word, blob_choices); } else if (tessedit_reject_mode == 5) { /* 5: Reject I/1/l from words where there is no strong contextual confirmation; the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); and the whole of any words which are very small */ if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { word->reject_map.rej_word_small_xht(); } else { one_ell_conflict(word, TRUE); /* Originally the code here just used the done flag. Now I have duplicated and unpacked the conditions for setting the done flag so that each mechanism can be turned on or off independently. This works WITHOUT affecting the done flag setting. */ if (rej_use_tess_accepted && !word->tess_accepted) word->reject_map.rej_word_not_tess_accepted (); if (rej_use_tess_blanks && (strchr (word->best_choice->unichar_string().string (), ' ') != NULL)) word->reject_map.rej_word_contains_blanks (); WERD_CHOICE* best_choice = word->best_choice; if (rej_use_good_perm) { if ((best_choice->permuter() == SYSTEM_DAWG_PERM || best_choice->permuter() == FREQ_DAWG_PERM || best_choice->permuter() == USER_DAWG_PERM) && (!rej_use_sensible_wd || acceptable_word_string(*word->uch_set, best_choice->unichar_string().string(), best_choice->unichar_lengths().string()) != AC_UNACCEPTABLE)) { // PASSED TEST } else if (best_choice->permuter() == NUMBER_PERM) { if (rej_alphas_in_number_perm) { for (i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0'; offset += best_choice->unichar_lengths()[i++]) { if (word->reject_map[i].accepted() && word->uch_set->get_isalpha( best_choice->unichar_string().string() + offset, best_choice->unichar_lengths()[i])) word->reject_map[i].setrej_bad_permuter(); // rej alpha } } } else { word->reject_map.rej_word_bad_permuter(); } } /* Ambig word rejection was here once !!*/ } } else { tprintf("BAD tessedit_reject_mode\n"); err_exit(); } if (tessedit_image_border > -1) reject_edge_blobs(word); check_debug_pt (word, 10); if (tessedit_rejection_debug) { tprintf("Permuter Type = %d\n", word->best_choice->permuter ()); tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty (), word->best_choice->rating ()); tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); } flip_hyphens(word); check_debug_pt(word, 20); } } // namespace tesseract
float compute_reject_threshold | ( | BLOB_CHOICE_LIST_CLIST * | blob_choices | ) |
Definition at line 370 of file reject.cpp.
{ inT16 index; //to ratings inT16 blob_count; //no of blobs in word inT16 ok_blob_count = 0; //non TESS rej blobs in word float *ratings; //array of confidences float threshold; //rejection threshold float bestgap; //biggest gap float gapstart; //bottom of gap //super iterator BLOB_CHOICE_LIST_C_IT list_it = blob_choices; BLOB_CHOICE_IT choice_it; //real iterator blob_count = blob_choices->length (); ratings = (float *) alloc_mem (blob_count * sizeof (float)); for (list_it.mark_cycle_pt (), index = 0; !list_it.cycled_list (); list_it.forward (), index++) { choice_it.set_to_list (list_it.data ()); if (choice_it.length () > 0) { ratings[ok_blob_count] = choice_it.data ()->certainty (); //get in an array // tprintf("Rating[%d]=%c %g %g\n", // index,choice_it.data()->char_class(), // choice_it.data()->rating(),choice_it.data()->certainty()); ok_blob_count++; } } ASSERT_HOST (index == blob_count); qsort (ratings, ok_blob_count, sizeof (float), sort_floats); //sort them bestgap = 0; gapstart = ratings[0] - 1; //all reject if none better if (ok_blob_count >= 3) { for (index = 0; index < ok_blob_count - 1; index++) { if (ratings[index + 1] - ratings[index] > bestgap) { bestgap = ratings[index + 1] - ratings[index]; //find biggest gapstart = ratings[index]; } } } threshold = gapstart + bestgap / 2; // tprintf("First=%g, last=%g, gap=%g, threshold=%g\n", // ratings[0],ratings[index],bestgap,threshold); free_mem(ratings); return threshold; }
void reject_blanks | ( | WERD_RES * | word | ) |
Definition at line 290 of file reject.cpp.
{ inT16 i; inT16 offset; for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; offset += word->best_choice->unichar_lengths()[i], i += 1) { if (word->best_choice->unichar_string()[offset] == ' ') //rej unrecognised blobs word->reject_map[i].setrej_tess_failure (); } }
void reject_poor_matches | ( | WERD_RES * | word, |
BLOB_CHOICE_LIST_CLIST * | blob_choices | ||
) |
Definition at line 319 of file reject.cpp.
{ float threshold; inT16 i = 0; inT16 offset = 0; //super iterator BLOB_CHOICE_LIST_C_IT list_it = blob_choices; BLOB_CHOICE_IT choice_it; //real iterator #ifndef SECURE_NAMES if (strlen(word->best_choice->unichar_lengths().string()) != list_it.length()) { tprintf ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n", word->best_choice->unichar_string().string(), strlen (word->best_choice->unichar_lengths().string()), list_it.length(), word->box_word->length()); } #endif ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) == list_it.length ()); ASSERT_HOST(word->box_word->length() == list_it.length()); threshold = compute_reject_threshold (blob_choices); for (list_it.mark_cycle_pt (); !list_it.cycled_list (); list_it.forward (), i++, offset += word->best_choice->unichar_lengths()[i]) { /* NB - only compares the threshold against the TOP choice char in the choices list for a blob !! - the selected one may be below the threshold */ choice_it.set_to_list (list_it.data ()); if ((word->best_choice->unichar_string()[offset] == ' ') || (choice_it.length () == 0)) //rej unrecognised blobs word->reject_map[i].setrej_tess_failure (); else if (choice_it.data ()->certainty () < threshold) //rej poor score blob word->reject_map[i].setrej_poor_match (); } }