Tesseract  3.02
tesseract-ocr/ccmain/reject.cpp File Reference
#include "mfcpch.h"
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "memry.h"
#include "reject.h"
#include "tfacep.h"
#include "imgs.h"
#include "control.h"
#include "docqual.h"
#include "secname.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"
#include "notdll.h"

Go to the source code of this file.

Namespaces

namespace  tesseract

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
void reject_blanks (WERD_RES *word)
void reject_poor_matches (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
float compute_reject_threshold (BLOB_CHOICE_LIST_CLIST *blob_choices)

Function Documentation

CLISTIZEH ( STRING  )

Definition at line 55 of file reject.cpp.

                    {
void Tesseract::set_done(  //set done flag
                         WERD_RES *word,
                         inT16 pass) {
  /*
  0: Original heuristic used in Tesseract and Ray's prototype Resaljet
  */
  if (tessedit_ok_mode == 0) {
    /* NOTE - done even if word contains some or all spaces !!! */
    word->done = word->tess_accepted;
  }
  /*
  1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
  */
  else if (tessedit_ok_mode == 1) {
    word->done = word->tess_accepted &&
      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);

    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
      word->done = FALSE;
  }
  /*
  2: as 1 + only accept dict words or numerics in pass 1
  */
  else if (tessedit_ok_mode == 2) {
    word->done = word->tess_accepted &&
      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);

    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
      word->done = FALSE;

    if (word->done &&
      (pass == 1) &&
      (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
      (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
      (word->best_choice->permuter () != USER_DAWG_PERM) &&
    (word->best_choice->permuter () != NUMBER_PERM)) {
      #ifndef SECURE_NAMES
      if (tessedit_rejection_debug)
        tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
          word->best_choice->unichar_string().string ());
      #endif
      word->done = FALSE;
    }
  }
  /*
  3: as 2 + only accept dict words or numerics in pass 2 as well
  */
  else if (tessedit_ok_mode == 3) {
    word->done = word->tess_accepted &&
      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);

    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
      word->done = FALSE;

    if (word->done &&
      (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
      (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
      (word->best_choice->permuter () != USER_DAWG_PERM) &&
    (word->best_choice->permuter () != NUMBER_PERM)) {
      #ifndef SECURE_NAMES
      if (tessedit_rejection_debug)
        tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
          word->best_choice->unichar_string().string ());
      #endif
      word->done = FALSE;
    }
  }
  /*
  4: as 2 + reject dict ambigs in pass 1
  */
  else if (tessedit_ok_mode == 4) {
    word->done = word->tess_accepted &&
      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);

    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
      word->done = FALSE;

    if (word->done &&
      (pass == 1) &&
      (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
      (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
      (word->best_choice->permuter () != USER_DAWG_PERM) &&
      (word->best_choice->permuter () != NUMBER_PERM)) ||
    (test_ambig_word (word)))) {
      #ifndef SECURE_NAMES
      if (tessedit_rejection_debug)
        tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
          word->best_choice->unichar_string().string ());
      #endif
      word->done = FALSE;
    }
  }
  /*
  5: as 3 + reject dict ambigs in both passes
  */
  else if (tessedit_ok_mode == 5) {
    word->done = word->tess_accepted &&
      (strchr (word->best_choice->unichar_string().string (), ' ') == NULL);

    if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
      word->done = FALSE;

    if (word->done &&
      (((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
      (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
      (word->best_choice->permuter () != USER_DAWG_PERM) &&
      (word->best_choice->permuter () != NUMBER_PERM)) ||
    (test_ambig_word (word)))) {
      #ifndef SECURE_NAMES
      if (tessedit_rejection_debug)
        tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
          word->best_choice->unichar_string().string ());
      #endif
      word->done = FALSE;
    }
  }

  else {
    tprintf ("BAD tessedit_ok_mode\n");
    err_exit();
  }
}


/*************************************************************************
 * make_reject_map()
 *
 * Sets the done flag to indicate whether the resylt is acceptable.
 *
 * Sets a reject map for the word.
 *************************************************************************/
void Tesseract::make_reject_map(      //make rej map for wd //detailed results
                                WERD_RES *word,
                                BLOB_CHOICE_LIST_CLIST *blob_choices,
                                ROW *row,
                                inT16 pass  //1st or 2nd?
                               ) {
  int i;
  int offset;

  flip_0O(word);
  check_debug_pt(word, -1);     // For trap only
  set_done(word, pass);  // Set acceptance
  word->reject_map.initialise(word->best_choice->unichar_lengths().length());
  reject_blanks(word);
  /*
  0: Rays original heuristic - the baseline
  */
  if (tessedit_reject_mode == 0) {
    if (!word->done)
      reject_poor_matches(word, blob_choices);
  } else if (tessedit_reject_mode == 5) {
    /*
    5: Reject I/1/l from words where there is no strong contextual confirmation;
      the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
      and the whole of any words which are very small
    */
    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
      word->reject_map.rej_word_small_xht();
    } else {
      one_ell_conflict(word, TRUE);
      /*
        Originally the code here just used the done flag. Now I have duplicated
        and unpacked the conditions for setting the done flag so that each
        mechanism can be turned on or off independently. This works WITHOUT
        affecting the done flag setting.
      */
      if (rej_use_tess_accepted && !word->tess_accepted)
        word->reject_map.rej_word_not_tess_accepted ();

      if (rej_use_tess_blanks &&
        (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
        word->reject_map.rej_word_contains_blanks ();

      WERD_CHOICE* best_choice = word->best_choice;
      if (rej_use_good_perm) {
        if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
             best_choice->permuter() == FREQ_DAWG_PERM ||
             best_choice->permuter() == USER_DAWG_PERM) &&
            (!rej_use_sensible_wd ||
             acceptable_word_string(*word->uch_set,
                                    best_choice->unichar_string().string(),
                                    best_choice->unichar_lengths().string()) !=
                                        AC_UNACCEPTABLE)) {
          // PASSED TEST
        } else if (best_choice->permuter() == NUMBER_PERM) {
          if (rej_alphas_in_number_perm) {
            for (i = 0, offset = 0;
                 best_choice->unichar_string()[offset] != '\0';
                 offset += best_choice->unichar_lengths()[i++]) {
              if (word->reject_map[i].accepted() &&
                  word->uch_set->get_isalpha(
                      best_choice->unichar_string().string() + offset,
                      best_choice->unichar_lengths()[i]))
                word->reject_map[i].setrej_bad_permuter();
              // rej alpha
            }
          }
        } else {
          word->reject_map.rej_word_bad_permuter();
        }
      }
      /* Ambig word rejection was here once !!*/
    }
  } else {
    tprintf("BAD tessedit_reject_mode\n");
    err_exit();
  }

  if (tessedit_image_border > -1)
    reject_edge_blobs(word);

  check_debug_pt (word, 10);
  if (tessedit_rejection_debug) {
    tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
    tprintf("Certainty: %f     Rating: %f\n",
      word->best_choice->certainty (), word->best_choice->rating ());
    tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
  }

  flip_hyphens(word);
  check_debug_pt(word, 20);
}
}  // namespace tesseract
float compute_reject_threshold ( BLOB_CHOICE_LIST_CLIST *  blob_choices)

Definition at line 370 of file reject.cpp.

                                                                     {
  inT16 index;                   //to ratings
  inT16 blob_count;              //no of blobs in word
  inT16 ok_blob_count = 0;       //non TESS rej blobs in word
  float *ratings;                //array of confidences
  float threshold;               //rejection threshold
  float bestgap;                 //biggest gap
  float gapstart;                //bottom of gap
                                 //super iterator
  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
  BLOB_CHOICE_IT choice_it;      //real iterator

  blob_count = blob_choices->length ();
  ratings = (float *) alloc_mem (blob_count * sizeof (float));
  for (list_it.mark_cycle_pt (), index = 0;
  !list_it.cycled_list (); list_it.forward (), index++) {
    choice_it.set_to_list (list_it.data ());
    if (choice_it.length () > 0) {
      ratings[ok_blob_count] = choice_it.data ()->certainty ();
      //get in an array
      //                 tprintf("Rating[%d]=%c %g %g\n",
      //                         index,choice_it.data()->char_class(),
      //                         choice_it.data()->rating(),choice_it.data()->certainty());
      ok_blob_count++;
    }
  }
  ASSERT_HOST (index == blob_count);
  qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
  //sort them
  bestgap = 0;
  gapstart = ratings[0] - 1;     //all reject if none better
  if (ok_blob_count >= 3) {
    for (index = 0; index < ok_blob_count - 1; index++) {
      if (ratings[index + 1] - ratings[index] > bestgap) {
        bestgap = ratings[index + 1] - ratings[index];
        //find biggest
        gapstart = ratings[index];
      }
    }
  }
  threshold = gapstart + bestgap / 2;
  //      tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
  //              ratings[0],ratings[index],bestgap,threshold);

  free_mem(ratings);
  return threshold;
}
void reject_blanks ( WERD_RES word)

Definition at line 290 of file reject.cpp.

                                   {
  inT16 i;
  inT16 offset;

  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
       offset += word->best_choice->unichar_lengths()[i], i += 1) {
    if (word->best_choice->unichar_string()[offset] == ' ')
                                 //rej unrecognised blobs
      word->reject_map[i].setrej_tess_failure ();
  }
}
void reject_poor_matches ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 319 of file reject.cpp.

                                                               {
  float threshold;
  inT16 i = 0;
  inT16 offset = 0;
                                 //super iterator
  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
  BLOB_CHOICE_IT choice_it;      //real iterator

  #ifndef SECURE_NAMES
  if (strlen(word->best_choice->unichar_lengths().string()) !=
      list_it.length()) {
    tprintf
      ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
      word->best_choice->unichar_string().string(),
      strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
      word->box_word->length());
  }
  #endif
  ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
    list_it.length ());
  ASSERT_HOST(word->box_word->length() == list_it.length());
  threshold = compute_reject_threshold (blob_choices);

  for (list_it.mark_cycle_pt ();
  !list_it.cycled_list (); list_it.forward (), i++,
           offset += word->best_choice->unichar_lengths()[i]) {
    /* NB - only compares the threshold against the TOP choice char in the
      choices list for a blob !! - the selected one may be below the threshold
    */
    choice_it.set_to_list (list_it.data ());
    if ((word->best_choice->unichar_string()[offset] == ' ') ||
      (choice_it.length () == 0))
                                 //rej unrecognised blobs
      word->reject_map[i].setrej_tess_failure ();
    else if (choice_it.data ()->certainty () < threshold)
                                 //rej poor score blob
      word->reject_map[i].setrej_poor_match ();
  }
}