#include <wordrec.h>

Inheritance diagram for tesseract::Wordrec:

Public Member Functions

Wordrec ()

virtual ~Wordrec ()

void CopyCharChoices (const BLOB_CHOICE_LIST_VECTOR &from, BLOB_CHOICE_LIST_VECTOR *to)

bool ChoiceIsCorrect (const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text)

void SaveAltChoices (const LIST &best_choices, WERD_RES *word)

void FillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

void CallFillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

void update_ratings (const BLOB_CHOICE_LIST_VECTOR &new_choices, const CHUNKS_RECORD *chunks_record, const SEARCH_STATE search_state)

void SegSearch (CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state, BlamerBundle *blamer_bundle)

SEAM * attempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, SEAMS seam_list)

SEAM * chop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, SEAMS seam_list)

SEAM * chop_overlapping_blob (const GenericVector< TBOX > &boxes, WERD_RES *word_res, inT32 *blob_number, bool italic_blob, SEAMS seam_list)

void junk_worst_seam (SEAM_QUEUE seams, SEAM *new_seam, float new_priority)

void choose_best_seam (SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob)

void combine_seam (SEAM_QUEUE seam_queue, SEAM_PILE seam_pile, SEAM *seam)

inT16 constrained_split (SPLIT *split, TBLOB *blob)

void delete_seam_pile (SEAM_PILE seam_pile)

SEAM * pick_good_seam (TBLOB *blob)

PRIORITY seam_priority (SEAM *seam, inT16 xmin, inT16 xmax)

void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)

void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob)

PRIORITY full_split_priority (SPLIT *split, inT16 xmin, inT16 xmax)

PRIORITY grade_center_of_blob (register BOUNDS_RECT rect)

PRIORITY grade_overlap (register BOUNDS_RECT rect)

PRIORITY grade_split_length (register SPLIT *split)

PRIORITY grade_sharpness (register SPLIT *split)

PRIORITY grade_width_change (register BOUNDS_RECT rect)

void set_outline_bounds (register EDGEPT *point1, register EDGEPT *point2, BOUNDS_RECT rect)

int crosses_outline (EDGEPT *p0, EDGEPT *p1, EDGEPT *outline)

int is_crossed (TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1)

int is_same_edgept (EDGEPT *p1, EDGEPT *p2)

bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)

void reverse_outline (EDGEPT *outline)

virtual BLOB_CHOICE_LIST * classify_piece (TBLOB *pieces, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)

void merge_fragments (MATRIX *ratings, inT16 num_blobs)

void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)

void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)

void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)

BLOB_CHOICE_LIST * get_piece_rating (MATRIX *ratings, TBLOB *blobs, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle)

TBOX * record_blob_bounds (TBLOB *blobs)

MATRIX * record_piece_ratings (TBLOB *blobs)

WIDTH_RECORD * state_char_widths (WIDTH_RECORD *chunk_widths, STATE *state, int num_joints)

FLOAT32 get_width_variance (WIDTH_RECORD *wrec, float norm_height)

FLOAT32 get_gap_variance (WIDTH_RECORD *wrec, float norm_height)

FLOAT32 prioritize_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)

FLOAT32 width_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)

FLOAT32 seamcut_priority (SEAMS seams, STATE *state, int num_joints)

FLOAT32 rating_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints)

program_editup

Initialize all the things in the program that need to be initialized. init_permute determines whether to initialize the permute functions and Dawg models.

void program_editup (const char *textbase, bool init_classifier, bool init_permute)

cc_recog

Recognize a word.

BLOB_CHOICE_LIST_VECTOR * cc_recog (WERD_RES *word)

program_editdown

This function holds any nessessary post processing for the Wise Owl program.

void program_editdown (inT32 elasped_time)

set_pass1

Get ready to do some pass 1 stuff.

void set_pass1 ()

set_pass2

Get ready to do some pass 2 stuff.

void set_pass2 ()

end_recog

Cleanup and exit the recog program.

int end_recog ()

call_matcher

Called from Tess with a blob in tess form. The blob may need rotating to the correct orientation for classification.

BLOB_CHOICE_LIST * call_matcher (const DENORM *denorm, TBLOB *blob)

dict_word()

Test the dictionaries, returning NO_PERM (0) if not found, or one of the PermuterType values if found, according to the dictionary.

int dict_word (const WERD_CHOICE &word)

classify_blob

Classify the this blob if it is not already recorded in the match table. Attempt to recognize this blob as a character. The recognition rating for this blob will be stored as a part of the blob. This value will also be returned to the caller.

Parameters:

blob	Current blob
string	The string to display in ScrollView
color	The colour to use when displayed with ScrollView

BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const DENORM &denorm, const char *string, C_COL color, BlamerBundle *blamer_bundle)

BLOB_CHOICE_LIST * fake_classify_blob (UNICHAR_ID class_id, float rating, float certainty)

update_blob_classifications

For each blob in the given word update match_table with the corresponding BLOB_CHOICES_LIST from choices.

void update_blob_classifications (TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices)

best_first_search

Find the best segmentation by doing a best first search of the solution space.

BLOB_CHOICE_LIST_VECTOR * evaluate_chunks (CHUNKS_RECORD *chunks_record, SEARCH_STATE search_state, BlamerBundle *blamer_bundle)

void best_first_search (CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_RES *word, STATE *state, DANGERR *fixpt, STATE *best_state)

void delete_search (SEARCH_RECORD *the_search)

evaluate_state

Evaluate the segmentation that is represented by this state in the best first search. Add this state to the "states_seen" list.

inT16 evaluate_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search, DANGERR *fixpt, BlamerBundle *blamer_bundle)

BLOB_CHOICE_LIST_VECTOR * rebuild_current_state (WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *char_choices, MATRIX *ratings)

new_search

Create and initialize a new search record.

SEARCH_RECORD * new_search (CHUNKS_RECORD *chunks_record, int num_joints, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice, STATE *state)

expand_node

Create the states that are attached to this one. Check to see that each one has not already been visited. If not add it to the priority queue.

void expand_node (FLOAT32 worst_priority, CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search)

replace_char_widths

Replace the value of the char_width field in the chunks_record with the updated width measurements from the last_segmentation.

void replace_char_widths (CHUNKS_RECORD *chunks_record, SEARCH_STATE state)

BLOB_CHOICE * rebuild_fragments (const char *unichar, const char *expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices)

BLOB_CHOICE_LIST * join_blobs_and_classify (WERD_RES *word, int x, int y, int choice_index, MATRIX *ratings, BLOB_CHOICE_LIST_VECTOR *old_choices)

pop_queue

Get this state from the priority queue. It should be the state that has the greatest urgency to be evaluated.

STATE * pop_queue (HEAP *queue)

push_queue

Add this state into the priority queue.

void push_queue (HEAP *queue, STATE *state, FLOAT32 worst_priority, FLOAT32 priority, bool debug)

point_priority

Assign a priority to and edge point that might be used as part of a split. The argument should be of type EDGEPT.

PRIORITY point_priority (EDGEPT *point)

add_point_to_list

Add an edge point to a POINT_GROUP containg a list of other points.

void add_point_to_list (POINT_GROUP point_list, EDGEPT *point)

angle_change

Return the change in angle (degrees) of the line segments between points one and two, and two and three.

int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)

is_little_chunk

Return TRUE if one of the pieces resulting from this split would less than some number of edge points.

int is_little_chunk (EDGEPT *point1, EDGEPT *point2)

is_small_area

Test the area defined by a split accross this outline.

int is_small_area (EDGEPT *point1, EDGEPT *point2)

pick_close_point

Choose the edge point that is closest to the critical point. This point may not be exactly vertical from the critical point.

EDGEPT * pick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)

prioritize_points

Find a list of edge points from the outer outline of this blob. For each of these points assign a priority. Sort these points using a heap structure so that they can be visited in order.

void prioritize_points (TESSLINE *outline, POINT_GROUP points)

new_min_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL.

void new_min_point (EDGEPT *local_min, POINT_GROUP points)

new_max_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL.

void new_max_point (EDGEPT *local_max, POINT_GROUP points)

vertical_projection_point

For one point on the outline, find the corresponding point on the other side of the outline that is a likely projection for a split point. This is done by iterating through the edge points until the X value of the point being looked at is greater than the X value of the split point. Ensure that the point being returned is not right next to the split point. Return the edge point in *best_point as a result, and any points that were newly created are also saved on the new_points list.

void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)

improve_one_blob

Start with the current word of blobs and its classification. Find the worst blobs and try to divide it up to improve the ratings.

bool improve_one_blob (WERD_RES *word_res, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, DANGERR *fixpt, bool split_next_to_fragment, BlamerBundle *blamer_bundle)

modify_blob_choice

Takes a blob and its chop index, converts that chop index to a unichar_id, and stores the chop index in place of the blob's original unichar_id.

void modify_blob_choice (BLOB_CHOICE_LIST *answer, int chop_index)

chop_one_blob

Start with the current one-blob word and its classification. Find the worst blobs and try to divide it up to improve the ratings. Used for testing chopper.

bool chop_one_blob (TWERD *word, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, int *right_chop_index)

bool chop_one_blob2 (const GenericVector< TBOX > &boxes, WERD_RES *word_res, SEAMS *seam_list)

chop_word_main

Classify the blobs in this word and permute the results. Find the worst blob in the word and chop it up. Continue this process until a good answer has been found or all the blobs have been chopped up enough. Return the word level ratings.

BLOB_CHOICE_LIST_VECTOR * chop_word_main (WERD_RES *word)

improve_by_chopping

Start with the current word of blobs and its classification. Find the worst blobs and try to divide them up to improve the ratings. As long as ratings are produced by the new blob splitting. When all the splitting has been accomplished all the ratings memory is reclaimed.

void improve_by_chopping (WERD_RES *word, BLOB_CHOICE_LIST_VECTOR *char_choices, STATE *best_state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, bool *updated_best_choice)

MATRIX * word_associator (bool only_create_ratings_matrtix, WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, STATE *best_state)

inT16 select_blob_to_split (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_ceiling, bool split_next_to_fragment)

inT16 select_blob_to_split_from_fixpt (DANGERR *fixpt)

void set_chopper_blame (WERD_RES *word)

Public Attributes

bool merge_fragments_in_matrix = 1

bool wordrec_no_block = 0

bool wordrec_enable_assoc = 1

bool force_word_assoc = 0

int wordrec_num_seg_states = 30

double wordrec_worst_state = 1

bool fragments_guide_chopper = 0

int repair_unchopped_blobs = 1

double tessedit_certainty_threshold = -2.25

int chop_debug = 0

bool chop_enable = 1

bool chop_vertical_creep = 0

int chop_split_length = 10000

int chop_same_distance = 2

int chop_min_outline_points = 6

int chop_inside_angle = -50

int chop_min_outline_area = 2000

double chop_split_dist_knob = 0.5

double chop_overlap_knob = 0.9

double chop_center_knob = 0.15

double chop_sharpness_knob = 0.06

double chop_width_change_knob = 5.0

double chop_ok_split = 100.0

double chop_good_split = 50.0

int chop_x_y_weight = 3

int segment_adjust_debug = 0

bool assume_fixed_pitch_char_segment = 0

bool use_new_state_cost = 0

double heuristic_segcost_rating_base = 1.25

double heuristic_weight_rating = 1

double heuristic_weight_width = 0

double heuristic_weight_seamcut = 0

double heuristic_max_char_wh_ratio = 2.0

int wordrec_debug_level = 0

bool wordrec_debug_blamer = false

bool wordrec_run_blamer = false

bool enable_new_segsearch = false

int segsearch_debug_level = 0

int segsearch_max_pain_points = 2000

int segsearch_max_futile_classifications = 10

double segsearch_max_char_wh_ratio = 2.0

double segsearch_max_fixed_pitch_char_wh_ratio = 2.0

bool save_alt_choices = false

LanguageModel * language_model_

PRIORITY

pass2_ok_split

int pass2_seg_states

int num_joints

int num_pushed

int num_popped

WERD_CHOICE * prev_word_best_choice_

GenericVector< int > blame_reasons_

void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Protected Member Functions

bool SegSearchDone (int num_futile_classifications)

void UpdateSegSearchNodes (int starting_col, SEG_SEARCH_PENDING_LIST *pending[], BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)

void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const WERD_CHOICE *best_choice, SEG_SEARCH_PENDING_LIST *pending[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle)

void InitBlamerForSegSearch (const WERD_CHOICE *best_choice, CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)

void FinishBlamerForSegSearch (const WERD_CHOICE *best_choice, BlamerBundle *blamer_bundle, STRING *blamer_debug)

Detailed Description

Definition at line 91 of file wordrec.h.

Constructor & Destructor Documentation

tesseract::Wordrec::Wordrec ( )

Definition at line 26 of file wordrec.cpp.

                 :
  // control parameters
  BOOL_MEMBER(merge_fragments_in_matrix, TRUE,
              "Merge the fragments in the ratings matrix and delete them"
              " after merging", params()),
  BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information",
              params()),
  BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable",
              params()),
  BOOL_MEMBER(force_word_assoc, FALSE,
              "force associator to run regardless of what enable_assoc is."
              "This is used for CJK where component grouping is necessary.",
              CCUtil::params()),
  INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states",
             CCUtil::params()),
  double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
                params()),
  BOOL_MEMBER(fragments_guide_chopper, FALSE,
              "Use information from fragments to guide chopping process",
              params()),
  INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
             params()),
  double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
                params()),
  INT_MEMBER(chop_debug, 0, "Chop debug",
             params()),
  BOOL_MEMBER(chop_enable, 1, "Chop enable",
              params()),
  BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
            params()),
  INT_MEMBER(chop_split_length, 10000, "Split Length",
             params()),
  INT_MEMBER(chop_same_distance, 2, "Same distance",
             params()),
  INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
             params()),
  INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
             params()),
  INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
             params()),
  double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
                params()),
  double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
                params()),
  double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
                params()),
  double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
                params()),
  double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
                params()),
  double_MEMBER(chop_ok_split, 100.0, "OK split limit",
                params()),
  double_MEMBER(chop_good_split, 50.0, "Good split limit",
                params()),
  INT_MEMBER(chop_x_y_weight, 3, "X / Y  length weight",
             params()),
  INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug",
             params()),
  BOOL_MEMBER(assume_fixed_pitch_char_segment, FALSE,
              "include fixed-pitch heuristics in char segmentation",
              params()),
  BOOL_MEMBER(use_new_state_cost, FALSE,
              "use new state cost heuristics for segmentation state evaluation",
              params()),
  double_MEMBER(heuristic_segcost_rating_base, 1.25,
                "base factor for adding segmentation cost into word rating."
                "It's a multiplying factor, the larger the value above 1, "
                "the bigger the effect of segmentation cost.",
                params()),
  double_MEMBER(heuristic_weight_rating, 1.0,
                "weight associated with char rating in combined cost of state",
                params()),
  double_MEMBER(heuristic_weight_width, 1000.0,
                "weight associated with width evidence in combined cost of"
                " state", params()),
  double_MEMBER(heuristic_weight_seamcut, 0.0,
                "weight associated with seam cut in combined cost of state",
                params()),
  double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
                "max char width-to-height ratio allowed in segmentation",
                params()),
  INT_MEMBER(wordrec_debug_level, 0,
             "Debug level for wordrec", params()),
  BOOL_MEMBER(wordrec_debug_blamer, false,
              "Print blamer debug messages", params()),
  BOOL_MEMBER(wordrec_run_blamer, false,
              "Try to set the blame for errors", params()),
  BOOL_MEMBER(enable_new_segsearch, true,
                   "Enable new segmentation search path.", params()),
  INT_MEMBER(segsearch_debug_level, 0,
             "SegSearch debug level", params()),
  INT_MEMBER(segsearch_max_pain_points, 2000,
             "Maximum number of pain points stored in the queue",
             params()),
  INT_MEMBER(segsearch_max_futile_classifications, 10,
             "Maximum number of pain point classifications per word that"
             "did not result in finding a better word choice.",
             params()),
  double_MEMBER(segsearch_max_char_wh_ratio, 2.0,
                "Maximum character width-to-height ratio", params()),
  double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
                "Maximum character width-to-height ratio for"
                " fixed-pitch fonts",
                params()),
  BOOL_MEMBER(save_alt_choices, false,
              "Save alternative paths found during chopping"
              " and segmentation search",
              params()) {
  prev_word_best_choice_ = NULL;
  language_model_ = new LanguageModel(&get_fontinfo_table(),
                                      &(getDict()));
  pass2_seg_states = 0;
  num_joints = 0;
  num_pushed = 0;
  num_popped = 0;
  fill_lattice_ = NULL;
}

tesseract::Wordrec::~Wordrec ( ) [virtual]

Definition at line 144 of file wordrec.cpp.

                  {
  delete language_model_;
}

Member Function Documentation

void tesseract::Wordrec::add_point_to_list	(	POINT_GROUP	point_list,
		EDGEPT *	point
	)

Definition at line 65 of file chop.cpp.

                                                                     {
  HEAPENTRY data;

  if (SizeOfHeap (point_list) < MAX_NUM_POINTS - 2) {
    data.Data = (char *) point;
    data.Key = point_priority (point);
    HeapStore(point_list, &data);
  }

#ifndef GRAPHICS_DISABLED
  if (chop_debug > 2)
    mark_outline(point);
#endif
}

int tesseract::Wordrec::angle_change	(	EDGEPT *	point1,
		EDGEPT *	point2,
		EDGEPT *	point3
	)

Definition at line 87 of file chop.cpp.

                                                                        {
  VECTOR vector1;
  VECTOR vector2;

  int angle;
  float length;

  /* Compute angle */
  vector1.x = point2->pos.x - point1->pos.x;
  vector1.y = point2->pos.y - point1->pos.y;
  vector2.x = point3->pos.x - point2->pos.x;
  vector2.y = point3->pos.y - point2->pos.y;
  /* Use cross product */
  length = (float)sqrt((float)LENGTH(vector1) * LENGTH(vector2));
  if ((int) length == 0)
    return (0);
  angle = static_cast<int>(floor(asin(CROSS (vector1, vector2) /
                                      length) / PI * 180.0 + 0.5));

  /* Use dot product */
  if (SCALAR (vector1, vector2) < 0)
    angle = 180 - angle;
  /* Adjust angle */
  if (angle > 180)
    angle -= 360;
  if (angle <= -180)
    angle += 360;
  return (angle);
}

SEAM * tesseract::Wordrec::attempt_blob_chop	(	TWERD *	word,
		TBLOB *	blob,
		inT32	blob_number,
		bool	italic_blob,
		SEAMS	seam_list
	)

Definition at line 146 of file chopper.cpp.

                                                                    {
  TBLOB *next_blob = blob->next;
  TBLOB *other_blob;
  SEAM *seam;

  if (repair_unchopped_blobs)
    preserve_outline_tree (blob->outlines);
  other_blob = new TBLOB;       /* Make new blob */
  other_blob->next = blob->next;
  other_blob->outlines = NULL;
  blob->next = other_blob;

  seam = NULL;
  if (prioritize_division) {
    TPOINT location;
    if (divisible_blob(blob, italic_blob, &location)) {
      seam = new_seam(0.0f, location, NULL, NULL, NULL);
    }
  }
  if (seam == NULL)
    seam = pick_good_seam(blob);
  if (seam == NULL && word->latin_script) {
    // If the blob can simply be divided into outlines, then do that.
    TPOINT location;
    if (divisible_blob(blob, italic_blob, &location)) {
      seam = new_seam(0.0f, location, NULL, NULL, NULL);
    }
  }
  if (chop_debug) {
    if (seam != NULL) {
      print_seam ("Good seam picked=", seam);
    }
    else
      cprintf ("\n** no seam picked *** \n");
  }
  if (seam) {
    apply_seam(blob, other_blob, italic_blob, seam);
  }

  if ((seam == NULL) ||
    (blob->outlines == NULL) ||
    (other_blob->outlines == NULL) ||
    total_containment (blob, other_blob) ||
    check_blob (other_blob) ||
    !(check_seam_order (blob, seam) &&
    check_seam_order (other_blob, seam)) ||
    any_shared_split_points (seam_list, seam) ||
    !test_insert_seam(seam_list, blob_number, blob, word->blobs)) {

    blob->next = next_blob;
    if (seam) {
      undo_seam(blob, other_blob, seam);
      delete_seam(seam);
#ifndef GRAPHICS_DISABLED
      if (chop_debug) {
        if (chop_debug >2)
          display_blob(blob, Red);
        cprintf ("\n** seam being removed ** \n");
      }
#endif
    } else {
      delete other_blob;
    }

    if (repair_unchopped_blobs)
      restore_outline_tree (blob->outlines);
    return (NULL);
  }
  return (seam);
}

void tesseract::Wordrec::best_first_search	(	CHUNKS_RECORD *	chunks_record,
		BLOB_CHOICE_LIST_VECTOR *	best_char_choices,
		WERD_RES *	word,
		STATE *	state,
		DANGERR *	fixpt,
		STATE *	best_state
	)

Definition at line 88 of file bestfirst.cpp.

                                                   {
  SEARCH_RECORD *the_search;
  inT16 keep_going;
  STATE guided_state;   // not used

  int num_joints = chunks_record->ratings->dimension() - 1;
  the_search = new_search(chunks_record, num_joints, best_char_choices,
                          word->best_choice, word->raw_choice, state);

  // The default state is initialized as the best choice.  In order to apply
  // segmentation adjustment, or any other contextual processing in permute,
  // we give the best choice a poor rating to force the processed raw choice
  // to be promoted to best choice.
  the_search->best_choice->set_rating(WERD_CHOICE::kBadRating);
  evaluate_state(chunks_record, the_search, fixpt, word->blamer_bundle);
  if (wordrec_debug_level > 1) {
    tprintf("\n\n\n =========== BestFirstSearch ==============\n");
    word->best_choice->print("**Initial BestChoice**");
  }

  FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record, the_search);
  if (worst_priority < wordrec_worst_state)
    worst_priority = wordrec_worst_state;
  if (wordrec_debug_level > 1) {
    log_state("BestFirstSearch", num_joints, best_state);
  }

  guided_state = *state;
  do {
                                 /* Look for answer */
    STATE orig_state = *the_search->this_state;
    if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
      guided_state = *(the_search->this_state);
      keep_going = evaluate_state(chunks_record, the_search, fixpt,
                                  word->blamer_bundle);
      hash_add (the_search->closed_states, the_search->this_state);

      if (!keep_going ||
          (the_search->num_states > wordrec_num_seg_states)) {
        if (wordrec_debug_level > 1)
          tprintf("Breaking best_first_search on keep_going %s numstates %d\n",
                  ((keep_going) ? "T" :"F"), the_search->num_states);
        free_state (the_search->this_state);
        break;
      }

      FLOAT32 new_worst_priority = 2.0f * prioritize_state(chunks_record,
                                                           the_search);
      if (new_worst_priority < worst_priority) {
        if (wordrec_debug_level > 1)
          tprintf("Lowering WorstPriority %f --> %f\n",
                  worst_priority, new_worst_priority);
        // Tighten the threshold for admitting new paths as better search
        // candidates are found.  After lowering this threshold, we can safely
        // popout everything that is worse than this score also.
        worst_priority = new_worst_priority;
      }
      expand_node(worst_priority, chunks_record, the_search);
    }

    if (wordrec_debug_level > 1) {
      log_state("Done with", the_search->num_joints, &orig_state);
    }
    free_state (the_search->this_state);
    num_popped++;
    the_search->this_state = pop_queue (the_search->open_states);
    if (wordrec_debug_level > 1 && !the_search->this_state)
      tprintf("No more states to evalaute after %d evals", num_popped);
  } while (the_search->this_state);

  state->part1 = the_search->best_state->part1;
  state->part2 = the_search->best_state->part2;
  if (wordrec_debug_level > 1) {
    tprintf("\n\n\n =========== BestFirstSearch ==============\n");
            // best_choice->debug_string().string());
    word->best_choice->print("**Final BestChoice**");
  }
  // save the best_state stats
  delete_search(the_search);
}

BLOB_CHOICE_LIST * tesseract::Wordrec::call_matcher	(	const DENORM *	denorm,
		TBLOB *	blob
	)

Definition at line 143 of file tface.cpp.

                                                                             {
  // Rotate the blob for classification if necessary.
  TBLOB* rotated_blob = tessblob->ClassifyNormalizeIfNeeded(&denorm);
  if (rotated_blob == NULL) {
    rotated_blob = tessblob;
  }
  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();  // matcher result
  AdaptiveClassifier(rotated_blob, *denorm, ratings, NULL);
  if (rotated_blob != tessblob) {
    delete rotated_blob;
    delete denorm;
  }
  return ratings;
}

void tesseract::Wordrec::CallFillLattice	(	const MATRIX &	ratings,
		const LIST &	best_choices,
		const UNICHARSET &	unicharset,
		BlamerBundle *	blamer_bundle
	)		`[inline]`

Definition at line 187 of file wordrec.h.

                                                                                  {
    (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
  }

BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::cc_recog ( WERD_RES * word )

Definition at line 117 of file tface.cpp.

                                                         {
  getDict().InitChoiceAccum();
  getDict().reset_hyphen_vars(word->word->flag(W_EOL));
  blob_match_table.init_match_table();
  BLOB_CHOICE_LIST_VECTOR *results = chop_word_main(word);
  getDict().DebugWordChoices();
  return results;
}

bool tesseract::Wordrec::ChoiceIsCorrect	(	const UNICHARSET &	uni_set,
		const WERD_CHOICE *	choice,
		const GenericVector< STRING > &	truth_text
	)

Definition at line 159 of file wordrec.cpp.

                                                                       {
  if (choice == NULL) return false;
  int i;
  STRING truth_str;
  for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i];
  STRING normed_choice_str;
  for (i = 0; i < choice->length(); ++i) {
    normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i));
  }
  return (truth_str == normed_choice_str);
}

void tesseract::Wordrec::choose_best_seam	(	SEAM_QUEUE	seam_queue,
		SEAM_PILE *	seam_pile,
		SPLIT *	split,
		PRIORITY	priority,
		SEAM **	seam_result,
		TBLOB *	blob
	)

Definition at line 178 of file findseam.cpp.

                                            {
  SEAM *seam;
  char str[80];
  float my_priority;
  /* Add seam of split */
  my_priority = priority;
  if (split != NULL) {
    TPOINT split_point = split->point1->pos;
    split_point += split->point2->pos;
    split_point /= 2;
    seam = new_seam(my_priority, split_point, split, NULL, NULL);
    if (chop_debug > 1)
      print_seam ("Partial priority    ", seam);
    add_seam_to_queue (seam_queue, seam, (float) my_priority);

    if (my_priority > chop_good_split)
      return;
  }

  TBOX bbox = blob->bounding_box();
  /* Queue loop */
  while (pop_next_seam (seam_queue, seam, my_priority)) {
    /* Set full priority */
    my_priority = seam_priority (seam, bbox.left(), bbox.right());
    if (chop_debug) {
      sprintf (str, "Full my_priority %0.0f,  ", my_priority);
      print_seam(str, seam);
    }

    if ((*seam_result == NULL || /* Replace answer */
    (*seam_result)->priority > my_priority) && my_priority < chop_ok_split) {
      /* No crossing */
      if (constrained_split (seam->split1, blob)) {
        delete_seam(*seam_result);
        clone_seam(*seam_result, seam);
        (*seam_result)->priority = my_priority;
      }
      else {
        delete_seam(seam);
        seam = NULL;
        my_priority = BAD_PRIORITY;
      }
    }

    if (my_priority < chop_good_split) {
      if (seam)
        delete_seam(seam);
      return;                    /* Made good answer */
    }

    if (seam) {
                                 /* Combine with others */
      if (array_count (*seam_pile) < MAX_NUM_SEAMS
      /*|| tessedit_truncate_chopper==0 */ ) {
        combine_seam(seam_queue, *seam_pile, seam);
        *seam_pile = array_push (*seam_pile, seam);
      }
      else
        delete_seam(seam);
    }

    my_priority = best_seam_priority (seam_queue);
    if ((my_priority > chop_ok_split) ||
      (my_priority > chop_good_split && split))
      return;
  }
}

SEAM * tesseract::Wordrec::chop_numbered_blob	(	TWERD *	word,
		inT32	blob_number,
		bool	italic_blob,
		SEAMS	seam_list
	)

Definition at line 219 of file chopper.cpp.

                                                                     {
  TBLOB *blob;
  inT16 x;

  blob = word->blobs;
  for (x = 0; x < blob_number; x++)
    blob = blob->next;

  return attempt_blob_chop(word, blob, blob_number,
                           italic_blob, seam_list);
}

bool tesseract::Wordrec::chop_one_blob	(	TWERD *	word,
		BLOB_CHOICE_LIST_VECTOR *	char_choices,
		inT32 *	blob_number,
		SEAMS *	seam_list,
		int *	right_chop_index
	)

Definition at line 441 of file chopper.cpp.

                                                   {
  TBLOB *blob;
  inT16 x = 0;
  float rating_ceiling = MAX_FLOAT32;
  BLOB_CHOICE_LIST *answer;
  BLOB_CHOICE_IT answer_it;
  SEAM *seam;
  UNICHAR_ID unichar_id = 0;
  int left_chop_index = 0;

  do {
    *blob_number = select_blob_to_split(*char_choices, rating_ceiling, false);
    if (chop_debug)
      cprintf("blob_number = %d\n", *blob_number);
    if (*blob_number == -1)
      return false;
    seam = chop_numbered_blob(word, *blob_number, true, *seam_list);
    if (seam != NULL)
      break;
    /* Must split null blobs */
    answer = char_choices->get(*blob_number);
    if (answer == NULL)
      return false;
    answer_it.set_to_list(answer);
    rating_ceiling = answer_it.data()->rating();  // try a different blob
  } while (true);
  /* Split OK */
  for (blob = word->blobs; x < *blob_number; x++) {
    blob = blob->next;
  }
  if (chop_debug) {
    tprintf("Chop made blob1:");
    blob->bounding_box().print();
    tprintf("and blob2:");
    blob->next->bounding_box().print();
  }
  *seam_list = insert_seam(*seam_list, *blob_number, seam, blob, word->blobs);

  answer = char_choices->get(*blob_number);
  answer_it.set_to_list(answer);
  unichar_id = answer_it.data()->unichar_id();
  float rating = answer_it.data()->rating() / exp(1.0);
  left_chop_index = atoi(unicharset.id_to_unichar(unichar_id));

  delete char_choices->get(*blob_number);
  // combine confidence w/ serial #
  answer = fake_classify_blob(0, rating, -rating);
  modify_blob_choice(answer, left_chop_index);
  char_choices->insert(answer, *blob_number);

  answer = fake_classify_blob(0, rating - 0.125f, -rating);
  modify_blob_choice(answer, ++*right_chop_index);
  char_choices->set(answer, *blob_number + 1);
  return true;
}

bool tesseract::Wordrec::chop_one_blob2	(	const GenericVector< TBOX > &	boxes,
		WERD_RES *	word_res,
		SEAMS *	seam_list
	)

Definition at line 502 of file chopper.cpp.

                                               {
  inT32 blob_number;
  inT16 x = 0;
  TBLOB *blob;
  SEAM *seam;

  seam = chop_overlapping_blob(boxes, word_res, &blob_number,
                               true, *seam_list);
  if (seam == NULL)
    return false;

  /* Split OK */
  for (blob = word_res->chopped_word->blobs; x < blob_number; x++) {
    blob = blob->next;
  }
  if (chop_debug) {
    tprintf("Chop made blob1:");
    blob->bounding_box().print();
    tprintf("and blob2:");
    blob->next->bounding_box().print();
  }
  *seam_list = insert_seam(*seam_list, blob_number, seam, blob,
                           word_res->chopped_word->blobs);
  return true;
}

SEAM * tesseract::Wordrec::chop_overlapping_blob	(	const GenericVector< TBOX > &	boxes,
		WERD_RES *	word_res,
		inT32 *	blob_number,
		bool	italic_blob,
		SEAMS	seam_list
	)

Definition at line 233 of file chopper.cpp.

                                                                        {
  TWERD *word = word_res->chopped_word;
  TBLOB *blob;

  *blob_number = 0;
  blob = word->blobs;
  while (blob != NULL) {
    TPOINT topleft, botright;
    topleft.x = blob->bounding_box().left();
    topleft.y = blob->bounding_box().top();
    botright.x = blob->bounding_box().right();
    botright.y = blob->bounding_box().bottom();

    TPOINT original_topleft, original_botright;
    word_res->denorm.DenormTransform(topleft, &original_topleft);
    word_res->denorm.DenormTransform(botright, &original_botright);

    TBOX original_box = TBOX(original_topleft.x, original_botright.y,
                             original_botright.x, original_topleft.y);

    bool almost_equal_box = false;
    int num_overlap = 0;
    for (int i = 0; i < boxes.size(); i++) {
      if (original_box.overlap_fraction(boxes[i]) > 0.125)
        num_overlap++;
      if (original_box.almost_equal(boxes[i], 3))
        almost_equal_box = true;
    }

    TPOINT location;
    if (divisible_blob(blob, italic_blob, &location) ||
        (!almost_equal_box && num_overlap > 1)) {
      SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
                                     italic_blob, seam_list);
      if (seam != NULL)
        return seam;
    }

    *blob_number = *blob_number + 1;
    blob = blob->next;
  }

  *blob_number = -1;
  return NULL;
}

BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::chop_word_main ( WERD_RES * word )

Definition at line 583 of file chopper.cpp.

                                                               {
  TBLOB *blob;
  int index;
  int did_chopping;
  STATE state;
  BLOB_CHOICE_LIST *match_result;
  MATRIX *ratings = NULL;
  DANGERR fixpt;                 /*dangerous ambig */
  inT32 bit_count;               //no of bits

  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR();

  did_chopping = 0;
  for (blob = word->chopped_word->blobs, index = 0;
       blob != NULL; blob = blob->next, index++) {
    match_result = classify_blob(blob, word->denorm, "chop_word:", Green,
                                 word->blamer_bundle);
    if (match_result == NULL)
      cprintf("Null classifier output!\n");
    *char_choices += match_result;
  }
  bit_count = index - 1;
  set_n_ones(&state, char_choices->length() - 1);
  bool acceptable = false;
  bool replaced = false;
  bool best_choice_updated =
    getDict().permute_characters(*char_choices, word->best_choice,
                                 word->raw_choice);
  if (best_choice_updated &&
      getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt,
                                 CHOPPER_CALLER, &replaced)) {
    acceptable = true;
  }
  if (replaced)
    update_blob_classifications(word->chopped_word, *char_choices);
  CopyCharChoices(*char_choices, best_char_choices);
  if (!acceptable) {  // do more work to find a better choice
    did_chopping = 1;

    bool best_choice_acceptable = false;
    if (chop_enable)
      improve_by_chopping(word,
                          char_choices,
                          &state,
                          best_char_choices,
                          &fixpt,
                          &best_choice_acceptable);
    if (chop_debug)
      print_seams ("Final seam list:", word->seam_array);

    if (word->blamer_bundle != NULL &&
        !ChoiceIsCorrect(*word->uch_set, word->best_choice,
                         word->blamer_bundle->truth_text)) {
      set_chopper_blame(word);
    }

    // The force_word_assoc is almost redundant to enable_assoc.  However,
    // it is not conditioned on the dict behavior.  For CJK, we need to force
    // the associator to be invoked.  When we figure out the exact behavior
    // of dict on CJK, we can remove the flag if it turns out to be redundant.
    if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) {
      ratings = word_associator(false, word, &state, best_char_choices,
                                &fixpt, &state);
    }
  }
  best_char_choices = rebuild_current_state(word, &state, best_char_choices,
                                            ratings);

  // If after running only the chopper best_choice is incorrect and no blame
  // has been yet set, blame the classifier if best_choice is classifier's
  // top choice and is a dictionary word (i.e. language model could not have
  // helped). Otherwise blame the tradeoff between the classifier and
  // the old language model (permuters).
  if (word->blamer_bundle != NULL &&
      word->blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
      ratings == NULL &&  // only the chopper was run
      !ChoiceIsCorrect(*word->uch_set, word->best_choice,
                       word->blamer_bundle->truth_text)) {
    if (word->best_choice != NULL &&
        Dict::valid_word_permuter(word->best_choice->permuter(), false)) {
      // Find out whether best choice is a top choice.
      word->blamer_bundle->best_choice_is_dict_and_top_choice = true;
      for (int i = 0; i < word->best_choice->length(); ++i) {
        BLOB_CHOICE_IT blob_choice_it(best_char_choices->get(i));
        ASSERT_HOST(!blob_choice_it.empty());
        BLOB_CHOICE *first_choice = NULL;
        for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
             blob_choice_it.forward()) {  // find first non-fragment choice
          if (!(getDict().getUnicharset().get_fragment(
                blob_choice_it.data()->unichar_id()))) {
            first_choice = blob_choice_it.data();
            break;
          }
        }
        ASSERT_HOST(first_choice != NULL);
        if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
          word->blamer_bundle->best_choice_is_dict_and_top_choice = false;
          break;
        }
      }
    }
    STRING debug;
    if (word->blamer_bundle->best_choice_is_dict_and_top_choice) {
      debug = "Best choice is: incorrect, top choice, dictionary word";
      debug += " with permuter ";
      debug += word->best_choice->permuter_name();
    } else {
      debug = "Classifier/Old LM tradeoff is to blame";
    }
    word->blamer_bundle->SetBlame(
        word->blamer_bundle->best_choice_is_dict_and_top_choice ?
            IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF,
        debug, word->best_choice, wordrec_debug_blamer);
  }

  if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
    if (ratings == NULL) {
      ratings = word_associator(true, word, NULL, NULL, NULL, NULL);
    }
    CallFillLattice(*ratings, getDict().getBestChoices(),
                    *word->uch_set, word->blamer_bundle);
  }
  if (ratings != NULL) {
    if (wordrec_debug_level > 0) {
      tprintf("Final Ratings Matrix:\n");
      ratings->print(getDict().getUnicharset());
    }
    ratings->delete_matrix_pointers();
    delete ratings;
  }
  getDict().FilterWordChoices();
  // TODO(antonova, eger): check that FilterWordChoices() does not filter
  // out anything useful for word bigram or phrase search.
  // TODO(antonova, eger): when implementing word bigram and phrase search
  // we will need to think carefully about how to replace a word with its
  // alternative choice.
  // In particular it might be required to save the segmentation state
  // associated with the word, so that best_char_choices could be updated
  // by rebuild_current_state() correctly.
  if (save_alt_choices) SaveAltChoices(getDict().getBestChoices(), word);
  char_choices->delete_data_pointers();
  delete char_choices;

  return best_char_choices;
}

BLOB_CHOICE_LIST * tesseract::Wordrec::classify_blob	(	TBLOB *	blob,
		const DENORM &	denorm,
		const char *	string,
		C_COL	color,
		BlamerBundle *	blamer_bundle
	)

Definition at line 62 of file wordclass.cpp.

                                                                      {
  fflush(stdout);
  BLOB_CHOICE_LIST *choices = NULL;
#ifndef GRAPHICS_DISABLED
  if (wordrec_display_all_blobs)
    display_blob(blob, color);
#endif
  choices = blob_match_table.get_match(blob);
  if (choices == NULL) {
    choices = call_matcher(&denorm, blob);
    blob_match_table.put_match(blob, choices);
    // If a blob with the same bounding box as one of the truth character
    // bounding boxes is not classified as the corresponding truth character
    // blame character classifier for incorrect answer.
    if (blamer_bundle != NULL && blamer_bundle->truth_has_char_boxes &&
        blamer_bundle->incorrect_result_reason == IRR_CORRECT) {
      for (int b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) {
        const TBOX &truth_box = blamer_bundle->norm_truth_word.BlobBox(b);
        const TBOX &blob_box = blob->bounding_box();
        // Note that we are more strict on the bounding box boundaries here
        // than in other places (chopper, segmentation search), since we do
        // not have the ability to check the previous and next bounding box.
        if (blob_box.x_almost_equal(truth_box,
                                    blamer_bundle->norm_box_tolerance/2)) {
          BLOB_CHOICE_IT choices_it(choices);
          bool found = false;
          bool incorrect_adapted = false;
          UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
          const char *truth_str = blamer_bundle->truth_text[b].string();
          for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
              choices_it.forward()) {
            if (strcmp(truth_str, getDict().getUnicharset().get_normed_unichar(
                choices_it.data()->unichar_id())) == 0) {
              found = true;
              break;
            } else if (choices_it.data()->adapted()) {
              incorrect_adapted = true;
              incorrect_adapted_id = choices_it.data()->unichar_id();
            }
          }  // end choices_it for loop
          if (!found) {
            STRING debug = "unichar ";
            debug += truth_str;
            debug += " not found in classification list";
            blamer_bundle->SetBlame(IRR_CLASSIFIER, debug,
                                    NULL, wordrec_debug_blamer);
          } else if (incorrect_adapted) {
            STRING debug = "better rating for adapted ";
            debug += getDict().getUnicharset().id_to_unichar(
                incorrect_adapted_id);
            debug += " than for correct ";
            debug += truth_str;
            blamer_bundle->SetBlame(IRR_ADAPTION, debug,
                                    NULL, wordrec_debug_blamer);
          }
          break;
        }
      }  // end iterating over blamer_bundle->norm_truth_word
    }
  }
#ifndef GRAPHICS_DISABLED
  if (classify_debug_level && string)
    print_ratings_list(string, choices, getDict().getUnicharset());

  if (wordrec_blob_pause)
    window_wait(blob_window);
#endif

  return (choices);
}

BLOB_CHOICE_LIST * tesseract::Wordrec::classify_piece	(	TBLOB *	pieces,
		const DENORM &	denorm,
		SEAMS	seams,
		inT16	start,
		inT16	end,
		BlamerBundle *	blamer_bundle
	)		`[virtual]`

Definition at line 75 of file pieces.cpp.

                                                                       {
  BLOB_CHOICE_LIST *choices;
  TBLOB *blob;
  inT16 x;

  join_pieces(pieces, seams, start, end);
  for (blob = pieces, x = 0; x < start; x++) {
    blob = blob->next;
  }
  choices = classify_blob(blob, denorm, "pieces:", White, blamer_bundle);

  break_pieces(blob, seams, start, end);
#ifndef GRAPHICS_DISABLED
  if (wordrec_display_segmentations > 2) {
    STATE current_state;
    SEARCH_STATE chunk_groups;
    set_n_ones (&current_state, array_count(seams));
    chunk_groups = bin_to_chunks(&current_state, array_count(seams));
    display_segmentation(pieces, chunk_groups);
    window_wait(segm_window);
    memfree(chunk_groups);
  }
#endif

  return (choices);
}

void tesseract::Wordrec::combine_seam	(	SEAM_QUEUE	seam_queue,
		SEAM_PILE	seam_pile,
		SEAM *	seam
	)

tessedit_fix_sideways_chops ||

Definition at line 259 of file findseam.cpp.

                                       {
  register inT16 x;
  register inT16 dist;
  inT16 bottom1, top1;
  inT16 bottom2, top2;

  SEAM *new_one;
  SEAM *this_one;

  bottom1 = seam->split1->point1->pos.y;
  if (seam->split1->point2->pos.y >= bottom1)
    top1 = seam->split1->point2->pos.y;
  else {
    top1 = bottom1;
    bottom1 = seam->split1->point2->pos.y;
  }
  if (seam->split2 != NULL) {
    bottom2 = seam->split2->point1->pos.y;
    if (seam->split2->point2->pos.y >= bottom2)
      top2 = seam->split2->point2->pos.y;
    else {
      top2 = bottom2;
      bottom2 = seam->split2->point2->pos.y;
    }
  }
  else {
    bottom2 = bottom1;
    top2 = top1;
  }
  array_loop(seam_pile, x) {
    this_one = (SEAM *) array_value (seam_pile, x);
    dist = seam->location.x - this_one->location.x;
    if (-SPLIT_CLOSENESS < dist &&
      dist < SPLIT_CLOSENESS &&
    seam->priority + this_one->priority < chop_ok_split) {
      inT16 split1_point1_y = this_one->split1->point1->pos.y;
      inT16 split1_point2_y = this_one->split1->point2->pos.y;
      inT16 split2_point1_y = 0;
      inT16 split2_point2_y = 0;
      if (this_one->split2) {
        split2_point1_y = this_one->split2->point1->pos.y;
        split2_point2_y = this_one->split2->point2->pos.y;
      }
      if (
        (
          /* this_one->split1 always exists */
          (
            ((split1_point1_y >= top1 && split1_point2_y >= top1) ||
             (split1_point1_y <= bottom1 && split1_point2_y <= bottom1))
            &&
            ((split1_point1_y >= top2 && split1_point2_y >= top2) ||
             (split1_point1_y <= bottom2 && split1_point2_y <= bottom2))
          )
        )
        &&
        (
          this_one->split2 == NULL ||
          (
            ((split2_point1_y >= top1 && split2_point2_y >= top1) ||
             (split2_point1_y <= bottom1 && split2_point2_y <= bottom1))
            &&
            ((split2_point1_y >= top2 && split2_point2_y >= top2) ||
             (split2_point1_y <= bottom2 && split2_point2_y <= bottom2))
          )
        )
      ) {
        new_one = join_two_seams (seam, this_one);
        if (chop_debug > 1)
          print_seam ("Combo priority       ", new_one);
        add_seam_to_queue (seam_queue, new_one, new_one->priority);
      }
    }
  }
}

inT16 tesseract::Wordrec::constrained_split	(	SPLIT *	split,
		TBLOB *	blob
	)

Definition at line 343 of file findseam.cpp.

                                                          {
  TESSLINE *outline;

  if (is_little_chunk (split->point1, split->point2))
    return (FALSE);

  for (outline = blob->outlines; outline; outline = outline->next) {
    if (split_bounds_overlap (split, outline) &&
    crosses_outline (split->point1, split->point2, outline->loop)) {
      return (FALSE);
    }
  }
  return (TRUE);
}

void tesseract::Wordrec::CopyCharChoices	(	const BLOB_CHOICE_LIST_VECTOR &	from,
		BLOB_CHOICE_LIST_VECTOR *	to
	)

Definition at line 148 of file wordrec.cpp.

                                                           {
  to->delete_data_pointers();
  to->clear();
  for (int i = 0; i < from.size(); ++i) {
    BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
    cc_list->deep_copy(from[i], &BLOB_CHOICE::deep_copy);
    to->push_back(cc_list);
  }
}

int tesseract::Wordrec::crosses_outline	(	EDGEPT *	p0,
		EDGEPT *	p1,
		EDGEPT *	outline
	)

Definition at line 48 of file outlines.cpp.

                                              {  /* Outline to check */
  EDGEPT *pt = outline;
  do {
    if (is_crossed (p0->pos, p1->pos, pt->pos, pt->next->pos))
      return (TRUE);
    pt = pt->next;
  }
  while (pt != outline);
  return (FALSE);
}

void tesseract::Wordrec::delete_seam_pile ( SEAM_PILE seam_pile )

Definition at line 365 of file findseam.cpp.

                                                  {
  inT16 x;

  array_loop(seam_pile, x) {
    delete_seam ((SEAM *) array_value (seam_pile, x));
  }
  array_free(seam_pile);
}

void tesseract::Wordrec::delete_search ( SEARCH_RECORD * the_search )

delete_search

Terminate the current search and free all the memory involved.

Definition at line 179 of file bestfirst.cpp.

                                                     {
  float closeness;

  closeness = (the_search->num_joints ?
    (hamming_distance(reinterpret_cast<uinT32*>(the_search->first_state),
                      reinterpret_cast<uinT32*>(the_search->best_state), 2) /
      (float) the_search->num_joints) : 0.0f);

  free_state (the_search->first_state);
  free_state (the_search->best_state);

  free_hash_table(the_search->closed_states);
  FreeHeapData (the_search->open_states, (void_dest) free_state);

  memfree(the_search);
}

int tesseract::Wordrec::dict_word ( const WERD_CHOICE & word )

Definition at line 133 of file tface.cpp.

                                              {
  return getDict().valid_word(word);
}

int tesseract::Wordrec::end_recog ( )

Definition at line 67 of file tface.cpp.

                       {
  program_editdown (0);

  return (0);
}

BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::evaluate_chunks	(	CHUNKS_RECORD *	chunks_record,
		SEARCH_STATE	search_state,
		BlamerBundle *	blamer_bundle
	)

evaluate_chunks

A particular word level segmentation has been chosen. Evaluation this to find the word list that corresponds to it.

Definition at line 203 of file bestfirst.cpp.

                                                                               {
  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  BLOB_CHOICE_LIST *blob_choices;
  BLOB_CHOICE_IT blob_choice_it;
  int i;
  int x = 0;
  int y;

  // Iterate sub-paths.
  for (i = 1; i <= search_state[0] + 1; i++) {
    if (i > search_state[0])
      y = count_blobs (chunks_record->chunks) - 1;
    else
      y = x + search_state[i];

    // Process one square.

    // Classify if needed.
    blob_choices = get_piece_rating(chunks_record->ratings,
                                    chunks_record->chunks,
                                    chunks_record->word_res->denorm,
                                    chunks_record->splits,
                                    x, y, blamer_bundle);

    if (blob_choices == NULL) {
      delete char_choices;
      return (NULL);
    }

    // Add permuted ratings.
    blob_choice_it.set_to_list(blob_choices);
    last_segmentation[i - 1].certainty = blob_choice_it.data()->certainty();
    last_segmentation[i - 1].match = blob_choice_it.data()->rating();

    last_segmentation[i - 1].width =
      AssociateUtils::GetChunksWidth(chunks_record->chunk_widths, x, y);
    last_segmentation[i - 1].gap =
      AssociateUtils::GetChunksGap(chunks_record->chunk_widths, y);

    *char_choices += blob_choices;
    x = y + 1;
  }
  return (char_choices);
}

inT16 tesseract::Wordrec::evaluate_state	(	CHUNKS_RECORD *	chunks_record,
		SEARCH_RECORD *	the_search,
		DANGERR *	fixpt,
		BlamerBundle *	blamer_bundle
	)

Definition at line 256 of file bestfirst.cpp.

                                                           {
  BLOB_CHOICE_LIST_VECTOR *char_choices;
  SEARCH_STATE chunk_groups;
  float rating_limit = the_search->best_choice->rating();
  bool keep_going = true;
  PIECES_STATE widths;

  the_search->num_states++;
  chunk_groups = bin_to_chunks(the_search->this_state,
                               the_search->num_joints);
  bin_to_pieces (the_search->this_state, the_search->num_joints, widths);
  if (wordrec_debug_level > 1) {
    log_state("Evaluating state", the_search->num_joints,
              the_search->this_state);
  }
  getDict().LogNewSegmentation(widths);

  char_choices = evaluate_chunks(chunks_record, chunk_groups, blamer_bundle);
  getDict().SetWordsegRatingAdjustFactor(-1.0f);
  bool updated_best_choice = false;
  if (char_choices != NULL && char_choices->length() > 0) {
    // Compute the segmentation cost and include the cost in word rating.
    // TODO(dsl): We should change the SEARCH_RECORD to store this cost
    // from state evaluation and avoid recomputing it here.
    prioritize_state(chunks_record, the_search);
    getDict().SetWordsegRatingAdjustFactor(the_search->segcost_bias);
    updated_best_choice =
      getDict().permute_characters(*char_choices,
                                   the_search->best_choice,
                                   the_search->raw_choice);
    bool replaced = false;
    if (updated_best_choice) {
      if (getDict().AcceptableChoice(char_choices, the_search->best_choice,
                                     NULL, ASSOCIATOR_CALLER, &replaced)) {
        keep_going = false;
      }
      CopyCharChoices(*char_choices, the_search->best_char_choices);
    }
  }
  getDict().SetWordsegRatingAdjustFactor(-1.0f);

#ifndef GRAPHICS_DISABLED
  if (wordrec_display_segmentations) {
    display_segmentation (chunks_record->chunks, chunk_groups);
    if (wordrec_display_segmentations > 1)
      window_wait(segm_window);
  }
#endif

  if (rating_limit != the_search->best_choice->rating()) {
    ASSERT_HOST(updated_best_choice);
    the_search->before_best = the_search->num_states;
    the_search->best_state->part1 = the_search->this_state->part1;
    the_search->best_state->part2 = the_search->this_state->part2;
    replace_char_widths(chunks_record, chunk_groups);
  } else {
    ASSERT_HOST(!updated_best_choice);
    if (char_choices != NULL) fixpt->clear();
  }

  if (char_choices != NULL) delete char_choices;
  memfree(chunk_groups);

  return (keep_going);
}

void tesseract::Wordrec::expand_node	(	FLOAT32	worst_priority,
		CHUNKS_RECORD *	chunks_record,
		SEARCH_RECORD *	the_search
	)

Definition at line 499 of file bestfirst.cpp.

                                                     {
  STATE old_state;
  int x;
  uinT32 mask = 1 << (the_search->num_joints - 1 - 32);

  old_state.part1 = the_search->this_state->part1;
  old_state.part2 = the_search->this_state->part2;

  // We need to expand the search more intelligently, or we get stuck
  // with a bad starting segmentation in a long word sequence as in CJK.
  // Expand a child node only if it is within the global bound, and no
  // worse than 2x of its parent.
  // TODO(dsl): There is some redudency here in recomputing the priority,
  // and in filtering of old_merit and worst_priority.
  the_search->this_state->part2 = old_state.part2;
  for (x = the_search->num_joints; x > 32; x--) {
    the_search->this_state->part1 = mask ^ old_state.part1;
    if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
      FLOAT32 new_merit = prioritize_state(chunks_record, the_search);
      if (new_merit < worst_priority) {
        if (wordrec_debug_level > 1)
          log_state("Pushing segstate", the_search->num_joints,
                    the_search->this_state, new_merit);
        push_queue(the_search->open_states, the_search->this_state,
                   worst_priority, new_merit, wordrec_debug_level > 1);
      } else {
        if (wordrec_debug_level > 1)
          log_state("Ignore weak segstate", the_search->num_joints,
                    the_search->this_state, new_merit);
      }
    }
    mask >>= 1;
  }

  if (the_search->num_joints > 32) {
    mask = 1 << 31;
  }
  else {
    mask = 1 << (the_search->num_joints - 1);
  }

  the_search->this_state->part1 = old_state.part1;
  while (x--) {
    the_search->this_state->part2 = mask ^ old_state.part2;
    if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
      FLOAT32 new_merit = prioritize_state(chunks_record, the_search);
      if (new_merit < worst_priority) {
        if (wordrec_debug_level > 1)
          log_state("Pushing segstate", the_search->num_joints,
                    the_search->this_state, new_merit);
        push_queue(the_search->open_states, the_search->this_state,
                   worst_priority, new_merit, wordrec_debug_level > 1);
      } else {
        if (wordrec_debug_level > 1)
          log_state("Ignoring weak segstate", the_search->num_joints,
                    the_search->this_state, new_merit);
      }
    }
    mask >>= 1;
  }
}

BLOB_CHOICE_LIST * tesseract::Wordrec::fake_classify_blob	(	UNICHAR_ID	class_id,
		float	rating,
		float	certainty
	)

Definition at line 136 of file wordclass.cpp.

                                                                             {
  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();  // matcher result
  BLOB_CHOICE *choice =
      new BLOB_CHOICE(class_id, rating, certainty, -1, -1, 0, 0, 0, false);
  BLOB_CHOICE_IT temp_it(ratings);
  temp_it.add_after_stay_put(choice);
  return ratings;
}

void tesseract::Wordrec::fill_filtered_fragment_list	(	BLOB_CHOICE_LIST *	choices,
		int	fragment_pos,
		int	num_frag_parts,
		BLOB_CHOICE_LIST *	filtered_choices
	)

Definition at line 136 of file pieces.cpp.

                                                                              {
  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
  BLOB_CHOICE_IT choices_it(choices);

  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
       choices_it.forward()) {
    UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
    const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);

    if (frag != NULL && frag->get_pos() == fragment_pos &&
        frag->get_total() == num_frag_parts) {
      // Recover the unichar_id of the unichar that this fragment is
      // a part of
      BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
      int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
      b->set_unichar_id(original_unichar);
      filtered_choices_it.add_to_end(b);
    }
  }

  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
}

void tesseract::Wordrec::FillLattice	(	const MATRIX &	ratings,
		const LIST &	best_choices,
		const UNICHARSET &	unicharset,
		BlamerBundle *	blamer_bundle
	)

void tesseract::Wordrec::FinishBlamerForSegSearch	(	const WERD_CHOICE *	best_choice,
		BlamerBundle *	blamer_bundle,
		STRING *	blamer_debug
	)		`[protected]`

Definition at line 376 of file segsearch.cpp.

                                                             {
  // If we are still looking for blame (i.e. best_choice is incorrect, but a
  // path representing the correct segmentation could be constructed), we can
  // blame segmentation search pain point prioritization if the rating of the
  // path corresponding to the correct segmentation is better than that of
  // best_choice (i.e. language model would have done the correct thing, but
  // because of poor pain point prioritization the correct segmentation was
  // never explored). Otherwise we blame the tradeoff between the language model
  // and the classifier, since even after exploring the path corresponding to
  // the correct segmentation incorrect best_choice would have been chosen.
  // One special case when we blame the classifier instead is when best choice
  // is incorrect, but it is a dictionary word and it classifier's top choice.
  if (blamer_bundle != NULL && blamer_bundle->segsearch_is_looking_for_blame) {
    blamer_bundle->segsearch_is_looking_for_blame = false;
    if (blamer_bundle->best_choice_is_dict_and_top_choice) {
      *blamer_debug = "Best choice is: incorrect, top choice, dictionary word";
      *blamer_debug += " with permuter ";
      *blamer_debug += best_choice->permuter_name();
      blamer_bundle->SetBlame(IRR_CLASSIFIER, *blamer_debug, best_choice,
                              wordrec_debug_blamer);
    } else if (blamer_bundle->best_correctly_segmented_rating <
        best_choice->rating()) {
      *blamer_debug += "Correct segmentation state was not explored";
      blamer_bundle->SetBlame(IRR_SEGSEARCH_PP, *blamer_debug, best_choice,
                              wordrec_debug_blamer);
    } else {
      if (blamer_bundle->best_correctly_segmented_rating >=
          WERD_CHOICE::kBadRating) {
        *blamer_debug += "Correct segmentation paths were pruned by LM\n";
      } else {
        char debug_buffer[256];
        *blamer_debug += "Best correct segmentation rating ";
        sprintf(debug_buffer, "%g",
                blamer_bundle->best_correctly_segmented_rating);
        *blamer_debug += debug_buffer;
        *blamer_debug += " vs. best choice rating ";
        sprintf(debug_buffer, "%g", best_choice->rating());
        *blamer_debug += debug_buffer;
      }
      blamer_bundle->SetBlame(IRR_CLASS_LM_TRADEOFF, *blamer_debug, best_choice,
                              wordrec_debug_blamer);
    }
  }
}

PRIORITY tesseract::Wordrec::full_split_priority	(	SPLIT *	split,
		inT16	xmin,
		inT16	xmax
	)

Definition at line 74 of file gradechop.cpp.

                                                                          {
  BOUNDS_RECT rect;

  set_outline_bounds (split->point1, split->point2, rect);

  if (xmin < MIN (rect[0], rect[2]) && xmax > MAX (rect[1], rect[3]))
    return (999.0);

  return (grade_overlap (rect) +
    grade_center_of_blob (rect) + grade_width_change (rect));
}

void tesseract::Wordrec::get_fragment_lists	(	inT16	current_frag,
		inT16	current_row,
		inT16	start,
		inT16	num_frag_parts,
		inT16	num_blobs,
		MATRIX *	ratings,
		BLOB_CHOICE_LIST *	choice_lists
	)

Definition at line 292 of file pieces.cpp.

                                                                 {
  if (current_frag == num_frag_parts) {
    merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
                                 choice_lists, ratings);
    return;
  }

  for (inT16 x = current_row; x < num_blobs; x++) {
    BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
    if (choices == NULL)
      continue;

    fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
                                &choice_lists[current_frag]);
    if (!choice_lists[current_frag].empty()) {
      get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
                         num_blobs, ratings, choice_lists);
      choice_lists[current_frag].clear();
    }
  }
}

FLOAT32 tesseract::Wordrec::get_gap_variance	(	WIDTH_RECORD *	wrec,
		float	norm_height
	)

Definition at line 111 of file heuristic.cpp.

                                                                       {
  MEASUREMENT ws;
  new_measurement(ws);
  for (int x = 0; x < wrec->num_chars - 1; x++) {
    FLOAT32 gap_ratio = (wrec->widths[2 * x] + wrec->widths[ 2*x + 1])
                        * 1.0 / norm_height;
    ADD_SAMPLE(ws, gap_ratio);
  }
  if (segment_adjust_debug > 2)
    tprintf("Gap Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
  return VARIANCE(ws);
}

BLOB_CHOICE_LIST * tesseract::Wordrec::get_piece_rating	(	MATRIX *	ratings,
		TBLOB *	blobs,
		const DENORM &	denorm,
		SEAMS	seams,
		inT16	start,
		inT16	end,
		BlamerBundle *	blamer_bundle
	)

Definition at line 362 of file pieces.cpp.

                                                                         {
  BLOB_CHOICE_LIST *choices = ratings->get(start, end);
  if (choices == NOT_CLASSIFIED) {
    choices = classify_piece(blobs,
                             denorm,
                             seams,
                             start,
                             end,
                             blamer_bundle);
    ratings->put(start, end, choices);
    if (wordrec_debug_level > 1) {
      tprintf("get_piece_rating(): updated ratings matrix\n");
      ratings->print(getDict().getUnicharset());
    }
  }
  return (choices);
}

FLOAT32 tesseract::Wordrec::get_width_variance	(	WIDTH_RECORD *	wrec,
		float	norm_height
	)

Definition at line 96 of file heuristic.cpp.

                                                                         {
  MEASUREMENT ws;
  new_measurement(ws);
  for (int x = 0; x < wrec->num_chars; x++) {
    FLOAT32 wh_ratio = wrec->widths[2 * x] * 1.0f / norm_height;
    if (x == wrec->num_chars - 1 && wh_ratio > 0.3)
      continue;   // exclude trailing punctuation from stats
    ADD_SAMPLE(ws, wh_ratio);
  }
  if (segment_adjust_debug > 2)
    tprintf("Width Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws));
  return VARIANCE(ws);
}

PRIORITY tesseract::Wordrec::grade_center_of_blob ( register BOUNDS_RECT rect )

Definition at line 95 of file gradechop.cpp.

                                                                {
  register PRIORITY grade;

  grade = (rect[1] - rect[0]) - (rect[3] - rect[2]);
  if (grade < 0)
    grade = -grade;

  grade *= chop_center_knob;
  grade = MIN (CENTER_GRADE_CAP, grade);
  return (MAX (0.0, grade));
}

PRIORITY tesseract::Wordrec::grade_overlap ( register BOUNDS_RECT rect )

Definition at line 115 of file gradechop.cpp.

                                                         {
  register PRIORITY grade;
  register inT16 width1;
  register inT16 width2;
  register inT16 overlap;

  width1 = rect[3] - rect[2];
  width2 = rect[1] - rect[0];

  overlap = MIN (rect[1], rect[3]) - MAX (rect[0], rect[2]);
  width1 = MIN (width1, width2);
  if (overlap == width1)
    return (100.0);              /* Total overlap */

  width1 = 2 * overlap - width1; /* Extra penalty for too */
  overlap += MAX (0, width1);    /* much overlap */

  grade = overlap * chop_overlap_knob;

  return (MAX (0.0, grade));
}

PRIORITY tesseract::Wordrec::grade_sharpness ( register SPLIT * split )

Definition at line 168 of file gradechop.cpp.

                                                       {
  register PRIORITY grade;

  grade = point_priority (split->point1) + point_priority (split->point2);

  if (grade < -360.0)
    grade = 0;
  else
    grade += 360.0;

  grade *= chop_sharpness_knob;       /* Values 0 to -360 */

  return (grade);
}

PRIORITY tesseract::Wordrec::grade_split_length ( register SPLIT * split )

Definition at line 145 of file gradechop.cpp.

                                                          {
  register PRIORITY grade;
  register float split_length;

  split_length = weighted_edgept_dist (split->point1, split->point2,
    chop_x_y_weight);

  if (split_length <= 0)
    grade = 0;
  else
    grade = sqrt (split_length) * chop_split_dist_knob;

  return (MAX (0.0, grade));
}

PRIORITY tesseract::Wordrec::grade_width_change ( register BOUNDS_RECT rect )

Definition at line 191 of file gradechop.cpp.

                                                              {
  register PRIORITY grade;
  register inT32 width1;
  register inT32 width2;

  width1 = rect[3] - rect[2];
  width2 = rect[1] - rect[0];

  grade = 20 - (MAX (rect[1], rect[3])
    - MIN (rect[0], rect[2]) - MAX (width1, width2));

  grade *= chop_width_change_knob;

  return (MAX (0.0, grade));
}

void tesseract::Wordrec::improve_by_chopping	(	WERD_RES *	word,
		BLOB_CHOICE_LIST_VECTOR *	char_choices,
		STATE *	best_state,
		BLOB_CHOICE_LIST_VECTOR *	best_char_choices,
		DANGERR *	fixpt,
		bool *	updated_best_choice
	)

Definition at line 741 of file chopper.cpp.

                                                                {
  inT32 blob_number;
  float old_best;
  bool updated_best_choice = false;

  while (1) {  // improvement loop
    old_best = word->best_choice->rating();
    if (improve_one_blob(word, char_choices,
                         &blob_number, &word->seam_array,
                         fixpt, (fragments_guide_chopper &&
                                 word->best_choice->fragment_mark()),
                                 word->blamer_bundle)) {
      getDict().LogNewSplit(blob_number);
      updated_best_choice =
        getDict().permute_characters(*char_choices, word->best_choice,
                                     word->raw_choice);

      if (old_best > word->best_choice->rating()) {
        set_n_ones(best_state, char_choices->length() - 1);
      } else {
        insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
        fixpt->clear();
      }

      if (chop_debug)
        print_state("best state = ",
          best_state, count_blobs(word->chopped_word->blobs) - 1);
    } else {
      break;
    }

    // Check if we should break from the loop.
    bool done = false;
    bool replaced = false;
    if ((updated_best_choice &&
         (*best_choice_acceptable =
          getDict().AcceptableChoice(char_choices, word->best_choice,
                                     fixpt, CHOPPER_CALLER, &replaced))) ||
        char_choices->length() >= MAX_NUM_CHUNKS) {
      done = true;
    }
    if (replaced) update_blob_classifications(word->chopped_word,
                                              *char_choices);
    if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices);
    if (done) break;
  }
}

bool tesseract::Wordrec::improve_one_blob	(	WERD_RES *	word_res,
		BLOB_CHOICE_LIST_VECTOR *	char_choices,
		inT32 *	blob_number,
		SEAMS *	seam_list,
		DANGERR *	fixpt,
		bool	split_next_to_fragment,
		BlamerBundle *	blamer_bundle
	)

Definition at line 332 of file chopper.cpp.

                                                            {
  TWERD* word = word_res->chopped_word;
  TBLOB *blob;
  inT16 x = 0;
  float rating_ceiling = MAX_FLOAT32;
  BLOB_CHOICE_LIST *answer;
  BLOB_CHOICE_IT answer_it;
  SEAM *seam;

  do {
    *blob_number = select_blob_to_split_from_fixpt(fixpt);
    bool split_point_from_dict = (*blob_number != -1);
    if (split_point_from_dict) {
      fixpt->clear();
    } else {
      *blob_number = select_blob_to_split(*char_choices, rating_ceiling,
                                          split_next_to_fragment);
    }
    if (chop_debug)
      cprintf("blob_number = %d\n", *blob_number);
    if (*blob_number == -1)
      return false;

    // TODO(rays) it may eventually help to allow italic_blob to be true,
    seam = chop_numbered_blob(word, *blob_number, false, *seam_list);
    if (seam != NULL)
      break;
    /* Must split null blobs */
    answer = char_choices->get(*blob_number);
    if (answer == NULL)
      return false;
    answer_it.set_to_list(answer);
    if (!split_point_from_dict) {
      // We chopped the worst rated blob, try something else next time.
      rating_ceiling = answer_it.data()->rating();
    }
  } while (true);
  /* Split OK */
  for (blob = word->blobs; x < *blob_number; x++) {
    blob = blob->next;
  }

  *seam_list =
    insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);

  delete char_choices->get(*blob_number);

  answer = classify_blob(blob, word_res->denorm, "improve 1:", Red,
                         blamer_bundle);
  char_choices->insert(answer, *blob_number);

  answer = classify_blob(blob->next, word_res->denorm, "improve 2:", Yellow,
                         blamer_bundle);
  char_choices->set(answer, *blob_number + 1);

  return true;
}

void tesseract::Wordrec::InitBlamerForSegSearch	(	const WERD_CHOICE *	best_choice,
		CHUNKS_RECORD *	chunks_record,
		HEAP *	pain_points,
		BlamerBundle *	blamer_bundle,
		STRING *	blamer_debug
	)		`[protected]`

Definition at line 331 of file segsearch.cpp.

                                                           {
  blamer_bundle->segsearch_is_looking_for_blame = true;
  if (wordrec_debug_blamer) {
    tprintf("segsearch starting to look for blame\n");
  }
  // Clear pain points heap.
  int pop;
  float pain_point_priority;
  MATRIX_COORD *pain_point;
  while ((pop = HeapPop(pain_points, &pain_point_priority,
                         &pain_point)) != EMPTY) {
    delete pain_point;
  }
  // Fill pain points for any unclassifed blob corresponding to the
  // correct segmentation state.
  *blamer_debug += "Correct segmentation:\n";
  for (int idx = 0;
        idx < blamer_bundle->correct_segmentation_cols.length(); ++idx) {
    blamer_debug->add_str_int(
        "col=", blamer_bundle->correct_segmentation_cols[idx]);
    blamer_debug->add_str_int(
        " row=", blamer_bundle->correct_segmentation_rows[idx]);
    *blamer_debug += "\n";
    if (chunks_record->ratings->get(
        blamer_bundle->correct_segmentation_cols[idx],
        blamer_bundle->correct_segmentation_rows[idx]) == NOT_CLASSIFIED) {
      if (!language_model_->GeneratePainPoint(
          blamer_bundle->correct_segmentation_cols[idx],
          blamer_bundle->correct_segmentation_rows[idx],
          false, -1.0, -1.0, false, -1.0, segsearch_max_char_wh_ratio,
          NULL, NULL, chunks_record, pain_points)) {
        blamer_bundle->segsearch_is_looking_for_blame = false;
        *blamer_debug += "\nFailed to insert pain point\n";
        blamer_bundle->SetBlame(IRR_SEGSEARCH_HEUR, *blamer_debug, best_choice,
                                wordrec_debug_blamer);
        break;
      }
    }
  }  // end for blamer_bundle->correct_segmentation_cols/rows
}

int tesseract::Wordrec::is_crossed	(	TPOINT	a0,
		TPOINT	a1,
		TPOINT	b0,
		TPOINT	b1
	)

Definition at line 70 of file outlines.cpp.

                                                                  {
  int b0a1xb0b1, b0b1xb0a0;
  int a1b1xa1a0, a1a0xa1b0;

  TPOINT b0a1, b0a0, a1b1, b0b1, a1a0;

  b0a1.x = a1.x - b0.x;
  b0a0.x = a0.x - b0.x;
  a1b1.x = b1.x - a1.x;
  b0b1.x = b1.x - b0.x;
  a1a0.x = a0.x - a1.x;
  b0a1.y = a1.y - b0.y;
  b0a0.y = a0.y - b0.y;
  a1b1.y = b1.y - a1.y;
  b0b1.y = b1.y - b0.y;
  a1a0.y = a0.y - a1.y;

  b0a1xb0b1 = CROSS (b0a1, b0b1);
  b0b1xb0a0 = CROSS (b0b1, b0a0);
  a1b1xa1a0 = CROSS (a1b1, a1a0);
                                 /*a1a0xa1b0=CROSS(a1a0,a1b0); */
  a1a0xa1b0 = -CROSS (a1a0, b0a1);

  return ((b0a1xb0b1 > 0 && b0b1xb0a0 > 0)
    || (b0a1xb0b1 < 0 && b0b1xb0a0 < 0))
    && ((a1b1xa1a0 > 0 && a1a0xa1b0 > 0) || (a1b1xa1a0 < 0 && a1a0xa1b0 < 0));
}

int tesseract::Wordrec::is_little_chunk	(	EDGEPT *	point1,
		EDGEPT *	point2
	)

Definition at line 123 of file chop.cpp.

                                                           {
  EDGEPT *p = point1;            /* Iterator */
  int counter = 0;

  do {
                                 /* Go from P1 to P2 */
    if (is_same_edgept (point2, p)) {
      if (is_small_area (point1, point2))
        return (TRUE);
      else
        break;
    }
    p = p->next;
  }
  while ((p != point1) && (counter++ < chop_min_outline_points));
  /* Go from P2 to P1 */
  p = point2;
  counter = 0;
  do {
    if (is_same_edgept (point1, p)) {
      return (is_small_area (point2, point1));
    }
    p = p->next;
  }
  while ((p != point2) && (counter++ < chop_min_outline_points));

  return (FALSE);
}

int tesseract::Wordrec::is_same_edgept	(	EDGEPT *	p1,
		EDGEPT *	p2
	)

Definition at line 104 of file outlines.cpp.

                                                  {
  return (p1 == p2);
}

int tesseract::Wordrec::is_small_area	(	EDGEPT *	point1,
		EDGEPT *	point2
	)

Definition at line 158 of file chop.cpp.

                                                         {
  EDGEPT *p = point1->next;      /* Iterator */
  int area = 0;
  TPOINT origin;

  do {
                                 /* Go from P1 to P2 */
    origin.x = p->pos.x - point1->pos.x;
    origin.y = p->pos.y - point1->pos.y;
    area += CROSS (origin, p->vec);
    p = p->next;
  }
  while (!is_same_edgept (point2, p));

  return (area < chop_min_outline_area);
}

BLOB_CHOICE_LIST * tesseract::Wordrec::join_blobs_and_classify	(	WERD_RES *	word,
		int	x,
		int	y,
		int	choice_index,
		MATRIX *	ratings,
		BLOB_CHOICE_LIST_VECTOR *	old_choices
	)

Definition at line 730 of file bestfirst.cpp.

                                          {
  // Join parts to make the blob if needed.
  if (x != y)
    join_pieces(word->chopped_word->blobs, word->seam_array, x, y);
  TBLOB *blob = word->chopped_word->blobs;
  for (int i = 0; i < x; i++) {
    blob = blob->next;
  }
  // Deep copy this blob into the output word.
  TBLOB* copy_blob = new TBLOB(*blob);
  copy_blob->next = word->rebuild_word->blobs;
  word->rebuild_word->blobs = copy_blob;

  BLOB_CHOICE_LIST *choices = NULL;
  // First check to see if we can look up the classificaiton
  // in old_choices (if there is no need to merge blobs).
  if (choice_index >= 0 && old_choices != NULL) {
    choices = old_choices->get(choice_index);
    old_choices->set(NULL, choice_index);
  }
  // The ratings matrix filled in by the associator will contain the next most
  // up-to-date classification info. Thus we look up the classification there
  // next, and only call classify_blob() if the classification is not found.
  if (choices == NULL && ratings != NULL) {
    choices = ratings->get(x, y);
    if (choices != NOT_CLASSIFIED) {
      ratings->put(x, y, NULL);
    }
  }
  // Get the choices for the blob by classification if necessary.
  if (choices == NULL) {
    choices = classify_blob(blob, word->denorm, "rebuild", Orange,
                            word->blamer_bundle);
  }
  // Undo join_pieces to restore the chopped word to its fully chopped state.
  if (x != y)
    break_pieces(blob, word->seam_array, x, y);
  return choices;
}

void tesseract::Wordrec::junk_worst_seam	(	SEAM_QUEUE	seams,
		SEAM *	new_seam,
		float	new_priority
	)

Definition at line 148 of file findseam.cpp.

                                                  {
  SEAM *seam;
  float priority;

  HeapPopWorst(seams, &priority, &seam);
  if (priority > new_priority) {
    delete_seam(seam);  /*get rid of it */
    HeapPush (seams, new_priority, (char *) new_seam);
  }
  else {
    delete_seam(new_seam);
    HeapPush (seams, priority, (char *) seam);
  }
}

void tesseract::Wordrec::merge_and_put_fragment_lists	(	inT16	row,
		inT16	column,
		inT16	num_frag_parts,
		BLOB_CHOICE_LIST *	choice_lists,
		MATRIX *	ratings
	)

Definition at line 169 of file pieces.cpp.

                                                            {
  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];

  for (int i = 0; i < num_frag_parts; i++) {
    choice_lists_it[i].set_to_list(&choice_lists[i]);
    choice_lists_it[i].mark_cycle_pt();
  }

  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
  if (merged_choice == NULL)
    merged_choice = new BLOB_CHOICE_LIST;

  bool end_of_list = false;
  BLOB_CHOICE_IT merged_choice_it(merged_choice);
  while (!end_of_list) {
    // Find the maximum unichar_id of the current entry the iterators
    // are pointing at
    UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
    int max_list = 0;
    for (int i = 0; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      if (max_unichar_id < unichar_id) {
        max_unichar_id = unichar_id;
        max_list = i;
      }
    }

    // Move the each iterators until it gets to an entry that has a
    // value greater than or equal to max_unichar_id
    for (int i = 0; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      while (!choice_lists_it[i].cycled_list() &&
             unichar_id < max_unichar_id) {
        choice_lists_it[i].forward();
        unichar_id = choice_lists_it[i].data()->unichar_id();
      }
      if (choice_lists_it[i].cycled_list()) {
        end_of_list = true;
        break;
      }
    }

    if (end_of_list)
      break;

    // Checks if the fragments are parts of the same character
    UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
    bool same_unichar = true;
    for (int i = 1; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      if (unichar_id != first_unichar_id) {
        same_unichar = false;
        break;
      }
    }

    if (same_unichar) {
      // Add the merged character to the result
      UNICHAR_ID merged_unichar_id = first_unichar_id;
      inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id();
      inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2();
      inT16 merged_min_xheight = choice_lists_it[0].data()->min_xheight();
      inT16 merged_max_xheight = choice_lists_it[0].data()->max_xheight();
      int merged_script_id = choice_lists_it[0].data()->script_id();
      bool merged_adapted = choice_lists_it[0].data()->adapted();

      float merged_rating = 0, merged_certainty = 0;
      for (int i = 0; i < num_frag_parts; i++) {
        float rating = choice_lists_it[i].data()->rating();
        float certainty = choice_lists_it[i].data()->certainty();

        if (i == 0 || certainty < merged_certainty)
          merged_certainty = certainty;
        merged_rating += rating;

        choice_lists_it[i].forward();
        if (choice_lists_it[i].cycled_list())
          end_of_list = true;
        IntersectRange(choice_lists_it[i].data()->min_xheight(),
                       choice_lists_it[i].data()->max_xheight(),
                       &merged_min_xheight, &merged_max_xheight);
      }

      merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id,
                                                  merged_rating,
                                                  merged_certainty,
                                                  merged_fontinfo_id,
                                                  merged_fontinfo_id2,
                                                  merged_script_id,
                                                  merged_min_xheight,
                                                  merged_max_xheight,
                                                  merged_adapted));
    }
  }

  if (classify_debug_level)
    print_ratings_list("Merged Fragments", merged_choice,
                       unicharset);

  if (merged_choice->empty())
    delete merged_choice;
  else
    ratings->put(row, column, merged_choice);

  delete [] choice_lists_it;
}

void tesseract::Wordrec::merge_fragments	(	MATRIX *	ratings,
		inT16	num_blobs
	)

Definition at line 324 of file pieces.cpp.

                                                              {
  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
  for (inT16 start = 0; start < num_blobs; start++) {
    for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
         frag_parts++) {
      get_fragment_lists(0, start, start, frag_parts, num_blobs,
                         ratings, choice_lists);
    }
  }

  // Delete fragments from the rating matrix
  for (inT16 x = 0; x < num_blobs; x++) {
    for (inT16 y = x; y < num_blobs; y++) {
      BLOB_CHOICE_LIST *choices = ratings->get(x, y);
      if (choices != NULL) {
        BLOB_CHOICE_IT choices_it(choices);
        for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
             choices_it.forward()) {
          UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
          const CHAR_FRAGMENT *frag =
              unicharset.get_fragment(choice_unichar_id);
          if (frag != NULL)
            delete choices_it.extract();
        }
      }
    }
  }
}

void tesseract::Wordrec::modify_blob_choice	(	BLOB_CHOICE_LIST *	answer,
		int	chop_index
	)

Definition at line 403 of file chopper.cpp.

                                        {
  char chop_index_string[2];
  if (chop_index <= 9) {
    snprintf(chop_index_string, sizeof(chop_index_string), "%d", chop_index);
  } else {
    chop_index_string[0] = static_cast<char>('A' - 10 + chop_index);
    chop_index_string[1] = '\0';
  }
  UNICHAR_ID unichar_id = unicharset.unichar_to_id(chop_index_string);
  if (unichar_id == INVALID_UNICHAR_ID) {
    // If the word is very long, we might exhaust the possibilities.
    unichar_id = 1;
  }
  BLOB_CHOICE_IT answer_it(answer);
  BLOB_CHOICE *modified_blob =
      new BLOB_CHOICE(unichar_id,
                      answer_it.data()->rating(),
                      answer_it.data()->certainty(),
                      answer_it.data()->fontinfo_id(),
                      answer_it.data()->fontinfo_id2(),
                      answer_it.data()->script_id(),
                      answer_it.data()->min_xheight(),
                      answer_it.data()->max_xheight(),
                      answer_it.data()->adapted());
  answer->clear();
  answer_it.set_to_list(answer);
  answer_it.add_after_then_move(modified_blob);
}

bool tesseract::Wordrec::near_point	(	EDGEPT *	point,
		EDGEPT *	line_pt_0,
		EDGEPT *	line_pt_1,
		EDGEPT **	near_pt
	)

Definition at line 116 of file outlines.cpp.

                                           {
  TPOINT p;

  float slope;
  float intercept;

  float x0 = line_pt_0->pos.x;
  float x1 = line_pt_1->pos.x;
  float y0 = line_pt_0->pos.y;
  float y1 = line_pt_1->pos.y;

  if (x0 == x1) {
                                 /* Handle vertical line */
    p.x = (inT16) x0;
    p.y = point->pos.y;
  }
  else {
    /* Slope and intercept */
    slope = (y0 - y1) / (x0 - x1);
    intercept = y1 - x1 * slope;

    /* Find perpendicular */
    p.x = (inT16) ((point->pos.x + (point->pos.y - intercept) * slope) /
      (slope * slope + 1));
    p.y = (inT16) (slope * p.x + intercept);
  }

  if (is_on_line (p, line_pt_0->pos, line_pt_1->pos) &&
    (!same_point (p, line_pt_0->pos)) && (!same_point (p, line_pt_1->pos))) {
    /* Intersection on line */
    *near_pt = make_edgept(p.x, p.y, line_pt_1, line_pt_0);
    return true;
  } else {                           /* Intersection not on line */
    *near_pt = closest(point, line_pt_0, line_pt_1);
    return false;
  }
}

void tesseract::Wordrec::new_max_point	(	EDGEPT *	local_max,
		POINT_GROUP	points
	)

Definition at line 303 of file chop.cpp.

                                                                 {
  inT16 dir;

  dir = direction (local_max);

  if (dir > 0) {
    add_point_to_list(points, local_max);
    return;
  }

  if (dir == 0 && point_priority (local_max) < 0) {
    add_point_to_list(points, local_max);
    return;
  }
}

void tesseract::Wordrec::new_min_point	(	EDGEPT *	local_min,
		POINT_GROUP	points
	)

Definition at line 279 of file chop.cpp.

                                                                 {
  inT16 dir;

  dir = direction (local_min);

  if (dir < 0) {
    add_point_to_list(points, local_min);
    return;
  }

  if (dir == 0 && point_priority (local_min) < 0) {
    add_point_to_list(points, local_min);
    return;
  }
}

SEARCH_RECORD * tesseract::Wordrec::new_search	(	CHUNKS_RECORD *	chunks_record,
		int	num_joints,
		BLOB_CHOICE_LIST_VECTOR *	best_char_choices,
		WERD_CHOICE *	best_choice,
		WERD_CHOICE *	raw_choice,
		STATE *	state
	)

Definition at line 568 of file bestfirst.cpp.

                                                 {
  SEARCH_RECORD *this_search;

  this_search = (SEARCH_RECORD *) memalloc (sizeof (SEARCH_RECORD));

  this_search->open_states = MakeHeap (wordrec_num_seg_states * 20);
  this_search->closed_states = new_hash_table();

  if (state)
    this_search->this_state = new_state (state);
  else
    cprintf ("error: bad initial state in new_search\n");

  this_search->first_state = new_state (this_search->this_state);
  this_search->best_state = new_state (this_search->this_state);

  this_search->best_choice = best_choice;
  this_search->raw_choice = raw_choice;
  this_search->best_char_choices = best_char_choices;

  this_search->num_joints = num_joints;
  this_search->num_states = 0;
  this_search->before_best = 0;
  this_search->segcost_bias = 0;

  return (this_search);
}

EDGEPT * tesseract::Wordrec::pick_close_point	(	EDGEPT *	critical_point,
		EDGEPT *	vertical_point,
		int *	best_dist
	)

Definition at line 182 of file chop.cpp.

                                                  {
  EDGEPT *best_point = NULL;
  int this_distance;
  int found_better;

  do {
    found_better = FALSE;

    this_distance = edgept_dist (critical_point, vertical_point);
    if (this_distance <= *best_dist) {

      if (!(same_point (critical_point->pos, vertical_point->pos) ||
        same_point (critical_point->pos, vertical_point->next->pos) ||
        (best_point && same_point (best_point->pos, vertical_point->pos)) ||
      is_exterior_point (critical_point, vertical_point))) {
        *best_dist = this_distance;
        best_point = vertical_point;
        if (chop_vertical_creep)
          found_better = TRUE;
      }
    }
    vertical_point = vertical_point->next;
  }
  while (found_better == TRUE);

  return (best_point);
}

SEAM * tesseract::Wordrec::pick_good_seam ( TBLOB * blob )

Definition at line 380 of file findseam.cpp.

                                         {
  SEAM_QUEUE seam_queue;
  SEAM_PILE seam_pile;
  POINT_GROUP point_heap;
  PRIORITY priority;
  EDGEPT *edge;
  EDGEPT *points[MAX_NUM_POINTS];
  EDGEPT_CLIST new_points;
  SEAM *seam = NULL;
  TESSLINE *outline;
  inT16 num_points = 0;

#ifndef GRAPHICS_DISABLED
  if (chop_debug > 2)
    wordrec_display_splits.set_value(true);

  draw_blob_edges(blob);
#endif

  point_heap = MakeHeap (MAX_NUM_POINTS);
  for (outline = blob->outlines; outline; outline = outline->next)
    prioritize_points(outline, point_heap);

  while (HeapPop (point_heap, &priority, &edge) == TESS_HEAP_OK) {
    if (num_points < MAX_NUM_POINTS)
      points[num_points++] = (EDGEPT *) edge;
  }
  FreeHeap(point_heap);

  /* Initialize queue & pile */
  create_seam_pile(seam_pile);
  create_seam_queue(seam_queue);

  try_point_pairs(points, num_points, seam_queue, &seam_pile, &seam, blob);
  try_vertical_splits(points, num_points, &new_points,
                      seam_queue, &seam_pile, &seam, blob);

  if (seam == NULL) {
    choose_best_seam(seam_queue, &seam_pile, NULL, BAD_PRIORITY, &seam, blob);
  }
  else if (seam->priority > chop_good_split) {
    choose_best_seam (seam_queue, &seam_pile, NULL, seam->priority,
      &seam, blob);
  }

  EDGEPT_C_IT it(&new_points);
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    EDGEPT *inserted_point = it.data();
    if (!point_used_by_seam(seam, inserted_point)) {
      remove_edgept(inserted_point);
    }
  }

  delete_seam_queue(seam_queue);
  delete_seam_pile(seam_pile);

  if (seam) {
    if (seam->priority > chop_ok_split) {
      delete_seam(seam);
      seam = NULL;
    }
#ifndef GRAPHICS_DISABLED
    else if (wordrec_display_splits) {
      if (seam->split1)
        mark_split (seam->split1);
      if (seam->split2)
        mark_split (seam->split2);
      if (seam->split3)
        mark_split (seam->split3);
      if (chop_debug > 2) {
        update_edge_window();
        edge_window_wait();
      }
    }
#endif
  }

  if (chop_debug)
    wordrec_display_splits.set_value(false);

  return (seam);
}

PRIORITY tesseract::Wordrec::point_priority ( EDGEPT * point )

Definition at line 55 of file chop.cpp.

                                              {
  return (PRIORITY)angle_change(point->prev, point, point->next);
}

STATE * tesseract::Wordrec::pop_queue ( HEAP * queue )

Definition at line 607 of file bestfirst.cpp.

                                     {
  HEAPENTRY entry;

  if (GetTopOfHeap (queue, &entry) == TESS_HEAP_OK) {
#ifndef GRAPHICS_DISABLED
    if (wordrec_display_segmentations) {
      cprintf ("eval state: %8.3f ", entry.Key);
      print_state ("", (STATE *) entry.Data, num_joints);
    }
#endif
    return ((STATE *) entry.Data);
  }
  else {
    return (NULL);
  }
}

void tesseract::Wordrec::prioritize_points	(	TESSLINE *	outline,
		POINT_GROUP	points
	)

Definition at line 220 of file chop.cpp.

                                                                     {
  EDGEPT *this_point;
  EDGEPT *local_min = NULL;
  EDGEPT *local_max = NULL;

  this_point = outline->loop;
  local_min = this_point;
  local_max = this_point;
  do {
    if (this_point->vec.y < 0) {
                                 /* Look for minima */
      if (local_max != NULL)
        new_max_point(local_max, points);
      else if (is_inside_angle (this_point))
        add_point_to_list(points, this_point);
      local_max = NULL;
      local_min = this_point->next;
    }
    else if (this_point->vec.y > 0) {
                                 /* Look for maxima */
      if (local_min != NULL)
        new_min_point(local_min, points);
      else if (is_inside_angle (this_point))
        add_point_to_list(points, this_point);
      local_min = NULL;
      local_max = this_point->next;
    }
    else {
      /* Flat area */
      if (local_max != NULL) {
        if (local_max->prev->vec.y != 0) {
          new_max_point(local_max, points);
        }
        local_max = this_point->next;
        local_min = NULL;
      }
      else {
        if (local_min->prev->vec.y != 0) {
          new_min_point(local_min, points);
        }
        local_min = this_point->next;
        local_max = NULL;
      }
    }

                                 /* Next point */
    this_point = this_point->next;
  }
  while (this_point != outline->loop);
}

FLOAT32 tesseract::Wordrec::prioritize_state	(	CHUNKS_RECORD *	chunks_record,
		SEARCH_RECORD *	the_search
	)

Definition at line 289 of file heuristic.cpp.

                                                             {
  FLOAT32 shape_cost;
  FLOAT32 width_cost;
  FLOAT32 seam_cost;

  shape_cost = rating_priority(chunks_record,
                               the_search->this_state,
                               the_search->num_joints);

  width_cost = width_priority(chunks_record,
                              the_search->this_state,
                              the_search->num_joints);

  // The rating_priority is the same as the original, and the width_priority
  // is the same as before if assume_fixed_pitch_char_segment == FALSE.
  // So this would return the original state priority.
  if (!use_new_state_cost)
    return width_cost * 1000 + shape_cost;

  seam_cost = seamcut_priority(chunks_record->splits,
                               the_search->this_state,
                               the_search->num_joints);

  // TODO(dsl): how do we normalize the scores for these separate evidence?
  // FLOAT32 total_cost = shape_cost + width_cost * 0.01 + seam_cost * 0.001;
  FLOAT32 total_cost = shape_cost * heuristic_weight_rating +
                       width_cost * heuristic_weight_width +
                       seam_cost * heuristic_weight_seamcut;

  // We don't have an adjustment model for variable pitch segmentation cost
  // into word rating
  if (assume_fixed_pitch_char_segment) {
    float seg_bias = 1.0;
    if (width_cost < 1) seg_bias *= 0.85;
    if (width_cost > 3)
      seg_bias *= pow(heuristic_segcost_rating_base, width_cost/3.0);
    if (seam_cost > 10)
      seg_bias *= pow(heuristic_segcost_rating_base, log(seam_cost)/log(10.0));
    if (shape_cost > 5)
      seg_bias *= pow(heuristic_segcost_rating_base, shape_cost/5.0);
    if (segment_adjust_debug) {
      tprintf("SegCost: %g Weight: %g rating: %g  width: %g  seam: %g\n",
               total_cost, seg_bias, shape_cost, width_cost, seam_cost);
    }
    the_search->segcost_bias = seg_bias;
  } else {
    the_search->segcost_bias = 0;
  }

  return total_cost;
}

void tesseract::Wordrec::ProcessSegSearchPainPoint	(	float	pain_point_priority,
		const MATRIX_COORD &	pain_point,
		const WERD_CHOICE *	best_choice,
		SEG_SEARCH_PENDING_LIST *	pending[],
		CHUNKS_RECORD *	chunks_record,
		HEAP *	pain_points,
		BlamerBundle *	blamer_bundle
	)		`[protected]`

Definition at line 257 of file segsearch.cpp.

                                                                     {
  if (segsearch_debug_level > 0) {
    tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n",
            pain_point_priority, pain_point.col, pain_point.row);
  }
  MATRIX *ratings = chunks_record->ratings;
  BLOB_CHOICE_LIST *classified = classify_piece(
      chunks_record->chunks, chunks_record->word_res->denorm,
      chunks_record->splits,
      pain_point.col, pain_point.row, blamer_bundle);
  ratings->put(pain_point.col, pain_point.row, classified);

  if (segsearch_debug_level > 0) {
    print_ratings_list("Updated ratings matrix with a new entry:",
                       ratings->get(pain_point.col, pain_point.row),
                       getDict().getUnicharset());
    ratings->print(getDict().getUnicharset());
  }

  // Insert initial "pain points" to join the newly classified blob
  // with its left and right neighbors.
  if (!classified->empty()) {
    float worst_piece_cert;
    bool fragmented;
    if (pain_point.col > 0) {
      language_model_->GetWorstPieceCertainty(
          pain_point.col-1, pain_point.row, chunks_record->ratings,
          &worst_piece_cert, &fragmented);
      language_model_->GeneratePainPoint(
          pain_point.col-1, pain_point.row, false,
          LanguageModel::kInitialPainPointPriorityAdjustment,
          worst_piece_cert, fragmented, best_choice->certainty(),
          segsearch_max_char_wh_ratio, NULL, NULL,
          chunks_record, pain_points);
    }
    if (pain_point.row+1 < ratings->dimension()) {
      language_model_->GetWorstPieceCertainty(
          pain_point.col, pain_point.row+1, chunks_record->ratings,
          &worst_piece_cert, &fragmented);
      language_model_->GeneratePainPoint(
          pain_point.col, pain_point.row+1, true,
          LanguageModel::kInitialPainPointPriorityAdjustment,
          worst_piece_cert, fragmented, best_choice->certainty(),
          segsearch_max_char_wh_ratio, NULL, NULL,
          chunks_record, pain_points);
    }
  }

  // Record a pending entry with the pain_point and each of its parents.
  int parent_row = pain_point.col - 1;
  if (parent_row < 0) {  // this node has no parents
    (*pending)[pain_point.col].add_sorted(
        SEG_SEARCH_PENDING::compare, true,
        new SEG_SEARCH_PENDING(pain_point.row, NULL,
                               LanguageModel::kAllChangedFlag));
  } else {
    for (int parent_col = 0; parent_col < pain_point.col; ++parent_col) {
      if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) {
        (*pending)[pain_point.col].add_sorted(
            SEG_SEARCH_PENDING::compare, true,
            new SEG_SEARCH_PENDING(pain_point.row,
                                   ratings->get(parent_col, parent_row),
                                   LanguageModel::kAllChangedFlag));
      }
    }
  }
}

void tesseract::Wordrec::program_editdown ( inT32 elasped_time )

Definition at line 80 of file tface.cpp.

                                                 {
  EndAdaptiveClassifier();
  blob_match_table.end_match_table();
  getDict().InitChoiceAccum();
  getDict().End();
}

void tesseract::Wordrec::program_editup	(	const char *	textbase,
		bool	init_classifier,
		bool	init_permute
	)

Definition at line 50 of file tface.cpp.

                                             {
  if (textbase != NULL) imagefile = textbase;
  InitFeatureDefs(&feature_defs_);
  SetupExtractors(&feature_defs_);
  InitAdaptiveClassifier(init_classifier);
  if (init_dict) getDict().Load();
  pass2_ok_split = chop_ok_split;
  pass2_seg_states = wordrec_num_seg_states;
}

void tesseract::Wordrec::push_queue	(	HEAP *	queue,
		STATE *	state,
		FLOAT32	worst_priority,
		FLOAT32	priority,
		bool	debug
	)

Definition at line 629 of file bestfirst.cpp.

                                                       {
  HEAPENTRY entry;

  if (priority < worst_priority) {
    if (SizeOfHeap (queue) >= MaxSizeOfHeap(queue)) {
      if (debug) tprintf("Heap is Full\n");
      return;
    }
    entry.Data = (char *) new_state (state);
    num_pushed++;
    entry.Key = priority;
    HeapStore(queue, &entry);
  }
}

FLOAT32 tesseract::Wordrec::rating_priority	(	CHUNKS_RECORD *	chunks_record,
		STATE *	state,
		int	num_joints
	)

Definition at line 175 of file heuristic.cpp.

                                                 {
  BLOB_CHOICE_LIST *blob_choices;
  BLOB_CHOICE_IT blob_choice_it;
  inT16 first_chunk = 0;
  inT16 last_chunk;
  inT16 ratings = 0;
  inT16 weights = 0;

  PIECES_STATE blob_chunks;
  bin_to_pieces(state, num_joints, blob_chunks);

  for (int x = 0; blob_chunks[x]; x++) {
    last_chunk = first_chunk + blob_chunks[x];

    blob_choices = chunks_record->ratings->get(first_chunk, last_chunk - 1);
    if (blob_choices != NOT_CLASSIFIED && blob_choices->length() > 0) {
      blob_choice_it.set_to_list(blob_choices);
      ratings += (inT16) blob_choice_it.data()->rating();
      for (int y = first_chunk; y < last_chunk; y++) {
        weights += (inT16) (chunks_record->weights[y]);
      }
    }
    first_chunk = last_chunk;
  }
  if (weights <= 0)
    weights = 1;
  FLOAT32 rating_cost = static_cast<FLOAT32>(ratings) /
                        static_cast<FLOAT32>(weights);
  if (segment_adjust_debug > 2)
    tprintf("rating_cost: r%f / w%f = %f\n", ratings, weights, rating_cost);
  return rating_cost;
}

BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::rebuild_current_state	(	WERD_RES *	word,
		STATE *	state,
		BLOB_CHOICE_LIST_VECTOR *	old_choices,
		MATRIX *	ratings
	)

rebuild_current_state

Transfers the given state to the word's output fields: rebuild_word, best_state, box_word, and returns the corresponding blob choices.

Definition at line 332 of file bestfirst.cpp.

                     {
  // Initialize search_state, num_joints, x, y.
  int num_joints = array_count(word->seam_array);
#ifndef GRAPHICS_DISABLED
    if (wordrec_display_segmentations) {
      print_state("Rebuilding state", state, num_joints);
    }
#endif
  // Setup the rebuild_word ready for the output blobs.
  if (word->rebuild_word != NULL)
    delete word->rebuild_word;
  word->rebuild_word = new TWERD;
  // Setup the best_state.
  word->best_state.clear();
  SEARCH_STATE search_state = bin_to_chunks(state, num_joints);
  // See which index is which below for information on x and y.
  int x = 0;
  int y;
  for (int i = 1; i <= search_state[0]; i++) {
    y = x + search_state[i];
    x = y + 1;
  }
  y = count_blobs(word->chopped_word->blobs) - 1;

  // Initialize char_choices, expanded_fragment_lengths:
  // e.g. if fragment_lengths = {1 1 2 3 1},
  // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}.
  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  STRING expanded_fragment_lengths_str = "";
  bool state_has_fragments = false;
  const char *fragment_lengths = NULL;

  if (word->best_choice->length() > 0) {
    fragment_lengths = word->best_choice->fragment_lengths();
  }
  if (fragment_lengths) {
    for (int i = 0; i < word->best_choice->length(); ++i) {
      *char_choices += NULL;
      word->best_state.push_back(0);
      if (fragment_lengths[i] > 1) {
        state_has_fragments = true;
      }
      for (int j = 0; j < fragment_lengths[i]; ++j) {
        expanded_fragment_lengths_str += fragment_lengths[i];
      }
    }
  } else {
    for (int i = 0; i <= search_state[0]; ++i) {
      expanded_fragment_lengths_str += (char)1;
      *char_choices += NULL;
      word->best_state.push_back(0);
    }
  }

  // Set up variables for concatenating fragments.
  const char *word_lengths_ptr = NULL;
  const char *word_ptr = NULL;
  if (state_has_fragments) {
    // Make word_lengths_ptr point to the last element in
    // best_choice->unichar_lengths().
    word_lengths_ptr = word->best_choice->unichar_lengths().string();
    word_lengths_ptr += (strlen(word_lengths_ptr)-1);
    // Make word_str point to the beginning of the last
    // unichar in best_choice->unichar_string().
    word_ptr = word->best_choice->unichar_string().string();
    word_ptr += (strlen(word_ptr)-*word_lengths_ptr);
  }
  const char *expanded_fragment_lengths =
    expanded_fragment_lengths_str.string();
  char unichar[UNICHAR_LEN + 1];

  // Populate char_choices list such that it corresponds to search_state.
  //
  // If we are rebuilding a state that contains character fragments:
  // -- combine blobs that belong to character fragments
  // -- re-classify the blobs to obtain choices list for the merged blob
  // -- ensure that correct classification appears in the new choices list
  //    NOTE: a choice composed form original fragment choices will be always
  //    added to the new choices list for each character composed from
  //    fragments (even if the choice for the corresponding character appears
  //    in the re-classified choices list of for the newly merged blob).
  int ss_index = search_state[0];
  // Which index is which?
  // char_choices_index refers to the finished product: there is one for each
  // blob/unicharset entry in the final word.
  // ss_index refers to the search_state, and indexes a group (chunk) of blobs
  // that were classified together for the best state.
  // old_choice_index is a copy of ss_index, and accesses the old_choices,
  // which correspond to chunks in the best state. old_choice_index gets
  // set to -1 on a fragment set, as there is no corresponding chunk in
  // the best state.
  // x and y refer to the underlying blobs and are the first and last blob
  // indices in a chunk.
  for (int char_choices_index = char_choices->length() - 1;
       char_choices_index >= 0;
       --char_choices_index) {
    // The start and end of the blob to rebuild.
    int true_x = x;
    int true_y = y;
    // The fake merged fragment choice.
    BLOB_CHOICE* merged_choice = NULL;
    // Test for and combine fragments first.
    int fragment_pieces = expanded_fragment_lengths[ss_index];
    int old_choice_index = ss_index;

    if (fragment_pieces > 1) {
      strncpy(unichar, word_ptr, *word_lengths_ptr);
      unichar[*word_lengths_ptr] = '\0';
      merged_choice = rebuild_fragments(unichar, expanded_fragment_lengths,
                                        old_choice_index, old_choices);
      old_choice_index = -1;
    }
    while (fragment_pieces > 0) {
      true_x = x;
      // Move left to the previous blob.
      y = x - 1;
      x = y - search_state[ss_index--];
      --fragment_pieces;
    }
    word->best_state[char_choices_index] = true_y + 1 - true_x;
    BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify(
        word, true_x, true_y, old_choice_index, ratings, old_choices);
    if (merged_choice != NULL) {
      // Insert merged_blob into current_choices, such that current_choices
      // are still sorted in non-descending order by rating.
      ASSERT_HOST(!current_choices->empty());
      BLOB_CHOICE_IT choice_it(current_choices);
      for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() &&
           merged_choice->rating() > choice_it.data()->rating();
           choice_it.forward());
        choice_it.add_before_stay_put(merged_choice);
    }
    // Get rid of fragments in current_choices.
    BLOB_CHOICE_IT choice_it(current_choices);
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
        choice_it.forward()) {
      if (getDict().getUnicharset().get_fragment(
          choice_it.data()->unichar_id())) {
        delete choice_it.extract();
      }
    }
    char_choices->set(current_choices, char_choices_index);

    // Update word_ptr and word_lengths_ptr.
    if (word_lengths_ptr != NULL && word_ptr != NULL) {
      word_lengths_ptr--;
      word_ptr -= (*word_lengths_ptr);
    }
  }
  old_choices->delete_data_pointers();
  delete old_choices;
  memfree(search_state);

  return char_choices;
}

BLOB_CHOICE * tesseract::Wordrec::rebuild_fragments	(	const char *	unichar,
		const char *	expanded_fragment_lengths,
		int	choice_index,
		BLOB_CHOICE_LIST_VECTOR *	old_choices
	)

Definition at line 680 of file bestfirst.cpp.

                                          {
  float rating = 0.0f;
  float certainty = 0.0f;
  inT16 min_xheight = -MAX_INT16;
  inT16 max_xheight = MAX_INT16;
  for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1;
       fragment_pieces >= 0; --fragment_pieces, --choice_index) {
    // Get a pointer to the classifier results from the old_choices.
    BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index);
    // Populate fragment with updated values and look for the
    // fragment with the same values in current_choices.
    // Update rating and certainty of the character being composed.
    CHAR_FRAGMENT fragment;
    fragment.set_all(unichar, fragment_pieces,
                     expanded_fragment_lengths[choice_index], false);
    BLOB_CHOICE_IT choice_it(current_choices);
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
        choice_it.forward()) {
      BLOB_CHOICE* choice = choice_it.data();
      const CHAR_FRAGMENT *current_fragment =
          getDict().getUnicharset().get_fragment(choice->unichar_id());
      if (current_fragment && fragment.equals(current_fragment)) {
        rating += choice->rating();
        if (choice->certainty() < certainty) {
          certainty = choice->certainty();
        }
        IntersectRange(choice->min_xheight(), choice->max_xheight(),
                       &min_xheight, &max_xheight);
        break;
      }
    }
    if (choice_it.cycled_list()) {
      print_ratings_list("Failure", current_choices, unicharset);
      tprintf("Failed to find fragment %s at index=%d\n",
              fragment.to_string().string(), choice_index);
    }
    ASSERT_HOST(!choice_it.cycled_list());  // Be sure we found the fragment.
  }
  return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
                         rating, certainty, -1, -1, 0,
                         min_xheight, max_xheight, false);
}

TBOX * tesseract::Wordrec::record_blob_bounds ( TBLOB * blobs )

Definition at line 393 of file pieces.cpp.

                                              {
  int nblobs = count_blobs(blobs);
  TBOX *bboxes = new TBOX[nblobs];

  inT16 x = 0;
  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
    bboxes[x] = blob->bounding_box();
    x++;
  }
  return bboxes;
}

MATRIX * tesseract::Wordrec::record_piece_ratings ( TBLOB * blobs )

Definition at line 414 of file pieces.cpp.

                                                  {
  inT16 num_blobs = count_blobs(blobs);
  TBOX *bounds = record_blob_bounds(blobs);
  MATRIX *ratings = new MATRIX(num_blobs);

  for (int x = 0; x < num_blobs; x++) {
    for (int y = x; y < num_blobs; y++) {
      TBOX piecebox = bounds_of_piece(bounds, x, y);
      BLOB_CHOICE_LIST *choices = blob_match_table.get_match_by_box(piecebox);
      if (choices != NULL) {
        ratings->put(x, y, choices);
      }
    }
  }

  if (merge_fragments_in_matrix)
    merge_fragments(ratings, num_blobs);

  delete []bounds;
  return ratings;
}

void tesseract::Wordrec::replace_char_widths	(	CHUNKS_RECORD *	chunks_record,
		SEARCH_STATE	state
	)

Definition at line 651 of file bestfirst.cpp.

                                                      {
  WIDTH_RECORD *width_record;
  int num_blobs;
  int i;

  free_widths (chunks_record->char_widths);

  num_blobs = state[0] + 1;
  width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2);
  width_record->num_chars = num_blobs;

  for (i = 0; i < num_blobs; i++) {

    width_record->widths[2 * i] = last_segmentation[i].width;

    if (i + 1 < num_blobs)
      width_record->widths[2 * i + 1] = last_segmentation[i].gap;
  }
  chunks_record->char_widths = width_record;
}

void tesseract::Wordrec::reverse_outline ( EDGEPT * outline )

Definition at line 164 of file outlines.cpp.

                                             {
  EDGEPT *edgept = outline;
  EDGEPT *temp;

  do {
                                 /* Swap next and prev */
    temp = edgept->prev;
    edgept->prev = edgept->next;
    edgept->next = temp;
    /* Set up vec field */
    edgept->vec.x = edgept->next->pos.x - edgept->pos.x;
    edgept->vec.y = edgept->next->pos.y - edgept->pos.y;

    edgept = edgept->prev;       /* Go to next point */
  }
  while (edgept != outline);
}

void tesseract::Wordrec::SaveAltChoices	(	const LIST &	best_choices,
		WERD_RES *	word
	)

Definition at line 173 of file wordrec.cpp.

                                                                     {
  ASSERT_HOST(word->alt_choices.empty());
  ASSERT_HOST(word->alt_states.empty());
  LIST list_it;
  iterate_list(list_it, best_choices) {
    VIABLE_CHOICE choice =
        reinterpret_cast<VIABLE_CHOICE>(first_node(list_it));
    CHAR_CHOICE *char_choice = &(choice->Blob[0]);
    WERD_CHOICE *alt_choice = new WERD_CHOICE(word->uch_set, choice->Length);
    word->alt_states.push_back(GenericVector<int>(choice->Length));
    GenericVector<int> &alt_state = word->alt_states.back();
    for (int i = 0; i < choice->Length; char_choice++, i++) {
      alt_choice->append_unichar_id_space_allocated(
          char_choice->Class, 1, 0, 0);
      alt_state.push_back(char_choice->NumChunks);
    }
    alt_choice->set_rating(choice->Rating);
    alt_choice->set_certainty(choice->Certainty);
    word->alt_choices.push_back(alt_choice);
    if (wordrec_debug_level > 0) {
      tprintf("SaveAltChoices: %s %g\n",
              alt_choice->unichar_string().string(), alt_choice->rating());
    }
  }
}

PRIORITY tesseract::Wordrec::seam_priority	(	SEAM *	seam,
		inT16	xmin,
		inT16	xmax
	)

Definition at line 469 of file findseam.cpp.

                                                                  {
  PRIORITY priority;

  if (seam->split1 == NULL)
    priority = 0;

  else if (seam->split2 == NULL) {
    priority = (seam->priority +
      full_split_priority (seam->split1, xmin, xmax));
  }

  else if (seam->split3 == NULL) {
    split_outline (seam->split2->point1, seam->split2->point2);
    priority = (seam->priority +
      full_split_priority (seam->split1, xmin, xmax));
    unsplit_outlines (seam->split2->point1, seam->split2->point2);
  }

  else {
    split_outline (seam->split2->point1, seam->split2->point2);
    split_outline (seam->split3->point1, seam->split3->point2);
    priority = (seam->priority +
      full_split_priority (seam->split1, xmin, xmax));
    unsplit_outlines (seam->split3->point1, seam->split3->point2);
    unsplit_outlines (seam->split2->point1, seam->split2->point2);
  }

  return (priority);
}

FLOAT32 tesseract::Wordrec::seamcut_priority	(	SEAMS	seams,
		STATE *	state,
		int	num_joints
	)

Definition at line 142 of file heuristic.cpp.

                                                  {
  int x;
  unsigned int mask = (num_joints > 32) ? (1 << (num_joints - 1 - 32))
                                        : (1 << (num_joints - 1));
  float seam_cost = 0.0f;
  for (x = num_joints - 1; x >= 0; x--) {
    int i = num_joints - 1 - x;
    uinT32 value = (x < 32) ? state->part2 : state->part1;
    bool state_on = value & mask;
    if (state_on) {
      SEAM* seam = (SEAM *) array_value(seams, i);
      seam_cost += seam->priority;
    }
    if (mask == 1)
      mask = 1 << 31;
    else
      mask >>= 1;
  }
  if (segment_adjust_debug > 2)
    tprintf("seam_cost: %f\n", seam_cost);
  return seam_cost;
}

void tesseract::Wordrec::SegSearch	(	CHUNKS_RECORD *	chunks_record,
		WERD_CHOICE *	best_choice,
		BLOB_CHOICE_LIST_VECTOR *	best_char_choices,
		WERD_CHOICE *	raw_choice,
		STATE *	output_best_state,
		BlamerBundle *	blamer_bundle
	)

Definition at line 35 of file segsearch.cpp.

                                                     {
  int row, col = 0;
  if (segsearch_debug_level > 0) {
    tprintf("Starting SegSearch on ratings matrix:\n");
    chunks_record->ratings->print(getDict().getUnicharset());
  }
  // Start with a fresh best_choice since rating adjustments
  // used by the chopper and the new segmentation search are not compatible.
  best_choice->set_rating(WERD_CHOICE::kBadRating);
  // TODO(antonova): Due to the fact that we currently do not re-start the
  // segmentation search from the best choice the chopper found, sometimes
  // the the segmentation search does not find the best path (that chopper
  // did discover) and does not have a chance to adapt to it. As soon as we
  // transition to using new-style language model penalties in the chopper
  // this issue will be resolved. But for how we are forced clear the
  // accumulator choices.
  //
  // Clear best choice accumulator (that is used for adaption), so that
  // choices adjusted by chopper do not interfere with the results from the
  // segmentation search.
  getDict().ClearBestChoiceAccum();

  MATRIX *ratings = chunks_record->ratings;
  // Priority queue containing pain points generated by the language model
  // The priority is set by the language model components, adjustments like
  // seam cost and width priority are factored into the priority.
  HEAP *pain_points = MakeHeap(segsearch_max_pain_points);

  // best_path_by_column records the lowest cost path found so far for each
  // column of the chunks_record->ratings matrix over all the rows.
  BestPathByColumn *best_path_by_column =
    new BestPathByColumn[ratings->dimension()];
  for (col = 0; col < ratings->dimension(); ++col) {
    best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating;
    best_path_by_column[col].best_vse = NULL;
  }

  // Compute scaling factor that will help us recover blob outline length
  // from classifier rating and certainty for the blob.
  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;

  language_model_->InitForWord(prev_word_best_choice_,
                               assume_fixed_pitch_char_segment,
                               best_choice->certainty(),
                               segsearch_max_char_wh_ratio, rating_cert_scale,
                               pain_points, chunks_record, blamer_bundle,
                               wordrec_debug_blamer);

  MATRIX_COORD *pain_point;
  float pain_point_priority;
  BestChoiceBundle best_choice_bundle(
      output_best_state, best_choice, raw_choice, best_char_choices);

  // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs,
  // where i is the column of the child. Initially all the classified entries
  // in the ratings matrix from column 0 (with parent NULL) are inserted into
  // pending[0]. As the language model state is updated, new child/parent
  // pairs are inserted into the lists. Next, the entries in pending[1] are
  // considered, and so on. It is important that during the update the
  // children are considered in the non-decreasing order of their column, since
  // this guarantees that all the parents would be up to date before an update
  // of a child is done.
  SEG_SEARCH_PENDING_LIST *pending =
    new SEG_SEARCH_PENDING_LIST[ratings->dimension()];

  // Search for the ratings matrix for the initial best path.
  for (row = 0; row < ratings->dimension(); ++row) {
    if (ratings->get(0, row) != NOT_CLASSIFIED) {
      pending[0].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag));
    }
  }
  UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record,
                       pain_points, &best_choice_bundle, blamer_bundle);

  // Keep trying to find a better path by fixing the "pain points".
  int num_futile_classifications = 0;
  STRING blamer_debug;
  while (!SegSearchDone(num_futile_classifications) ||
         (blamer_bundle != NULL &&
          blamer_bundle->segsearch_is_looking_for_blame)) {
    // Get the next valid "pain point".
    int pop;
    while (true) {
      pop = HeapPop(pain_points, &pain_point_priority, &pain_point);
      if (pop == EMPTY) break;
      if (pain_point->Valid(*ratings) &&
        ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) {
        break;
      } else {
        delete pain_point;
      }
    }
    if (pop == EMPTY) {
      if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
      break;
    }
    ProcessSegSearchPainPoint(pain_point_priority, *pain_point,
                              best_choice_bundle.best_choice, &pending,
                              chunks_record, pain_points, blamer_bundle);

    UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column,
                         chunks_record, pain_points, &best_choice_bundle,
                         blamer_bundle);
    if (!best_choice_bundle.updated) ++num_futile_classifications;

    if (segsearch_debug_level > 0) {
      tprintf("num_futile_classifications %d\n", num_futile_classifications);
    }

    best_choice_bundle.updated = false;  // reset updated
    delete pain_point;  // done using this pain point

    // See if it's time to terminate SegSearch or time for starting a guided
    // search for the true path to find the blame for the incorrect best_choice.
    if (SegSearchDone(num_futile_classifications) && blamer_bundle != NULL &&
        blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
        !blamer_bundle->segsearch_is_looking_for_blame &&
        blamer_bundle->truth_has_char_boxes &&
        !ChoiceIsCorrect(getDict().getUnicharset(),
                         best_choice, blamer_bundle->truth_text)) {
      InitBlamerForSegSearch(best_choice_bundle.best_choice, chunks_record,
                             pain_points, blamer_bundle, &blamer_debug);
    }
  }  // end while loop exploring alternative paths
  FinishBlamerForSegSearch(best_choice_bundle.best_choice,
                           blamer_bundle, &blamer_debug);

  if (segsearch_debug_level > 0) {
    tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
            language_model_->AcceptableChoiceFound());
  }

  // Clean up.
  FreeHeapData(pain_points, MATRIX_COORD::Delete);
  delete[] best_path_by_column;
  delete[] pending;
  for (row = 0; row < ratings->dimension(); ++row) {
    for (col = 0; col <= row; ++col) {
      BLOB_CHOICE_LIST *rating = ratings->get(col, row);
      if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating);
    }
  }
}

bool tesseract::Wordrec::SegSearchDone ( int num_futile_classifications ) [inline, protected]

Definition at line 520 of file wordrec.h.

                                                            {
    return (language_model_->AcceptableChoiceFound() ||
            num_futile_classifications >=
            segsearch_max_futile_classifications);
  }

inT16 tesseract::Wordrec::select_blob_to_split	(	const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		float	rating_ceiling,
		bool	split_next_to_fragment
	)

Definition at line 801 of file chopper.cpp.

                                                                 {
  BLOB_CHOICE_IT blob_choice_it;
  BLOB_CHOICE *blob_choice;
  BLOB_CHOICE_IT temp_it;
  int x;
  float worst = -MAX_FLOAT32;
  int worst_index = -1;
  float worst_near_fragment = -MAX_FLOAT32;
  int worst_index_near_fragment = -1;
  const CHAR_FRAGMENT **fragments = NULL;

  if (chop_debug) {
    if (rating_ceiling < MAX_FLOAT32)
      cprintf("rating_ceiling = %8.4f\n", rating_ceiling);
    else
      cprintf("rating_ceiling = No Limit\n");
  }

  if (split_next_to_fragment && char_choices.length() > 0) {
    fragments = new const CHAR_FRAGMENT *[char_choices.length()];
    if (char_choices.get(0) != NULL) {
      temp_it.set_to_list(char_choices.get(0));
      fragments[0] = getDict().getUnicharset().get_fragment(
          temp_it.data()->unichar_id());
    } else {
      fragments[0] = NULL;
    }
  }

  for (x = 0; x < char_choices.length(); ++x) {
    if (char_choices.get(x) == NULL) {
      if (fragments != NULL) {
        delete[] fragments;
      }
      return x;
    } else {
      blob_choice_it.set_to_list(char_choices.get(x));
      blob_choice = blob_choice_it.data();
      // Populate fragments for the following position.
      if (split_next_to_fragment && x+1 < char_choices.length()) {
        if (char_choices.get(x+1) != NULL) {
          temp_it.set_to_list(char_choices.get(x+1));
          fragments[x+1] = getDict().getUnicharset().get_fragment(
              temp_it.data()->unichar_id());
        } else {
          fragments[x+1] = NULL;
        }
      }
      if (blob_choice->rating() < rating_ceiling &&
          blob_choice->certainty() < tessedit_certainty_threshold) {
        // Update worst and worst_index.
        if (blob_choice->rating() > worst) {
          worst_index = x;
          worst = blob_choice->rating();
        }
        if (split_next_to_fragment) {
          // Update worst_near_fragment and worst_index_near_fragment.
          bool expand_following_fragment =
            (x + 1 < char_choices.length() &&
             fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
          bool expand_preceding_fragment =
            (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
          if ((expand_following_fragment || expand_preceding_fragment) &&
              blob_choice->rating() > worst_near_fragment) {
            worst_index_near_fragment = x;
            worst_near_fragment = blob_choice->rating();
            if (chop_debug) {
              cprintf("worst_index_near_fragment=%d"
                      " expand_following_fragment=%d"
                      " expand_preceding_fragment=%d\n",
                      worst_index_near_fragment,
                      expand_following_fragment,
                      expand_preceding_fragment);
            }
          }
        }
      }
    }
  }
  if (fragments != NULL) {
    delete[] fragments;
  }
  // TODO(daria): maybe a threshold of badness for
  // worst_near_fragment would be useful.
  return worst_index_near_fragment != -1 ?
    worst_index_near_fragment : worst_index;
}

inT16 tesseract::Wordrec::select_blob_to_split_from_fixpt ( DANGERR * fixpt )

Definition at line 898 of file chopper.cpp.

                                                             {
  if (!fixpt)
    return -1;
  for (int i = 0; i < fixpt->size(); i++) {
    if ((*fixpt)[i].begin == (*fixpt)[i].end &&
        (*fixpt)[i].dangerous &&
        (*fixpt)[i].correct_is_ngram) {
      return (*fixpt)[i].begin;
    }
  }
  return -1;
}

void tesseract::Wordrec::set_chopper_blame ( WERD_RES * word )

Definition at line 917 of file chopper.cpp.

                                              {
  BlamerBundle *blamer_bundle = word->blamer_bundle;
  assert(blamer_bundle != NULL);
  if (blamer_bundle->NoTruth() || !(blamer_bundle->truth_has_char_boxes) ||
      word->chopped_word->blobs == NULL) {
    return;
  }
  STRING debug;
  bool missing_chop = false;
  TBLOB * curr_blob = word->chopped_word->blobs;
  int b = 0;
  inT16 truth_x;
  while (b < blamer_bundle->truth_word.length() && curr_blob != NULL) {
    truth_x = blamer_bundle->norm_truth_word.BlobBox(b).right();
    if (curr_blob->bounding_box().right() <
        (truth_x - blamer_bundle->norm_box_tolerance)) {
      curr_blob = curr_blob->next;
      continue;  // encountered an extra chop, keep looking
    } else if (curr_blob->bounding_box().right() >
                (truth_x + blamer_bundle->norm_box_tolerance)) {
      missing_chop = true;
      break;
    } else {
      curr_blob = curr_blob->next;
      ++b;
    }
  }
  if (missing_chop || b < blamer_bundle->norm_truth_word.length()) {
    STRING debug;
    char debug_buffer[256];
    if (missing_chop) {
      sprintf(debug_buffer, "Detected missing chop (tolerance=%d) at ",
              blamer_bundle->norm_box_tolerance);
      debug += debug_buffer;
      curr_blob->bounding_box().append_debug(&debug);
      debug.add_str_int("\nNo chop for truth at x=", truth_x);
    } else {
      debug.add_str_int("Missing chops for last ",
                        blamer_bundle->norm_truth_word.length()-b);
      debug += " truth box(es)";
    }
    debug += "\nMaximally chopped word boxes:\n";
    for (curr_blob = word->chopped_word->blobs; curr_blob != NULL;
        curr_blob = curr_blob->next) {
      const TBOX &tbox = curr_blob->bounding_box();
      sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
              tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
      debug += debug_buffer;
    }
    debug += "Truth  bounding  boxes:\n";
    for (b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) {
      const TBOX &tbox = blamer_bundle->norm_truth_word.BlobBox(b);
      sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
              tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
      debug += debug_buffer;
    }
    blamer_bundle->SetBlame(IRR_CHOPPER, debug, word->best_choice,
                            wordrec_debug_blamer);
  }
}

void tesseract::Wordrec::set_outline_bounds	(	register EDGEPT *	point1,
		register EDGEPT *	point2,
		BOUNDS_RECT	rect
	)

Definition at line 213 of file gradechop.cpp.

                                                   {
  register EDGEPT *this_point;
  register inT16 x_min;
  register inT16 x_max;

  find_bounds_loop(point1, point2, x_min, x_max);

  rect[0] = x_min;
  rect[1] = x_max;

  find_bounds_loop(point2, point1, x_min, x_max);

  rect[2] = x_min;
  rect[3] = x_max;
}

void tesseract::Wordrec::set_pass1 ( )

Definition at line 93 of file tface.cpp.

                        {
  chop_ok_split.set_value(70.0);
  wordrec_num_seg_states.set_value(15);
  SettupPass1();
}

void tesseract::Wordrec::set_pass2 ( )

Definition at line 105 of file tface.cpp.

                        {
  chop_ok_split.set_value(pass2_ok_split);
  wordrec_num_seg_states.set_value(pass2_seg_states);
  SettupPass2();
}

WIDTH_RECORD * tesseract::Wordrec::state_char_widths	(	WIDTH_RECORD *	chunk_widths,
		STATE *	state,
		int	num_joints
	)

Definition at line 58 of file heuristic.cpp.

                                                         {
  SEARCH_STATE chunks = bin_to_chunks(state, num_joints);
  int num_chars = chunks[0] + 1;

  // allocate and store (n+1,w0,g0,w1,g1...,wn) in int[2*(n+1)] as a
  // struct { num_chars, widths[2*n+1]; }
  WIDTH_RECORD *char_widths = (WIDTH_RECORD*) memalloc(sizeof(int)*num_chars*2);
  char_widths->num_chars = num_chars;

  int first_blob = 0;
  int last_blob;
  for (int i = 1; i <= num_chars; i++) {
    last_blob = (i > chunks[0]) ? num_joints : first_blob + chunks[i];

    char_widths->widths[2*i-2] =
      AssociateUtils::GetChunksWidth(chunk_widths, first_blob, last_blob);
    if (i <= chunks[0]) {
      char_widths->widths[2*i-1] =
        AssociateUtils::GetChunksGap(chunk_widths, last_blob);
    }

    if (segment_adjust_debug > 3)
      tprintf("width_record[%d]s%d--s%d(%d) %d %d:%d\n",
              i-1, first_blob, last_blob, chunks[i],
              char_widths->widths[2*i-2], char_widths->widths[2*i-1],
              chunk_widths->widths[2*last_blob+1]);
    first_blob = last_blob + 1;
  }

  memfree(chunks);
  return char_widths;
}

void tesseract::Wordrec::try_point_pairs	(	EDGEPT *	points[MAX_NUM_POINTS],
		inT16	num_points,
		SEAM_QUEUE	seam_queue,
		SEAM_PILE *	seam_pile,
		SEAM **	seam,
		TBLOB *	blob
	)

Definition at line 507 of file findseam.cpp.

                                             {
  inT16 x;
  inT16 y;
  SPLIT *split;
  PRIORITY priority;

  for (x = 0; x < num_points; x++) {
    for (y = x + 1; y < num_points; y++) {

      if (points[y] &&
          weighted_edgept_dist(points[x], points[y],
                               chop_x_y_weight) < chop_split_length &&
          points[x] != points[y]->next &&
          points[y] != points[x]->next &&
          !is_exterior_point(points[x], points[y]) &&
          !is_exterior_point(points[y], points[x])) {
        split = new_split (points[x], points[y]);
        priority = partial_split_priority (split);

        choose_best_seam(seam_queue, seam_pile, split, priority, seam, blob);
      }
    }
  }

}

void tesseract::Wordrec::try_vertical_splits	(	EDGEPT *	points[MAX_NUM_POINTS],
		inT16	num_points,
		EDGEPT_CLIST *	new_points,
		SEAM_QUEUE	seam_queue,
		SEAM_PILE *	seam_pile,
		SEAM **	seam,
		TBLOB *	blob
	)

Definition at line 549 of file findseam.cpp.

                                                {
  EDGEPT *vertical_point = NULL;
  SPLIT *split;
  inT16 x;
  PRIORITY priority;
  TESSLINE *outline;

  for (x = 0; x < num_points; x++) {
    vertical_point = NULL;
    for (outline = blob->outlines; outline; outline = outline->next) {
      vertical_projection_point(points[x], outline->loop,
                                &vertical_point, new_points);
    }

    if (vertical_point &&
      points[x] != vertical_point->next &&
      vertical_point != points[x]->next &&
      weighted_edgept_dist(points[x], vertical_point,
                           chop_x_y_weight) < chop_split_length) {

      split = new_split (points[x], vertical_point);
      priority = partial_split_priority (split);

      choose_best_seam(seam_queue, seam_pile, split, priority, seam, blob);
    }
  }
}

void tesseract::Wordrec::update_blob_classifications	(	TWERD *	word,
		const BLOB_CHOICE_LIST_VECTOR &	choices
	)

Definition at line 152 of file wordclass.cpp.

                                                         {
  TBLOB *tblob = word->blobs;
  int index = 0;
  for (; tblob != NULL && index < choices.length();
       tblob = tblob->next, index++) {
    blob_match_table.add_to_match(tblob, choices.get(index));
  }
}

void tesseract::Wordrec::update_ratings	(	const BLOB_CHOICE_LIST_VECTOR &	new_choices,
		const CHUNKS_RECORD *	chunks_record,
		const SEARCH_STATE	search_state
	)

void tesseract::Wordrec::UpdateSegSearchNodes	(	int	starting_col,
		SEG_SEARCH_PENDING_LIST *	pending[],
		BestPathByColumn *	best_path_by_column[],
		CHUNKS_RECORD *	chunks_record,
		HEAP *	pain_points,
		BestChoiceBundle *	best_choice_bundle,
		BlamerBundle *	blamer_bundle
	)		`[protected]`

Definition at line 186 of file segsearch.cpp.

                                 {
  MATRIX *ratings = chunks_record->ratings;
  for (int col = starting_col; col < ratings->dimension(); ++col) {
    if (segsearch_debug_level > 0) {
      tprintf("\n\nUpdateSegSearchNodes: evaluate children in col=%d\n", col);
    }
    // Iterate over the pending list for this column.
    SEG_SEARCH_PENDING_LIST *pending_list = &((*pending)[col]);
    SEG_SEARCH_PENDING_IT pending_it(pending_list);
    GenericVector<int> non_empty_rows;
    while (!pending_it.empty()) {
      // Update language model state of this child+parent pair.
      SEG_SEARCH_PENDING *p = pending_it.extract();
      if (non_empty_rows.length() == 0 ||
          non_empty_rows[non_empty_rows.length()-1] != p->child_row) {
        non_empty_rows.push_back(p->child_row);
      }
      BLOB_CHOICE_LIST *current_node = ratings->get(col, p->child_row);
      LanguageModelFlagsType new_changed =
        language_model_->UpdateState(p->changed, col, p->child_row,
                                     current_node, p->parent, pain_points,
                                     best_path_by_column, chunks_record,
                                     best_choice_bundle, blamer_bundle);
      if (new_changed) {
        // Since the language model state of this entry changed, add all the
        // pairs with it as a parent and each of its children to pending, so
        // that the children are updated as well.
        int child_col = p->child_row + 1;
        for (int child_row = child_col;
             child_row < ratings->dimension(); ++child_row) {
          if (ratings->get(child_col, child_row) != NOT_CLASSIFIED) {
            SEG_SEARCH_PENDING *new_pending =
              new SEG_SEARCH_PENDING(child_row, current_node, 0);
            SEG_SEARCH_PENDING *actual_new_pending =
              reinterpret_cast<SEG_SEARCH_PENDING *>(
                  (*pending)[child_col].add_sorted_and_find(
                  SEG_SEARCH_PENDING::compare, true, new_pending));
            if (new_pending != actual_new_pending) delete new_pending;
            actual_new_pending->changed |= new_changed;
            if (segsearch_debug_level > 0) {
                  tprintf("Added child(col=%d row=%d) parent(col=%d row=%d)"
                          " changed=0x%x to pending\n", child_col,
                          actual_new_pending->child_row,
                          col, p->child_row, actual_new_pending->changed);
            }
          }
        }
      }  // end if new_changed
      delete p;  // clean up
      pending_it.forward();
    } // end while !pending_it.empty()
    language_model_->GeneratePainPointsFromColumn(
      col, non_empty_rows, best_choice_bundle->best_choice->certainty(),
      pain_points, best_path_by_column, chunks_record);
  }  // end for col

  if (best_choice_bundle->updated) {
    language_model_->GeneratePainPointsFromBestChoice(
        pain_points, chunks_record, best_choice_bundle);
  }

  language_model_->CleanUp();
}

void tesseract::Wordrec::vertical_projection_point	(	EDGEPT *	split_point,
		EDGEPT *	target_point,
		EDGEPT **	best_point,
		EDGEPT_CLIST *	new_points
	)

Definition at line 332 of file chop.cpp.

                                                                  {
  EDGEPT *p;                     /* Iterator */
  EDGEPT *this_edgept;           /* Iterator */
  EDGEPT_C_IT new_point_it(new_points);
  int x = split_point->pos.x;    /* X value of vertical */
  int best_dist = LARGE_DISTANCE;/* Best point found */

  if (*best_point != NULL)
    best_dist = edgept_dist(split_point, *best_point);

  p = target_point;
  /* Look at each edge point */
  do {
    if ((((p->pos.x <= x) && (x <= p->next->pos.x)) ||
      ((p->next->pos.x <= x) && (x <= p->pos.x))) &&
      !same_point (split_point->pos, p->pos) &&
      !same_point (split_point->pos, p->next->pos)
    && (*best_point == NULL || !same_point ((*best_point)->pos, p->pos))) {

      if (near_point(split_point, p, p->next, &this_edgept)) {
        new_point_it.add_before_then_move(this_edgept);
      }

      if (*best_point == NULL)
        best_dist = edgept_dist (split_point, this_edgept);

      this_edgept =
        pick_close_point(split_point, this_edgept, &best_dist);
      if (this_edgept)
        *best_point = this_edgept;
    }

    p = p->next;
  }
  while (p != target_point);
}

FLOAT32 tesseract::Wordrec::width_priority	(	CHUNKS_RECORD *	chunks_record,
		STATE *	state,
		int	num_joints
	)

Definition at line 222 of file heuristic.cpp.

                                                {
  FLOAT32 penalty = 0.0;
  WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths,
                                              state, num_joints);
  // When baseline_enable==True, which is the current default for Tesseract,
  // a fixed value of 128 (BASELINE_SCALE) is always used.
  FLOAT32 normalizing_height = BASELINE_SCALE;
  if (assume_fixed_pitch_char_segment) {
    // For fixed pitch language like CJK, we use the full text height as the
    // normalizing factor so we are not dependent on xheight calculation.
    // In the normalized coord. xheight * scale == BASELINE_SCALE(128),
    // so add proportionally scaled ascender zone to get full text height.
    const DENORM& denorm = chunks_record->word_res->denorm;
    normalizing_height = denorm.y_scale() *
        (denorm.row()->x_height() + denorm.row()->ascenders());
    if (segment_adjust_debug > 1)
      tprintf("WidthPriority: %f %f normalizing height = %f\n",
              denorm.row()->x_height(), denorm.row()->ascenders(),
              normalizing_height);
    // Impose additional segmentation penalties if blob widths or gaps
    // distribution don't fit a fixed-pitch model.
    FLOAT32 width_var = get_width_variance(width_rec, normalizing_height);
    FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height);
    penalty += width_var;
    penalty += gap_var;
  }

  for (int x = 0; x < width_rec->num_chars; x++) {
    FLOAT32 squat = width_rec->widths[2*x];
    FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0;
    squat /= normalizing_height;
    gap /= normalizing_height;
    if (assume_fixed_pitch_char_segment) {
      penalty += AssociateUtils::FixedPitchWidthCost(
          squat, 0.0f, x == 0 || x == width_rec->num_chars -1,
          heuristic_max_char_wh_ratio);
      penalty += AssociateUtils::FixedPitchGapCost(
          gap, x == width_rec->num_chars - 1);
      if (width_rec->num_chars == 1 &&
          squat > AssociateUtils::kMaxFixedPitchCharAspectRatio) {
        penalty += 10;
      }
    } else {
      // Original equation when
      // heuristic_max_char_ratio == AssociateUtils::kMaxSquat
      if (squat > heuristic_max_char_wh_ratio)
        penalty += squat - heuristic_max_char_wh_ratio;
    }
  }

  free_widths(width_rec);
  return (penalty);
}

MATRIX * tesseract::Wordrec::word_associator	(	bool	only_create_ratings_matrtix,
		WERD_RES *	word,
		STATE *	state,
		BLOB_CHOICE_LIST_VECTOR *	best_char_choices,
		DANGERR *	fixpt,
		STATE *	best_state
	)

Definition at line 984 of file chopper.cpp.

                                                    {
  CHUNKS_RECORD chunks_record;
  BLOB_WEIGHTS blob_weights;
  int x;
  int num_chunks;
  BLOB_CHOICE_IT blob_choice_it;

  num_chunks = array_count(word->seam_array) + 1;

  TBLOB* blobs = word->chopped_word->blobs;
  chunks_record.ratings = record_piece_ratings(blobs);
  chunks_record.chunks = blobs;
  chunks_record.word_res = word;
  chunks_record.splits = word->seam_array;
  chunks_record.chunk_widths = blobs_widths(blobs);
  chunks_record.char_widths = blobs_widths(blobs);
  /* Save chunk weights */
  for (x = 0; x < num_chunks; x++) {
    BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings, blobs,
                                                 chunks_record.word_res->denorm,
                                                 word->seam_array, x, x,
                                                 word->blamer_bundle);
    blob_choice_it.set_to_list(choices);
    //This is done by Jetsoft. Divide by zero is possible.
    if (blob_choice_it.data()->certainty() == 0) {
      blob_weights[x]=0;
    } else {
      blob_weights[x] =
        -(inT16) (10 * blob_choice_it.data()->rating() /
                  blob_choice_it.data()->certainty());
    }
  }
  chunks_record.weights = blob_weights;

  if (chop_debug)
    chunks_record.ratings->print(getDict().getUnicharset());

  if (!only_create_ratings_matrix) {
    if (enable_new_segsearch) {
      SegSearch(&chunks_record, word->best_choice,
                best_char_choices, word->raw_choice,
                state, word->blamer_bundle);
    } else {
      best_first_search(&chunks_record, best_char_choices, word,
                        state, fixpt, best_state);
    }
  }

  free_widths(chunks_record.chunk_widths);
  free_widths(chunks_record.char_widths);
  return chunks_record.ratings;
}

Member Data Documentation

bool tesseract::Wordrec::assume_fixed_pitch_char_segment = 0

"include fixed-pitch heuristics in char segmentation"

Definition at line 126 of file wordrec.h.

GenericVector<int> tesseract::Wordrec::blame_reasons_

Definition at line 512 of file wordrec.h.

BlobMatchTable tesseract::Wordrec::blob_match_table

Definition at line 505 of file wordrec.h.

double tesseract::Wordrec::chop_center_knob = 0.15

"Split center adjustment"

Definition at line 118 of file wordrec.h.

int tesseract::Wordrec::chop_debug = 0

"Chop debug"

Definition at line 108 of file wordrec.h.

bool tesseract::Wordrec::chop_enable = 1

"Chop enable"

Definition at line 109 of file wordrec.h.

double tesseract::Wordrec::chop_good_split = 50.0

"Good split limit"

Definition at line 122 of file wordrec.h.

int tesseract::Wordrec::chop_inside_angle = -50

"Min Inside Angle Bend"

Definition at line 114 of file wordrec.h.

int tesseract::Wordrec::chop_min_outline_area = 2000

"Min Outline Area"

Definition at line 115 of file wordrec.h.

int tesseract::Wordrec::chop_min_outline_points = 6

"Min Number of Points on Outline"

Definition at line 113 of file wordrec.h.

double tesseract::Wordrec::chop_ok_split = 100.0

"OK split limit"

Definition at line 121 of file wordrec.h.

double tesseract::Wordrec::chop_overlap_knob = 0.9

"Split overlap adjustment"

Definition at line 117 of file wordrec.h.

int tesseract::Wordrec::chop_same_distance = 2

"Same distance"

Definition at line 112 of file wordrec.h.

double tesseract::Wordrec::chop_sharpness_knob = 0.06

"Split sharpness adjustment"

Definition at line 119 of file wordrec.h.

double tesseract::Wordrec::chop_split_dist_knob = 0.5

"Split length adjustment"

Definition at line 116 of file wordrec.h.

int tesseract::Wordrec::chop_split_length = 10000

"Split Length"

Definition at line 111 of file wordrec.h.

bool tesseract::Wordrec::chop_vertical_creep = 0

"Vertical creep"

Definition at line 110 of file wordrec.h.

double tesseract::Wordrec::chop_width_change_knob = 5.0

"Width change adjustment"

Definition at line 120 of file wordrec.h.

int tesseract::Wordrec::chop_x_y_weight = 3

"X / Y length weight"

Definition at line 123 of file wordrec.h.

bool tesseract::Wordrec::enable_new_segsearch = false

"Enable new segmentation search path."

Definition at line 145 of file wordrec.h.

void(Wordrec::* tesseract::Wordrec::fill_lattice_)(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 514 of file wordrec.h.

bool tesseract::Wordrec::force_word_assoc = 0

"force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary."

Definition at line 101 of file wordrec.h.

bool tesseract::Wordrec::fragments_guide_chopper = 0

"Use information from fragments to guide chopping process"

Definition at line 105 of file wordrec.h.

double tesseract::Wordrec::heuristic_max_char_wh_ratio = 2.0

"max char width-to-height ratio allowed in segmentation"

Definition at line 140 of file wordrec.h.

double tesseract::Wordrec::heuristic_segcost_rating_base = 1.25

"base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost."

Definition at line 132 of file wordrec.h.

double tesseract::Wordrec::heuristic_weight_rating = 1

"weight associated with char rating in combined cost of state"

Definition at line 134 of file wordrec.h.

double tesseract::Wordrec::heuristic_weight_seamcut = 0

"weight associated with seam cut in combined cost of state"

Definition at line 138 of file wordrec.h.

double tesseract::Wordrec::heuristic_weight_width = 0

"weight associated with width evidence in combined cost of state"

Definition at line 136 of file wordrec.h.

LanguageModel* tesseract::Wordrec::language_model_

Definition at line 499 of file wordrec.h.