Tesseract
3.02
|
#include <wordrec.h>
Public Member Functions | |||||||
Wordrec () | |||||||
virtual | ~Wordrec () | ||||||
void | CopyCharChoices (const BLOB_CHOICE_LIST_VECTOR &from, BLOB_CHOICE_LIST_VECTOR *to) | ||||||
bool | ChoiceIsCorrect (const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector< STRING > &truth_text) | ||||||
void | SaveAltChoices (const LIST &best_choices, WERD_RES *word) | ||||||
void | FillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) | ||||||
void | CallFillLattice (const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) | ||||||
void | update_ratings (const BLOB_CHOICE_LIST_VECTOR &new_choices, const CHUNKS_RECORD *chunks_record, const SEARCH_STATE search_state) | ||||||
void | SegSearch (CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state, BlamerBundle *blamer_bundle) | ||||||
SEAM * | attempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, SEAMS seam_list) | ||||||
SEAM * | chop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, SEAMS seam_list) | ||||||
SEAM * | chop_overlapping_blob (const GenericVector< TBOX > &boxes, WERD_RES *word_res, inT32 *blob_number, bool italic_blob, SEAMS seam_list) | ||||||
void | junk_worst_seam (SEAM_QUEUE seams, SEAM *new_seam, float new_priority) | ||||||
void | choose_best_seam (SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob) | ||||||
void | combine_seam (SEAM_QUEUE seam_queue, SEAM_PILE seam_pile, SEAM *seam) | ||||||
inT16 | constrained_split (SPLIT *split, TBLOB *blob) | ||||||
void | delete_seam_pile (SEAM_PILE seam_pile) | ||||||
SEAM * | pick_good_seam (TBLOB *blob) | ||||||
PRIORITY | seam_priority (SEAM *seam, inT16 xmin, inT16 xmax) | ||||||
void | try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob) | ||||||
void | try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SEAM_QUEUE seam_queue, SEAM_PILE *seam_pile, SEAM **seam, TBLOB *blob) | ||||||
PRIORITY | full_split_priority (SPLIT *split, inT16 xmin, inT16 xmax) | ||||||
PRIORITY | grade_center_of_blob (register BOUNDS_RECT rect) | ||||||
PRIORITY | grade_overlap (register BOUNDS_RECT rect) | ||||||
PRIORITY | grade_split_length (register SPLIT *split) | ||||||
PRIORITY | grade_sharpness (register SPLIT *split) | ||||||
PRIORITY | grade_width_change (register BOUNDS_RECT rect) | ||||||
void | set_outline_bounds (register EDGEPT *point1, register EDGEPT *point2, BOUNDS_RECT rect) | ||||||
int | crosses_outline (EDGEPT *p0, EDGEPT *p1, EDGEPT *outline) | ||||||
int | is_crossed (TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1) | ||||||
int | is_same_edgept (EDGEPT *p1, EDGEPT *p2) | ||||||
bool | near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt) | ||||||
void | reverse_outline (EDGEPT *outline) | ||||||
virtual BLOB_CHOICE_LIST * | classify_piece (TBLOB *pieces, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle) | ||||||
void | merge_fragments (MATRIX *ratings, inT16 num_blobs) | ||||||
void | get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists) | ||||||
void | merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings) | ||||||
void | fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices) | ||||||
BLOB_CHOICE_LIST * | get_piece_rating (MATRIX *ratings, TBLOB *blobs, const DENORM &denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle) | ||||||
TBOX * | record_blob_bounds (TBLOB *blobs) | ||||||
MATRIX * | record_piece_ratings (TBLOB *blobs) | ||||||
WIDTH_RECORD * | state_char_widths (WIDTH_RECORD *chunk_widths, STATE *state, int num_joints) | ||||||
FLOAT32 | get_width_variance (WIDTH_RECORD *wrec, float norm_height) | ||||||
FLOAT32 | get_gap_variance (WIDTH_RECORD *wrec, float norm_height) | ||||||
FLOAT32 | prioritize_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) | ||||||
FLOAT32 | width_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints) | ||||||
FLOAT32 | seamcut_priority (SEAMS seams, STATE *state, int num_joints) | ||||||
FLOAT32 | rating_priority (CHUNKS_RECORD *chunks_record, STATE *state, int num_joints) | ||||||
program_editup | |||||||
Initialize all the things in the program that need to be initialized. init_permute determines whether to initialize the permute functions and Dawg models. | |||||||
void | program_editup (const char *textbase, bool init_classifier, bool init_permute) | ||||||
cc_recog | |||||||
Recognize a word. | |||||||
BLOB_CHOICE_LIST_VECTOR * | cc_recog (WERD_RES *word) | ||||||
program_editdown | |||||||
This function holds any nessessary post processing for the Wise Owl program. | |||||||
void | program_editdown (inT32 elasped_time) | ||||||
set_pass1 | |||||||
Get ready to do some pass 1 stuff. | |||||||
void | set_pass1 () | ||||||
set_pass2 | |||||||
Get ready to do some pass 2 stuff. | |||||||
void | set_pass2 () | ||||||
end_recog | |||||||
Cleanup and exit the recog program. | |||||||
int | end_recog () | ||||||
call_matcher | |||||||
Called from Tess with a blob in tess form. The blob may need rotating to the correct orientation for classification. | |||||||
BLOB_CHOICE_LIST * | call_matcher (const DENORM *denorm, TBLOB *blob) | ||||||
dict_word() | |||||||
Test the dictionaries, returning NO_PERM (0) if not found, or one of the PermuterType values if found, according to the dictionary. | |||||||
int | dict_word (const WERD_CHOICE &word) | ||||||
classify_blob | |||||||
Classify the this blob if it is not already recorded in the match table. Attempt to recognize this blob as a character. The recognition rating for this blob will be stored as a part of the blob. This value will also be returned to the caller.
| |||||||
BLOB_CHOICE_LIST * | classify_blob (TBLOB *blob, const DENORM &denorm, const char *string, C_COL color, BlamerBundle *blamer_bundle) | ||||||
BLOB_CHOICE_LIST * | fake_classify_blob (UNICHAR_ID class_id, float rating, float certainty) | ||||||
update_blob_classifications | |||||||
For each blob in the given word update match_table with the corresponding BLOB_CHOICES_LIST from choices. | |||||||
void | update_blob_classifications (TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices) | ||||||
best_first_search | |||||||
Find the best segmentation by doing a best first search of the solution space. | |||||||
BLOB_CHOICE_LIST_VECTOR * | evaluate_chunks (CHUNKS_RECORD *chunks_record, SEARCH_STATE search_state, BlamerBundle *blamer_bundle) | ||||||
void | best_first_search (CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_RES *word, STATE *state, DANGERR *fixpt, STATE *best_state) | ||||||
void | delete_search (SEARCH_RECORD *the_search) | ||||||
evaluate_state | |||||||
Evaluate the segmentation that is represented by this state in the best first search. Add this state to the "states_seen" list. | |||||||
inT16 | evaluate_state (CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search, DANGERR *fixpt, BlamerBundle *blamer_bundle) | ||||||
BLOB_CHOICE_LIST_VECTOR * | rebuild_current_state (WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *char_choices, MATRIX *ratings) | ||||||
new_search | |||||||
Create and initialize a new search record. | |||||||
SEARCH_RECORD * | new_search (CHUNKS_RECORD *chunks_record, int num_joints, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *best_choice, WERD_CHOICE *raw_choice, STATE *state) | ||||||
expand_node | |||||||
Create the states that are attached to this one. Check to see that each one has not already been visited. If not add it to the priority queue. | |||||||
void | expand_node (FLOAT32 worst_priority, CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search) | ||||||
replace_char_widths | |||||||
Replace the value of the char_width field in the chunks_record with the updated width measurements from the last_segmentation. | |||||||
void | replace_char_widths (CHUNKS_RECORD *chunks_record, SEARCH_STATE state) | ||||||
BLOB_CHOICE * | rebuild_fragments (const char *unichar, const char *expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices) | ||||||
BLOB_CHOICE_LIST * | join_blobs_and_classify (WERD_RES *word, int x, int y, int choice_index, MATRIX *ratings, BLOB_CHOICE_LIST_VECTOR *old_choices) | ||||||
pop_queue | |||||||
Get this state from the priority queue. It should be the state that has the greatest urgency to be evaluated. | |||||||
STATE * | pop_queue (HEAP *queue) | ||||||
push_queue | |||||||
Add this state into the priority queue. | |||||||
void | push_queue (HEAP *queue, STATE *state, FLOAT32 worst_priority, FLOAT32 priority, bool debug) | ||||||
point_priority | |||||||
Assign a priority to and edge point that might be used as part of a split. The argument should be of type EDGEPT. | |||||||
PRIORITY | point_priority (EDGEPT *point) | ||||||
add_point_to_list | |||||||
Add an edge point to a POINT_GROUP containg a list of other points. | |||||||
void | add_point_to_list (POINT_GROUP point_list, EDGEPT *point) | ||||||
angle_change | |||||||
Return the change in angle (degrees) of the line segments between points one and two, and two and three. | |||||||
int | angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3) | ||||||
is_little_chunk | |||||||
Return TRUE if one of the pieces resulting from this split would less than some number of edge points. | |||||||
int | is_little_chunk (EDGEPT *point1, EDGEPT *point2) | ||||||
is_small_area | |||||||
Test the area defined by a split accross this outline. | |||||||
int | is_small_area (EDGEPT *point1, EDGEPT *point2) | ||||||
pick_close_point | |||||||
Choose the edge point that is closest to the critical point. This point may not be exactly vertical from the critical point. | |||||||
EDGEPT * | pick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist) | ||||||
prioritize_points | |||||||
Find a list of edge points from the outer outline of this blob. For each of these points assign a priority. Sort these points using a heap structure so that they can be visited in order. | |||||||
void | prioritize_points (TESSLINE *outline, POINT_GROUP points) | ||||||
new_min_point | |||||||
Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL. | |||||||
void | new_min_point (EDGEPT *local_min, POINT_GROUP points) | ||||||
new_max_point | |||||||
Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL. | |||||||
void | new_max_point (EDGEPT *local_max, POINT_GROUP points) | ||||||
vertical_projection_point | |||||||
For one point on the outline, find the corresponding point on the other side of the outline that is a likely projection for a split point. This is done by iterating through the edge points until the X value of the point being looked at is greater than the X value of the split point. Ensure that the point being returned is not right next to the split point. Return the edge point in *best_point as a result, and any points that were newly created are also saved on the new_points list. | |||||||
void | vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points) | ||||||
improve_one_blob | |||||||
Start with the current word of blobs and its classification. Find the worst blobs and try to divide it up to improve the ratings. | |||||||
bool | improve_one_blob (WERD_RES *word_res, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, DANGERR *fixpt, bool split_next_to_fragment, BlamerBundle *blamer_bundle) | ||||||
modify_blob_choice | |||||||
Takes a blob and its chop index, converts that chop index to a unichar_id, and stores the chop index in place of the blob's original unichar_id. | |||||||
void | modify_blob_choice (BLOB_CHOICE_LIST *answer, int chop_index) | ||||||
chop_one_blob | |||||||
Start with the current one-blob word and its classification. Find the worst blobs and try to divide it up to improve the ratings. Used for testing chopper. | |||||||
bool | chop_one_blob (TWERD *word, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, int *right_chop_index) | ||||||
bool | chop_one_blob2 (const GenericVector< TBOX > &boxes, WERD_RES *word_res, SEAMS *seam_list) | ||||||
chop_word_main | |||||||
Classify the blobs in this word and permute the results. Find the worst blob in the word and chop it up. Continue this process until a good answer has been found or all the blobs have been chopped up enough. Return the word level ratings. | |||||||
BLOB_CHOICE_LIST_VECTOR * | chop_word_main (WERD_RES *word) | ||||||
improve_by_chopping | |||||||
Start with the current word of blobs and its classification. Find the worst blobs and try to divide them up to improve the ratings. As long as ratings are produced by the new blob splitting. When all the splitting has been accomplished all the ratings memory is reclaimed. | |||||||
void | improve_by_chopping (WERD_RES *word, BLOB_CHOICE_LIST_VECTOR *char_choices, STATE *best_state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, bool *updated_best_choice) | ||||||
MATRIX * | word_associator (bool only_create_ratings_matrtix, WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, STATE *best_state) | ||||||
inT16 | select_blob_to_split (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_ceiling, bool split_next_to_fragment) | ||||||
inT16 | select_blob_to_split_from_fixpt (DANGERR *fixpt) | ||||||
void | set_chopper_blame (WERD_RES *word) | ||||||
Public Attributes | |||||||
bool | merge_fragments_in_matrix = 1 | ||||||
bool | wordrec_no_block = 0 | ||||||
bool | wordrec_enable_assoc = 1 | ||||||
bool | force_word_assoc = 0 | ||||||
int | wordrec_num_seg_states = 30 | ||||||
double | wordrec_worst_state = 1 | ||||||
bool | fragments_guide_chopper = 0 | ||||||
int | repair_unchopped_blobs = 1 | ||||||
double | tessedit_certainty_threshold = -2.25 | ||||||
int | chop_debug = 0 | ||||||
bool | chop_enable = 1 | ||||||
bool | chop_vertical_creep = 0 | ||||||
int | chop_split_length = 10000 | ||||||
int | chop_same_distance = 2 | ||||||
int | chop_min_outline_points = 6 | ||||||
int | chop_inside_angle = -50 | ||||||
int | chop_min_outline_area = 2000 | ||||||
double | chop_split_dist_knob = 0.5 | ||||||
double | chop_overlap_knob = 0.9 | ||||||
double | chop_center_knob = 0.15 | ||||||
double | chop_sharpness_knob = 0.06 | ||||||
double | chop_width_change_knob = 5.0 | ||||||
double | chop_ok_split = 100.0 | ||||||
double | chop_good_split = 50.0 | ||||||
int | chop_x_y_weight = 3 | ||||||
int | segment_adjust_debug = 0 | ||||||
bool | assume_fixed_pitch_char_segment = 0 | ||||||
bool | use_new_state_cost = 0 | ||||||
double | heuristic_segcost_rating_base = 1.25 | ||||||
double | heuristic_weight_rating = 1 | ||||||
double | heuristic_weight_width = 0 | ||||||
double | heuristic_weight_seamcut = 0 | ||||||
double | heuristic_max_char_wh_ratio = 2.0 | ||||||
int | wordrec_debug_level = 0 | ||||||
bool | wordrec_debug_blamer = false | ||||||
bool | wordrec_run_blamer = false | ||||||
bool | enable_new_segsearch = false | ||||||
int | segsearch_debug_level = 0 | ||||||
int | segsearch_max_pain_points = 2000 | ||||||
int | segsearch_max_futile_classifications = 10 | ||||||
double | segsearch_max_char_wh_ratio = 2.0 | ||||||
double | segsearch_max_fixed_pitch_char_wh_ratio = 2.0 | ||||||
bool | save_alt_choices = false | ||||||
LanguageModel * | language_model_ | ||||||
PRIORITY | pass2_ok_split | ||||||
int | pass2_seg_states | ||||||
int | num_joints | ||||||
int | num_pushed | ||||||
int | num_popped | ||||||
BlobMatchTable | blob_match_table | ||||||
EVALUATION_ARRAY | last_segmentation | ||||||
WERD_CHOICE * | prev_word_best_choice_ | ||||||
GenericVector< int > | blame_reasons_ | ||||||
void(Wordrec::* | fill_lattice_ )(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) | ||||||
Protected Member Functions | |||||||
bool | SegSearchDone (int num_futile_classifications) | ||||||
void | UpdateSegSearchNodes (int starting_col, SEG_SEARCH_PENDING_LIST *pending[], BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) | ||||||
void | ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const WERD_CHOICE *best_choice, SEG_SEARCH_PENDING_LIST *pending[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle) | ||||||
void | InitBlamerForSegSearch (const WERD_CHOICE *best_choice, CHUNKS_RECORD *chunks_record, HEAP *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug) | ||||||
void | FinishBlamerForSegSearch (const WERD_CHOICE *best_choice, BlamerBundle *blamer_bundle, STRING *blamer_debug) |
tesseract::Wordrec::Wordrec | ( | ) |
Definition at line 26 of file wordrec.cpp.
: // control parameters BOOL_MEMBER(merge_fragments_in_matrix, TRUE, "Merge the fragments in the ratings matrix and delete them" " after merging", params()), BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information", params()), BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable", params()), BOOL_MEMBER(force_word_assoc, FALSE, "force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary.", CCUtil::params()), INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states", CCUtil::params()), double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state", params()), BOOL_MEMBER(fragments_guide_chopper, FALSE, "Use information from fragments to guide chopping process", params()), INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped", params()), double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit", params()), INT_MEMBER(chop_debug, 0, "Chop debug", params()), BOOL_MEMBER(chop_enable, 1, "Chop enable", params()), BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep", params()), INT_MEMBER(chop_split_length, 10000, "Split Length", params()), INT_MEMBER(chop_same_distance, 2, "Same distance", params()), INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline", params()), INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend", params()), INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area", params()), double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment", params()), double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment", params()), double_MEMBER(chop_center_knob, 0.15, "Split center adjustment", params()), double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment", params()), double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment", params()), double_MEMBER(chop_ok_split, 100.0, "OK split limit", params()), double_MEMBER(chop_good_split, 50.0, "Good split limit", params()), INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight", params()), INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug", params()), BOOL_MEMBER(assume_fixed_pitch_char_segment, FALSE, "include fixed-pitch heuristics in char segmentation", params()), BOOL_MEMBER(use_new_state_cost, FALSE, "use new state cost heuristics for segmentation state evaluation", params()), double_MEMBER(heuristic_segcost_rating_base, 1.25, "base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost.", params()), double_MEMBER(heuristic_weight_rating, 1.0, "weight associated with char rating in combined cost of state", params()), double_MEMBER(heuristic_weight_width, 1000.0, "weight associated with width evidence in combined cost of" " state", params()), double_MEMBER(heuristic_weight_seamcut, 0.0, "weight associated with seam cut in combined cost of state", params()), double_MEMBER(heuristic_max_char_wh_ratio, 2.0, "max char width-to-height ratio allowed in segmentation", params()), INT_MEMBER(wordrec_debug_level, 0, "Debug level for wordrec", params()), BOOL_MEMBER(wordrec_debug_blamer, false, "Print blamer debug messages", params()), BOOL_MEMBER(wordrec_run_blamer, false, "Try to set the blame for errors", params()), BOOL_MEMBER(enable_new_segsearch, true, "Enable new segmentation search path.", params()), INT_MEMBER(segsearch_debug_level, 0, "SegSearch debug level", params()), INT_MEMBER(segsearch_max_pain_points, 2000, "Maximum number of pain points stored in the queue", params()), INT_MEMBER(segsearch_max_futile_classifications, 10, "Maximum number of pain point classifications per word that" "did not result in finding a better word choice.", params()), double_MEMBER(segsearch_max_char_wh_ratio, 2.0, "Maximum character width-to-height ratio", params()), double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, "Maximum character width-to-height ratio for" " fixed-pitch fonts", params()), BOOL_MEMBER(save_alt_choices, false, "Save alternative paths found during chopping" " and segmentation search", params()) { prev_word_best_choice_ = NULL; language_model_ = new LanguageModel(&get_fontinfo_table(), &(getDict())); pass2_seg_states = 0; num_joints = 0; num_pushed = 0; num_popped = 0; fill_lattice_ = NULL; }
tesseract::Wordrec::~Wordrec | ( | ) | [virtual] |
Definition at line 144 of file wordrec.cpp.
{ delete language_model_; }
void tesseract::Wordrec::add_point_to_list | ( | POINT_GROUP | point_list, |
EDGEPT * | point | ||
) |
Definition at line 65 of file chop.cpp.
{ HEAPENTRY data; if (SizeOfHeap (point_list) < MAX_NUM_POINTS - 2) { data.Data = (char *) point; data.Key = point_priority (point); HeapStore(point_list, &data); } #ifndef GRAPHICS_DISABLED if (chop_debug > 2) mark_outline(point); #endif }
Definition at line 87 of file chop.cpp.
{ VECTOR vector1; VECTOR vector2; int angle; float length; /* Compute angle */ vector1.x = point2->pos.x - point1->pos.x; vector1.y = point2->pos.y - point1->pos.y; vector2.x = point3->pos.x - point2->pos.x; vector2.y = point3->pos.y - point2->pos.y; /* Use cross product */ length = (float)sqrt((float)LENGTH(vector1) * LENGTH(vector2)); if ((int) length == 0) return (0); angle = static_cast<int>(floor(asin(CROSS (vector1, vector2) / length) / PI * 180.0 + 0.5)); /* Use dot product */ if (SCALAR (vector1, vector2) < 0) angle = 180 - angle; /* Adjust angle */ if (angle > 180) angle -= 360; if (angle <= -180) angle += 360; return (angle); }
SEAM * tesseract::Wordrec::attempt_blob_chop | ( | TWERD * | word, |
TBLOB * | blob, | ||
inT32 | blob_number, | ||
bool | italic_blob, | ||
SEAMS | seam_list | ||
) |
Definition at line 146 of file chopper.cpp.
{ TBLOB *next_blob = blob->next; TBLOB *other_blob; SEAM *seam; if (repair_unchopped_blobs) preserve_outline_tree (blob->outlines); other_blob = new TBLOB; /* Make new blob */ other_blob->next = blob->next; other_blob->outlines = NULL; blob->next = other_blob; seam = NULL; if (prioritize_division) { TPOINT location; if (divisible_blob(blob, italic_blob, &location)) { seam = new_seam(0.0f, location, NULL, NULL, NULL); } } if (seam == NULL) seam = pick_good_seam(blob); if (seam == NULL && word->latin_script) { // If the blob can simply be divided into outlines, then do that. TPOINT location; if (divisible_blob(blob, italic_blob, &location)) { seam = new_seam(0.0f, location, NULL, NULL, NULL); } } if (chop_debug) { if (seam != NULL) { print_seam ("Good seam picked=", seam); } else cprintf ("\n** no seam picked *** \n"); } if (seam) { apply_seam(blob, other_blob, italic_blob, seam); } if ((seam == NULL) || (blob->outlines == NULL) || (other_blob->outlines == NULL) || total_containment (blob, other_blob) || check_blob (other_blob) || !(check_seam_order (blob, seam) && check_seam_order (other_blob, seam)) || any_shared_split_points (seam_list, seam) || !test_insert_seam(seam_list, blob_number, blob, word->blobs)) { blob->next = next_blob; if (seam) { undo_seam(blob, other_blob, seam); delete_seam(seam); #ifndef GRAPHICS_DISABLED if (chop_debug) { if (chop_debug >2) display_blob(blob, Red); cprintf ("\n** seam being removed ** \n"); } #endif } else { delete other_blob; } if (repair_unchopped_blobs) restore_outline_tree (blob->outlines); return (NULL); } return (seam); }
void tesseract::Wordrec::best_first_search | ( | CHUNKS_RECORD * | chunks_record, |
BLOB_CHOICE_LIST_VECTOR * | best_char_choices, | ||
WERD_RES * | word, | ||
STATE * | state, | ||
DANGERR * | fixpt, | ||
STATE * | best_state | ||
) |
Definition at line 88 of file bestfirst.cpp.
{ SEARCH_RECORD *the_search; inT16 keep_going; STATE guided_state; // not used int num_joints = chunks_record->ratings->dimension() - 1; the_search = new_search(chunks_record, num_joints, best_char_choices, word->best_choice, word->raw_choice, state); // The default state is initialized as the best choice. In order to apply // segmentation adjustment, or any other contextual processing in permute, // we give the best choice a poor rating to force the processed raw choice // to be promoted to best choice. the_search->best_choice->set_rating(WERD_CHOICE::kBadRating); evaluate_state(chunks_record, the_search, fixpt, word->blamer_bundle); if (wordrec_debug_level > 1) { tprintf("\n\n\n =========== BestFirstSearch ==============\n"); word->best_choice->print("**Initial BestChoice**"); } FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record, the_search); if (worst_priority < wordrec_worst_state) worst_priority = wordrec_worst_state; if (wordrec_debug_level > 1) { log_state("BestFirstSearch", num_joints, best_state); } guided_state = *state; do { /* Look for answer */ STATE orig_state = *the_search->this_state; if (!hash_lookup (the_search->closed_states, the_search->this_state)) { guided_state = *(the_search->this_state); keep_going = evaluate_state(chunks_record, the_search, fixpt, word->blamer_bundle); hash_add (the_search->closed_states, the_search->this_state); if (!keep_going || (the_search->num_states > wordrec_num_seg_states)) { if (wordrec_debug_level > 1) tprintf("Breaking best_first_search on keep_going %s numstates %d\n", ((keep_going) ? "T" :"F"), the_search->num_states); free_state (the_search->this_state); break; } FLOAT32 new_worst_priority = 2.0f * prioritize_state(chunks_record, the_search); if (new_worst_priority < worst_priority) { if (wordrec_debug_level > 1) tprintf("Lowering WorstPriority %f --> %f\n", worst_priority, new_worst_priority); // Tighten the threshold for admitting new paths as better search // candidates are found. After lowering this threshold, we can safely // popout everything that is worse than this score also. worst_priority = new_worst_priority; } expand_node(worst_priority, chunks_record, the_search); } if (wordrec_debug_level > 1) { log_state("Done with", the_search->num_joints, &orig_state); } free_state (the_search->this_state); num_popped++; the_search->this_state = pop_queue (the_search->open_states); if (wordrec_debug_level > 1 && !the_search->this_state) tprintf("No more states to evalaute after %d evals", num_popped); } while (the_search->this_state); state->part1 = the_search->best_state->part1; state->part2 = the_search->best_state->part2; if (wordrec_debug_level > 1) { tprintf("\n\n\n =========== BestFirstSearch ==============\n"); // best_choice->debug_string().string()); word->best_choice->print("**Final BestChoice**"); } // save the best_state stats delete_search(the_search); }
Definition at line 143 of file tface.cpp.
{ // Rotate the blob for classification if necessary. TBLOB* rotated_blob = tessblob->ClassifyNormalizeIfNeeded(&denorm); if (rotated_blob == NULL) { rotated_blob = tessblob; } BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST(); // matcher result AdaptiveClassifier(rotated_blob, *denorm, ratings, NULL); if (rotated_blob != tessblob) { delete rotated_blob; delete denorm; } return ratings; }
void tesseract::Wordrec::CallFillLattice | ( | const MATRIX & | ratings, |
const LIST & | best_choices, | ||
const UNICHARSET & | unicharset, | ||
BlamerBundle * | blamer_bundle | ||
) | [inline] |
Definition at line 187 of file wordrec.h.
{ (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle); }
BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::cc_recog | ( | WERD_RES * | word | ) |
Definition at line 117 of file tface.cpp.
{ getDict().InitChoiceAccum(); getDict().reset_hyphen_vars(word->word->flag(W_EOL)); blob_match_table.init_match_table(); BLOB_CHOICE_LIST_VECTOR *results = chop_word_main(word); getDict().DebugWordChoices(); return results; }
bool tesseract::Wordrec::ChoiceIsCorrect | ( | const UNICHARSET & | uni_set, |
const WERD_CHOICE * | choice, | ||
const GenericVector< STRING > & | truth_text | ||
) |
Definition at line 159 of file wordrec.cpp.
{ if (choice == NULL) return false; int i; STRING truth_str; for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i]; STRING normed_choice_str; for (i = 0; i < choice->length(); ++i) { normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i)); } return (truth_str == normed_choice_str); }
void tesseract::Wordrec::choose_best_seam | ( | SEAM_QUEUE | seam_queue, |
SEAM_PILE * | seam_pile, | ||
SPLIT * | split, | ||
PRIORITY | priority, | ||
SEAM ** | seam_result, | ||
TBLOB * | blob | ||
) |
Definition at line 178 of file findseam.cpp.
{ SEAM *seam; char str[80]; float my_priority; /* Add seam of split */ my_priority = priority; if (split != NULL) { TPOINT split_point = split->point1->pos; split_point += split->point2->pos; split_point /= 2; seam = new_seam(my_priority, split_point, split, NULL, NULL); if (chop_debug > 1) print_seam ("Partial priority ", seam); add_seam_to_queue (seam_queue, seam, (float) my_priority); if (my_priority > chop_good_split) return; } TBOX bbox = blob->bounding_box(); /* Queue loop */ while (pop_next_seam (seam_queue, seam, my_priority)) { /* Set full priority */ my_priority = seam_priority (seam, bbox.left(), bbox.right()); if (chop_debug) { sprintf (str, "Full my_priority %0.0f, ", my_priority); print_seam(str, seam); } if ((*seam_result == NULL || /* Replace answer */ (*seam_result)->priority > my_priority) && my_priority < chop_ok_split) { /* No crossing */ if (constrained_split (seam->split1, blob)) { delete_seam(*seam_result); clone_seam(*seam_result, seam); (*seam_result)->priority = my_priority; } else { delete_seam(seam); seam = NULL; my_priority = BAD_PRIORITY; } } if (my_priority < chop_good_split) { if (seam) delete_seam(seam); return; /* Made good answer */ } if (seam) { /* Combine with others */ if (array_count (*seam_pile) < MAX_NUM_SEAMS /*|| tessedit_truncate_chopper==0 */ ) { combine_seam(seam_queue, *seam_pile, seam); *seam_pile = array_push (*seam_pile, seam); } else delete_seam(seam); } my_priority = best_seam_priority (seam_queue); if ((my_priority > chop_ok_split) || (my_priority > chop_good_split && split)) return; } }
SEAM * tesseract::Wordrec::chop_numbered_blob | ( | TWERD * | word, |
inT32 | blob_number, | ||
bool | italic_blob, | ||
SEAMS | seam_list | ||
) |
Definition at line 219 of file chopper.cpp.
{ TBLOB *blob; inT16 x; blob = word->blobs; for (x = 0; x < blob_number; x++) blob = blob->next; return attempt_blob_chop(word, blob, blob_number, italic_blob, seam_list); }
bool tesseract::Wordrec::chop_one_blob | ( | TWERD * | word, |
BLOB_CHOICE_LIST_VECTOR * | char_choices, | ||
inT32 * | blob_number, | ||
SEAMS * | seam_list, | ||
int * | right_chop_index | ||
) |
Definition at line 441 of file chopper.cpp.
{ TBLOB *blob; inT16 x = 0; float rating_ceiling = MAX_FLOAT32; BLOB_CHOICE_LIST *answer; BLOB_CHOICE_IT answer_it; SEAM *seam; UNICHAR_ID unichar_id = 0; int left_chop_index = 0; do { *blob_number = select_blob_to_split(*char_choices, rating_ceiling, false); if (chop_debug) cprintf("blob_number = %d\n", *blob_number); if (*blob_number == -1) return false; seam = chop_numbered_blob(word, *blob_number, true, *seam_list); if (seam != NULL) break; /* Must split null blobs */ answer = char_choices->get(*blob_number); if (answer == NULL) return false; answer_it.set_to_list(answer); rating_ceiling = answer_it.data()->rating(); // try a different blob } while (true); /* Split OK */ for (blob = word->blobs; x < *blob_number; x++) { blob = blob->next; } if (chop_debug) { tprintf("Chop made blob1:"); blob->bounding_box().print(); tprintf("and blob2:"); blob->next->bounding_box().print(); } *seam_list = insert_seam(*seam_list, *blob_number, seam, blob, word->blobs); answer = char_choices->get(*blob_number); answer_it.set_to_list(answer); unichar_id = answer_it.data()->unichar_id(); float rating = answer_it.data()->rating() / exp(1.0); left_chop_index = atoi(unicharset.id_to_unichar(unichar_id)); delete char_choices->get(*blob_number); // combine confidence w/ serial # answer = fake_classify_blob(0, rating, -rating); modify_blob_choice(answer, left_chop_index); char_choices->insert(answer, *blob_number); answer = fake_classify_blob(0, rating - 0.125f, -rating); modify_blob_choice(answer, ++*right_chop_index); char_choices->set(answer, *blob_number + 1); return true; }
bool tesseract::Wordrec::chop_one_blob2 | ( | const GenericVector< TBOX > & | boxes, |
WERD_RES * | word_res, | ||
SEAMS * | seam_list | ||
) |
Definition at line 502 of file chopper.cpp.
{ inT32 blob_number; inT16 x = 0; TBLOB *blob; SEAM *seam; seam = chop_overlapping_blob(boxes, word_res, &blob_number, true, *seam_list); if (seam == NULL) return false; /* Split OK */ for (blob = word_res->chopped_word->blobs; x < blob_number; x++) { blob = blob->next; } if (chop_debug) { tprintf("Chop made blob1:"); blob->bounding_box().print(); tprintf("and blob2:"); blob->next->bounding_box().print(); } *seam_list = insert_seam(*seam_list, blob_number, seam, blob, word_res->chopped_word->blobs); return true; }
SEAM * tesseract::Wordrec::chop_overlapping_blob | ( | const GenericVector< TBOX > & | boxes, |
WERD_RES * | word_res, | ||
inT32 * | blob_number, | ||
bool | italic_blob, | ||
SEAMS | seam_list | ||
) |
Definition at line 233 of file chopper.cpp.
{ TWERD *word = word_res->chopped_word; TBLOB *blob; *blob_number = 0; blob = word->blobs; while (blob != NULL) { TPOINT topleft, botright; topleft.x = blob->bounding_box().left(); topleft.y = blob->bounding_box().top(); botright.x = blob->bounding_box().right(); botright.y = blob->bounding_box().bottom(); TPOINT original_topleft, original_botright; word_res->denorm.DenormTransform(topleft, &original_topleft); word_res->denorm.DenormTransform(botright, &original_botright); TBOX original_box = TBOX(original_topleft.x, original_botright.y, original_botright.x, original_topleft.y); bool almost_equal_box = false; int num_overlap = 0; for (int i = 0; i < boxes.size(); i++) { if (original_box.overlap_fraction(boxes[i]) > 0.125) num_overlap++; if (original_box.almost_equal(boxes[i], 3)) almost_equal_box = true; } TPOINT location; if (divisible_blob(blob, italic_blob, &location) || (!almost_equal_box && num_overlap > 1)) { SEAM *seam = attempt_blob_chop(word, blob, *blob_number, italic_blob, seam_list); if (seam != NULL) return seam; } *blob_number = *blob_number + 1; blob = blob->next; } *blob_number = -1; return NULL; }
BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::chop_word_main | ( | WERD_RES * | word | ) |
Definition at line 583 of file chopper.cpp.
{ TBLOB *blob; int index; int did_chopping; STATE state; BLOB_CHOICE_LIST *match_result; MATRIX *ratings = NULL; DANGERR fixpt; /*dangerous ambig */ inT32 bit_count; //no of bits BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR(); did_chopping = 0; for (blob = word->chopped_word->blobs, index = 0; blob != NULL; blob = blob->next, index++) { match_result = classify_blob(blob, word->denorm, "chop_word:", Green, word->blamer_bundle); if (match_result == NULL) cprintf("Null classifier output!\n"); *char_choices += match_result; } bit_count = index - 1; set_n_ones(&state, char_choices->length() - 1); bool acceptable = false; bool replaced = false; bool best_choice_updated = getDict().permute_characters(*char_choices, word->best_choice, word->raw_choice); if (best_choice_updated && getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt, CHOPPER_CALLER, &replaced)) { acceptable = true; } if (replaced) update_blob_classifications(word->chopped_word, *char_choices); CopyCharChoices(*char_choices, best_char_choices); if (!acceptable) { // do more work to find a better choice did_chopping = 1; bool best_choice_acceptable = false; if (chop_enable) improve_by_chopping(word, char_choices, &state, best_char_choices, &fixpt, &best_choice_acceptable); if (chop_debug) print_seams ("Final seam list:", word->seam_array); if (word->blamer_bundle != NULL && !ChoiceIsCorrect(*word->uch_set, word->best_choice, word->blamer_bundle->truth_text)) { set_chopper_blame(word); } // The force_word_assoc is almost redundant to enable_assoc. However, // it is not conditioned on the dict behavior. For CJK, we need to force // the associator to be invoked. When we figure out the exact behavior // of dict on CJK, we can remove the flag if it turns out to be redundant. if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) { ratings = word_associator(false, word, &state, best_char_choices, &fixpt, &state); } } best_char_choices = rebuild_current_state(word, &state, best_char_choices, ratings); // If after running only the chopper best_choice is incorrect and no blame // has been yet set, blame the classifier if best_choice is classifier's // top choice and is a dictionary word (i.e. language model could not have // helped). Otherwise blame the tradeoff between the classifier and // the old language model (permuters). if (word->blamer_bundle != NULL && word->blamer_bundle->incorrect_result_reason == IRR_CORRECT && ratings == NULL && // only the chopper was run !ChoiceIsCorrect(*word->uch_set, word->best_choice, word->blamer_bundle->truth_text)) { if (word->best_choice != NULL && Dict::valid_word_permuter(word->best_choice->permuter(), false)) { // Find out whether best choice is a top choice. word->blamer_bundle->best_choice_is_dict_and_top_choice = true; for (int i = 0; i < word->best_choice->length(); ++i) { BLOB_CHOICE_IT blob_choice_it(best_char_choices->get(i)); ASSERT_HOST(!blob_choice_it.empty()); BLOB_CHOICE *first_choice = NULL; for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) { // find first non-fragment choice if (!(getDict().getUnicharset().get_fragment( blob_choice_it.data()->unichar_id()))) { first_choice = blob_choice_it.data(); break; } } ASSERT_HOST(first_choice != NULL); if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) { word->blamer_bundle->best_choice_is_dict_and_top_choice = false; break; } } } STRING debug; if (word->blamer_bundle->best_choice_is_dict_and_top_choice) { debug = "Best choice is: incorrect, top choice, dictionary word"; debug += " with permuter "; debug += word->best_choice->permuter_name(); } else { debug = "Classifier/Old LM tradeoff is to blame"; } word->blamer_bundle->SetBlame( word->blamer_bundle->best_choice_is_dict_and_top_choice ? IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF, debug, word->best_choice, wordrec_debug_blamer); } if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) { if (ratings == NULL) { ratings = word_associator(true, word, NULL, NULL, NULL, NULL); } CallFillLattice(*ratings, getDict().getBestChoices(), *word->uch_set, word->blamer_bundle); } if (ratings != NULL) { if (wordrec_debug_level > 0) { tprintf("Final Ratings Matrix:\n"); ratings->print(getDict().getUnicharset()); } ratings->delete_matrix_pointers(); delete ratings; } getDict().FilterWordChoices(); // TODO(antonova, eger): check that FilterWordChoices() does not filter // out anything useful for word bigram or phrase search. // TODO(antonova, eger): when implementing word bigram and phrase search // we will need to think carefully about how to replace a word with its // alternative choice. // In particular it might be required to save the segmentation state // associated with the word, so that best_char_choices could be updated // by rebuild_current_state() correctly. if (save_alt_choices) SaveAltChoices(getDict().getBestChoices(), word); char_choices->delete_data_pointers(); delete char_choices; return best_char_choices; }
BLOB_CHOICE_LIST * tesseract::Wordrec::classify_blob | ( | TBLOB * | blob, |
const DENORM & | denorm, | ||
const char * | string, | ||
C_COL | color, | ||
BlamerBundle * | blamer_bundle | ||
) |
Definition at line 62 of file wordclass.cpp.
{ fflush(stdout); BLOB_CHOICE_LIST *choices = NULL; #ifndef GRAPHICS_DISABLED if (wordrec_display_all_blobs) display_blob(blob, color); #endif choices = blob_match_table.get_match(blob); if (choices == NULL) { choices = call_matcher(&denorm, blob); blob_match_table.put_match(blob, choices); // If a blob with the same bounding box as one of the truth character // bounding boxes is not classified as the corresponding truth character // blame character classifier for incorrect answer. if (blamer_bundle != NULL && blamer_bundle->truth_has_char_boxes && blamer_bundle->incorrect_result_reason == IRR_CORRECT) { for (int b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) { const TBOX &truth_box = blamer_bundle->norm_truth_word.BlobBox(b); const TBOX &blob_box = blob->bounding_box(); // Note that we are more strict on the bounding box boundaries here // than in other places (chopper, segmentation search), since we do // not have the ability to check the previous and next bounding box. if (blob_box.x_almost_equal(truth_box, blamer_bundle->norm_box_tolerance/2)) { BLOB_CHOICE_IT choices_it(choices); bool found = false; bool incorrect_adapted = false; UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID; const char *truth_str = blamer_bundle->truth_text[b].string(); for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) { if (strcmp(truth_str, getDict().getUnicharset().get_normed_unichar( choices_it.data()->unichar_id())) == 0) { found = true; break; } else if (choices_it.data()->adapted()) { incorrect_adapted = true; incorrect_adapted_id = choices_it.data()->unichar_id(); } } // end choices_it for loop if (!found) { STRING debug = "unichar "; debug += truth_str; debug += " not found in classification list"; blamer_bundle->SetBlame(IRR_CLASSIFIER, debug, NULL, wordrec_debug_blamer); } else if (incorrect_adapted) { STRING debug = "better rating for adapted "; debug += getDict().getUnicharset().id_to_unichar( incorrect_adapted_id); debug += " than for correct "; debug += truth_str; blamer_bundle->SetBlame(IRR_ADAPTION, debug, NULL, wordrec_debug_blamer); } break; } } // end iterating over blamer_bundle->norm_truth_word } } #ifndef GRAPHICS_DISABLED if (classify_debug_level && string) print_ratings_list(string, choices, getDict().getUnicharset()); if (wordrec_blob_pause) window_wait(blob_window); #endif return (choices); }
BLOB_CHOICE_LIST * tesseract::Wordrec::classify_piece | ( | TBLOB * | pieces, |
const DENORM & | denorm, | ||
SEAMS | seams, | ||
inT16 | start, | ||
inT16 | end, | ||
BlamerBundle * | blamer_bundle | ||
) | [virtual] |
Definition at line 75 of file pieces.cpp.
{ BLOB_CHOICE_LIST *choices; TBLOB *blob; inT16 x; join_pieces(pieces, seams, start, end); for (blob = pieces, x = 0; x < start; x++) { blob = blob->next; } choices = classify_blob(blob, denorm, "pieces:", White, blamer_bundle); break_pieces(blob, seams, start, end); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations > 2) { STATE current_state; SEARCH_STATE chunk_groups; set_n_ones (¤t_state, array_count(seams)); chunk_groups = bin_to_chunks(¤t_state, array_count(seams)); display_segmentation(pieces, chunk_groups); window_wait(segm_window); memfree(chunk_groups); } #endif return (choices); }
void tesseract::Wordrec::combine_seam | ( | SEAM_QUEUE | seam_queue, |
SEAM_PILE | seam_pile, | ||
SEAM * | seam | ||
) |
tessedit_fix_sideways_chops ||
Definition at line 259 of file findseam.cpp.
{ register inT16 x; register inT16 dist; inT16 bottom1, top1; inT16 bottom2, top2; SEAM *new_one; SEAM *this_one; bottom1 = seam->split1->point1->pos.y; if (seam->split1->point2->pos.y >= bottom1) top1 = seam->split1->point2->pos.y; else { top1 = bottom1; bottom1 = seam->split1->point2->pos.y; } if (seam->split2 != NULL) { bottom2 = seam->split2->point1->pos.y; if (seam->split2->point2->pos.y >= bottom2) top2 = seam->split2->point2->pos.y; else { top2 = bottom2; bottom2 = seam->split2->point2->pos.y; } } else { bottom2 = bottom1; top2 = top1; } array_loop(seam_pile, x) { this_one = (SEAM *) array_value (seam_pile, x); dist = seam->location.x - this_one->location.x; if (-SPLIT_CLOSENESS < dist && dist < SPLIT_CLOSENESS && seam->priority + this_one->priority < chop_ok_split) { inT16 split1_point1_y = this_one->split1->point1->pos.y; inT16 split1_point2_y = this_one->split1->point2->pos.y; inT16 split2_point1_y = 0; inT16 split2_point2_y = 0; if (this_one->split2) { split2_point1_y = this_one->split2->point1->pos.y; split2_point2_y = this_one->split2->point2->pos.y; } if ( ( /* this_one->split1 always exists */ ( ((split1_point1_y >= top1 && split1_point2_y >= top1) || (split1_point1_y <= bottom1 && split1_point2_y <= bottom1)) && ((split1_point1_y >= top2 && split1_point2_y >= top2) || (split1_point1_y <= bottom2 && split1_point2_y <= bottom2)) ) ) && ( this_one->split2 == NULL || ( ((split2_point1_y >= top1 && split2_point2_y >= top1) || (split2_point1_y <= bottom1 && split2_point2_y <= bottom1)) && ((split2_point1_y >= top2 && split2_point2_y >= top2) || (split2_point1_y <= bottom2 && split2_point2_y <= bottom2)) ) ) ) { new_one = join_two_seams (seam, this_one); if (chop_debug > 1) print_seam ("Combo priority ", new_one); add_seam_to_queue (seam_queue, new_one, new_one->priority); } } } }
Definition at line 343 of file findseam.cpp.
{ TESSLINE *outline; if (is_little_chunk (split->point1, split->point2)) return (FALSE); for (outline = blob->outlines; outline; outline = outline->next) { if (split_bounds_overlap (split, outline) && crosses_outline (split->point1, split->point2, outline->loop)) { return (FALSE); } } return (TRUE); }
void tesseract::Wordrec::CopyCharChoices | ( | const BLOB_CHOICE_LIST_VECTOR & | from, |
BLOB_CHOICE_LIST_VECTOR * | to | ||
) |
Definition at line 148 of file wordrec.cpp.
{ to->delete_data_pointers(); to->clear(); for (int i = 0; i < from.size(); ++i) { BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST(); cc_list->deep_copy(from[i], &BLOB_CHOICE::deep_copy); to->push_back(cc_list); } }
void tesseract::Wordrec::delete_seam_pile | ( | SEAM_PILE | seam_pile | ) |
Definition at line 365 of file findseam.cpp.
{ inT16 x; array_loop(seam_pile, x) { delete_seam ((SEAM *) array_value (seam_pile, x)); } array_free(seam_pile); }
void tesseract::Wordrec::delete_search | ( | SEARCH_RECORD * | the_search | ) |
delete_search
Terminate the current search and free all the memory involved.
Definition at line 179 of file bestfirst.cpp.
{ float closeness; closeness = (the_search->num_joints ? (hamming_distance(reinterpret_cast<uinT32*>(the_search->first_state), reinterpret_cast<uinT32*>(the_search->best_state), 2) / (float) the_search->num_joints) : 0.0f); free_state (the_search->first_state); free_state (the_search->best_state); free_hash_table(the_search->closed_states); FreeHeapData (the_search->open_states, (void_dest) free_state); memfree(the_search); }
int tesseract::Wordrec::dict_word | ( | const WERD_CHOICE & | word | ) |
int tesseract::Wordrec::end_recog | ( | ) |
Definition at line 67 of file tface.cpp.
{ program_editdown (0); return (0); }
BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::evaluate_chunks | ( | CHUNKS_RECORD * | chunks_record, |
SEARCH_STATE | search_state, | ||
BlamerBundle * | blamer_bundle | ||
) |
evaluate_chunks
A particular word level segmentation has been chosen. Evaluation this to find the word list that corresponds to it.
Definition at line 203 of file bestfirst.cpp.
{ BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); BLOB_CHOICE_LIST *blob_choices; BLOB_CHOICE_IT blob_choice_it; int i; int x = 0; int y; // Iterate sub-paths. for (i = 1; i <= search_state[0] + 1; i++) { if (i > search_state[0]) y = count_blobs (chunks_record->chunks) - 1; else y = x + search_state[i]; // Process one square. // Classify if needed. blob_choices = get_piece_rating(chunks_record->ratings, chunks_record->chunks, chunks_record->word_res->denorm, chunks_record->splits, x, y, blamer_bundle); if (blob_choices == NULL) { delete char_choices; return (NULL); } // Add permuted ratings. blob_choice_it.set_to_list(blob_choices); last_segmentation[i - 1].certainty = blob_choice_it.data()->certainty(); last_segmentation[i - 1].match = blob_choice_it.data()->rating(); last_segmentation[i - 1].width = AssociateUtils::GetChunksWidth(chunks_record->chunk_widths, x, y); last_segmentation[i - 1].gap = AssociateUtils::GetChunksGap(chunks_record->chunk_widths, y); *char_choices += blob_choices; x = y + 1; } return (char_choices); }
inT16 tesseract::Wordrec::evaluate_state | ( | CHUNKS_RECORD * | chunks_record, |
SEARCH_RECORD * | the_search, | ||
DANGERR * | fixpt, | ||
BlamerBundle * | blamer_bundle | ||
) |
Definition at line 256 of file bestfirst.cpp.
{ BLOB_CHOICE_LIST_VECTOR *char_choices; SEARCH_STATE chunk_groups; float rating_limit = the_search->best_choice->rating(); bool keep_going = true; PIECES_STATE widths; the_search->num_states++; chunk_groups = bin_to_chunks(the_search->this_state, the_search->num_joints); bin_to_pieces (the_search->this_state, the_search->num_joints, widths); if (wordrec_debug_level > 1) { log_state("Evaluating state", the_search->num_joints, the_search->this_state); } getDict().LogNewSegmentation(widths); char_choices = evaluate_chunks(chunks_record, chunk_groups, blamer_bundle); getDict().SetWordsegRatingAdjustFactor(-1.0f); bool updated_best_choice = false; if (char_choices != NULL && char_choices->length() > 0) { // Compute the segmentation cost and include the cost in word rating. // TODO(dsl): We should change the SEARCH_RECORD to store this cost // from state evaluation and avoid recomputing it here. prioritize_state(chunks_record, the_search); getDict().SetWordsegRatingAdjustFactor(the_search->segcost_bias); updated_best_choice = getDict().permute_characters(*char_choices, the_search->best_choice, the_search->raw_choice); bool replaced = false; if (updated_best_choice) { if (getDict().AcceptableChoice(char_choices, the_search->best_choice, NULL, ASSOCIATOR_CALLER, &replaced)) { keep_going = false; } CopyCharChoices(*char_choices, the_search->best_char_choices); } } getDict().SetWordsegRatingAdjustFactor(-1.0f); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { display_segmentation (chunks_record->chunks, chunk_groups); if (wordrec_display_segmentations > 1) window_wait(segm_window); } #endif if (rating_limit != the_search->best_choice->rating()) { ASSERT_HOST(updated_best_choice); the_search->before_best = the_search->num_states; the_search->best_state->part1 = the_search->this_state->part1; the_search->best_state->part2 = the_search->this_state->part2; replace_char_widths(chunks_record, chunk_groups); } else { ASSERT_HOST(!updated_best_choice); if (char_choices != NULL) fixpt->clear(); } if (char_choices != NULL) delete char_choices; memfree(chunk_groups); return (keep_going); }
void tesseract::Wordrec::expand_node | ( | FLOAT32 | worst_priority, |
CHUNKS_RECORD * | chunks_record, | ||
SEARCH_RECORD * | the_search | ||
) |
Definition at line 499 of file bestfirst.cpp.
{ STATE old_state; int x; uinT32 mask = 1 << (the_search->num_joints - 1 - 32); old_state.part1 = the_search->this_state->part1; old_state.part2 = the_search->this_state->part2; // We need to expand the search more intelligently, or we get stuck // with a bad starting segmentation in a long word sequence as in CJK. // Expand a child node only if it is within the global bound, and no // worse than 2x of its parent. // TODO(dsl): There is some redudency here in recomputing the priority, // and in filtering of old_merit and worst_priority. the_search->this_state->part2 = old_state.part2; for (x = the_search->num_joints; x > 32; x--) { the_search->this_state->part1 = mask ^ old_state.part1; if (!hash_lookup (the_search->closed_states, the_search->this_state)) { FLOAT32 new_merit = prioritize_state(chunks_record, the_search); if (new_merit < worst_priority) { if (wordrec_debug_level > 1) log_state("Pushing segstate", the_search->num_joints, the_search->this_state, new_merit); push_queue(the_search->open_states, the_search->this_state, worst_priority, new_merit, wordrec_debug_level > 1); } else { if (wordrec_debug_level > 1) log_state("Ignore weak segstate", the_search->num_joints, the_search->this_state, new_merit); } } mask >>= 1; } if (the_search->num_joints > 32) { mask = 1 << 31; } else { mask = 1 << (the_search->num_joints - 1); } the_search->this_state->part1 = old_state.part1; while (x--) { the_search->this_state->part2 = mask ^ old_state.part2; if (!hash_lookup (the_search->closed_states, the_search->this_state)) { FLOAT32 new_merit = prioritize_state(chunks_record, the_search); if (new_merit < worst_priority) { if (wordrec_debug_level > 1) log_state("Pushing segstate", the_search->num_joints, the_search->this_state, new_merit); push_queue(the_search->open_states, the_search->this_state, worst_priority, new_merit, wordrec_debug_level > 1); } else { if (wordrec_debug_level > 1) log_state("Ignoring weak segstate", the_search->num_joints, the_search->this_state, new_merit); } } mask >>= 1; } }
BLOB_CHOICE_LIST * tesseract::Wordrec::fake_classify_blob | ( | UNICHAR_ID | class_id, |
float | rating, | ||
float | certainty | ||
) |
Definition at line 136 of file wordclass.cpp.
{ BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST(); // matcher result BLOB_CHOICE *choice = new BLOB_CHOICE(class_id, rating, certainty, -1, -1, 0, 0, 0, false); BLOB_CHOICE_IT temp_it(ratings); temp_it.add_after_stay_put(choice); return ratings; }
void tesseract::Wordrec::fill_filtered_fragment_list | ( | BLOB_CHOICE_LIST * | choices, |
int | fragment_pos, | ||
int | num_frag_parts, | ||
BLOB_CHOICE_LIST * | filtered_choices | ||
) |
Definition at line 136 of file pieces.cpp.
{ BLOB_CHOICE_IT filtered_choices_it(filtered_choices); BLOB_CHOICE_IT choices_it(choices); for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) { UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id); if (frag != NULL && frag->get_pos() == fragment_pos && frag->get_total() == num_frag_parts) { // Recover the unichar_id of the unichar that this fragment is // a part of BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data()); int original_unichar = unicharset.unichar_to_id(frag->get_unichar()); b->set_unichar_id(original_unichar); filtered_choices_it.add_to_end(b); } } filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>); }
void tesseract::Wordrec::FillLattice | ( | const MATRIX & | ratings, |
const LIST & | best_choices, | ||
const UNICHARSET & | unicharset, | ||
BlamerBundle * | blamer_bundle | ||
) |
void tesseract::Wordrec::FinishBlamerForSegSearch | ( | const WERD_CHOICE * | best_choice, |
BlamerBundle * | blamer_bundle, | ||
STRING * | blamer_debug | ||
) | [protected] |
Definition at line 376 of file segsearch.cpp.
{ // If we are still looking for blame (i.e. best_choice is incorrect, but a // path representing the correct segmentation could be constructed), we can // blame segmentation search pain point prioritization if the rating of the // path corresponding to the correct segmentation is better than that of // best_choice (i.e. language model would have done the correct thing, but // because of poor pain point prioritization the correct segmentation was // never explored). Otherwise we blame the tradeoff between the language model // and the classifier, since even after exploring the path corresponding to // the correct segmentation incorrect best_choice would have been chosen. // One special case when we blame the classifier instead is when best choice // is incorrect, but it is a dictionary word and it classifier's top choice. if (blamer_bundle != NULL && blamer_bundle->segsearch_is_looking_for_blame) { blamer_bundle->segsearch_is_looking_for_blame = false; if (blamer_bundle->best_choice_is_dict_and_top_choice) { *blamer_debug = "Best choice is: incorrect, top choice, dictionary word"; *blamer_debug += " with permuter "; *blamer_debug += best_choice->permuter_name(); blamer_bundle->SetBlame(IRR_CLASSIFIER, *blamer_debug, best_choice, wordrec_debug_blamer); } else if (blamer_bundle->best_correctly_segmented_rating < best_choice->rating()) { *blamer_debug += "Correct segmentation state was not explored"; blamer_bundle->SetBlame(IRR_SEGSEARCH_PP, *blamer_debug, best_choice, wordrec_debug_blamer); } else { if (blamer_bundle->best_correctly_segmented_rating >= WERD_CHOICE::kBadRating) { *blamer_debug += "Correct segmentation paths were pruned by LM\n"; } else { char debug_buffer[256]; *blamer_debug += "Best correct segmentation rating "; sprintf(debug_buffer, "%g", blamer_bundle->best_correctly_segmented_rating); *blamer_debug += debug_buffer; *blamer_debug += " vs. best choice rating "; sprintf(debug_buffer, "%g", best_choice->rating()); *blamer_debug += debug_buffer; } blamer_bundle->SetBlame(IRR_CLASS_LM_TRADEOFF, *blamer_debug, best_choice, wordrec_debug_blamer); } } }
Definition at line 74 of file gradechop.cpp.
{ BOUNDS_RECT rect; set_outline_bounds (split->point1, split->point2, rect); if (xmin < MIN (rect[0], rect[2]) && xmax > MAX (rect[1], rect[3])) return (999.0); return (grade_overlap (rect) + grade_center_of_blob (rect) + grade_width_change (rect)); }
void tesseract::Wordrec::get_fragment_lists | ( | inT16 | current_frag, |
inT16 | current_row, | ||
inT16 | start, | ||
inT16 | num_frag_parts, | ||
inT16 | num_blobs, | ||
MATRIX * | ratings, | ||
BLOB_CHOICE_LIST * | choice_lists | ||
) |
Definition at line 292 of file pieces.cpp.
{ if (current_frag == num_frag_parts) { merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts, choice_lists, ratings); return; } for (inT16 x = current_row; x < num_blobs; x++) { BLOB_CHOICE_LIST *choices = ratings->get(current_row, x); if (choices == NULL) continue; fill_filtered_fragment_list(choices, current_frag, num_frag_parts, &choice_lists[current_frag]); if (!choice_lists[current_frag].empty()) { get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts, num_blobs, ratings, choice_lists); choice_lists[current_frag].clear(); } } }
FLOAT32 tesseract::Wordrec::get_gap_variance | ( | WIDTH_RECORD * | wrec, |
float | norm_height | ||
) |
Definition at line 111 of file heuristic.cpp.
{ MEASUREMENT ws; new_measurement(ws); for (int x = 0; x < wrec->num_chars - 1; x++) { FLOAT32 gap_ratio = (wrec->widths[2 * x] + wrec->widths[ 2*x + 1]) * 1.0 / norm_height; ADD_SAMPLE(ws, gap_ratio); } if (segment_adjust_debug > 2) tprintf("Gap Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws)); return VARIANCE(ws); }
BLOB_CHOICE_LIST * tesseract::Wordrec::get_piece_rating | ( | MATRIX * | ratings, |
TBLOB * | blobs, | ||
const DENORM & | denorm, | ||
SEAMS | seams, | ||
inT16 | start, | ||
inT16 | end, | ||
BlamerBundle * | blamer_bundle | ||
) |
Definition at line 362 of file pieces.cpp.
{ BLOB_CHOICE_LIST *choices = ratings->get(start, end); if (choices == NOT_CLASSIFIED) { choices = classify_piece(blobs, denorm, seams, start, end, blamer_bundle); ratings->put(start, end, choices); if (wordrec_debug_level > 1) { tprintf("get_piece_rating(): updated ratings matrix\n"); ratings->print(getDict().getUnicharset()); } } return (choices); }
FLOAT32 tesseract::Wordrec::get_width_variance | ( | WIDTH_RECORD * | wrec, |
float | norm_height | ||
) |
Definition at line 96 of file heuristic.cpp.
{ MEASUREMENT ws; new_measurement(ws); for (int x = 0; x < wrec->num_chars; x++) { FLOAT32 wh_ratio = wrec->widths[2 * x] * 1.0f / norm_height; if (x == wrec->num_chars - 1 && wh_ratio > 0.3) continue; // exclude trailing punctuation from stats ADD_SAMPLE(ws, wh_ratio); } if (segment_adjust_debug > 2) tprintf("Width Mean=%g Var=%g\n", MEAN(ws), VARIANCE(ws)); return VARIANCE(ws); }
PRIORITY tesseract::Wordrec::grade_center_of_blob | ( | register BOUNDS_RECT | rect | ) |
Definition at line 95 of file gradechop.cpp.
{ register PRIORITY grade; grade = (rect[1] - rect[0]) - (rect[3] - rect[2]); if (grade < 0) grade = -grade; grade *= chop_center_knob; grade = MIN (CENTER_GRADE_CAP, grade); return (MAX (0.0, grade)); }
PRIORITY tesseract::Wordrec::grade_overlap | ( | register BOUNDS_RECT | rect | ) |
Definition at line 115 of file gradechop.cpp.
{ register PRIORITY grade; register inT16 width1; register inT16 width2; register inT16 overlap; width1 = rect[3] - rect[2]; width2 = rect[1] - rect[0]; overlap = MIN (rect[1], rect[3]) - MAX (rect[0], rect[2]); width1 = MIN (width1, width2); if (overlap == width1) return (100.0); /* Total overlap */ width1 = 2 * overlap - width1; /* Extra penalty for too */ overlap += MAX (0, width1); /* much overlap */ grade = overlap * chop_overlap_knob; return (MAX (0.0, grade)); }
Definition at line 168 of file gradechop.cpp.
{ register PRIORITY grade; grade = point_priority (split->point1) + point_priority (split->point2); if (grade < -360.0) grade = 0; else grade += 360.0; grade *= chop_sharpness_knob; /* Values 0 to -360 */ return (grade); }
Definition at line 145 of file gradechop.cpp.
{ register PRIORITY grade; register float split_length; split_length = weighted_edgept_dist (split->point1, split->point2, chop_x_y_weight); if (split_length <= 0) grade = 0; else grade = sqrt (split_length) * chop_split_dist_knob; return (MAX (0.0, grade)); }
PRIORITY tesseract::Wordrec::grade_width_change | ( | register BOUNDS_RECT | rect | ) |
Definition at line 191 of file gradechop.cpp.
void tesseract::Wordrec::improve_by_chopping | ( | WERD_RES * | word, |
BLOB_CHOICE_LIST_VECTOR * | char_choices, | ||
STATE * | best_state, | ||
BLOB_CHOICE_LIST_VECTOR * | best_char_choices, | ||
DANGERR * | fixpt, | ||
bool * | updated_best_choice | ||
) |
Definition at line 741 of file chopper.cpp.
{ inT32 blob_number; float old_best; bool updated_best_choice = false; while (1) { // improvement loop old_best = word->best_choice->rating(); if (improve_one_blob(word, char_choices, &blob_number, &word->seam_array, fixpt, (fragments_guide_chopper && word->best_choice->fragment_mark()), word->blamer_bundle)) { getDict().LogNewSplit(blob_number); updated_best_choice = getDict().permute_characters(*char_choices, word->best_choice, word->raw_choice); if (old_best > word->best_choice->rating()) { set_n_ones(best_state, char_choices->length() - 1); } else { insert_new_chunk(best_state, blob_number, char_choices->length() - 2); fixpt->clear(); } if (chop_debug) print_state("best state = ", best_state, count_blobs(word->chopped_word->blobs) - 1); } else { break; } // Check if we should break from the loop. bool done = false; bool replaced = false; if ((updated_best_choice && (*best_choice_acceptable = getDict().AcceptableChoice(char_choices, word->best_choice, fixpt, CHOPPER_CALLER, &replaced))) || char_choices->length() >= MAX_NUM_CHUNKS) { done = true; } if (replaced) update_blob_classifications(word->chopped_word, *char_choices); if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices); if (done) break; } }
bool tesseract::Wordrec::improve_one_blob | ( | WERD_RES * | word_res, |
BLOB_CHOICE_LIST_VECTOR * | char_choices, | ||
inT32 * | blob_number, | ||
SEAMS * | seam_list, | ||
DANGERR * | fixpt, | ||
bool | split_next_to_fragment, | ||
BlamerBundle * | blamer_bundle | ||
) |
Definition at line 332 of file chopper.cpp.
{ TWERD* word = word_res->chopped_word; TBLOB *blob; inT16 x = 0; float rating_ceiling = MAX_FLOAT32; BLOB_CHOICE_LIST *answer; BLOB_CHOICE_IT answer_it; SEAM *seam; do { *blob_number = select_blob_to_split_from_fixpt(fixpt); bool split_point_from_dict = (*blob_number != -1); if (split_point_from_dict) { fixpt->clear(); } else { *blob_number = select_blob_to_split(*char_choices, rating_ceiling, split_next_to_fragment); } if (chop_debug) cprintf("blob_number = %d\n", *blob_number); if (*blob_number == -1) return false; // TODO(rays) it may eventually help to allow italic_blob to be true, seam = chop_numbered_blob(word, *blob_number, false, *seam_list); if (seam != NULL) break; /* Must split null blobs */ answer = char_choices->get(*blob_number); if (answer == NULL) return false; answer_it.set_to_list(answer); if (!split_point_from_dict) { // We chopped the worst rated blob, try something else next time. rating_ceiling = answer_it.data()->rating(); } } while (true); /* Split OK */ for (blob = word->blobs; x < *blob_number; x++) { blob = blob->next; } *seam_list = insert_seam (*seam_list, *blob_number, seam, blob, word->blobs); delete char_choices->get(*blob_number); answer = classify_blob(blob, word_res->denorm, "improve 1:", Red, blamer_bundle); char_choices->insert(answer, *blob_number); answer = classify_blob(blob->next, word_res->denorm, "improve 2:", Yellow, blamer_bundle); char_choices->set(answer, *blob_number + 1); return true; }
void tesseract::Wordrec::InitBlamerForSegSearch | ( | const WERD_CHOICE * | best_choice, |
CHUNKS_RECORD * | chunks_record, | ||
HEAP * | pain_points, | ||
BlamerBundle * | blamer_bundle, | ||
STRING * | blamer_debug | ||
) | [protected] |
Definition at line 331 of file segsearch.cpp.
{ blamer_bundle->segsearch_is_looking_for_blame = true; if (wordrec_debug_blamer) { tprintf("segsearch starting to look for blame\n"); } // Clear pain points heap. int pop; float pain_point_priority; MATRIX_COORD *pain_point; while ((pop = HeapPop(pain_points, &pain_point_priority, &pain_point)) != EMPTY) { delete pain_point; } // Fill pain points for any unclassifed blob corresponding to the // correct segmentation state. *blamer_debug += "Correct segmentation:\n"; for (int idx = 0; idx < blamer_bundle->correct_segmentation_cols.length(); ++idx) { blamer_debug->add_str_int( "col=", blamer_bundle->correct_segmentation_cols[idx]); blamer_debug->add_str_int( " row=", blamer_bundle->correct_segmentation_rows[idx]); *blamer_debug += "\n"; if (chunks_record->ratings->get( blamer_bundle->correct_segmentation_cols[idx], blamer_bundle->correct_segmentation_rows[idx]) == NOT_CLASSIFIED) { if (!language_model_->GeneratePainPoint( blamer_bundle->correct_segmentation_cols[idx], blamer_bundle->correct_segmentation_rows[idx], false, -1.0, -1.0, false, -1.0, segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points)) { blamer_bundle->segsearch_is_looking_for_blame = false; *blamer_debug += "\nFailed to insert pain point\n"; blamer_bundle->SetBlame(IRR_SEGSEARCH_HEUR, *blamer_debug, best_choice, wordrec_debug_blamer); break; } } } // end for blamer_bundle->correct_segmentation_cols/rows }
Definition at line 70 of file outlines.cpp.
{ int b0a1xb0b1, b0b1xb0a0; int a1b1xa1a0, a1a0xa1b0; TPOINT b0a1, b0a0, a1b1, b0b1, a1a0; b0a1.x = a1.x - b0.x; b0a0.x = a0.x - b0.x; a1b1.x = b1.x - a1.x; b0b1.x = b1.x - b0.x; a1a0.x = a0.x - a1.x; b0a1.y = a1.y - b0.y; b0a0.y = a0.y - b0.y; a1b1.y = b1.y - a1.y; b0b1.y = b1.y - b0.y; a1a0.y = a0.y - a1.y; b0a1xb0b1 = CROSS (b0a1, b0b1); b0b1xb0a0 = CROSS (b0b1, b0a0); a1b1xa1a0 = CROSS (a1b1, a1a0); /*a1a0xa1b0=CROSS(a1a0,a1b0); */ a1a0xa1b0 = -CROSS (a1a0, b0a1); return ((b0a1xb0b1 > 0 && b0b1xb0a0 > 0) || (b0a1xb0b1 < 0 && b0b1xb0a0 < 0)) && ((a1b1xa1a0 > 0 && a1a0xa1b0 > 0) || (a1b1xa1a0 < 0 && a1a0xa1b0 < 0)); }
Definition at line 123 of file chop.cpp.
{ EDGEPT *p = point1; /* Iterator */ int counter = 0; do { /* Go from P1 to P2 */ if (is_same_edgept (point2, p)) { if (is_small_area (point1, point2)) return (TRUE); else break; } p = p->next; } while ((p != point1) && (counter++ < chop_min_outline_points)); /* Go from P2 to P1 */ p = point2; counter = 0; do { if (is_same_edgept (point1, p)) { return (is_small_area (point2, point1)); } p = p->next; } while ((p != point2) && (counter++ < chop_min_outline_points)); return (FALSE); }
Definition at line 104 of file outlines.cpp.
{
return (p1 == p2);
}
BLOB_CHOICE_LIST * tesseract::Wordrec::join_blobs_and_classify | ( | WERD_RES * | word, |
int | x, | ||
int | y, | ||
int | choice_index, | ||
MATRIX * | ratings, | ||
BLOB_CHOICE_LIST_VECTOR * | old_choices | ||
) |
Definition at line 730 of file bestfirst.cpp.
{ // Join parts to make the blob if needed. if (x != y) join_pieces(word->chopped_word->blobs, word->seam_array, x, y); TBLOB *blob = word->chopped_word->blobs; for (int i = 0; i < x; i++) { blob = blob->next; } // Deep copy this blob into the output word. TBLOB* copy_blob = new TBLOB(*blob); copy_blob->next = word->rebuild_word->blobs; word->rebuild_word->blobs = copy_blob; BLOB_CHOICE_LIST *choices = NULL; // First check to see if we can look up the classificaiton // in old_choices (if there is no need to merge blobs). if (choice_index >= 0 && old_choices != NULL) { choices = old_choices->get(choice_index); old_choices->set(NULL, choice_index); } // The ratings matrix filled in by the associator will contain the next most // up-to-date classification info. Thus we look up the classification there // next, and only call classify_blob() if the classification is not found. if (choices == NULL && ratings != NULL) { choices = ratings->get(x, y); if (choices != NOT_CLASSIFIED) { ratings->put(x, y, NULL); } } // Get the choices for the blob by classification if necessary. if (choices == NULL) { choices = classify_blob(blob, word->denorm, "rebuild", Orange, word->blamer_bundle); } // Undo join_pieces to restore the chopped word to its fully chopped state. if (x != y) break_pieces(blob, word->seam_array, x, y); return choices; }
void tesseract::Wordrec::junk_worst_seam | ( | SEAM_QUEUE | seams, |
SEAM * | new_seam, | ||
float | new_priority | ||
) |
Definition at line 148 of file findseam.cpp.
{ SEAM *seam; float priority; HeapPopWorst(seams, &priority, &seam); if (priority > new_priority) { delete_seam(seam); /*get rid of it */ HeapPush (seams, new_priority, (char *) new_seam); } else { delete_seam(new_seam); HeapPush (seams, priority, (char *) seam); } }
void tesseract::Wordrec::merge_and_put_fragment_lists | ( | inT16 | row, |
inT16 | column, | ||
inT16 | num_frag_parts, | ||
BLOB_CHOICE_LIST * | choice_lists, | ||
MATRIX * | ratings | ||
) |
Definition at line 169 of file pieces.cpp.
{ BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts]; for (int i = 0; i < num_frag_parts; i++) { choice_lists_it[i].set_to_list(&choice_lists[i]); choice_lists_it[i].mark_cycle_pt(); } BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column); if (merged_choice == NULL) merged_choice = new BLOB_CHOICE_LIST; bool end_of_list = false; BLOB_CHOICE_IT merged_choice_it(merged_choice); while (!end_of_list) { // Find the maximum unichar_id of the current entry the iterators // are pointing at UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id(); int max_list = 0; for (int i = 0; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); if (max_unichar_id < unichar_id) { max_unichar_id = unichar_id; max_list = i; } } // Move the each iterators until it gets to an entry that has a // value greater than or equal to max_unichar_id for (int i = 0; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); while (!choice_lists_it[i].cycled_list() && unichar_id < max_unichar_id) { choice_lists_it[i].forward(); unichar_id = choice_lists_it[i].data()->unichar_id(); } if (choice_lists_it[i].cycled_list()) { end_of_list = true; break; } } if (end_of_list) break; // Checks if the fragments are parts of the same character UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id(); bool same_unichar = true; for (int i = 1; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); if (unichar_id != first_unichar_id) { same_unichar = false; break; } } if (same_unichar) { // Add the merged character to the result UNICHAR_ID merged_unichar_id = first_unichar_id; inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id(); inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2(); inT16 merged_min_xheight = choice_lists_it[0].data()->min_xheight(); inT16 merged_max_xheight = choice_lists_it[0].data()->max_xheight(); int merged_script_id = choice_lists_it[0].data()->script_id(); bool merged_adapted = choice_lists_it[0].data()->adapted(); float merged_rating = 0, merged_certainty = 0; for (int i = 0; i < num_frag_parts; i++) { float rating = choice_lists_it[i].data()->rating(); float certainty = choice_lists_it[i].data()->certainty(); if (i == 0 || certainty < merged_certainty) merged_certainty = certainty; merged_rating += rating; choice_lists_it[i].forward(); if (choice_lists_it[i].cycled_list()) end_of_list = true; IntersectRange(choice_lists_it[i].data()->min_xheight(), choice_lists_it[i].data()->max_xheight(), &merged_min_xheight, &merged_max_xheight); } merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id, merged_rating, merged_certainty, merged_fontinfo_id, merged_fontinfo_id2, merged_script_id, merged_min_xheight, merged_max_xheight, merged_adapted)); } } if (classify_debug_level) print_ratings_list("Merged Fragments", merged_choice, unicharset); if (merged_choice->empty()) delete merged_choice; else ratings->put(row, column, merged_choice); delete [] choice_lists_it; }
Definition at line 324 of file pieces.cpp.
{ BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks]; for (inT16 start = 0; start < num_blobs; start++) { for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks; frag_parts++) { get_fragment_lists(0, start, start, frag_parts, num_blobs, ratings, choice_lists); } } // Delete fragments from the rating matrix for (inT16 x = 0; x < num_blobs; x++) { for (inT16 y = x; y < num_blobs; y++) { BLOB_CHOICE_LIST *choices = ratings->get(x, y); if (choices != NULL) { BLOB_CHOICE_IT choices_it(choices); for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) { UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id); if (frag != NULL) delete choices_it.extract(); } } } } }
void tesseract::Wordrec::modify_blob_choice | ( | BLOB_CHOICE_LIST * | answer, |
int | chop_index | ||
) |
Definition at line 403 of file chopper.cpp.
{ char chop_index_string[2]; if (chop_index <= 9) { snprintf(chop_index_string, sizeof(chop_index_string), "%d", chop_index); } else { chop_index_string[0] = static_cast<char>('A' - 10 + chop_index); chop_index_string[1] = '\0'; } UNICHAR_ID unichar_id = unicharset.unichar_to_id(chop_index_string); if (unichar_id == INVALID_UNICHAR_ID) { // If the word is very long, we might exhaust the possibilities. unichar_id = 1; } BLOB_CHOICE_IT answer_it(answer); BLOB_CHOICE *modified_blob = new BLOB_CHOICE(unichar_id, answer_it.data()->rating(), answer_it.data()->certainty(), answer_it.data()->fontinfo_id(), answer_it.data()->fontinfo_id2(), answer_it.data()->script_id(), answer_it.data()->min_xheight(), answer_it.data()->max_xheight(), answer_it.data()->adapted()); answer->clear(); answer_it.set_to_list(answer); answer_it.add_after_then_move(modified_blob); }
bool tesseract::Wordrec::near_point | ( | EDGEPT * | point, |
EDGEPT * | line_pt_0, | ||
EDGEPT * | line_pt_1, | ||
EDGEPT ** | near_pt | ||
) |
Definition at line 116 of file outlines.cpp.
{ TPOINT p; float slope; float intercept; float x0 = line_pt_0->pos.x; float x1 = line_pt_1->pos.x; float y0 = line_pt_0->pos.y; float y1 = line_pt_1->pos.y; if (x0 == x1) { /* Handle vertical line */ p.x = (inT16) x0; p.y = point->pos.y; } else { /* Slope and intercept */ slope = (y0 - y1) / (x0 - x1); intercept = y1 - x1 * slope; /* Find perpendicular */ p.x = (inT16) ((point->pos.x + (point->pos.y - intercept) * slope) / (slope * slope + 1)); p.y = (inT16) (slope * p.x + intercept); } if (is_on_line (p, line_pt_0->pos, line_pt_1->pos) && (!same_point (p, line_pt_0->pos)) && (!same_point (p, line_pt_1->pos))) { /* Intersection on line */ *near_pt = make_edgept(p.x, p.y, line_pt_1, line_pt_0); return true; } else { /* Intersection not on line */ *near_pt = closest(point, line_pt_0, line_pt_1); return false; } }
void tesseract::Wordrec::new_max_point | ( | EDGEPT * | local_max, |
POINT_GROUP | points | ||
) |
Definition at line 303 of file chop.cpp.
{ inT16 dir; dir = direction (local_max); if (dir > 0) { add_point_to_list(points, local_max); return; } if (dir == 0 && point_priority (local_max) < 0) { add_point_to_list(points, local_max); return; } }
void tesseract::Wordrec::new_min_point | ( | EDGEPT * | local_min, |
POINT_GROUP | points | ||
) |
Definition at line 279 of file chop.cpp.
{ inT16 dir; dir = direction (local_min); if (dir < 0) { add_point_to_list(points, local_min); return; } if (dir == 0 && point_priority (local_min) < 0) { add_point_to_list(points, local_min); return; } }
SEARCH_RECORD * tesseract::Wordrec::new_search | ( | CHUNKS_RECORD * | chunks_record, |
int | num_joints, | ||
BLOB_CHOICE_LIST_VECTOR * | best_char_choices, | ||
WERD_CHOICE * | best_choice, | ||
WERD_CHOICE * | raw_choice, | ||
STATE * | state | ||
) |
Definition at line 568 of file bestfirst.cpp.
{ SEARCH_RECORD *this_search; this_search = (SEARCH_RECORD *) memalloc (sizeof (SEARCH_RECORD)); this_search->open_states = MakeHeap (wordrec_num_seg_states * 20); this_search->closed_states = new_hash_table(); if (state) this_search->this_state = new_state (state); else cprintf ("error: bad initial state in new_search\n"); this_search->first_state = new_state (this_search->this_state); this_search->best_state = new_state (this_search->this_state); this_search->best_choice = best_choice; this_search->raw_choice = raw_choice; this_search->best_char_choices = best_char_choices; this_search->num_joints = num_joints; this_search->num_states = 0; this_search->before_best = 0; this_search->segcost_bias = 0; return (this_search); }
EDGEPT * tesseract::Wordrec::pick_close_point | ( | EDGEPT * | critical_point, |
EDGEPT * | vertical_point, | ||
int * | best_dist | ||
) |
Definition at line 182 of file chop.cpp.
{ EDGEPT *best_point = NULL; int this_distance; int found_better; do { found_better = FALSE; this_distance = edgept_dist (critical_point, vertical_point); if (this_distance <= *best_dist) { if (!(same_point (critical_point->pos, vertical_point->pos) || same_point (critical_point->pos, vertical_point->next->pos) || (best_point && same_point (best_point->pos, vertical_point->pos)) || is_exterior_point (critical_point, vertical_point))) { *best_dist = this_distance; best_point = vertical_point; if (chop_vertical_creep) found_better = TRUE; } } vertical_point = vertical_point->next; } while (found_better == TRUE); return (best_point); }
Definition at line 380 of file findseam.cpp.
{ SEAM_QUEUE seam_queue; SEAM_PILE seam_pile; POINT_GROUP point_heap; PRIORITY priority; EDGEPT *edge; EDGEPT *points[MAX_NUM_POINTS]; EDGEPT_CLIST new_points; SEAM *seam = NULL; TESSLINE *outline; inT16 num_points = 0; #ifndef GRAPHICS_DISABLED if (chop_debug > 2) wordrec_display_splits.set_value(true); draw_blob_edges(blob); #endif point_heap = MakeHeap (MAX_NUM_POINTS); for (outline = blob->outlines; outline; outline = outline->next) prioritize_points(outline, point_heap); while (HeapPop (point_heap, &priority, &edge) == TESS_HEAP_OK) { if (num_points < MAX_NUM_POINTS) points[num_points++] = (EDGEPT *) edge; } FreeHeap(point_heap); /* Initialize queue & pile */ create_seam_pile(seam_pile); create_seam_queue(seam_queue); try_point_pairs(points, num_points, seam_queue, &seam_pile, &seam, blob); try_vertical_splits(points, num_points, &new_points, seam_queue, &seam_pile, &seam, blob); if (seam == NULL) { choose_best_seam(seam_queue, &seam_pile, NULL, BAD_PRIORITY, &seam, blob); } else if (seam->priority > chop_good_split) { choose_best_seam (seam_queue, &seam_pile, NULL, seam->priority, &seam, blob); } EDGEPT_C_IT it(&new_points); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { EDGEPT *inserted_point = it.data(); if (!point_used_by_seam(seam, inserted_point)) { remove_edgept(inserted_point); } } delete_seam_queue(seam_queue); delete_seam_pile(seam_pile); if (seam) { if (seam->priority > chop_ok_split) { delete_seam(seam); seam = NULL; } #ifndef GRAPHICS_DISABLED else if (wordrec_display_splits) { if (seam->split1) mark_split (seam->split1); if (seam->split2) mark_split (seam->split2); if (seam->split3) mark_split (seam->split3); if (chop_debug > 2) { update_edge_window(); edge_window_wait(); } } #endif } if (chop_debug) wordrec_display_splits.set_value(false); return (seam); }
Definition at line 607 of file bestfirst.cpp.
{ HEAPENTRY entry; if (GetTopOfHeap (queue, &entry) == TESS_HEAP_OK) { #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { cprintf ("eval state: %8.3f ", entry.Key); print_state ("", (STATE *) entry.Data, num_joints); } #endif return ((STATE *) entry.Data); } else { return (NULL); } }
void tesseract::Wordrec::prioritize_points | ( | TESSLINE * | outline, |
POINT_GROUP | points | ||
) |
Definition at line 220 of file chop.cpp.
{ EDGEPT *this_point; EDGEPT *local_min = NULL; EDGEPT *local_max = NULL; this_point = outline->loop; local_min = this_point; local_max = this_point; do { if (this_point->vec.y < 0) { /* Look for minima */ if (local_max != NULL) new_max_point(local_max, points); else if (is_inside_angle (this_point)) add_point_to_list(points, this_point); local_max = NULL; local_min = this_point->next; } else if (this_point->vec.y > 0) { /* Look for maxima */ if (local_min != NULL) new_min_point(local_min, points); else if (is_inside_angle (this_point)) add_point_to_list(points, this_point); local_min = NULL; local_max = this_point->next; } else { /* Flat area */ if (local_max != NULL) { if (local_max->prev->vec.y != 0) { new_max_point(local_max, points); } local_max = this_point->next; local_min = NULL; } else { if (local_min->prev->vec.y != 0) { new_min_point(local_min, points); } local_min = this_point->next; local_max = NULL; } } /* Next point */ this_point = this_point->next; } while (this_point != outline->loop); }
FLOAT32 tesseract::Wordrec::prioritize_state | ( | CHUNKS_RECORD * | chunks_record, |
SEARCH_RECORD * | the_search | ||
) |
Definition at line 289 of file heuristic.cpp.
{ FLOAT32 shape_cost; FLOAT32 width_cost; FLOAT32 seam_cost; shape_cost = rating_priority(chunks_record, the_search->this_state, the_search->num_joints); width_cost = width_priority(chunks_record, the_search->this_state, the_search->num_joints); // The rating_priority is the same as the original, and the width_priority // is the same as before if assume_fixed_pitch_char_segment == FALSE. // So this would return the original state priority. if (!use_new_state_cost) return width_cost * 1000 + shape_cost; seam_cost = seamcut_priority(chunks_record->splits, the_search->this_state, the_search->num_joints); // TODO(dsl): how do we normalize the scores for these separate evidence? // FLOAT32 total_cost = shape_cost + width_cost * 0.01 + seam_cost * 0.001; FLOAT32 total_cost = shape_cost * heuristic_weight_rating + width_cost * heuristic_weight_width + seam_cost * heuristic_weight_seamcut; // We don't have an adjustment model for variable pitch segmentation cost // into word rating if (assume_fixed_pitch_char_segment) { float seg_bias = 1.0; if (width_cost < 1) seg_bias *= 0.85; if (width_cost > 3) seg_bias *= pow(heuristic_segcost_rating_base, width_cost/3.0); if (seam_cost > 10) seg_bias *= pow(heuristic_segcost_rating_base, log(seam_cost)/log(10.0)); if (shape_cost > 5) seg_bias *= pow(heuristic_segcost_rating_base, shape_cost/5.0); if (segment_adjust_debug) { tprintf("SegCost: %g Weight: %g rating: %g width: %g seam: %g\n", total_cost, seg_bias, shape_cost, width_cost, seam_cost); } the_search->segcost_bias = seg_bias; } else { the_search->segcost_bias = 0; } return total_cost; }
void tesseract::Wordrec::ProcessSegSearchPainPoint | ( | float | pain_point_priority, |
const MATRIX_COORD & | pain_point, | ||
const WERD_CHOICE * | best_choice, | ||
SEG_SEARCH_PENDING_LIST * | pending[], | ||
CHUNKS_RECORD * | chunks_record, | ||
HEAP * | pain_points, | ||
BlamerBundle * | blamer_bundle | ||
) | [protected] |
Definition at line 257 of file segsearch.cpp.
{ if (segsearch_debug_level > 0) { tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n", pain_point_priority, pain_point.col, pain_point.row); } MATRIX *ratings = chunks_record->ratings; BLOB_CHOICE_LIST *classified = classify_piece( chunks_record->chunks, chunks_record->word_res->denorm, chunks_record->splits, pain_point.col, pain_point.row, blamer_bundle); ratings->put(pain_point.col, pain_point.row, classified); if (segsearch_debug_level > 0) { print_ratings_list("Updated ratings matrix with a new entry:", ratings->get(pain_point.col, pain_point.row), getDict().getUnicharset()); ratings->print(getDict().getUnicharset()); } // Insert initial "pain points" to join the newly classified blob // with its left and right neighbors. if (!classified->empty()) { float worst_piece_cert; bool fragmented; if (pain_point.col > 0) { language_model_->GetWorstPieceCertainty( pain_point.col-1, pain_point.row, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point.col-1, pain_point.row, false, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } if (pain_point.row+1 < ratings->dimension()) { language_model_->GetWorstPieceCertainty( pain_point.col, pain_point.row+1, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point.col, pain_point.row+1, true, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } } // Record a pending entry with the pain_point and each of its parents. int parent_row = pain_point.col - 1; if (parent_row < 0) { // this node has no parents (*pending)[pain_point.col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point.row, NULL, LanguageModel::kAllChangedFlag)); } else { for (int parent_col = 0; parent_col < pain_point.col; ++parent_col) { if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) { (*pending)[pain_point.col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point.row, ratings->get(parent_col, parent_row), LanguageModel::kAllChangedFlag)); } } } }
void tesseract::Wordrec::program_editdown | ( | inT32 | elasped_time | ) |
Definition at line 80 of file tface.cpp.
{ EndAdaptiveClassifier(); blob_match_table.end_match_table(); getDict().InitChoiceAccum(); getDict().End(); }
void tesseract::Wordrec::program_editup | ( | const char * | textbase, |
bool | init_classifier, | ||
bool | init_permute | ||
) |
Definition at line 50 of file tface.cpp.
{ if (textbase != NULL) imagefile = textbase; InitFeatureDefs(&feature_defs_); SetupExtractors(&feature_defs_); InitAdaptiveClassifier(init_classifier); if (init_dict) getDict().Load(); pass2_ok_split = chop_ok_split; pass2_seg_states = wordrec_num_seg_states; }
void tesseract::Wordrec::push_queue | ( | HEAP * | queue, |
STATE * | state, | ||
FLOAT32 | worst_priority, | ||
FLOAT32 | priority, | ||
bool | debug | ||
) |
Definition at line 629 of file bestfirst.cpp.
{ HEAPENTRY entry; if (priority < worst_priority) { if (SizeOfHeap (queue) >= MaxSizeOfHeap(queue)) { if (debug) tprintf("Heap is Full\n"); return; } entry.Data = (char *) new_state (state); num_pushed++; entry.Key = priority; HeapStore(queue, &entry); } }
FLOAT32 tesseract::Wordrec::rating_priority | ( | CHUNKS_RECORD * | chunks_record, |
STATE * | state, | ||
int | num_joints | ||
) |
Definition at line 175 of file heuristic.cpp.
{ BLOB_CHOICE_LIST *blob_choices; BLOB_CHOICE_IT blob_choice_it; inT16 first_chunk = 0; inT16 last_chunk; inT16 ratings = 0; inT16 weights = 0; PIECES_STATE blob_chunks; bin_to_pieces(state, num_joints, blob_chunks); for (int x = 0; blob_chunks[x]; x++) { last_chunk = first_chunk + blob_chunks[x]; blob_choices = chunks_record->ratings->get(first_chunk, last_chunk - 1); if (blob_choices != NOT_CLASSIFIED && blob_choices->length() > 0) { blob_choice_it.set_to_list(blob_choices); ratings += (inT16) blob_choice_it.data()->rating(); for (int y = first_chunk; y < last_chunk; y++) { weights += (inT16) (chunks_record->weights[y]); } } first_chunk = last_chunk; } if (weights <= 0) weights = 1; FLOAT32 rating_cost = static_cast<FLOAT32>(ratings) / static_cast<FLOAT32>(weights); if (segment_adjust_debug > 2) tprintf("rating_cost: r%f / w%f = %f\n", ratings, weights, rating_cost); return rating_cost; }
BLOB_CHOICE_LIST_VECTOR * tesseract::Wordrec::rebuild_current_state | ( | WERD_RES * | word, |
STATE * | state, | ||
BLOB_CHOICE_LIST_VECTOR * | old_choices, | ||
MATRIX * | ratings | ||
) |
rebuild_current_state
Transfers the given state to the word's output fields: rebuild_word, best_state, box_word, and returns the corresponding blob choices.
Definition at line 332 of file bestfirst.cpp.
{ // Initialize search_state, num_joints, x, y. int num_joints = array_count(word->seam_array); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { print_state("Rebuilding state", state, num_joints); } #endif // Setup the rebuild_word ready for the output blobs. if (word->rebuild_word != NULL) delete word->rebuild_word; word->rebuild_word = new TWERD; // Setup the best_state. word->best_state.clear(); SEARCH_STATE search_state = bin_to_chunks(state, num_joints); // See which index is which below for information on x and y. int x = 0; int y; for (int i = 1; i <= search_state[0]; i++) { y = x + search_state[i]; x = y + 1; } y = count_blobs(word->chopped_word->blobs) - 1; // Initialize char_choices, expanded_fragment_lengths: // e.g. if fragment_lengths = {1 1 2 3 1}, // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}. BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); STRING expanded_fragment_lengths_str = ""; bool state_has_fragments = false; const char *fragment_lengths = NULL; if (word->best_choice->length() > 0) { fragment_lengths = word->best_choice->fragment_lengths(); } if (fragment_lengths) { for (int i = 0; i < word->best_choice->length(); ++i) { *char_choices += NULL; word->best_state.push_back(0); if (fragment_lengths[i] > 1) { state_has_fragments = true; } for (int j = 0; j < fragment_lengths[i]; ++j) { expanded_fragment_lengths_str += fragment_lengths[i]; } } } else { for (int i = 0; i <= search_state[0]; ++i) { expanded_fragment_lengths_str += (char)1; *char_choices += NULL; word->best_state.push_back(0); } } // Set up variables for concatenating fragments. const char *word_lengths_ptr = NULL; const char *word_ptr = NULL; if (state_has_fragments) { // Make word_lengths_ptr point to the last element in // best_choice->unichar_lengths(). word_lengths_ptr = word->best_choice->unichar_lengths().string(); word_lengths_ptr += (strlen(word_lengths_ptr)-1); // Make word_str point to the beginning of the last // unichar in best_choice->unichar_string(). word_ptr = word->best_choice->unichar_string().string(); word_ptr += (strlen(word_ptr)-*word_lengths_ptr); } const char *expanded_fragment_lengths = expanded_fragment_lengths_str.string(); char unichar[UNICHAR_LEN + 1]; // Populate char_choices list such that it corresponds to search_state. // // If we are rebuilding a state that contains character fragments: // -- combine blobs that belong to character fragments // -- re-classify the blobs to obtain choices list for the merged blob // -- ensure that correct classification appears in the new choices list // NOTE: a choice composed form original fragment choices will be always // added to the new choices list for each character composed from // fragments (even if the choice for the corresponding character appears // in the re-classified choices list of for the newly merged blob). int ss_index = search_state[0]; // Which index is which? // char_choices_index refers to the finished product: there is one for each // blob/unicharset entry in the final word. // ss_index refers to the search_state, and indexes a group (chunk) of blobs // that were classified together for the best state. // old_choice_index is a copy of ss_index, and accesses the old_choices, // which correspond to chunks in the best state. old_choice_index gets // set to -1 on a fragment set, as there is no corresponding chunk in // the best state. // x and y refer to the underlying blobs and are the first and last blob // indices in a chunk. for (int char_choices_index = char_choices->length() - 1; char_choices_index >= 0; --char_choices_index) { // The start and end of the blob to rebuild. int true_x = x; int true_y = y; // The fake merged fragment choice. BLOB_CHOICE* merged_choice = NULL; // Test for and combine fragments first. int fragment_pieces = expanded_fragment_lengths[ss_index]; int old_choice_index = ss_index; if (fragment_pieces > 1) { strncpy(unichar, word_ptr, *word_lengths_ptr); unichar[*word_lengths_ptr] = '\0'; merged_choice = rebuild_fragments(unichar, expanded_fragment_lengths, old_choice_index, old_choices); old_choice_index = -1; } while (fragment_pieces > 0) { true_x = x; // Move left to the previous blob. y = x - 1; x = y - search_state[ss_index--]; --fragment_pieces; } word->best_state[char_choices_index] = true_y + 1 - true_x; BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify( word, true_x, true_y, old_choice_index, ratings, old_choices); if (merged_choice != NULL) { // Insert merged_blob into current_choices, such that current_choices // are still sorted in non-descending order by rating. ASSERT_HOST(!current_choices->empty()); BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && merged_choice->rating() > choice_it.data()->rating(); choice_it.forward()); choice_it.add_before_stay_put(merged_choice); } // Get rid of fragments in current_choices. BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { if (getDict().getUnicharset().get_fragment( choice_it.data()->unichar_id())) { delete choice_it.extract(); } } char_choices->set(current_choices, char_choices_index); // Update word_ptr and word_lengths_ptr. if (word_lengths_ptr != NULL && word_ptr != NULL) { word_lengths_ptr--; word_ptr -= (*word_lengths_ptr); } } old_choices->delete_data_pointers(); delete old_choices; memfree(search_state); return char_choices; }
BLOB_CHOICE * tesseract::Wordrec::rebuild_fragments | ( | const char * | unichar, |
const char * | expanded_fragment_lengths, | ||
int | choice_index, | ||
BLOB_CHOICE_LIST_VECTOR * | old_choices | ||
) |
Definition at line 680 of file bestfirst.cpp.
{ float rating = 0.0f; float certainty = 0.0f; inT16 min_xheight = -MAX_INT16; inT16 max_xheight = MAX_INT16; for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1; fragment_pieces >= 0; --fragment_pieces, --choice_index) { // Get a pointer to the classifier results from the old_choices. BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index); // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[choice_index], false); BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(choice->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += choice->rating(); if (choice->certainty() < certainty) { certainty = choice->certainty(); } IntersectRange(choice->min_xheight(), choice->max_xheight(), &min_xheight, &max_xheight); break; } } if (choice_it.cycled_list()) { print_ratings_list("Failure", current_choices, unicharset); tprintf("Failed to find fragment %s at index=%d\n", fragment.to_string().string(), choice_index); } ASSERT_HOST(!choice_it.cycled_list()); // Be sure we found the fragment. } return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, -1, -1, 0, min_xheight, max_xheight, false); }
Definition at line 393 of file pieces.cpp.
Definition at line 414 of file pieces.cpp.
{ inT16 num_blobs = count_blobs(blobs); TBOX *bounds = record_blob_bounds(blobs); MATRIX *ratings = new MATRIX(num_blobs); for (int x = 0; x < num_blobs; x++) { for (int y = x; y < num_blobs; y++) { TBOX piecebox = bounds_of_piece(bounds, x, y); BLOB_CHOICE_LIST *choices = blob_match_table.get_match_by_box(piecebox); if (choices != NULL) { ratings->put(x, y, choices); } } } if (merge_fragments_in_matrix) merge_fragments(ratings, num_blobs); delete []bounds; return ratings; }
void tesseract::Wordrec::replace_char_widths | ( | CHUNKS_RECORD * | chunks_record, |
SEARCH_STATE | state | ||
) |
Definition at line 651 of file bestfirst.cpp.
{ WIDTH_RECORD *width_record; int num_blobs; int i; free_widths (chunks_record->char_widths); num_blobs = state[0] + 1; width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2); width_record->num_chars = num_blobs; for (i = 0; i < num_blobs; i++) { width_record->widths[2 * i] = last_segmentation[i].width; if (i + 1 < num_blobs) width_record->widths[2 * i + 1] = last_segmentation[i].gap; } chunks_record->char_widths = width_record; }
void tesseract::Wordrec::reverse_outline | ( | EDGEPT * | outline | ) |
Definition at line 164 of file outlines.cpp.
{ EDGEPT *edgept = outline; EDGEPT *temp; do { /* Swap next and prev */ temp = edgept->prev; edgept->prev = edgept->next; edgept->next = temp; /* Set up vec field */ edgept->vec.x = edgept->next->pos.x - edgept->pos.x; edgept->vec.y = edgept->next->pos.y - edgept->pos.y; edgept = edgept->prev; /* Go to next point */ } while (edgept != outline); }
Definition at line 173 of file wordrec.cpp.
{ ASSERT_HOST(word->alt_choices.empty()); ASSERT_HOST(word->alt_states.empty()); LIST list_it; iterate_list(list_it, best_choices) { VIABLE_CHOICE choice = reinterpret_cast<VIABLE_CHOICE>(first_node(list_it)); CHAR_CHOICE *char_choice = &(choice->Blob[0]); WERD_CHOICE *alt_choice = new WERD_CHOICE(word->uch_set, choice->Length); word->alt_states.push_back(GenericVector<int>(choice->Length)); GenericVector<int> &alt_state = word->alt_states.back(); for (int i = 0; i < choice->Length; char_choice++, i++) { alt_choice->append_unichar_id_space_allocated( char_choice->Class, 1, 0, 0); alt_state.push_back(char_choice->NumChunks); } alt_choice->set_rating(choice->Rating); alt_choice->set_certainty(choice->Certainty); word->alt_choices.push_back(alt_choice); if (wordrec_debug_level > 0) { tprintf("SaveAltChoices: %s %g\n", alt_choice->unichar_string().string(), alt_choice->rating()); } } }
Definition at line 469 of file findseam.cpp.
{ PRIORITY priority; if (seam->split1 == NULL) priority = 0; else if (seam->split2 == NULL) { priority = (seam->priority + full_split_priority (seam->split1, xmin, xmax)); } else if (seam->split3 == NULL) { split_outline (seam->split2->point1, seam->split2->point2); priority = (seam->priority + full_split_priority (seam->split1, xmin, xmax)); unsplit_outlines (seam->split2->point1, seam->split2->point2); } else { split_outline (seam->split2->point1, seam->split2->point2); split_outline (seam->split3->point1, seam->split3->point2); priority = (seam->priority + full_split_priority (seam->split1, xmin, xmax)); unsplit_outlines (seam->split3->point1, seam->split3->point2); unsplit_outlines (seam->split2->point1, seam->split2->point2); } return (priority); }
Definition at line 142 of file heuristic.cpp.
{ int x; unsigned int mask = (num_joints > 32) ? (1 << (num_joints - 1 - 32)) : (1 << (num_joints - 1)); float seam_cost = 0.0f; for (x = num_joints - 1; x >= 0; x--) { int i = num_joints - 1 - x; uinT32 value = (x < 32) ? state->part2 : state->part1; bool state_on = value & mask; if (state_on) { SEAM* seam = (SEAM *) array_value(seams, i); seam_cost += seam->priority; } if (mask == 1) mask = 1 << 31; else mask >>= 1; } if (segment_adjust_debug > 2) tprintf("seam_cost: %f\n", seam_cost); return seam_cost; }
void tesseract::Wordrec::SegSearch | ( | CHUNKS_RECORD * | chunks_record, |
WERD_CHOICE * | best_choice, | ||
BLOB_CHOICE_LIST_VECTOR * | best_char_choices, | ||
WERD_CHOICE * | raw_choice, | ||
STATE * | output_best_state, | ||
BlamerBundle * | blamer_bundle | ||
) |
Definition at line 35 of file segsearch.cpp.
{ int row, col = 0; if (segsearch_debug_level > 0) { tprintf("Starting SegSearch on ratings matrix:\n"); chunks_record->ratings->print(getDict().getUnicharset()); } // Start with a fresh best_choice since rating adjustments // used by the chopper and the new segmentation search are not compatible. best_choice->set_rating(WERD_CHOICE::kBadRating); // TODO(antonova): Due to the fact that we currently do not re-start the // segmentation search from the best choice the chopper found, sometimes // the the segmentation search does not find the best path (that chopper // did discover) and does not have a chance to adapt to it. As soon as we // transition to using new-style language model penalties in the chopper // this issue will be resolved. But for how we are forced clear the // accumulator choices. // // Clear best choice accumulator (that is used for adaption), so that // choices adjusted by chopper do not interfere with the results from the // segmentation search. getDict().ClearBestChoiceAccum(); MATRIX *ratings = chunks_record->ratings; // Priority queue containing pain points generated by the language model // The priority is set by the language model components, adjustments like // seam cost and width priority are factored into the priority. HEAP *pain_points = MakeHeap(segsearch_max_pain_points); // best_path_by_column records the lowest cost path found so far for each // column of the chunks_record->ratings matrix over all the rows. BestPathByColumn *best_path_by_column = new BestPathByColumn[ratings->dimension()]; for (col = 0; col < ratings->dimension(); ++col) { best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating; best_path_by_column[col].best_vse = NULL; } // Compute scaling factor that will help us recover blob outline length // from classifier rating and certainty for the blob. float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; language_model_->InitForWord(prev_word_best_choice_, assume_fixed_pitch_char_segment, best_choice->certainty(), segsearch_max_char_wh_ratio, rating_cert_scale, pain_points, chunks_record, blamer_bundle, wordrec_debug_blamer); MATRIX_COORD *pain_point; float pain_point_priority; BestChoiceBundle best_choice_bundle( output_best_state, best_choice, raw_choice, best_char_choices); // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs, // where i is the column of the child. Initially all the classified entries // in the ratings matrix from column 0 (with parent NULL) are inserted into // pending[0]. As the language model state is updated, new child/parent // pairs are inserted into the lists. Next, the entries in pending[1] are // considered, and so on. It is important that during the update the // children are considered in the non-decreasing order of their column, since // this guarantees that all the parents would be up to date before an update // of a child is done. SEG_SEARCH_PENDING_LIST *pending = new SEG_SEARCH_PENDING_LIST[ratings->dimension()]; // Search for the ratings matrix for the initial best path. for (row = 0; row < ratings->dimension(); ++row) { if (ratings->get(0, row) != NOT_CLASSIFIED) { pending[0].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag)); } } UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle, blamer_bundle); // Keep trying to find a better path by fixing the "pain points". int num_futile_classifications = 0; STRING blamer_debug; while (!SegSearchDone(num_futile_classifications) || (blamer_bundle != NULL && blamer_bundle->segsearch_is_looking_for_blame)) { // Get the next valid "pain point". int pop; while (true) { pop = HeapPop(pain_points, &pain_point_priority, &pain_point); if (pop == EMPTY) break; if (pain_point->Valid(*ratings) && ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) { break; } else { delete pain_point; } } if (pop == EMPTY) { if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n"); break; } ProcessSegSearchPainPoint(pain_point_priority, *pain_point, best_choice_bundle.best_choice, &pending, chunks_record, pain_points, blamer_bundle); UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle, blamer_bundle); if (!best_choice_bundle.updated) ++num_futile_classifications; if (segsearch_debug_level > 0) { tprintf("num_futile_classifications %d\n", num_futile_classifications); } best_choice_bundle.updated = false; // reset updated delete pain_point; // done using this pain point // See if it's time to terminate SegSearch or time for starting a guided // search for the true path to find the blame for the incorrect best_choice. if (SegSearchDone(num_futile_classifications) && blamer_bundle != NULL && blamer_bundle->incorrect_result_reason == IRR_CORRECT && !blamer_bundle->segsearch_is_looking_for_blame && blamer_bundle->truth_has_char_boxes && !ChoiceIsCorrect(getDict().getUnicharset(), best_choice, blamer_bundle->truth_text)) { InitBlamerForSegSearch(best_choice_bundle.best_choice, chunks_record, pain_points, blamer_bundle, &blamer_debug); } } // end while loop exploring alternative paths FinishBlamerForSegSearch(best_choice_bundle.best_choice, blamer_bundle, &blamer_debug); if (segsearch_debug_level > 0) { tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n", language_model_->AcceptableChoiceFound()); } // Clean up. FreeHeapData(pain_points, MATRIX_COORD::Delete); delete[] best_path_by_column; delete[] pending; for (row = 0; row < ratings->dimension(); ++row) { for (col = 0; col <= row; ++col) { BLOB_CHOICE_LIST *rating = ratings->get(col, row); if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating); } } }
bool tesseract::Wordrec::SegSearchDone | ( | int | num_futile_classifications | ) | [inline, protected] |
Definition at line 520 of file wordrec.h.
{ return (language_model_->AcceptableChoiceFound() || num_futile_classifications >= segsearch_max_futile_classifications); }
inT16 tesseract::Wordrec::select_blob_to_split | ( | const BLOB_CHOICE_LIST_VECTOR & | char_choices, |
float | rating_ceiling, | ||
bool | split_next_to_fragment | ||
) |
Definition at line 801 of file chopper.cpp.
{ BLOB_CHOICE_IT blob_choice_it; BLOB_CHOICE *blob_choice; BLOB_CHOICE_IT temp_it; int x; float worst = -MAX_FLOAT32; int worst_index = -1; float worst_near_fragment = -MAX_FLOAT32; int worst_index_near_fragment = -1; const CHAR_FRAGMENT **fragments = NULL; if (chop_debug) { if (rating_ceiling < MAX_FLOAT32) cprintf("rating_ceiling = %8.4f\n", rating_ceiling); else cprintf("rating_ceiling = No Limit\n"); } if (split_next_to_fragment && char_choices.length() > 0) { fragments = new const CHAR_FRAGMENT *[char_choices.length()]; if (char_choices.get(0) != NULL) { temp_it.set_to_list(char_choices.get(0)); fragments[0] = getDict().getUnicharset().get_fragment( temp_it.data()->unichar_id()); } else { fragments[0] = NULL; } } for (x = 0; x < char_choices.length(); ++x) { if (char_choices.get(x) == NULL) { if (fragments != NULL) { delete[] fragments; } return x; } else { blob_choice_it.set_to_list(char_choices.get(x)); blob_choice = blob_choice_it.data(); // Populate fragments for the following position. if (split_next_to_fragment && x+1 < char_choices.length()) { if (char_choices.get(x+1) != NULL) { temp_it.set_to_list(char_choices.get(x+1)); fragments[x+1] = getDict().getUnicharset().get_fragment( temp_it.data()->unichar_id()); } else { fragments[x+1] = NULL; } } if (blob_choice->rating() < rating_ceiling && blob_choice->certainty() < tessedit_certainty_threshold) { // Update worst and worst_index. if (blob_choice->rating() > worst) { worst_index = x; worst = blob_choice->rating(); } if (split_next_to_fragment) { // Update worst_near_fragment and worst_index_near_fragment. bool expand_following_fragment = (x + 1 < char_choices.length() && fragments[x+1] != NULL && !fragments[x+1]->is_beginning()); bool expand_preceding_fragment = (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending()); if ((expand_following_fragment || expand_preceding_fragment) && blob_choice->rating() > worst_near_fragment) { worst_index_near_fragment = x; worst_near_fragment = blob_choice->rating(); if (chop_debug) { cprintf("worst_index_near_fragment=%d" " expand_following_fragment=%d" " expand_preceding_fragment=%d\n", worst_index_near_fragment, expand_following_fragment, expand_preceding_fragment); } } } } } } if (fragments != NULL) { delete[] fragments; } // TODO(daria): maybe a threshold of badness for // worst_near_fragment would be useful. return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index; }
Definition at line 898 of file chopper.cpp.
{ if (!fixpt) return -1; for (int i = 0; i < fixpt->size(); i++) { if ((*fixpt)[i].begin == (*fixpt)[i].end && (*fixpt)[i].dangerous && (*fixpt)[i].correct_is_ngram) { return (*fixpt)[i].begin; } } return -1; }
void tesseract::Wordrec::set_chopper_blame | ( | WERD_RES * | word | ) |
Definition at line 917 of file chopper.cpp.
{ BlamerBundle *blamer_bundle = word->blamer_bundle; assert(blamer_bundle != NULL); if (blamer_bundle->NoTruth() || !(blamer_bundle->truth_has_char_boxes) || word->chopped_word->blobs == NULL) { return; } STRING debug; bool missing_chop = false; TBLOB * curr_blob = word->chopped_word->blobs; int b = 0; inT16 truth_x; while (b < blamer_bundle->truth_word.length() && curr_blob != NULL) { truth_x = blamer_bundle->norm_truth_word.BlobBox(b).right(); if (curr_blob->bounding_box().right() < (truth_x - blamer_bundle->norm_box_tolerance)) { curr_blob = curr_blob->next; continue; // encountered an extra chop, keep looking } else if (curr_blob->bounding_box().right() > (truth_x + blamer_bundle->norm_box_tolerance)) { missing_chop = true; break; } else { curr_blob = curr_blob->next; ++b; } } if (missing_chop || b < blamer_bundle->norm_truth_word.length()) { STRING debug; char debug_buffer[256]; if (missing_chop) { sprintf(debug_buffer, "Detected missing chop (tolerance=%d) at ", blamer_bundle->norm_box_tolerance); debug += debug_buffer; curr_blob->bounding_box().append_debug(&debug); debug.add_str_int("\nNo chop for truth at x=", truth_x); } else { debug.add_str_int("Missing chops for last ", blamer_bundle->norm_truth_word.length()-b); debug += " truth box(es)"; } debug += "\nMaximally chopped word boxes:\n"; for (curr_blob = word->chopped_word->blobs; curr_blob != NULL; curr_blob = curr_blob->next) { const TBOX &tbox = curr_blob->bounding_box(); sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n", tbox.left(), tbox.bottom(), tbox.right(), tbox.top()); debug += debug_buffer; } debug += "Truth bounding boxes:\n"; for (b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) { const TBOX &tbox = blamer_bundle->norm_truth_word.BlobBox(b); sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n", tbox.left(), tbox.bottom(), tbox.right(), tbox.top()); debug += debug_buffer; } blamer_bundle->SetBlame(IRR_CHOPPER, debug, word->best_choice, wordrec_debug_blamer); } }
void tesseract::Wordrec::set_outline_bounds | ( | register EDGEPT * | point1, |
register EDGEPT * | point2, | ||
BOUNDS_RECT | rect | ||
) |
Definition at line 213 of file gradechop.cpp.
{ register EDGEPT *this_point; register inT16 x_min; register inT16 x_max; find_bounds_loop(point1, point2, x_min, x_max); rect[0] = x_min; rect[1] = x_max; find_bounds_loop(point2, point1, x_min, x_max); rect[2] = x_min; rect[3] = x_max; }
void tesseract::Wordrec::set_pass1 | ( | ) |
Definition at line 93 of file tface.cpp.
{ chop_ok_split.set_value(70.0); wordrec_num_seg_states.set_value(15); SettupPass1(); }
void tesseract::Wordrec::set_pass2 | ( | ) |
Definition at line 105 of file tface.cpp.
{ chop_ok_split.set_value(pass2_ok_split); wordrec_num_seg_states.set_value(pass2_seg_states); SettupPass2(); }
WIDTH_RECORD * tesseract::Wordrec::state_char_widths | ( | WIDTH_RECORD * | chunk_widths, |
STATE * | state, | ||
int | num_joints | ||
) |
Definition at line 58 of file heuristic.cpp.
{ SEARCH_STATE chunks = bin_to_chunks(state, num_joints); int num_chars = chunks[0] + 1; // allocate and store (n+1,w0,g0,w1,g1...,wn) in int[2*(n+1)] as a // struct { num_chars, widths[2*n+1]; } WIDTH_RECORD *char_widths = (WIDTH_RECORD*) memalloc(sizeof(int)*num_chars*2); char_widths->num_chars = num_chars; int first_blob = 0; int last_blob; for (int i = 1; i <= num_chars; i++) { last_blob = (i > chunks[0]) ? num_joints : first_blob + chunks[i]; char_widths->widths[2*i-2] = AssociateUtils::GetChunksWidth(chunk_widths, first_blob, last_blob); if (i <= chunks[0]) { char_widths->widths[2*i-1] = AssociateUtils::GetChunksGap(chunk_widths, last_blob); } if (segment_adjust_debug > 3) tprintf("width_record[%d]s%d--s%d(%d) %d %d:%d\n", i-1, first_blob, last_blob, chunks[i], char_widths->widths[2*i-2], char_widths->widths[2*i-1], chunk_widths->widths[2*last_blob+1]); first_blob = last_blob + 1; } memfree(chunks); return char_widths; }
void tesseract::Wordrec::try_point_pairs | ( | EDGEPT * | points[MAX_NUM_POINTS], |
inT16 | num_points, | ||
SEAM_QUEUE | seam_queue, | ||
SEAM_PILE * | seam_pile, | ||
SEAM ** | seam, | ||
TBLOB * | blob | ||
) |
Definition at line 507 of file findseam.cpp.
{ inT16 x; inT16 y; SPLIT *split; PRIORITY priority; for (x = 0; x < num_points; x++) { for (y = x + 1; y < num_points; y++) { if (points[y] && weighted_edgept_dist(points[x], points[y], chop_x_y_weight) < chop_split_length && points[x] != points[y]->next && points[y] != points[x]->next && !is_exterior_point(points[x], points[y]) && !is_exterior_point(points[y], points[x])) { split = new_split (points[x], points[y]); priority = partial_split_priority (split); choose_best_seam(seam_queue, seam_pile, split, priority, seam, blob); } } } }
void tesseract::Wordrec::try_vertical_splits | ( | EDGEPT * | points[MAX_NUM_POINTS], |
inT16 | num_points, | ||
EDGEPT_CLIST * | new_points, | ||
SEAM_QUEUE | seam_queue, | ||
SEAM_PILE * | seam_pile, | ||
SEAM ** | seam, | ||
TBLOB * | blob | ||
) |
Definition at line 549 of file findseam.cpp.
{ EDGEPT *vertical_point = NULL; SPLIT *split; inT16 x; PRIORITY priority; TESSLINE *outline; for (x = 0; x < num_points; x++) { vertical_point = NULL; for (outline = blob->outlines; outline; outline = outline->next) { vertical_projection_point(points[x], outline->loop, &vertical_point, new_points); } if (vertical_point && points[x] != vertical_point->next && vertical_point != points[x]->next && weighted_edgept_dist(points[x], vertical_point, chop_x_y_weight) < chop_split_length) { split = new_split (points[x], vertical_point); priority = partial_split_priority (split); choose_best_seam(seam_queue, seam_pile, split, priority, seam, blob); } } }
void tesseract::Wordrec::update_blob_classifications | ( | TWERD * | word, |
const BLOB_CHOICE_LIST_VECTOR & | choices | ||
) |
Definition at line 152 of file wordclass.cpp.
void tesseract::Wordrec::update_ratings | ( | const BLOB_CHOICE_LIST_VECTOR & | new_choices, |
const CHUNKS_RECORD * | chunks_record, | ||
const SEARCH_STATE | search_state | ||
) |
void tesseract::Wordrec::UpdateSegSearchNodes | ( | int | starting_col, |
SEG_SEARCH_PENDING_LIST * | pending[], | ||
BestPathByColumn * | best_path_by_column[], | ||
CHUNKS_RECORD * | chunks_record, | ||
HEAP * | pain_points, | ||
BestChoiceBundle * | best_choice_bundle, | ||
BlamerBundle * | blamer_bundle | ||
) | [protected] |
Definition at line 186 of file segsearch.cpp.
{ MATRIX *ratings = chunks_record->ratings; for (int col = starting_col; col < ratings->dimension(); ++col) { if (segsearch_debug_level > 0) { tprintf("\n\nUpdateSegSearchNodes: evaluate children in col=%d\n", col); } // Iterate over the pending list for this column. SEG_SEARCH_PENDING_LIST *pending_list = &((*pending)[col]); SEG_SEARCH_PENDING_IT pending_it(pending_list); GenericVector<int> non_empty_rows; while (!pending_it.empty()) { // Update language model state of this child+parent pair. SEG_SEARCH_PENDING *p = pending_it.extract(); if (non_empty_rows.length() == 0 || non_empty_rows[non_empty_rows.length()-1] != p->child_row) { non_empty_rows.push_back(p->child_row); } BLOB_CHOICE_LIST *current_node = ratings->get(col, p->child_row); LanguageModelFlagsType new_changed = language_model_->UpdateState(p->changed, col, p->child_row, current_node, p->parent, pain_points, best_path_by_column, chunks_record, best_choice_bundle, blamer_bundle); if (new_changed) { // Since the language model state of this entry changed, add all the // pairs with it as a parent and each of its children to pending, so // that the children are updated as well. int child_col = p->child_row + 1; for (int child_row = child_col; child_row < ratings->dimension(); ++child_row) { if (ratings->get(child_col, child_row) != NOT_CLASSIFIED) { SEG_SEARCH_PENDING *new_pending = new SEG_SEARCH_PENDING(child_row, current_node, 0); SEG_SEARCH_PENDING *actual_new_pending = reinterpret_cast<SEG_SEARCH_PENDING *>( (*pending)[child_col].add_sorted_and_find( SEG_SEARCH_PENDING::compare, true, new_pending)); if (new_pending != actual_new_pending) delete new_pending; actual_new_pending->changed |= new_changed; if (segsearch_debug_level > 0) { tprintf("Added child(col=%d row=%d) parent(col=%d row=%d)" " changed=0x%x to pending\n", child_col, actual_new_pending->child_row, col, p->child_row, actual_new_pending->changed); } } } } // end if new_changed delete p; // clean up pending_it.forward(); } // end while !pending_it.empty() language_model_->GeneratePainPointsFromColumn( col, non_empty_rows, best_choice_bundle->best_choice->certainty(), pain_points, best_path_by_column, chunks_record); } // end for col if (best_choice_bundle->updated) { language_model_->GeneratePainPointsFromBestChoice( pain_points, chunks_record, best_choice_bundle); } language_model_->CleanUp(); }
void tesseract::Wordrec::vertical_projection_point | ( | EDGEPT * | split_point, |
EDGEPT * | target_point, | ||
EDGEPT ** | best_point, | ||
EDGEPT_CLIST * | new_points | ||
) |
Definition at line 332 of file chop.cpp.
{ EDGEPT *p; /* Iterator */ EDGEPT *this_edgept; /* Iterator */ EDGEPT_C_IT new_point_it(new_points); int x = split_point->pos.x; /* X value of vertical */ int best_dist = LARGE_DISTANCE;/* Best point found */ if (*best_point != NULL) best_dist = edgept_dist(split_point, *best_point); p = target_point; /* Look at each edge point */ do { if ((((p->pos.x <= x) && (x <= p->next->pos.x)) || ((p->next->pos.x <= x) && (x <= p->pos.x))) && !same_point (split_point->pos, p->pos) && !same_point (split_point->pos, p->next->pos) && (*best_point == NULL || !same_point ((*best_point)->pos, p->pos))) { if (near_point(split_point, p, p->next, &this_edgept)) { new_point_it.add_before_then_move(this_edgept); } if (*best_point == NULL) best_dist = edgept_dist (split_point, this_edgept); this_edgept = pick_close_point(split_point, this_edgept, &best_dist); if (this_edgept) *best_point = this_edgept; } p = p->next; } while (p != target_point); }
FLOAT32 tesseract::Wordrec::width_priority | ( | CHUNKS_RECORD * | chunks_record, |
STATE * | state, | ||
int | num_joints | ||
) |
Definition at line 222 of file heuristic.cpp.
{ FLOAT32 penalty = 0.0; WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths, state, num_joints); // When baseline_enable==True, which is the current default for Tesseract, // a fixed value of 128 (BASELINE_SCALE) is always used. FLOAT32 normalizing_height = BASELINE_SCALE; if (assume_fixed_pitch_char_segment) { // For fixed pitch language like CJK, we use the full text height as the // normalizing factor so we are not dependent on xheight calculation. // In the normalized coord. xheight * scale == BASELINE_SCALE(128), // so add proportionally scaled ascender zone to get full text height. const DENORM& denorm = chunks_record->word_res->denorm; normalizing_height = denorm.y_scale() * (denorm.row()->x_height() + denorm.row()->ascenders()); if (segment_adjust_debug > 1) tprintf("WidthPriority: %f %f normalizing height = %f\n", denorm.row()->x_height(), denorm.row()->ascenders(), normalizing_height); // Impose additional segmentation penalties if blob widths or gaps // distribution don't fit a fixed-pitch model. FLOAT32 width_var = get_width_variance(width_rec, normalizing_height); FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height); penalty += width_var; penalty += gap_var; } for (int x = 0; x < width_rec->num_chars; x++) { FLOAT32 squat = width_rec->widths[2*x]; FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0; squat /= normalizing_height; gap /= normalizing_height; if (assume_fixed_pitch_char_segment) { penalty += AssociateUtils::FixedPitchWidthCost( squat, 0.0f, x == 0 || x == width_rec->num_chars -1, heuristic_max_char_wh_ratio); penalty += AssociateUtils::FixedPitchGapCost( gap, x == width_rec->num_chars - 1); if (width_rec->num_chars == 1 && squat > AssociateUtils::kMaxFixedPitchCharAspectRatio) { penalty += 10; } } else { // Original equation when // heuristic_max_char_ratio == AssociateUtils::kMaxSquat if (squat > heuristic_max_char_wh_ratio) penalty += squat - heuristic_max_char_wh_ratio; } } free_widths(width_rec); return (penalty); }
MATRIX * tesseract::Wordrec::word_associator | ( | bool | only_create_ratings_matrtix, |
WERD_RES * | word, | ||
STATE * | state, | ||
BLOB_CHOICE_LIST_VECTOR * | best_char_choices, | ||
DANGERR * | fixpt, | ||
STATE * | best_state | ||
) |
Definition at line 984 of file chopper.cpp.
{ CHUNKS_RECORD chunks_record; BLOB_WEIGHTS blob_weights; int x; int num_chunks; BLOB_CHOICE_IT blob_choice_it; num_chunks = array_count(word->seam_array) + 1; TBLOB* blobs = word->chopped_word->blobs; chunks_record.ratings = record_piece_ratings(blobs); chunks_record.chunks = blobs; chunks_record.word_res = word; chunks_record.splits = word->seam_array; chunks_record.chunk_widths = blobs_widths(blobs); chunks_record.char_widths = blobs_widths(blobs); /* Save chunk weights */ for (x = 0; x < num_chunks; x++) { BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings, blobs, chunks_record.word_res->denorm, word->seam_array, x, x, word->blamer_bundle); blob_choice_it.set_to_list(choices); //This is done by Jetsoft. Divide by zero is possible. if (blob_choice_it.data()->certainty() == 0) { blob_weights[x]=0; } else { blob_weights[x] = -(inT16) (10 * blob_choice_it.data()->rating() / blob_choice_it.data()->certainty()); } } chunks_record.weights = blob_weights; if (chop_debug) chunks_record.ratings->print(getDict().getUnicharset()); if (!only_create_ratings_matrix) { if (enable_new_segsearch) { SegSearch(&chunks_record, word->best_choice, best_char_choices, word->raw_choice, state, word->blamer_bundle); } else { best_first_search(&chunks_record, best_char_choices, word, state, fixpt, best_state); } } free_widths(chunks_record.chunk_widths); free_widths(chunks_record.char_widths); return chunks_record.ratings; }
double tesseract::Wordrec::chop_center_knob = 0.15 |
int tesseract::Wordrec::chop_debug = 0 |
bool tesseract::Wordrec::chop_enable = 1 |
double tesseract::Wordrec::chop_good_split = 50.0 |
int tesseract::Wordrec::chop_inside_angle = -50 |
int tesseract::Wordrec::chop_min_outline_area = 2000 |
double tesseract::Wordrec::chop_ok_split = 100.0 |
double tesseract::Wordrec::chop_overlap_knob = 0.9 |
double tesseract::Wordrec::chop_sharpness_knob = 0.06 |
double tesseract::Wordrec::chop_split_dist_knob = 0.5 |
int tesseract::Wordrec::chop_split_length = 10000 |
double tesseract::Wordrec::chop_width_change_knob = 5.0 |
bool tesseract::Wordrec::enable_new_segsearch = false |
void(Wordrec::* tesseract::Wordrec::fill_lattice_)(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
bool tesseract::Wordrec::force_word_assoc = 0 |
double tesseract::Wordrec::heuristic_max_char_wh_ratio = 2.0 |
double tesseract::Wordrec::heuristic_segcost_rating_base = 1.25 |
double tesseract::Wordrec::heuristic_weight_rating = 1 |
double tesseract::Wordrec::heuristic_weight_seamcut = 0 |
double tesseract::Wordrec::heuristic_weight_width = 0 |
bool tesseract::Wordrec::save_alt_choices = false |
double tesseract::Wordrec::segsearch_max_char_wh_ratio = 2.0 |
double tesseract::Wordrec::tessedit_certainty_threshold = -2.25 |
bool tesseract::Wordrec::wordrec_debug_blamer = false |
bool tesseract::Wordrec::wordrec_no_block = 0 |
bool tesseract::Wordrec::wordrec_run_blamer = false |
double tesseract::Wordrec::wordrec_worst_state = 1 |