tesseract-doc/fixspace_8cpp_source.html

00001 /******************************************************************
00002  * File:        fixspace.cpp  (Formerly fixspace.c)
00003  * Description: Implements a pass over the page res, exploring the alternative
00004  *              spacing possibilities, trying to use context to improve the
00005  *              word spacing
00006 * Author:               Phil Cheatle
00007 * Created:              Thu Oct 21 11:38:43 BST 1993
00008 *
00009 * (C) Copyright 1993, Hewlett-Packard Ltd.
00010 ** Licensed under the Apache License, Version 2.0 (the "License");
00011 ** you may not use this file except in compliance with the License.
00012 ** You may obtain a copy of the License at
00013 ** http://www.apache.org/licenses/LICENSE-2.0
00014 ** Unless required by applicable law or agreed to in writing, software
00015 ** distributed under the License is distributed on an "AS IS" BASIS,
00016 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 ** See the License for the specific language governing permissions and
00018 ** limitations under the License.
00019 *
00020 **********************************************************************/
00021
00022 #include "mfcpch.h"
00023 #include <ctype.h>
00024 #include "reject.h"
00025 #include "statistc.h"
00026 #include "control.h"
00027 #include "fixspace.h"
00028 #include "genblob.h"
00029 #include "tessvars.h"
00030 #include "tessbox.h"
00031 #include "secname.h"
00032 #include "globals.h"
00033 #include "tesseractclass.h"
00034
00035 #define PERFECT_WERDS   999
00036 #define MAXSPACING      128      /*max expected spacing in pix */
00037
00038 namespace tesseract {
00049 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
00050                                  inT32 word_count,
00051                                  PAGE_RES *page_res) {
00052   BLOCK_RES_IT block_res_it;
00053   ROW_RES_IT row_res_it;
00054   WERD_RES_IT word_res_it_from;
00055   WERD_RES_IT word_res_it_to;
00056   WERD_RES *word_res;
00057   WERD_RES_LIST fuzzy_space_words;
00058   inT16 new_length;
00059   BOOL8 prevent_null_wd_fixsp;   // DONT process blobless wds
00060   inT32 word_index;              // current word
00061
00062   block_res_it.set_to_list(&page_res->block_res_list);
00063   word_index = 0;
00064   for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
00065        block_res_it.forward()) {
00066     row_res_it.set_to_list(&block_res_it.data()->row_res_list);
00067     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
00068          row_res_it.forward()) {
00069       word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
00070       while (!word_res_it_from.at_last()) {
00071         word_res = word_res_it_from.data();
00072         while (!word_res_it_from.at_last() &&
00073                !(word_res->combination ||
00074                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
00075                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
00076           fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
00077                          block_res_it.data()->block);
00078           word_res = word_res_it_from.forward();
00079           word_index++;
00080           if (monitor != NULL) {
00081             monitor->ocr_alive = TRUE;
00082             monitor->progress = 90 + 5 * word_index / word_count;
00083             if (monitor->deadline_exceeded() ||
00084                 (monitor->cancel != NULL &&
00085                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
00086             return;
00087           }
00088         }
00089
00090         if (!word_res_it_from.at_last()) {
00091           word_res_it_to = word_res_it_from;
00092           prevent_null_wd_fixsp =
00093             word_res->word->cblob_list()->empty();
00094           if (check_debug_pt(word_res, 60))
00095             debug_fix_space_level.set_value(10);
00096           word_res_it_to.forward();
00097           word_index++;
00098           if (monitor != NULL) {
00099             monitor->ocr_alive = TRUE;
00100             monitor->progress = 90 + 5 * word_index / word_count;
00101             if (monitor->deadline_exceeded() ||
00102                 (monitor->cancel != NULL &&
00103                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
00104             return;
00105           }
00106           while (!word_res_it_to.at_last () &&
00107                  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
00108                   word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
00109             if (check_debug_pt(word_res, 60))
00110               debug_fix_space_level.set_value(10);
00111             if (word_res->word->cblob_list()->empty())
00112               prevent_null_wd_fixsp = TRUE;
00113             word_res = word_res_it_to.forward();
00114           }
00115           if (check_debug_pt(word_res, 60))
00116             debug_fix_space_level.set_value(10);
00117           if (word_res->word->cblob_list()->empty())
00118             prevent_null_wd_fixsp = TRUE;
00119           if (prevent_null_wd_fixsp) {
00120             word_res_it_from = word_res_it_to;
00121           } else {
00122             fuzzy_space_words.assign_to_sublist(&word_res_it_from,
00123                                                 &word_res_it_to);
00124             fix_fuzzy_space_list(fuzzy_space_words,
00125                                  row_res_it.data()->row,
00126                                  block_res_it.data()->block);
00127             new_length = fuzzy_space_words.length();
00128             word_res_it_from.add_list_before(&fuzzy_space_words);
00129             for (;
00130                  !word_res_it_from.at_last() && new_length > 0;
00131                  new_length--) {
00132               word_res_it_from.forward();
00133             }
00134           }
00135           if (test_pt)
00136             debug_fix_space_level.set_value(0);
00137         }
00138         fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
00139                        block_res_it.data()->block);
00140         // Last word in row
00141       }
00142     }
00143   }
00144 }
00145
00146 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
00147                                      ROW *row,
00148                                      BLOCK* block) {
00149   inT16 best_score;
00150   WERD_RES_LIST current_perm;
00151   inT16 current_score;
00152   BOOL8 improved = FALSE;
00153
00154   best_score = eval_word_spacing(best_perm);  // default score
00155   dump_words(best_perm, best_score, 1, improved);
00156
00157   if (best_score != PERFECT_WERDS)
00158     initialise_search(best_perm, current_perm);
00159
00160   while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
00161     match_current_words(current_perm, row, block);
00162     current_score = eval_word_spacing(current_perm);
00163     dump_words(current_perm, current_score, 2, improved);
00164     if (current_score > best_score) {
00165       best_perm.clear();
00166       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
00167       best_score = current_score;
00168       improved = TRUE;
00169     }
00170     if (current_score < PERFECT_WERDS)
00171       transform_to_next_perm(current_perm);
00172   }
00173   dump_words(best_perm, best_score, 3, improved);
00174 }
00175
00176 }  // namespace tesseract
00177
00178 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
00179   WERD_RES_IT src_it(&src_list);
00180   WERD_RES_IT new_it(&new_list);
00181   WERD_RES *src_wd;
00182   WERD_RES *new_wd;
00183
00184   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00185     src_wd = src_it.data();
00186     if (!src_wd->combination) {
00187       new_wd = new WERD_RES(*src_wd);
00188       new_wd->combination = FALSE;
00189       new_wd->part_of_combo = FALSE;
00190       new_it.add_after_then_move(new_wd);
00191     }
00192   }
00193 }
00194
00195
00196 namespace tesseract {
00197 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
00198                                     BLOCK* block) {
00199   WERD_RES_IT word_it(&words);
00200   WERD_RES *word;
00201   // Since we are not using PAGE_RES to iterate over words, we need to update
00202   // prev_word_best_choice_ before calling classify_word_pass2().
00203   prev_word_best_choice_ = NULL;
00204   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00205     word = word_it.data();
00206     if ((!word->part_of_combo) && (word->box_word == NULL)) {
00207       classify_word_and_language(&Tesseract::classify_word_pass2,
00208                                  block, row, word);
00209     }
00210     prev_word_best_choice_ = word->best_choice;
00211   }
00212 }
00213
00214
00240 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
00241   WERD_RES_IT word_res_it(&word_res_list);
00242   inT16 total_score = 0;
00243   inT16 word_count = 0;
00244   inT16 done_word_count = 0;
00245   inT16 word_len;
00246   inT16 i;
00247   inT16 offset;
00248   WERD_RES *word;                 // current word
00249   inT16 prev_word_score = 0;
00250   BOOL8 prev_word_done = FALSE;
00251   BOOL8 prev_char_1 = FALSE;      // prev ch a "1/I/l"?
00252   BOOL8 prev_char_digit = FALSE;  // prev ch 2..9 or 0
00253   BOOL8 current_char_1 = FALSE;
00254   BOOL8 current_word_ok_so_far;
00255   STRING punct_chars = "!\"`',.:;";
00256   BOOL8 prev_char_punct = FALSE;
00257   BOOL8 current_char_punct = FALSE;
00258   BOOL8 word_done = FALSE;
00259
00260   do {
00261     word = word_res_it.data();
00262     word_done = fixspace_thinks_word_done(word);
00263     word_count++;
00264     if (word->tess_failed) {
00265       total_score += prev_word_score;
00266       if (prev_word_done)
00267         done_word_count++;
00268       prev_word_score = 0;
00269       prev_char_1 = FALSE;
00270       prev_char_digit = FALSE;
00271       prev_word_done = FALSE;
00272     } else {
00273       /*
00274         Can we add the prev word score and potentially count this word?
00275         Yes IF it didnt end in a 1 when the first char of this word is a digit
00276           AND it didnt end in a digit when the first char of this word is a 1
00277       */
00278       word_len = word->reject_map.length();
00279       current_word_ok_so_far = FALSE;
00280       if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
00281             (prev_char_digit && (
00282                 (word_done &&
00283                  word->best_choice->unichar_lengths().string()[0] == 1 &&
00284                  word->best_choice->unichar_string()[0] == '1') ||
00285                 (!word_done && STRING(conflict_set_I_l_1).contains(
00286                       word->best_choice->unichar_string()[0])))))) {
00287         total_score += prev_word_score;
00288         if (prev_word_done)
00289           done_word_count++;
00290         current_word_ok_so_far = word_done;
00291       }
00292
00293       if (current_word_ok_so_far) {
00294         prev_word_done = TRUE;
00295         prev_word_score = word_len;
00296       } else {
00297         prev_word_done = FALSE;
00298         prev_word_score = 0;
00299       }
00300
00301       /* Add 1 to total score for every joined 1 regardless of context and
00302          rejtn */
00303       for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
00304         current_char_1 = word->best_choice->unichar_string()[i] == '1';
00305         if (prev_char_1 || (current_char_1 && (i > 0)))
00306           total_score++;
00307         prev_char_1 = current_char_1;
00308       }
00309
00310       /* Add 1 to total score for every joined punctuation regardless of context
00311         and rejtn */
00312       if (tessedit_prefer_joined_punct) {
00313         for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
00314              offset += word->best_choice->unichar_lengths()[i++]) {
00315           current_char_punct =
00316             punct_chars.contains(word->best_choice->unichar_string()[offset]);
00317           if (prev_char_punct || (current_char_punct && i > 0))
00318             total_score++;
00319           prev_char_punct = current_char_punct;
00320         }
00321       }
00322       prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
00323       for (i = 0, offset = 0; i < word_len - 1;
00324            offset += word->best_choice->unichar_lengths()[i++]);
00325       prev_char_1 =
00326           ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
00327            || (!word_done && STRING(conflict_set_I_l_1).contains(
00328                    word->best_choice->unichar_string()[offset])));
00329     }
00330     /* Find next word */
00331     do {
00332       word_res_it.forward();
00333     } while (word_res_it.data()->part_of_combo);
00334   } while (!word_res_it.at_first());
00335   total_score += prev_word_score;
00336   if (prev_word_done)
00337     done_word_count++;
00338   if (done_word_count == word_count)
00339     return PERFECT_WERDS;
00340   else
00341     return total_score;
00342 }
00343
00344 BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
00345   int i;
00346   int offset;
00347
00348   for (i = 0, offset = 0; i < char_position;
00349        offset += word->best_choice->unichar_lengths()[i++]);
00350   return (
00351       word->uch_set->get_isdigit(
00352           word->best_choice->unichar_string().string() + offset,
00353           word->best_choice->unichar_lengths()[i]) ||
00354       (word->best_choice->permuter() == NUMBER_PERM &&
00355        STRING(numeric_punctuation).contains(
00356            word->best_choice->unichar_string().string()[offset])));
00357 }
00358
00359 }  // namespace tesseract
00360
00361
00373 void transform_to_next_perm(WERD_RES_LIST &words) {
00374   WERD_RES_IT word_it(&words);
00375   WERD_RES_IT prev_word_it(&words);
00376   WERD_RES *word;
00377   WERD_RES *prev_word;
00378   WERD_RES *combo;
00379   WERD *copy_word;
00380   inT16 prev_right = -MAX_INT16;
00381   TBOX box;
00382   inT16 gap;
00383   inT16 min_gap = MAX_INT16;
00384
00385   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00386     word = word_it.data();
00387     if (!word->part_of_combo) {
00388       box = word->word->bounding_box();
00389       if (prev_right > -MAX_INT16) {
00390         gap = box.left() - prev_right;
00391         if (gap < min_gap)
00392           min_gap = gap;
00393       }
00394       prev_right = box.right();
00395     }
00396   }
00397   if (min_gap < MAX_INT16) {
00398     prev_right = -MAX_INT16;        // back to start
00399     word_it.set_to_list(&words);
00400     // Note: we can't use cycle_pt due to inserted combos at start of list.
00401     for (; (prev_right == -MAX_INT16) || !word_it.at_first();
00402          word_it.forward()) {
00403       word = word_it.data();
00404       if (!word->part_of_combo) {
00405         box = word->word->bounding_box();
00406         if (prev_right > -MAX_INT16) {
00407           gap = box.left() - prev_right;
00408           if (gap <= min_gap) {
00409             prev_word = prev_word_it.data();
00410             if (prev_word->combination) {
00411               combo = prev_word;
00412             } else {
00413               /* Make a new combination and insert before
00414                * the first word being joined. */
00415               copy_word = new WERD;
00416               *copy_word = *(prev_word->word);
00417               // deep copy
00418               combo = new WERD_RES(copy_word);
00419               combo->combination = TRUE;
00420               combo->x_height = prev_word->x_height;
00421               prev_word->part_of_combo = TRUE;
00422               prev_word_it.add_before_then_move(combo);
00423             }
00424             combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
00425             if (word->combination) {
00426               combo->word->join_on(word->word);
00427               // Move blobs to combo
00428               // old combo no longer needed
00429               delete word_it.extract();
00430             } else {
00431               // Copy current wd to combo
00432               combo->copy_on(word);
00433               word->part_of_combo = TRUE;
00434             }
00435             combo->done = FALSE;
00436             combo->ClearResults();
00437           } else {
00438             prev_word_it = word_it;  // catch up
00439           }
00440         }
00441         prev_right = box.right();
00442       }
00443     }
00444   } else {
00445     words.clear();  // signal termination
00446   }
00447 }
00448
00449 namespace tesseract {
00450 void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,
00451                            inT16 mode, BOOL8 improved) {
00452   WERD_RES_IT word_res_it(&perm);
00453
00454   if (debug_fix_space_level > 0) {
00455     if (mode == 1) {
00456       stats_.dump_words_str = "";
00457       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00458            word_res_it.forward()) {
00459         if (!word_res_it.data()->part_of_combo) {
00460           stats_.dump_words_str +=
00461               word_res_it.data()->best_choice->unichar_string();
00462           stats_.dump_words_str += ' ';
00463         }
00464       }
00465     }
00466
00467     #ifndef SECURE_NAMES
00468     if (debug_fix_space_level > 1) {
00469       switch (mode) {
00470         case 1:
00471           tprintf("EXTRACTED (%d): \"", score);
00472           break;
00473         case 2:
00474           tprintf("TESTED (%d): \"", score);
00475           break;
00476         case 3:
00477           tprintf("RETURNED (%d): \"", score);
00478           break;
00479       }
00480
00481       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00482            word_res_it.forward()) {
00483         if (!word_res_it.data()->part_of_combo) {
00484           tprintf("%s/%1d ",
00485                   word_res_it.data()->best_choice->unichar_string().string(),
00486                   (int)word_res_it.data()->best_choice->permuter());
00487         }
00488       }
00489       tprintf("\"\n");
00490     } else if (improved) {
00491       tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
00492       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00493            word_res_it.forward()) {
00494         if (!word_res_it.data()->part_of_combo) {
00495           tprintf("%s/%1d ",
00496                   word_res_it.data()->best_choice->unichar_string().string(),
00497                   (int)word_res_it.data()->best_choice->permuter());
00498         }
00499       }
00500       tprintf("\"\n");
00501     }
00502     #endif
00503   }
00504 }
00505
00506
00515 BOOL8 Tesseract::uniformly_spaced(WERD_RES *word) {
00516   TBOX box;
00517   inT16 prev_right = -MAX_INT16;
00518   inT16 gap;
00519   inT16 max_gap = -MAX_INT16;
00520   inT16 max_gap_count = 0;
00521   STATS gap_stats(0, MAXSPACING);
00522   BOOL8 result;
00523   const ROW *row = word->denorm.row();
00524   float max_non_space;
00525   float normalised_max_nonspace;
00526   inT16 i = 0;
00527   inT16 offset = 0;
00528   STRING punct_chars = "\"`',.:;";
00529
00530   for (TBLOB* blob = word->rebuild_word->blobs; blob != NULL;
00531        blob = blob->next) {
00532     box = blob->bounding_box();
00533     if ((prev_right > -MAX_INT16) &&
00534         (!punct_chars.contains(
00535              word->best_choice->unichar_string()
00536                  [offset - word->best_choice->unichar_lengths()[i - 1]]) &&
00537          !punct_chars.contains(
00538              word->best_choice->unichar_string()[offset]))) {
00539       gap = box.left() - prev_right;
00540       if (gap < max_gap) {
00541         gap_stats.add(gap, 1);
00542       } else if (gap == max_gap) {
00543         max_gap_count++;
00544       } else {
00545         if (max_gap_count > 0)
00546           gap_stats.add(max_gap, max_gap_count);
00547         max_gap = gap;
00548         max_gap_count = 1;
00549       }
00550     }
00551     prev_right = box.right();
00552     offset += word->best_choice->unichar_lengths()[i++];
00553   }
00554
00555   max_non_space = (row->space() + 3 * row->kern()) / 4;
00556   normalised_max_nonspace = max_non_space * kBlnXHeight / row->x_height();
00557
00558   result = (
00559       gap_stats.get_total() == 0 ||
00560       max_gap <= normalised_max_nonspace ||
00561       (gap_stats.get_total() > 2 && max_gap <= 2 * gap_stats.median()) ||
00562       (gap_stats.get_total() <= 2 && max_gap <= 2 * gap_stats.mean()));
00563   #ifndef SECURE_NAMES
00564   if ((debug_fix_space_level > 1)) {
00565     if (result) {
00566       tprintf(
00567           "ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
00568           "total=%d mean=%f median=%f\n",
00569           word->best_choice->unichar_string().string(), normalised_max_nonspace,
00570           max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
00571           gap_stats.median());
00572     } else {
00573       tprintf(
00574           "REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
00575           "total=%d mean=%f median=%f\n",
00576           word->best_choice->unichar_string().string(), normalised_max_nonspace,
00577           max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
00578           gap_stats.median());
00579     }
00580   }
00581   #endif
00582
00583   return result;
00584 }
00585
00586 BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
00587   if (word->done)
00588     return TRUE;
00589
00590   /*
00591     Use all the standard pass 2 conditions for mode 5 in set_done() in
00592     reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
00593     CARE WHETHER WE HAVE of/at on/an etc.
00594   */
00595   if (fixsp_done_mode > 0 &&
00596       (word->tess_accepted ||
00597        (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
00598        fixsp_done_mode == 3) &&
00599       (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
00600       ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
00601        (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
00602        (word->best_choice->permuter() == USER_DAWG_PERM) ||
00603        (word->best_choice->permuter() == NUMBER_PERM))) {
00604     return TRUE;
00605   } else {
00606     return FALSE;
00607   }
00608 }
00609
00610
00618 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
00619                                BLOCK* block) {
00620   WERD_RES *word_res;
00621   WERD_RES_LIST sub_word_list;
00622   WERD_RES_IT sub_word_list_it(&sub_word_list);
00623   inT16 blob_index;
00624   inT16 new_length;
00625   float junk;
00626
00627   word_res = word_res_it.data();
00628   if (word_res->word->flag(W_REP_CHAR) ||
00629       word_res->combination ||
00630       word_res->part_of_combo ||
00631       !word_res->word->flag(W_DONT_CHOP))
00632     return;
00633
00634   blob_index = worst_noise_blob(word_res, &junk);
00635   if (blob_index < 0)
00636     return;
00637
00638   if (debug_fix_space_level > 1) {
00639     tprintf("FP fixspace working on \"%s\"\n",
00640             word_res->best_choice->unichar_string().string());
00641   }
00642   word_res->word->rej_cblob_list()->sort(c_blob_comparator);
00643   sub_word_list_it.add_after_stay_put(word_res_it.extract());
00644   fix_noisy_space_list(sub_word_list, row, block);
00645   new_length = sub_word_list.length();
00646   word_res_it.add_list_before(&sub_word_list);
00647   for (; !word_res_it.at_last() && new_length > 1; new_length--) {
00648     word_res_it.forward();
00649   }
00650 }
00651
00652 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
00653                                      BLOCK* block) {
00654   inT16 best_score;
00655   WERD_RES_IT best_perm_it(&best_perm);
00656   WERD_RES_LIST current_perm;
00657   WERD_RES_IT current_perm_it(&current_perm);
00658   WERD_RES *old_word_res;
00659   WERD_RES *new_word_res;
00660   inT16 current_score;
00661   BOOL8 improved = FALSE;
00662
00663   best_score = fp_eval_word_spacing(best_perm);  // default score
00664
00665   dump_words(best_perm, best_score, 1, improved);
00666
00667   new_word_res = new WERD_RES;
00668   old_word_res = best_perm_it.data();
00669   old_word_res->combination = TRUE;   // Kludge to force deep copy
00670   *new_word_res = *old_word_res;      // deep copy
00671   old_word_res->combination = FALSE;  // Undo kludge
00672   current_perm_it.add_to_end(new_word_res);
00673
00674   break_noisiest_blob_word(current_perm);
00675
00676   while (best_score != PERFECT_WERDS && !current_perm.empty()) {
00677     match_current_words(current_perm, row, block);
00678     current_score = fp_eval_word_spacing(current_perm);
00679     dump_words(current_perm, current_score, 2, improved);
00680     if (current_score > best_score) {
00681       best_perm.clear();
00682       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
00683       best_score = current_score;
00684       improved = TRUE;
00685     }
00686     if (current_score < PERFECT_WERDS) {
00687       break_noisiest_blob_word(current_perm);
00688     }
00689   }
00690   dump_words(best_perm, best_score, 3, improved);
00691 }
00692
00693
00699 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
00700   WERD_RES_IT word_it(&words);
00701   WERD_RES_IT worst_word_it;
00702   float worst_noise_score = 9999;
00703   int worst_blob_index = -1;     // Noisiest blob of noisiest wd
00704   int blob_index;                // of wds noisiest blob
00705   float noise_score;             // of wds noisiest blob
00706   WERD_RES *word_res;
00707   C_BLOB_IT blob_it;
00708   C_BLOB_IT rej_cblob_it;
00709   C_BLOB_LIST new_blob_list;
00710   C_BLOB_IT new_blob_it;
00711   C_BLOB_IT new_rej_cblob_it;
00712   WERD *new_word;
00713   inT16 start_of_noise_blob;
00714   inT16 i;
00715
00716   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00717     blob_index = worst_noise_blob(word_it.data(), &noise_score);
00718     if (blob_index > -1 && worst_noise_score > noise_score) {
00719       worst_noise_score = noise_score;
00720       worst_blob_index = blob_index;
00721       worst_word_it = word_it;
00722     }
00723   }
00724   if (worst_blob_index < 0) {
00725     words.clear();          // signal termination
00726     return;
00727   }
00728
00729   /* Now split the worst_word_it */
00730
00731   word_res = worst_word_it.data();
00732
00733   /* Move blobs before noise blob to a new bloblist */
00734
00735   new_blob_it.set_to_list(&new_blob_list);
00736   blob_it.set_to_list(word_res->word->cblob_list());
00737   for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
00738     new_blob_it.add_after_then_move(blob_it.extract());
00739   }
00740   start_of_noise_blob = blob_it.data()->bounding_box().left();
00741   delete blob_it.extract();     // throw out noise blob
00742
00743   new_word = new WERD(&new_blob_list, word_res->word);
00744   new_word->set_flag(W_EOL, FALSE);
00745   word_res->word->set_flag(W_BOL, FALSE);
00746   word_res->word->set_blanks(1);  // After break
00747
00748   new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
00749   rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
00750   for (;
00751        (!rej_cblob_it.empty() &&
00752         (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
00753        rej_cblob_it.forward()) {
00754     new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
00755   }
00756
00757   WERD_RES* new_word_res = new WERD_RES(new_word);
00758   new_word_res->combination = TRUE;
00759   worst_word_it.add_before_then_move(new_word_res);
00760
00761   word_res->ClearResults();
00762 }
00763
00764 inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
00765                                   float *worst_noise_score) {
00766   float noise_score[512];
00767   int i;
00768   int min_noise_blob;            // 1st contender
00769   int max_noise_blob;            // last contender
00770   int non_noise_count;
00771   int worst_noise_blob;          // Worst blob
00772   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
00773   float non_noise_limit = kBlnXHeight * 0.8;
00774
00775   if (word_res->rebuild_word == NULL)
00776     return -1;  // Can't handle cube words.
00777
00778   TBLOB* blob = word_res->rebuild_word->blobs;
00779   // Normalised.
00780   int blob_count = word_res->box_word->length();
00781   ASSERT_HOST(blob_count <= 512);
00782   if (blob_count < 5)
00783     return -1;                   // too short to split
00784
00785   /* Get the noise scores for all blobs */
00786
00787   #ifndef SECURE_NAMES
00788   if (debug_fix_space_level > 5)
00789     tprintf("FP fixspace Noise metrics for \"%s\": ",
00790             word_res->best_choice->unichar_string().string());
00791   #endif
00792
00793   for (i = 0; i < blob_count && blob != NULL; i++, blob = blob->next) {
00794     if (word_res->reject_map[i].accepted())
00795       noise_score[i] = non_noise_limit;
00796     else
00797       noise_score[i] = blob_noise_score(blob);
00798
00799     if (debug_fix_space_level > 5)
00800       tprintf("%1.1f ", noise_score[i]);
00801   }
00802   if (debug_fix_space_level > 5)
00803     tprintf("\n");
00804
00805   /* Now find the worst one which is far enough away from the end of the word */
00806
00807   non_noise_count = 0;
00808   for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
00809     if (noise_score[i] >= non_noise_limit) {
00810       non_noise_count++;
00811     }
00812   }
00813   if (non_noise_count < fixsp_non_noise_limit)
00814     return -1;
00815
00816   min_noise_blob = i;
00817
00818   non_noise_count = 0;
00819   for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
00820        i--) {
00821     if (noise_score[i] >= non_noise_limit) {
00822       non_noise_count++;
00823     }
00824   }
00825   if (non_noise_count < fixsp_non_noise_limit)
00826     return -1;
00827
00828   max_noise_blob = i;
00829
00830   if (min_noise_blob > max_noise_blob)
00831     return -1;
00832
00833   *worst_noise_score = small_limit;
00834   worst_noise_blob = -1;
00835   for (i = min_noise_blob; i <= max_noise_blob; i++) {
00836     if (noise_score[i] < *worst_noise_score) {
00837       worst_noise_blob = i;
00838       *worst_noise_score = noise_score[i];
00839     }
00840   }
00841   return worst_noise_blob;
00842 }
00843
00844 float Tesseract::blob_noise_score(TBLOB *blob) {
00845   TBOX box;                       // BB of outline
00846   inT16 outline_count = 0;
00847   inT16 max_dimension;
00848   inT16 largest_outline_dimension = 0;
00849
00850   for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
00851     outline_count++;
00852     box = ol->bounding_box();
00853     if (box.height() > box.width()) {
00854       max_dimension = box.height();
00855     } else {
00856       max_dimension = box.width();
00857     }
00858
00859     if (largest_outline_dimension < max_dimension)
00860       largest_outline_dimension = max_dimension;
00861   }
00862
00863   if (outline_count > 5) {
00864     // penalise LOTS of blobs
00865     largest_outline_dimension *= 2;
00866   }
00867
00868   box = blob->bounding_box();
00869   if (box.bottom() > kBlnBaselineOffset * 4 ||
00870       box.top() < kBlnBaselineOffset / 2) {
00871     // Lax blob is if high or low
00872     largest_outline_dimension /= 2;
00873   }
00874
00875   return largest_outline_dimension;
00876 }
00877 }  // namespace tesseract
00878
00879 void fixspace_dbg(WERD_RES *word) {
00880   TBOX box = word->word->bounding_box();
00881   BOOL8 show_map_detail = FALSE;
00882   inT16 i;
00883
00884   box.print();
00885   tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
00886   tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
00887           word->word->cblob_list()->length(),
00888           word->rebuild_word->NumBlobs(),
00889           word->box_word->length());
00890   word->reject_map.print(debug_fp);
00891   tprintf("\n");
00892   if (show_map_detail) {
00893     tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
00894     for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
00895       tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
00896       word->reject_map[i].full_print(debug_fp);
00897     }
00898   }
00899
00900   tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
00901   tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
00902 }
00903
00904
00913 namespace tesseract {
00914 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
00915   WERD_RES_IT word_it(&word_res_list);
00916   WERD_RES *word;
00917   inT16 word_length;
00918   inT16 score = 0;
00919   inT16 i;
00920   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
00921
00922   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00923     word = word_it.data();
00924     if (word->rebuild_word == NULL)
00925       continue;  // Can't handle cube words.
00926     word_length = word->reject_map.length();
00927     if (word->done ||
00928         word->tess_accepted ||
00929         word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
00930         word->best_choice->permuter() == FREQ_DAWG_PERM ||
00931         word->best_choice->permuter() == USER_DAWG_PERM ||
00932         safe_dict_word(word) > 0) {
00933       TBLOB* blob = word->rebuild_word->blobs;
00934       UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
00935       for (i = 0; i < word->best_choice->length() && blob != NULL;
00936            ++i, blob = blob->next) {
00937         if (word->best_choice->unichar_id(i) == space ||
00938             blob_noise_score(blob) < small_limit) {
00939           score -= 1;  // penalise possibly erroneous non-space
00940         } else if (word->reject_map[i].accepted()) {
00941           score++;
00942         }
00943       }
00944     }
00945   }
00946   if (score < 0)
00947     score = 0;
00948   return score;
00949 }
00950
00951 }  // namespace tesseract