tesseract-doc/docqual_8cpp_source.html

00001 /******************************************************************
00002  * File:        docqual.cpp  (Formerly docqual.c)
00003  * Description: Document Quality Metrics
00004  * Author:              Phil Cheatle
00005  * Created:             Mon May  9 11:27:28 BST 1994
00006  *
00007  * (C) Copyright 1994, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023
00024 #include "mfcpch.h"
00025 #include          <ctype.h>
00026 #include          "docqual.h"
00027 #include          "tfacep.h"
00028 #include          "reject.h"
00029 #include          "tesscallback.h"
00030 #include          "tessvars.h"
00031 #include          "secname.h"
00032 #include          "globals.h"
00033 #include          "tesseractclass.h"
00034
00035 namespace tesseract{
00036
00037 // A little class to provide the callbacks as we have no pre-bound args.
00038 struct DocQualCallbacks {
00039   explicit DocQualCallbacks(WERD_RES* word0)
00040     : word(word0), match_count(0), accepted_match_count(0) {}
00041
00042   void CountMatchingBlobs(int index) {
00043     ++match_count;
00044   }
00045
00046   void CountAcceptedBlobs(int index) {
00047     if (word->reject_map[index].accepted())
00048       ++accepted_match_count;
00049     ++match_count;
00050   }
00051
00052   void AcceptIfGoodQuality(int index) {
00053     if (word->reject_map[index].accept_if_good_quality())
00054       word->reject_map[index].setrej_quality_accept();
00055   }
00056
00057   WERD_RES* word;
00058   inT16 match_count;
00059   inT16 accepted_match_count;
00060 };
00061
00062 /*************************************************************************
00063  * word_blob_quality()
00064  * How many blobs in the box_word are identical to those of the inword?
00065  * ASSUME blobs in both initial word and box_word are in ascending order of
00066  * left hand blob edge.
00067  *************************************************************************/
00068 inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
00069   if (word->bln_boxes == NULL ||
00070       word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
00071     return 0;
00072
00073   DocQualCallbacks cb(word);
00074   word->bln_boxes->ProcessMatchedBlobs(
00075       *word->rebuild_word,
00076       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs));
00077   return cb.match_count;
00078 }
00079
00080 inT16 Tesseract::word_outline_errs(WERD_RES *word) {
00081   inT16 i = 0;
00082   inT16 err_count = 0;
00083
00084   if (word->rebuild_word != NULL) {
00085     TBLOB* blob = word->rebuild_word->blobs;
00086     for (; blob != NULL; blob = blob->next) {
00087       err_count += count_outline_errs(word->best_choice->unichar_string()[i],
00088                                       blob->NumOutlines());
00089       i++;
00090     }
00091   }
00092   return err_count;
00093 }
00094
00095 /*************************************************************************
00096  * word_char_quality()
00097  * Combination of blob quality and outline quality - how many good chars are
00098  * there? - I.e chars which pass the blob AND outline tests.
00099  *************************************************************************/
00100 void Tesseract::word_char_quality(WERD_RES *word,
00101                                   ROW *row,
00102                                   inT16 *match_count,
00103                                   inT16 *accepted_match_count) {
00104   if (word->bln_boxes == NULL ||
00105       word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
00106     return;
00107
00108   DocQualCallbacks cb(word);
00109   word->bln_boxes->ProcessMatchedBlobs(
00110       *word->rebuild_word,
00111       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs));
00112   *match_count = cb.match_count;
00113   *accepted_match_count = cb.accepted_match_count;
00114 }
00115
00116 /*************************************************************************
00117  * unrej_good_chs()
00118  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
00119  *************************************************************************/
00120 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
00121   if (word->bln_boxes == NULL ||
00122       word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
00123     return;
00124
00125   DocQualCallbacks cb(word);
00126   word->bln_boxes->ProcessMatchedBlobs(
00127       *word->rebuild_word,
00128       NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality));
00129 }
00130
00131 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
00132   int expected_outline_count;
00133
00134   if (STRING (outlines_odd).contains (c))
00135     return 0;                    //Dont use this char
00136   else if (STRING (outlines_2).contains (c))
00137     expected_outline_count = 2;
00138   else
00139     expected_outline_count = 1;
00140   return abs (outline_count - expected_outline_count);
00141 }
00142
00143 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
00144                                         BOOL8 good_quality_doc) {
00145   if ((tessedit_good_quality_unrej && good_quality_doc))
00146     unrej_good_quality_words(page_res_it);
00147   doc_and_block_rejection(page_res_it, good_quality_doc);
00148   if (unlv_tilde_crunching) {
00149     tilde_crunch(page_res_it);
00150     tilde_delete(page_res_it);
00151   }
00152 }
00153
00154
00155 /*************************************************************************
00156  * unrej_good_quality_words()
00157  * Accept potential rejects in words which pass the following checks:
00158  *    - Contains a potential reject
00159  *    - Word looks like a sensible alpha word.
00160  *    - Word segmentation is the same as the original image
00161  *              - All characters have the expected number of outlines
00162  * NOTE - the rejection counts are recalculated after unrejection
00163  *      - CANT do it in a single pass without a bit of fiddling
00164  *              - keep it simple but inefficient
00165  *************************************************************************/
00166 void Tesseract::unrej_good_quality_words(  //unreject potential
00167                                          PAGE_RES_IT &page_res_it) {
00168   WERD_RES *word;
00169   ROW_RES *current_row;
00170   BLOCK_RES *current_block;
00171   int i;
00172
00173   page_res_it.restart_page ();
00174   while (page_res_it.word () != NULL) {
00175     check_debug_pt (page_res_it.word (), 100);
00176     if (bland_unrej) {
00177       word = page_res_it.word ();
00178       for (i = 0; i < word->reject_map.length (); i++) {
00179         if (word->reject_map[i].accept_if_good_quality ())
00180           word->reject_map[i].setrej_quality_accept ();
00181       }
00182       page_res_it.forward ();
00183     }
00184     else if ((page_res_it.row ()->char_count > 0) &&
00185       ((page_res_it.row ()->rej_count /
00186       (float) page_res_it.row ()->char_count) <=
00187     quality_rowrej_pc)) {
00188       word = page_res_it.word ();
00189       if (word->reject_map.quality_recoverable_rejects() &&
00190           (tessedit_unrej_any_wd ||
00191            acceptable_word_string(*word->uch_set,
00192                                   word->best_choice->unichar_string().string(),
00193                                   word->best_choice->unichar_lengths().string())
00194                != AC_UNACCEPTABLE)) {
00195         unrej_good_chs(word, page_res_it.row ()->row);
00196       }
00197       page_res_it.forward ();
00198     }
00199     else {
00200       /* Skip to end of dodgy row */
00201       current_row = page_res_it.row ();
00202       while ((page_res_it.word () != NULL) &&
00203         (page_res_it.row () == current_row))
00204         page_res_it.forward ();
00205     }
00206     check_debug_pt (page_res_it.word (), 110);
00207   }
00208   page_res_it.restart_page ();
00209   page_res_it.page_res->char_count = 0;
00210   page_res_it.page_res->rej_count = 0;
00211   current_block = NULL;
00212   current_row = NULL;
00213   while (page_res_it.word () != NULL) {
00214     if (current_block != page_res_it.block ()) {
00215       current_block = page_res_it.block ();
00216       current_block->char_count = 0;
00217       current_block->rej_count = 0;
00218     }
00219     if (current_row != page_res_it.row ()) {
00220       current_row = page_res_it.row ();
00221       current_row->char_count = 0;
00222       current_row->rej_count = 0;
00223       current_row->whole_word_rej_count = 0;
00224     }
00225     page_res_it.rej_stat_word ();
00226     page_res_it.forward ();
00227   }
00228 }
00229
00230
00231 /*************************************************************************
00232  * doc_and_block_rejection()
00233  *
00234  * If the page has too many rejects - reject all of it.
00235  * If any block has too many rejects - reject all words in the block
00236  *************************************************************************/
00237
00238 void Tesseract::doc_and_block_rejection(  //reject big chunks
00239                                         PAGE_RES_IT &page_res_it,
00240                                         BOOL8 good_quality_doc) {
00241   inT16 block_no = 0;
00242   inT16 row_no = 0;
00243   BLOCK_RES *current_block;
00244   ROW_RES *current_row;
00245
00246   BOOL8 rej_word;
00247   BOOL8 prev_word_rejected;
00248   inT16 char_quality = 0;
00249   inT16 accepted_char_quality;
00250
00251   if (page_res_it.page_res->rej_count * 100.0 /
00252       page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
00253     reject_whole_page(page_res_it);
00254     if (tessedit_debug_doc_rejection) {
00255       tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
00256               page_res_it.page_res->char_count,
00257               page_res_it.page_res->rej_count);
00258     }
00259   } else {
00260     if (tessedit_debug_doc_rejection) {
00261       tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
00262               page_res_it.page_res->char_count,
00263               page_res_it.page_res->rej_count);
00264     }
00265
00266     /* Walk blocks testing for block rejection */
00267
00268     page_res_it.restart_page();
00269     WERD_RES* word;
00270     while ((word = page_res_it.word()) != NULL) {
00271       current_block = page_res_it.block();
00272       block_no = current_block->block->index();
00273       if (current_block->char_count > 0 &&
00274           (current_block->rej_count * 100.0 / current_block->char_count) >
00275            tessedit_reject_block_percent) {
00276         if (tessedit_debug_block_rejection) {
00277           tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
00278                   block_no, current_block->char_count,
00279                   current_block->rej_count);
00280         }
00281         prev_word_rejected = FALSE;
00282         while ((word = page_res_it.word()) != NULL &&
00283                (page_res_it.block() == current_block)) {
00284           if (tessedit_preserve_blk_rej_perfect_wds) {
00285             rej_word = word->reject_map.reject_count() > 0 ||
00286                 word->reject_map.length () < tessedit_preserve_min_wd_len;
00287             if (rej_word && tessedit_dont_blkrej_good_wds &&
00288                 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
00289                 acceptable_word_string(
00290                     *word->uch_set,
00291                     word->best_choice->unichar_string().string(),
00292                     word->best_choice->unichar_lengths().string()) !=
00293                 AC_UNACCEPTABLE) {
00294               word_char_quality(word, page_res_it.row()->row,
00295                                 &char_quality,
00296                                 &accepted_char_quality);
00297               rej_word = char_quality !=  word->reject_map.length();
00298             }
00299           } else {
00300             rej_word = TRUE;
00301           }
00302           if (rej_word) {
00303             /*
00304               Reject spacing if both current and prev words are rejected.
00305               NOTE - this is NOT restricted to FUZZY spaces. - When tried this
00306               generated more space errors.
00307             */
00308             if (tessedit_use_reject_spaces &&
00309                 prev_word_rejected &&
00310                 page_res_it.prev_row() == page_res_it.row() &&
00311                 word->word->space() == 1)
00312               word->reject_spaces = TRUE;
00313             word->reject_map.rej_word_block_rej();
00314           }
00315           prev_word_rejected = rej_word;
00316           page_res_it.forward();
00317         }
00318       } else {
00319         if (tessedit_debug_block_rejection) {
00320           tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
00321                   block_no, page_res_it.block()->char_count,
00322                   page_res_it.block()->rej_count);
00323         }
00324
00325         /* Walk rows in block testing for row rejection */
00326         row_no = 0;
00327         while ((word = page_res_it.word()) != NULL &&
00328                page_res_it.block() == current_block) {
00329           current_row = page_res_it.row();
00330           row_no++;
00331           /* Reject whole row if:
00332             fraction of chars on row which are rejected exceed a limit AND
00333             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
00334             limit
00335           */
00336           if (current_row->char_count > 0 &&
00337               (current_row->rej_count * 100.0 / current_row->char_count) >
00338               tessedit_reject_row_percent &&
00339               (current_row->whole_word_rej_count * 100.0 /
00340                   current_row->rej_count) <
00341               tessedit_whole_wd_rej_row_percent) {
00342             if (tessedit_debug_block_rejection) {
00343               tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
00344                       row_no, current_row->char_count,
00345                       current_row->rej_count);
00346             }
00347             prev_word_rejected = FALSE;
00348             while ((word = page_res_it.word()) != NULL &&
00349                    page_res_it.row () == current_row) {
00350               /* Preserve words on good docs unless they are mostly rejected*/
00351               if (!tessedit_row_rej_good_docs && good_quality_doc) {
00352                 rej_word = word->reject_map.reject_count() /
00353                     static_cast<float>(word->reject_map.length()) >
00354                     tessedit_good_doc_still_rowrej_wd;
00355               } else if (tessedit_preserve_row_rej_perfect_wds) {
00356                 /* Preserve perfect words anyway */
00357                 rej_word = word->reject_map.reject_count() > 0 ||
00358                     word->reject_map.length () < tessedit_preserve_min_wd_len;
00359                 if (rej_word && tessedit_dont_rowrej_good_wds &&
00360                     word->reject_map.length() >= tessedit_preserve_min_wd_len &&
00361                     acceptable_word_string(*word->uch_set,
00362                         word->best_choice->unichar_string().string(),
00363                         word->best_choice->unichar_lengths().string()) !=
00364                             AC_UNACCEPTABLE) {
00365                   word_char_quality(word, page_res_it.row()->row,
00366                                     &char_quality,
00367                                     &accepted_char_quality);
00368                   rej_word = char_quality != word->reject_map.length();
00369                 }
00370               } else {
00371                 rej_word = TRUE;
00372               }
00373               if (rej_word) {
00374                 /*
00375                   Reject spacing if both current and prev words are rejected.
00376                   NOTE - this is NOT restricted to FUZZY spaces. - When tried
00377                   this generated more space errors.
00378                 */
00379                 if (tessedit_use_reject_spaces &&
00380                     prev_word_rejected &&
00381                     page_res_it.prev_row() == page_res_it.row() &&
00382                     word->word->space () == 1)
00383                   word->reject_spaces = TRUE;
00384                 word->reject_map.rej_word_row_rej();
00385               }
00386               prev_word_rejected = rej_word;
00387               page_res_it.forward();
00388             }
00389           } else {
00390             if (tessedit_debug_block_rejection) {
00391               tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
00392                       row_no, current_row->char_count, current_row->rej_count);
00393             }
00394             while (page_res_it.word() != NULL &&
00395                    page_res_it.row() == current_row)
00396               page_res_it.forward();
00397           }
00398         }
00399       }
00400     }
00401   }
00402 }
00403
00404 }  // namespace tesseract
00405
00406
00407 /*************************************************************************
00408  * reject_whole_page()
00409  * Dont believe any of it - set the reject map to 00..00 in all words
00410  *
00411  *************************************************************************/
00412
00413 void reject_whole_page(PAGE_RES_IT &page_res_it) {
00414   page_res_it.restart_page ();
00415   while (page_res_it.word () != NULL) {
00416     page_res_it.word ()->reject_map.rej_word_doc_rej ();
00417     page_res_it.forward ();
00418   }
00419                                  //whole page is rejected
00420   page_res_it.page_res->rejected = TRUE;
00421 }
00422
00423 namespace tesseract {
00424 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
00425   WERD_RES *word;
00426   GARBAGE_LEVEL garbage_level;
00427   PAGE_RES_IT copy_it;
00428   BOOL8 prev_potential_marked = FALSE;
00429   BOOL8 found_terrible_word = FALSE;
00430   BOOL8 ok_dict_word;
00431
00432   page_res_it.restart_page();
00433   while (page_res_it.word() != NULL) {
00434     POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
00435     if (pb != NULL && !pb->IsText()) {
00436       page_res_it.forward();
00437       continue;
00438     }
00439     word = page_res_it.word();
00440
00441     if (crunch_early_convert_bad_unlv_chs)
00442       convert_bad_unlv_chs(word);
00443
00444     if (crunch_early_merge_tess_fails)
00445       word->merge_tess_fails();
00446
00447     if (word->reject_map.accept_count () != 0) {
00448       found_terrible_word = FALSE;
00449                                  //Forget earlier potential crunches
00450       prev_potential_marked = FALSE;
00451     }
00452     else {
00453       ok_dict_word = safe_dict_word(word);
00454       garbage_level = garbage_word (word, ok_dict_word);
00455
00456       if ((garbage_level != G_NEVER_CRUNCH) &&
00457       (terrible_word_crunch (word, garbage_level))) {
00458         if (crunch_debug > 0) {
00459           tprintf ("T CRUNCHING: \"%s\"\n",
00460             word->best_choice->unichar_string().string());
00461         }
00462         word->unlv_crunch_mode = CR_KEEP_SPACE;
00463         if (prev_potential_marked) {
00464           while (copy_it.word () != word) {
00465             if (crunch_debug > 0) {
00466               tprintf ("P1 CRUNCHING: \"%s\"\n",
00467                 copy_it.word()->best_choice->unichar_string().string());
00468             }
00469             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
00470             copy_it.forward ();
00471           }
00472           prev_potential_marked = FALSE;
00473         }
00474         found_terrible_word = TRUE;
00475       }
00476       else if ((garbage_level != G_NEVER_CRUNCH) &&
00477         (potential_word_crunch (word,
00478       garbage_level, ok_dict_word))) {
00479         if (found_terrible_word) {
00480           if (crunch_debug > 0) {
00481             tprintf ("P2 CRUNCHING: \"%s\"\n",
00482               word->best_choice->unichar_string().string());
00483           }
00484           word->unlv_crunch_mode = CR_KEEP_SPACE;
00485         }
00486         else if (!prev_potential_marked) {
00487           copy_it = page_res_it;
00488           prev_potential_marked = TRUE;
00489           if (crunch_debug > 1) {
00490             tprintf ("P3 CRUNCHING: \"%s\"\n",
00491               word->best_choice->unichar_string().string());
00492           }
00493         }
00494       }
00495       else {
00496         found_terrible_word = FALSE;
00497                                  //Forget earlier potential crunches
00498         prev_potential_marked = FALSE;
00499         if (crunch_debug > 2) {
00500           tprintf ("NO CRUNCH: \"%s\"\n",
00501             word->best_choice->unichar_string().string());
00502         }
00503       }
00504     }
00505     page_res_it.forward ();
00506   }
00507 }
00508
00509
00510 BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,
00511                                       GARBAGE_LEVEL garbage_level) {
00512   float rating_per_ch;
00513   int adjusted_len;
00514   int crunch_mode = 0;
00515
00516   if ((word->best_choice->unichar_string().length () == 0) ||
00517     (strspn (word->best_choice->unichar_string().string(), " ") ==
00518     word->best_choice->unichar_string().length ()))
00519     crunch_mode = 1;
00520   else {
00521     adjusted_len = word->reject_map.length ();
00522     if (adjusted_len > crunch_rating_max)
00523       adjusted_len = crunch_rating_max;
00524     rating_per_ch = word->best_choice->rating () / adjusted_len;
00525
00526     if (rating_per_ch > crunch_terrible_rating)
00527       crunch_mode = 2;
00528     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
00529       crunch_mode = 3;
00530     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
00531       (garbage_level != G_OK))
00532       crunch_mode = 4;
00533     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
00534       (garbage_level != G_OK))
00535       crunch_mode = 5;
00536   }
00537   if (crunch_mode > 0) {
00538     if (crunch_debug > 2) {
00539       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
00540         crunch_mode, word->best_choice->unichar_string().string());
00541     }
00542     return TRUE;
00543   }
00544   else
00545     return FALSE;
00546 }
00547
00548 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
00549                                        GARBAGE_LEVEL garbage_level,
00550                                        BOOL8 ok_dict_word) {
00551   float rating_per_ch;
00552   int adjusted_len;
00553   const char *str = word->best_choice->unichar_string().string();
00554   const char *lengths = word->best_choice->unichar_lengths().string();
00555   BOOL8 word_crunchable;
00556   int poor_indicator_count = 0;
00557
00558   word_crunchable = !crunch_leave_accept_strings ||
00559                     word->reject_map.length() < 3 ||
00560                     (acceptable_word_string(*word->uch_set,
00561                                             str, lengths) == AC_UNACCEPTABLE &&
00562                      !ok_dict_word);
00563
00564   adjusted_len = word->reject_map.length();
00565   if (adjusted_len > 10)
00566     adjusted_len = 10;
00567   rating_per_ch = word->best_choice->rating() / adjusted_len;
00568
00569   if (rating_per_ch > crunch_pot_poor_rate) {
00570     if (crunch_debug > 2) {
00571       tprintf("Potential poor rating on \"%s\"\n",
00572               word->best_choice->unichar_string().string());
00573     }
00574     poor_indicator_count++;
00575   }
00576
00577   if (word_crunchable &&
00578       word->best_choice->certainty() < crunch_pot_poor_cert) {
00579     if (crunch_debug > 2) {
00580       tprintf("Potential poor cert on \"%s\"\n",
00581               word->best_choice->unichar_string().string());
00582     }
00583     poor_indicator_count++;
00584   }
00585
00586   if (garbage_level != G_OK) {
00587     if (crunch_debug > 2) {
00588       tprintf("Potential garbage on \"%s\"\n",
00589               word->best_choice->unichar_string().string());
00590     }
00591     poor_indicator_count++;
00592   }
00593   return poor_indicator_count >= crunch_pot_indicators;
00594 }
00595
00596 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
00597   WERD_RES *word;
00598   PAGE_RES_IT copy_it;
00599   BOOL8 deleting_from_bol = FALSE;
00600   BOOL8 marked_delete_point = FALSE;
00601   inT16 debug_delete_mode;
00602   CRUNCH_MODE delete_mode;
00603   inT16 x_debug_delete_mode;
00604   CRUNCH_MODE x_delete_mode;
00605
00606   page_res_it.restart_page();
00607   while (page_res_it.word() != NULL) {
00608     word = page_res_it.word();
00609
00610     delete_mode = word_deletable (word, debug_delete_mode);
00611     if (delete_mode != CR_NONE) {
00612       if (word->word->flag (W_BOL) || deleting_from_bol) {
00613         if (crunch_debug > 0) {
00614           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
00615             debug_delete_mode,
00616             word->best_choice->unichar_string().string());
00617         }
00618         word->unlv_crunch_mode = delete_mode;
00619         deleting_from_bol = TRUE;
00620       } else if (word->word->flag(W_EOL)) {
00621         if (marked_delete_point) {
00622           while (copy_it.word() != word) {
00623             x_delete_mode = word_deletable (copy_it.word (),
00624               x_debug_delete_mode);
00625             if (crunch_debug > 0) {
00626               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
00627                 x_debug_delete_mode,
00628                 copy_it.word()->best_choice->unichar_string().string());
00629             }
00630             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
00631             copy_it.forward ();
00632           }
00633         }
00634         if (crunch_debug > 0) {
00635           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
00636             debug_delete_mode,
00637             word->best_choice->unichar_string().string());
00638         }
00639         word->unlv_crunch_mode = delete_mode;
00640         deleting_from_bol = FALSE;
00641         marked_delete_point = FALSE;
00642       }
00643       else {
00644         if (!marked_delete_point) {
00645           copy_it = page_res_it;
00646           marked_delete_point = TRUE;
00647         }
00648       }
00649     }
00650     else {
00651       deleting_from_bol = FALSE;
00652                                  //Forget earlier potential crunches
00653       marked_delete_point = FALSE;
00654     }
00655     /*
00656       The following step has been left till now as the tess fails are used to
00657       determine if the word is deletable.
00658     */
00659     if (!crunch_early_merge_tess_fails)
00660       word->merge_tess_fails();
00661     page_res_it.forward ();
00662   }
00663 }
00664
00665
00666 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
00667   int i;
00668   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
00669   UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
00670   UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
00671   UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
00672   bool modified = false;
00673   for (i = 0; i < word_res->reject_map.length(); ++i) {
00674     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
00675       word_res->best_choice->set_unichar_id(unichar_dash, i);
00676       modified = true;
00677       if (word_res->reject_map[i].accepted ())
00678         word_res->reject_map[i].setrej_unlv_rej ();
00679     }
00680     if (word_res->best_choice->unichar_id(i) == unichar_pow) {
00681       word_res->best_choice->set_unichar_id(unichar_space, i);
00682       modified = true;
00683       if (word_res->reject_map[i].accepted ())
00684         word_res->reject_map[i].setrej_unlv_rej ();
00685     }
00686   }
00687 }
00688
00689 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
00690   enum STATES
00691   {
00692     JUNK,
00693     FIRST_UPPER,
00694     FIRST_LOWER,
00695     FIRST_NUM,
00696     SUBSEQUENT_UPPER,
00697     SUBSEQUENT_LOWER,
00698     SUBSEQUENT_NUM
00699   };
00700   const char *str = word->best_choice->unichar_string().string();
00701   const char *lengths = word->best_choice->unichar_lengths().string();
00702   STATES state = JUNK;
00703   int len = 0;
00704   int isolated_digits = 0;
00705   int isolated_alphas = 0;
00706   int bad_char_count = 0;
00707   int tess_rejs = 0;
00708   int dodgy_chars = 0;
00709   int ok_chars;
00710   UNICHAR_ID last_char = -1;
00711   int alpha_repetition_count = 0;
00712   int longest_alpha_repetition_count = 0;
00713   int longest_lower_run_len = 0;
00714   int lower_string_count = 0;
00715   int longest_upper_run_len = 0;
00716   int upper_string_count = 0;
00717   int total_alpha_count = 0;
00718   int total_digit_count = 0;
00719
00720   for (; *str != '\0'; str += *(lengths++)) {
00721     len++;
00722     if (word->uch_set->get_isupper (str, *lengths)) {
00723       total_alpha_count++;
00724       switch (state) {
00725         case SUBSEQUENT_UPPER:
00726         case FIRST_UPPER:
00727           state = SUBSEQUENT_UPPER;
00728           upper_string_count++;
00729           if (longest_upper_run_len < upper_string_count)
00730             longest_upper_run_len = upper_string_count;
00731           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
00732             alpha_repetition_count++;
00733             if (longest_alpha_repetition_count < alpha_repetition_count) {
00734               longest_alpha_repetition_count = alpha_repetition_count;
00735             }
00736           }
00737           else {
00738             last_char = word->uch_set->unichar_to_id(str, *lengths);
00739             alpha_repetition_count = 1;
00740           }
00741           break;
00742         case FIRST_NUM:
00743           isolated_digits++;
00744         default:
00745           state = FIRST_UPPER;
00746           last_char = word->uch_set->unichar_to_id(str, *lengths);
00747           alpha_repetition_count = 1;
00748           upper_string_count = 1;
00749           break;
00750       }
00751     }
00752     else if (word->uch_set->get_islower (str, *lengths)) {
00753       total_alpha_count++;
00754       switch (state) {
00755         case SUBSEQUENT_LOWER:
00756         case FIRST_LOWER:
00757           state = SUBSEQUENT_LOWER;
00758           lower_string_count++;
00759           if (longest_lower_run_len < lower_string_count)
00760             longest_lower_run_len = lower_string_count;
00761           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
00762             alpha_repetition_count++;
00763             if (longest_alpha_repetition_count < alpha_repetition_count) {
00764               longest_alpha_repetition_count = alpha_repetition_count;
00765             }
00766           }
00767           else {
00768             last_char = word->uch_set->unichar_to_id(str, *lengths);
00769             alpha_repetition_count = 1;
00770           }
00771           break;
00772         case FIRST_NUM:
00773           isolated_digits++;
00774         default:
00775           state = FIRST_LOWER;
00776           last_char = word->uch_set->unichar_to_id(str, *lengths);
00777           alpha_repetition_count = 1;
00778           lower_string_count = 1;
00779           break;
00780       }
00781     }
00782     else if (word->uch_set->get_isdigit (str, *lengths)) {
00783       total_digit_count++;
00784       switch (state) {
00785         case FIRST_NUM:
00786           state = SUBSEQUENT_NUM;
00787         case SUBSEQUENT_NUM:
00788           break;
00789         case FIRST_UPPER:
00790         case FIRST_LOWER:
00791           isolated_alphas++;
00792         default:
00793           state = FIRST_NUM;
00794           break;
00795       }
00796     }
00797     else {
00798       if (*lengths == 1 && *str == ' ')
00799         tess_rejs++;
00800       else
00801         bad_char_count++;
00802       switch (state) {
00803         case FIRST_NUM:
00804           isolated_digits++;
00805           break;
00806         case FIRST_UPPER:
00807         case FIRST_LOWER:
00808           isolated_alphas++;
00809         default:
00810           break;
00811       }
00812       state = JUNK;
00813     }
00814   }
00815
00816   switch (state) {
00817     case FIRST_NUM:
00818       isolated_digits++;
00819       break;
00820     case FIRST_UPPER:
00821     case FIRST_LOWER:
00822       isolated_alphas++;
00823     default:
00824       break;
00825   }
00826
00827   if (crunch_include_numerals) {
00828     total_alpha_count += total_digit_count - isolated_digits;
00829   }
00830
00831   if (crunch_leave_ok_strings && len >= 4 &&
00832       2 * (total_alpha_count - isolated_alphas) > len &&
00833       longest_alpha_repetition_count < crunch_long_repetitions) {
00834     if ((crunch_accept_ok &&
00835          acceptable_word_string(*word->uch_set, str, lengths) !=
00836              AC_UNACCEPTABLE) ||
00837         longest_lower_run_len > crunch_leave_lc_strings ||
00838         longest_upper_run_len > crunch_leave_uc_strings)
00839       return G_NEVER_CRUNCH;
00840   }
00841   if (word->reject_map.length() > 1 &&
00842       strpbrk(str, " ") == NULL &&
00843       (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
00844        word->best_choice->permuter() == FREQ_DAWG_PERM ||
00845        word->best_choice->permuter() == USER_DAWG_PERM ||
00846        word->best_choice->permuter() == NUMBER_PERM ||
00847        acceptable_word_string(*word->uch_set, str, lengths) !=
00848            AC_UNACCEPTABLE || ok_dict_word))
00849     return G_OK;
00850
00851   ok_chars = len - bad_char_count - isolated_digits -
00852     isolated_alphas - tess_rejs;
00853
00854   if (crunch_debug > 3) {
00855     tprintf("garbage_word: \"%s\"\n",
00856             word->best_choice->unichar_string().string());
00857     tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
00858             len,
00859             bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
00860   }
00861   if (bad_char_count == 0 &&
00862       tess_rejs == 0 &&
00863       (len > isolated_digits + isolated_alphas || len <= 2))
00864     return G_OK;
00865
00866   if (tess_rejs > ok_chars ||
00867       (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
00868     return G_TERRIBLE;
00869
00870   if (len > 4) {
00871     dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
00872         isolated_alphas;
00873     if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
00874       return G_DODGY;
00875     else
00876       return G_OK;
00877   } else {
00878     dodgy_chars = 2 * tess_rejs + bad_char_count;
00879     if ((len == 4 && dodgy_chars > 2) ||
00880         (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
00881       return G_DODGY;
00882     else
00883       return G_OK;
00884   }
00885 }
00886
00887
00888 /*************************************************************************
00889  * word_deletable()
00890  *     DELETE WERDS AT ENDS OF ROWS IF
00891  *        Word is crunched &&
00892  *        ( string length = 0                                          OR
00893  *          > 50% of chars are "|" (before merging)                    OR
00894  *          certainty < -10                                            OR
00895  *          rating /char > 60                                          OR
00896  *          TOP of word is more than 0.5 xht BELOW baseline            OR
00897  *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR
00898  *          length of word < 3xht                                      OR
00899  *          height of word < 0.7 xht                                   OR
00900  *          height of word > 3.0 xht                                   OR
00901  *          >75% of the outline BBs have longest dimension < 0.5xht
00902  *************************************************************************/
00903
00904 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) {
00905   int word_len = word->reject_map.length ();
00906   float rating_per_ch;
00907   TBOX box;                       //BB of word
00908
00909   if (word->unlv_crunch_mode == CR_NONE) {
00910     delete_mode = 0;
00911     return CR_NONE;
00912   }
00913
00914   if (word_len == 0) {
00915     delete_mode = 1;
00916     return CR_DELETE;
00917   }
00918
00919   if (word->rebuild_word != NULL) {
00920     // Cube leaves rebuild_word NULL.
00921     box = word->rebuild_word->bounding_box();
00922     if (box.height () < crunch_del_min_ht * kBlnXHeight) {
00923       delete_mode = 4;
00924       return CR_DELETE;
00925     }
00926
00927     if (noise_outlines(word->rebuild_word)) {
00928       delete_mode = 5;
00929       return CR_DELETE;
00930     }
00931   }
00932
00933   if ((failure_count (word) * 1.5) > word_len) {
00934     delete_mode = 2;
00935     return CR_LOOSE_SPACE;
00936   }
00937
00938   if (word->best_choice->certainty () < crunch_del_cert) {
00939     delete_mode = 7;
00940     return CR_LOOSE_SPACE;
00941   }
00942
00943   rating_per_ch = word->best_choice->rating () / word_len;
00944
00945   if (rating_per_ch > crunch_del_rating) {
00946     delete_mode = 8;
00947     return CR_LOOSE_SPACE;
00948   }
00949
00950   if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
00951     delete_mode = 9;
00952     return CR_LOOSE_SPACE;
00953   }
00954
00955   if (box.bottom () >
00956   kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
00957     delete_mode = 10;
00958     return CR_LOOSE_SPACE;
00959   }
00960
00961   if (box.height () > crunch_del_max_ht * kBlnXHeight) {
00962     delete_mode = 11;
00963     return CR_LOOSE_SPACE;
00964   }
00965
00966   if (box.width () < crunch_del_min_width * kBlnXHeight) {
00967     delete_mode = 3;
00968     return CR_LOOSE_SPACE;
00969   }
00970
00971   delete_mode = 0;
00972   return CR_NONE;
00973 }
00974
00975 inT16 Tesseract::failure_count(WERD_RES *word) {
00976   const char *str = word->best_choice->unichar_string().string();
00977   int tess_rejs = 0;
00978
00979   for (; *str != '\0'; str++) {
00980     if (*str == ' ')
00981       tess_rejs++;
00982   }
00983   return tess_rejs;
00984 }
00985
00986
00987 BOOL8 Tesseract::noise_outlines(TWERD *word) {
00988   TBOX box;                       // BB of outline
00989   inT16 outline_count = 0;
00990   inT16 small_outline_count = 0;
00991   inT16 max_dimension;
00992   float small_limit = kBlnXHeight * crunch_small_outlines_size;
00993
00994   for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) {
00995     for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
00996       outline_count++;
00997       box = ol->bounding_box();
00998       if (box.height() > box.width())
00999         max_dimension = box.height();
01000       else
01001         max_dimension = box.width();
01002       if (max_dimension < small_limit)
01003         small_outline_count++;
01004     }
01005   }
01006   return (small_outline_count >= outline_count);
01007 }
01008 }  // namespace tesseract