tesseract-doc/tesseractclass_8cpp_source.html

00001
00002 // File:        tesseractclass.cpp
00003 // Description: An instance of Tesseract. For thread safety, *every*
00004 //              global variable goes in here, directly, or indirectly.
00005 // Author:      Ray Smith
00006 // Created:     Fri Mar 07 08:17:01 PST 2008
00007 //
00008 // (C) Copyright 2008, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020
00021 #include "tesseractclass.h"
00022
00023 #include "allheaders.h"
00024 #include "cube_reco_context.h"
00025 #include "edgblob.h"
00026 #include "equationdetect.h"
00027 #include "globals.h"
00028 #include "tesseract_cube_combiner.h"
00029
00030 // Include automatically generated configuration file if running autoconf.
00031 #ifdef HAVE_CONFIG_H
00032 #include "config_auto.h"
00033 #endif
00034
00035 namespace tesseract {
00036
00037 Tesseract::Tesseract()
00038   : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
00039                 "Take segmentation and labeling from box file",
00040                 this->params()),
00041     BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
00042                 "Conversion of word/line box file to char box file",
00043                 this->params()),
00044     BOOL_MEMBER(tessedit_train_from_boxes, false,
00045                 "Generate training data from boxed chars", this->params()),
00046     BOOL_MEMBER(tessedit_make_boxes_from_boxes, false,
00047                 "Generate more boxes from boxed chars", this->params()),
00048     BOOL_MEMBER(tessedit_dump_pageseg_images, false,
00049                "Dump intermediate images made during page segmentation",
00050                this->params()),
00051     // The default for pageseg_mode is the old behaviour, so as not to
00052     // upset anything that relies on that.
00053     INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
00054                "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
00055                " 5=line, 6=word, 7=char"
00056                " (Values from PageSegMode enum in publictypes.h)",
00057                this->params()),
00058     INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
00059                     "Which OCR engine(s) to run (Tesseract, Cube, both)."
00060                     " Defaults to loading and running only Tesseract"
00061                     " (no Cube,no combiner)."
00062                     " Values from OcrEngineMode enum in tesseractclass.h)",
00063                this->params()),
00064     STRING_MEMBER(tessedit_char_blacklist, "",
00065                   "Blacklist of chars not to recognize", this->params()),
00066     STRING_MEMBER(tessedit_char_whitelist, "",
00067                   "Whitelist of chars to recognize", this->params()),
00068     BOOL_MEMBER(tessedit_ambigs_training, false,
00069                 "Perform training for ambiguities", this->params()),
00070     INT_MEMBER(pageseg_devanagari_split_strategy,
00071               tesseract::ShiroRekhaSplitter::NO_SPLIT,
00072               "Whether to use the top-line splitting process for Devanagari "
00073               "documents while performing page-segmentation.", this->params()),
00074     INT_MEMBER(ocr_devanagari_split_strategy,
00075               tesseract::ShiroRekhaSplitter::NO_SPLIT,
00076               "Whether to use the top-line splitting process for Devanagari "
00077               "documents while performing ocr.", this->params()),
00078     STRING_MEMBER(tessedit_write_params_to_file, "",
00079                   "Write all parameters to the given file.", this->params()),
00080     BOOL_MEMBER(tessedit_adapt_to_char_fragments, true,
00081                 "Adapt to words that contain "
00082                 " a character composed form fragments", this->params()),
00083     BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug"
00084                 " information for adaption", this->params()),
00085     INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
00086     INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
00087     INT_MEMBER(applybox_page, 0,
00088                "Page number to apply boxes from", this->params()),
00089     STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows"
00090                   " this pattern in the image filename. The name of the image"
00091                   " files are expected to be in the form"
00092                   " [lang].[fontname].exp[num].tif", this->params()),
00093     BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
00094                "Learn both character fragments (as is done in the"
00095                " special low exposure mode) as well as unfragmented"
00096                " characters.", this->params()),
00097     BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box"
00098                 " is assumed to contain ngrams. Only learn the ngrams"
00099                 " whose outlines overlap horizontally.", this->params()),
00100     BOOL_MEMBER(tessedit_display_outwords, false,
00101                 "Draw output words", this->params()),
00102     BOOL_MEMBER(tessedit_training_tess, false,
00103                 "Call Tess to learn blobs", this->params()),
00104     BOOL_MEMBER(tessedit_dump_choices, false,
00105                 "Dump char choices", this->params()),
00106     BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
00107                 "Try to improve fuzzy spaces", this->params()),
00108     BOOL_MEMBER(tessedit_unrej_any_wd, false,
00109                 "Dont bother with word plausibility", this->params()),
00110     BOOL_MEMBER(tessedit_fix_hyphens, true,
00111                 "Crunch double hyphens?", this->params()),
00112     BOOL_MEMBER(tessedit_redo_xheight, true,
00113                 "Check/Correct x-height", this->params()),
00114     BOOL_MEMBER(tessedit_enable_doc_dict, true,
00115                 "Add words to the document dictionary", this->params()),
00116     BOOL_MEMBER(tessedit_debug_fonts, false,
00117                 "Output font info per char", this->params()),
00118     BOOL_MEMBER(tessedit_debug_block_rejection, false,
00119                 "Block and Row stats", this->params()),
00120     BOOL_MEMBER(tessedit_enable_bigram_correction, false,
00121                 "Enable correction based on the word bigram dictionary.",
00122                 this->params()),
00123     INT_MEMBER(tessedit_bigram_debug, 0,
00124                "Amount of debug output for bigram correction.",
00125                this->params()),
00126     INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
00127     BOOL_MEMBER(debug_acceptable_wds, false,
00128                 "Dump word pass/fail chk", this->params()),
00129     STRING_MEMBER(chs_leading_punct, "('`\"",
00130                   "Leading punctuation", this->params()),
00131     STRING_MEMBER(chs_trailing_punct1, ").,;:?!",
00132                   "1st Trailing punctuation", this->params()),
00133     STRING_MEMBER(chs_trailing_punct2, ")'`\"",
00134                   "2nd Trailing punctuation", this->params()),
00135     double_MEMBER(quality_rej_pc, 0.08,
00136                   "good_quality_doc lte rejection limit", this->params()),
00137     double_MEMBER(quality_blob_pc, 0.0,
00138                   "good_quality_doc gte good blobs limit", this->params()),
00139     double_MEMBER(quality_outline_pc, 1.0,
00140                   "good_quality_doc lte outline error limit", this->params()),
00141     double_MEMBER(quality_char_pc, 0.95,
00142                   "good_quality_doc gte good char limit", this->params()),
00143     INT_MEMBER(quality_min_initial_alphas_reqd, 2,
00144                "alphas in a good word", this->params()),
00145     BOOL_MEMBER(tessedit_tess_adapt_to_rejmap, false,
00146                 "Use reject map to control Tesseract adaption", this->params()),
00147     INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
00148                "Adaptation decision algorithm for tess", this->params()),
00149     BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
00150                 "Do minimal rejection on pass 1 output", this->params()),
00151     BOOL_MEMBER(tessedit_test_adaption, false,
00152                 "Test adaption criteria", this->params()),
00153     BOOL_MEMBER(tessedit_matcher_log, false,
00154                 "Log matcher activity", this->params()),
00155     INT_MEMBER(tessedit_test_adaption_mode, 3,
00156                "Adaptation decision algorithm for tess", this->params()),
00157     BOOL_MEMBER(save_blob_choices, false,
00158                 "Save the results of the recognition step (blob_choices)"
00159                 " within the corresponding WERD_CHOICE", this->params()),
00160     BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
00161     double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
00162     double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
00163     INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
00164                this->params()),
00165     INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
00166     STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
00167                   this->params()),
00168     STRING_MEMBER(outlines_2, "ij!?%\":;",
00169                   "Non standard number of outlines", this->params()),
00170     BOOL_MEMBER(docqual_excuse_outline_errs, false,
00171                 "Allow outline errs in unrejection?", this->params()),
00172     BOOL_MEMBER(tessedit_good_quality_unrej, true,
00173                 "Reduce rejection on good docs", this->params()),
00174     BOOL_MEMBER(tessedit_use_reject_spaces, true,
00175                 "Reject spaces?", this->params()),
00176     double_MEMBER(tessedit_reject_doc_percent, 65.00,
00177                   "%rej allowed before rej whole doc", this->params()),
00178     double_MEMBER(tessedit_reject_block_percent, 45.00,
00179                   "%rej allowed before rej whole block", this->params()),
00180     double_MEMBER(tessedit_reject_row_percent, 40.00,
00181                 "%rej allowed before rej whole row", this->params()),
00182     double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
00183                   "Number of row rejects in whole word rejects"
00184                   "which prevents whole row rejection", this->params()),
00185     BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
00186                 "Only rej partially rejected words in block rejection",
00187                 this->params()),
00188     BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
00189                 "Only rej partially rejected words in row rejection",
00190                 this->params()),
00191     BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false,
00192                 "Use word segmentation quality metric", this->params()),
00193     BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false,
00194                 "Use word segmentation quality metric", this->params()),
00195     INT_MEMBER(tessedit_preserve_min_wd_len, 2,
00196                "Only preserve wds longer than this", this->params()),
00197     BOOL_MEMBER(tessedit_row_rej_good_docs, true,
00198                 "Apply row rejection to good docs", this->params()),
00199     double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
00200                   "rej good doc wd if more than this fraction rejected",
00201                   this->params()),
00202     BOOL_MEMBER(tessedit_reject_bad_qual_wds, true,
00203                 "Reject all bad quality wds", this->params()),
00204     BOOL_MEMBER(tessedit_debug_doc_rejection, false,
00205                 "Page stats", this->params()),
00206     BOOL_MEMBER(tessedit_debug_quality_metrics, false,
00207                 "Output data to debug file", this->params()),
00208     BOOL_MEMBER(bland_unrej, false,
00209                 "unrej potential with no chekcs", this->params()),
00210     double_MEMBER(quality_rowrej_pc, 1.1,
00211                   "good_quality_doc gte good char limit", this->params()),
00212     BOOL_MEMBER(unlv_tilde_crunching, true,
00213                 "Mark v.bad words for tilde crunch", this->params()),
00214     BOOL_MEMBER(crunch_early_merge_tess_fails, true,
00215                 "Before word crunch?", this->params()),
00216     BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
00217                 "Take out ~^ early?", this->params()),
00218     double_MEMBER(crunch_terrible_rating, 80.0,
00219                   "crunch rating lt this", this->params()),
00220     BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
00221     double_MEMBER(crunch_poor_garbage_cert, -9.0,
00222                   "crunch garbage cert lt this", this->params()),
00223     double_MEMBER(crunch_poor_garbage_rate, 60,
00224                   "crunch garbage rating lt this", this->params()),
00225     double_MEMBER(crunch_pot_poor_rate, 40,
00226                   "POTENTIAL crunch rating lt this", this->params()),
00227     double_MEMBER(crunch_pot_poor_cert, -8.0,
00228                   "POTENTIAL crunch cert lt this", this->params()),
00229     BOOL_MEMBER(crunch_pot_garbage, true,
00230                 "POTENTIAL crunch garbage", this->params()),
00231     double_MEMBER(crunch_del_rating, 60,
00232                   "POTENTIAL crunch rating lt this", this->params()),
00233     double_MEMBER(crunch_del_cert, -10.0,
00234                   "POTENTIAL crunch cert lt this", this->params()),
00235     double_MEMBER(crunch_del_min_ht, 0.7,
00236                   "Del if word ht lt xht x this", this->params()),
00237     double_MEMBER(crunch_del_max_ht, 3.0,
00238                   "Del if word ht gt xht x this", this->params()),
00239     double_MEMBER(crunch_del_min_width, 3.0,
00240                   "Del if word width lt xht x this", this->params()),
00241     double_MEMBER(crunch_del_high_word, 1.5,
00242                   "Del if word gt xht x this above bl", this->params()),
00243     double_MEMBER(crunch_del_low_word, 0.5,
00244                   "Del if word gt xht x this below bl", this->params()),
00245     double_MEMBER(crunch_small_outlines_size, 0.6,
00246                   "Small if lt xht x this", this->params()),
00247     INT_MEMBER(crunch_rating_max, 10,
00248                "For adj length in rating per ch", this->params()),
00249     INT_MEMBER(crunch_pot_indicators, 1,
00250                "How many potential indicators needed", this->params()),
00251     BOOL_MEMBER(crunch_leave_ok_strings, true,
00252                 "Dont touch sensible strings", this->params()),
00253     BOOL_MEMBER(crunch_accept_ok, true,
00254                 "Use acceptability in okstring", this->params()),
00255     BOOL_MEMBER(crunch_leave_accept_strings, false,
00256                 "Dont pot crunch sensible strings", this->params()),
00257     BOOL_MEMBER(crunch_include_numerals, false,
00258                 "Fiddle alpha figures", this->params()),
00259     INT_MEMBER(crunch_leave_lc_strings, 4,
00260                "Dont crunch words with long lower case strings",
00261                this->params()),
00262     INT_MEMBER(crunch_leave_uc_strings, 4,
00263                "Dont crunch words with long lower case strings",
00264                this->params()),
00265     INT_MEMBER(crunch_long_repetitions, 3,
00266                "Crunch words with long repetitions", this->params()),
00267     INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
00268     INT_MEMBER(fixsp_non_noise_limit, 1,
00269                "How many non-noise blbs either side?", this->params()),
00270     double_MEMBER(fixsp_small_outlines_size, 0.28,
00271                   "Small if lt xht x this", this->params()),
00272     BOOL_MEMBER(tessedit_prefer_joined_punct, false,
00273                 "Reward punctation joins", this->params()),
00274     INT_MEMBER(fixsp_done_mode, 1,
00275                "What constitues done for spacing", this->params()),
00276     INT_MEMBER(debug_fix_space_level, 0,
00277                "Contextual fixspace debug", this->params()),
00278     STRING_MEMBER(numeric_punctuation, ".,",
00279                   "Punct. chs expected WITHIN numbers", this->params()),
00280     INT_MEMBER(x_ht_acceptance_tolerance, 8,
00281                "Max allowed deviation of blob top outside of font data",
00282                this->params()),
00283     INT_MEMBER(x_ht_min_change, 8,
00284                "Min change in xht before actually trying it", this->params()),
00285     BOOL_MEMBER(tessedit_write_block_separators, false,
00286                 "Write block separators in output", this->params()),
00287     BOOL_MEMBER(tessedit_write_rep_codes, false,
00288                 "Write repetition char code", this->params()),
00289     BOOL_MEMBER(tessedit_write_unlv, false,
00290                 "Write .unlv output file", this->params()),
00291     BOOL_MEMBER(tessedit_create_hocr, false,
00292                 "Write .html hOCR output file", this->params()),
00293     STRING_MEMBER(unrecognised_char, "|",
00294                   "Output char for unidentified blobs", this->params()),
00295     INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
00296     INT_MEMBER(suspect_space_level, 100,
00297                "Min suspect level for rejecting spaces", this->params()),
00298     INT_MEMBER(suspect_short_words, 2,
00299                "Dont Suspect dict wds longer than this", this->params()),
00300     BOOL_MEMBER(suspect_constrain_1Il, false,
00301                 "UNLV keep 1Il chars rejected", this->params()),
00302     double_MEMBER(suspect_rating_per_ch, 999.9,
00303                   "Dont touch bad rating limit", this->params()),
00304     double_MEMBER(suspect_accept_rating, -999.9,
00305                   "Accept good rating limit", this->params()),
00306     BOOL_MEMBER(tessedit_minimal_rejection, false,
00307                 "Only reject tess failures", this->params()),
00308     BOOL_MEMBER(tessedit_zero_rejection, false,
00309                 "Dont reject ANYTHING", this->params()),
00310     BOOL_MEMBER(tessedit_word_for_word, false,
00311                 "Make output have exactly one word per WERD", this->params()),
00312     BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
00313                 "Dont reject ANYTHING AT ALL", this->params()),
00314     BOOL_MEMBER(tessedit_consistent_reps, true,
00315                 "Force all rep chars the same", this->params()),
00316     INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()),
00317     INT_MEMBER(tessedit_ok_mode, 5,
00318                "Acceptance decision algorithm", this->params()),
00319     BOOL_MEMBER(tessedit_rejection_debug, false,
00320                 "Adaption debug", this->params()),
00321     BOOL_MEMBER(tessedit_flip_0O, true,
00322                 "Contextual 0O O0 flips", this->params()),
00323     double_MEMBER(tessedit_lower_flip_hyphen, 1.5,
00324                   "Aspect ratio dot/hyphen test", this->params()),
00325     double_MEMBER(tessedit_upper_flip_hyphen, 1.8,
00326                   "Aspect ratio dot/hyphen test", this->params()),
00327     BOOL_MEMBER(rej_trust_doc_dawg, false,
00328                 "Use DOC dawg in 11l conf. detector", this->params()),
00329     BOOL_MEMBER(rej_1Il_use_dict_word, false,
00330                 "Use dictword test", this->params()),
00331     BOOL_MEMBER(rej_1Il_trust_permuter_type, true,
00332                 "Dont double check", this->params()),
00333     BOOL_MEMBER(rej_use_tess_accepted, true,
00334                 "Individual rejection control", this->params()),
00335     BOOL_MEMBER(rej_use_tess_blanks, true,
00336                 "Individual rejection control", this->params()),
00337     BOOL_MEMBER(rej_use_good_perm, true,
00338                 "Individual rejection control", this->params()),
00339     BOOL_MEMBER(rej_use_sensible_wd, false,
00340                 "Extend permuter check", this->params()),
00341     BOOL_MEMBER(rej_alphas_in_number_perm, false,
00342                 "Extend permuter check", this->params()),
00343     double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85,
00344                   "if >this fract", this->params()),
00345     INT_MEMBER(tessedit_image_border, 2,
00346                "Rej blbs near image edge limit", this->params()),
00347     STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075",
00348                   "Allow NN to unrej", this->params()),
00349     STRING_MEMBER(conflict_set_I_l_1, "Il1[]",
00350                   "Il1 conflict set", this->params()),
00351     INT_MEMBER(min_sane_x_ht_pixels, 8,
00352                "Reject any x-ht lt or eq than this", this->params()),
00353     BOOL_MEMBER(tessedit_create_boxfile, false,
00354                 "Output text with boxes", this->params()),
00355     INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages"
00356                " , else specifc page to process", this->params()),
00357     BOOL_MEMBER(tessedit_write_images, false,
00358                 "Capture the image from the IPE", this->params()),
00359     BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
00360                 this->params()),
00361     STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
00362     BOOL_MEMBER(tessedit_override_permuter, true,
00363                 "According to dict_word", this->params()),
00364     INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for"
00365                " TessdataManager functions.", this->params()),
00366     STRING_MEMBER(tessedit_load_sublangs, "",
00367                   "List of languages to load with this one", this->params()),
00368     double_MEMBER(min_orientation_margin, 7.0,
00369                   "Min acceptable orientation margin", this->params()),
00370     BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
00371                 this->params()),
00372     BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
00373                 this->params()),
00374     BOOL_INIT_MEMBER(tessedit_init_config_only, false,
00375                      "Only initialize with the config file. Useful if the "
00376                      "instance is not going to be used for OCR but say only "
00377                      "for layout analysis.", this->params()),
00378     BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
00379                 this->params()),
00380     backup_config_file_(NULL),
00381     pix_binary_(NULL),
00382     cube_binary_(NULL),
00383     pix_grey_(NULL),
00384     source_resolution_(0),
00385     textord_(this),
00386     right_to_left_(false),
00387     scaled_color_(NULL),
00388     scaled_factor_(-1),
00389     deskew_(1.0f, 0.0f),
00390     reskew_(1.0f, 0.0f),
00391     most_recently_used_(this),
00392     font_table_size_(0),
00393     cube_cntxt_(NULL),
00394     tess_cube_combiner_(NULL),
00395     equ_detect_(NULL) {
00396 }
00397
00398 Tesseract::~Tesseract() {
00399   Clear();
00400   end_tesseract();
00401   sub_langs_.delete_data_pointers();
00402   // Delete cube objects.
00403   if (cube_cntxt_ != NULL) {
00404     delete cube_cntxt_;
00405     cube_cntxt_ = NULL;
00406   }
00407   if (tess_cube_combiner_ != NULL) {
00408     delete tess_cube_combiner_;
00409     tess_cube_combiner_ = NULL;
00410   }
00411 }
00412
00413 void Tesseract::Clear() {
00414   pixDestroy(&pix_binary_);
00415   pixDestroy(&cube_binary_);
00416   pixDestroy(&pix_grey_);
00417   pixDestroy(&scaled_color_);
00418   deskew_ = FCOORD(1.0f, 0.0f);
00419   reskew_ = FCOORD(1.0f, 0.0f);
00420   splitter_.Clear();
00421   scaled_factor_ = -1;
00422   ResetFeaturesHaveBeenExtracted();
00423   for (int i = 0; i < sub_langs_.size(); ++i)
00424     sub_langs_[i]->Clear();
00425 }
00426
00427 void Tesseract::SetEquationDetect(EquationDetect* detector) {
00428   equ_detect_ = detector;
00429   equ_detect_->SetLangTesseract(this);
00430 }
00431
00432 // Clear all memory of adaption for this and all subclassifiers.
00433 void Tesseract::ResetAdaptiveClassifier() {
00434   ResetAdaptiveClassifierInternal();
00435   for (int i = 0; i < sub_langs_.size(); ++i) {
00436     sub_langs_[i]->ResetAdaptiveClassifierInternal();
00437   }
00438 }
00439
00440 // Clear the document dictionary for this and all subclassifiers.
00441 void Tesseract::ResetDocumentDictionary() {
00442   getDict().ResetDocumentDictionary();
00443   for (int i = 0; i < sub_langs_.size(); ++i) {
00444     sub_langs_[i]->getDict().ResetDocumentDictionary();
00445   }
00446 }
00447
00448 void Tesseract::SetBlackAndWhitelist() {
00449   // Set the white and blacklists (if any)
00450   unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
00451                                      tessedit_char_whitelist.string());
00452   // Black and white lists should apply to all loaded classifiers.
00453   for (int i = 0; i < sub_langs_.size(); ++i) {
00454     sub_langs_[i]->unicharset.set_black_and_whitelist(
00455         tessedit_char_blacklist.string(), tessedit_char_whitelist.string());
00456   }
00457 }
00458
00459 // Perform steps to prepare underlying binary image/other data structures for
00460 // page segmentation.
00461 void Tesseract::PrepareForPageseg() {
00462   textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
00463   pixDestroy(&cube_binary_);
00464   cube_binary_ = pixClone(pix_binary());
00465   // Find the max splitter strategy over all langs.
00466   ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy =
00467       static_cast<ShiroRekhaSplitter::SplitStrategy>(
00468       static_cast<inT32>(pageseg_devanagari_split_strategy));
00469   for (int i = 0; i < sub_langs_.size(); ++i) {
00470     ShiroRekhaSplitter::SplitStrategy pageseg_strategy =
00471         static_cast<ShiroRekhaSplitter::SplitStrategy>(
00472         static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
00473     if (pageseg_strategy > max_pageseg_strategy)
00474       max_pageseg_strategy = pageseg_strategy;
00475     // Clone the cube image to all the sub langs too.
00476     pixDestroy(&sub_langs_[i]->cube_binary_);
00477     sub_langs_[i]->cube_binary_ = pixClone(pix_binary());
00478     pixDestroy(&sub_langs_[i]->pix_binary_);
00479     sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
00480   }
00481   // Perform shiro-rekha (top-line) splitting and replace the current image by
00482   // the newly splitted image.
00483   splitter_.set_orig_pix(pix_binary());
00484   splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
00485   if (splitter_.Split(true)) {
00486     ASSERT_HOST(splitter_.splitted_image());
00487     pixDestroy(&pix_binary_);
00488     pix_binary_ = pixClone(splitter_.splitted_image());
00489   }
00490 }
00491
00492 // Perform steps to prepare underlying binary image/other data structures for
00493 // OCR. The current segmentation is required by this method.
00494 // Note that this method resets pix_binary_ to the original binarized image,
00495 // which may be different from the image actually used for OCR depending on the
00496 // value of devanagari_ocr_split_strategy.
00497 void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list,
00498                                   Tesseract* osd_tess, OSResults* osr) {
00499   // Find the max splitter strategy over all langs.
00500   ShiroRekhaSplitter::SplitStrategy max_ocr_strategy =
00501       static_cast<ShiroRekhaSplitter::SplitStrategy>(
00502       static_cast<inT32>(ocr_devanagari_split_strategy));
00503   for (int i = 0; i < sub_langs_.size(); ++i) {
00504     ShiroRekhaSplitter::SplitStrategy ocr_strategy =
00505         static_cast<ShiroRekhaSplitter::SplitStrategy>(
00506         static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
00507     if (ocr_strategy > max_ocr_strategy)
00508       max_ocr_strategy = ocr_strategy;
00509   }
00510   // Utilize the segmentation information available.
00511   splitter_.set_segmentation_block_list(block_list);
00512   splitter_.set_ocr_split_strategy(max_ocr_strategy);
00513   // Run the splitter for OCR
00514   bool split_for_ocr = splitter_.Split(false);
00515   // Restore pix_binary to the binarized original pix for future reference.
00516   ASSERT_HOST(splitter_.orig_pix());
00517   pixDestroy(&pix_binary_);
00518   pix_binary_ = pixClone(splitter_.orig_pix());
00519   // If the pageseg and ocr strategies are different, refresh the block list
00520   // (from the last SegmentImage call) with blobs from the real image to be used
00521   // for OCR.
00522   if (splitter_.HasDifferentSplitStrategies()) {
00523     BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
00524                 pixGetHeight(pix_binary_));
00525     Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
00526         splitter_.orig_pix();
00527     extract_edges(pix_for_ocr, &block);
00528     splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
00529   }
00530   // The splitter isn't needed any more after this, so save memory by clearing.
00531   splitter_.Clear();
00532 }
00533
00534 }  // namespace tesseract