Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: baseapi.cpp 00003 * Description: Simple API for calling tesseract. 00004 * Author: Ray Smith 00005 * Created: Fri Oct 06 15:35:01 PDT 2006 00006 * 00007 * (C) Copyright 2006, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // Include automatically generated configuration file if running autoconf. 00021 #ifdef HAVE_CONFIG_H 00022 #include "config_auto.h" 00023 #endif 00024 00025 #include "allheaders.h" 00026 00027 #ifdef USING_GETTEXT 00028 #include <libintl.h> 00029 #include <locale.h> 00030 #define _(x) gettext(x) 00031 #else 00032 #define _(x) (x) 00033 #endif 00034 00035 #include "baseapi.h" 00036 00037 #include "resultiterator.h" 00038 #include "mutableiterator.h" 00039 #include "thresholder.h" 00040 #include "tesseractclass.h" 00041 #include "pageres.h" 00042 #include "paragraphs.h" 00043 #include "tessvars.h" 00044 #include "control.h" 00045 #include "pgedit.h" 00046 #include "paramsd.h" 00047 #include "output.h" 00048 #include "globals.h" 00049 #include "edgblob.h" 00050 #include "equationdetect.h" 00051 #include "tessbox.h" 00052 #include "imgs.h" 00053 #include "imgtiff.h" 00054 #include "makerow.h" 00055 #include "permute.h" 00056 #include "otsuthr.h" 00057 #include "osdetect.h" 00058 #include "params.h" 00059 00060 #if defined(_WIN32) && !defined(VERSION) 00061 #include "version.h" 00062 #endif 00063 00064 namespace tesseract { 00065 00067 const int kMinRectSize = 10; 00069 const char kTesseractReject = '~'; 00071 const char kUNLVReject = '~'; 00073 const char kUNLVSuspect = '^'; 00078 const char* kInputFile = "noname.tif"; 00080 const char* kOldVarsFile = "failed_vars.txt"; 00082 const int kMaxIntSize = 22; 00087 const int kMinCredibleResolution = 70; 00089 const int kMaxCredibleResolution = 2400; 00090 00091 TessBaseAPI::TessBaseAPI() 00092 : tesseract_(NULL), 00093 osd_tesseract_(NULL), 00094 equ_detect_(NULL), 00095 // Thresholder is initialized to NULL here, but will be set before use by: 00096 // A constructor of a derived API, SetThresholder(), or 00097 // created implicitly when used in InternalSetImage. 00098 thresholder_(NULL), 00099 paragraph_models_(NULL), 00100 block_list_(NULL), 00101 page_res_(NULL), 00102 input_file_(NULL), 00103 output_file_(NULL), 00104 datapath_(NULL), 00105 language_(NULL), 00106 last_oem_requested_(OEM_DEFAULT), 00107 recognition_done_(false), 00108 truth_cb_(NULL), 00109 rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0), 00110 image_width_(0), image_height_(0) { 00111 } 00112 00113 TessBaseAPI::~TessBaseAPI() { 00114 End(); 00115 } 00116 00120 const char* TessBaseAPI::Version() { 00121 return VERSION; 00122 } 00123 00128 void TessBaseAPI::SetInputName(const char* name) { 00129 if (input_file_ == NULL) 00130 input_file_ = new STRING(name); 00131 else 00132 *input_file_ = name; 00133 } 00134 00136 void TessBaseAPI::SetOutputName(const char* name) { 00137 if (output_file_ == NULL) 00138 output_file_ = new STRING(name); 00139 else 00140 *output_file_ = name; 00141 } 00142 00143 bool TessBaseAPI::SetVariable(const char* name, const char* value) { 00144 if (tesseract_ == NULL) tesseract_ = new Tesseract; 00145 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY, 00146 tesseract_->params()); 00147 } 00148 00149 bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) { 00150 if (tesseract_ == NULL) tesseract_ = new Tesseract; 00151 return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, 00152 tesseract_->params()); 00153 } 00154 00155 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const { 00156 IntParam *p = ParamUtils::FindParam<IntParam>( 00157 name, GlobalParams()->int_params, tesseract_->params()->int_params); 00158 if (p == NULL) return false; 00159 *value = (inT32)(*p); 00160 return true; 00161 } 00162 00163 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const { 00164 BoolParam *p = ParamUtils::FindParam<BoolParam>( 00165 name, GlobalParams()->bool_params, tesseract_->params()->bool_params); 00166 if (p == NULL) return false; 00167 *value = (BOOL8)(*p); 00168 return true; 00169 } 00170 00171 const char *TessBaseAPI::GetStringVariable(const char *name) const { 00172 StringParam *p = ParamUtils::FindParam<StringParam>( 00173 name, GlobalParams()->string_params, tesseract_->params()->string_params); 00174 return (p != NULL) ? p->string() : NULL; 00175 } 00176 00177 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const { 00178 DoubleParam *p = ParamUtils::FindParam<DoubleParam>( 00179 name, GlobalParams()->double_params, tesseract_->params()->double_params); 00180 if (p == NULL) return false; 00181 *value = (double)(*p); 00182 return true; 00183 } 00184 00186 bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) { 00187 return ParamUtils::GetParamAsString(name, tesseract_->params(), val); 00188 } 00189 00191 void TessBaseAPI::PrintVariables(FILE *fp) const { 00192 ParamUtils::PrintParams(fp, tesseract_->params()); 00193 } 00194 00203 int TessBaseAPI::Init(const char* datapath, const char* language, 00204 OcrEngineMode oem, char **configs, int configs_size, 00205 const GenericVector<STRING> *vars_vec, 00206 const GenericVector<STRING> *vars_values, 00207 bool set_only_non_debug_params) { 00208 // Default language is "eng". 00209 if (language == NULL) language = "eng"; 00210 // If the datapath, OcrEngineMode or the language have changed - start again. 00211 // Note that the language_ field stores the last requested language that was 00212 // initialized successfully, while tesseract_->lang stores the language 00213 // actually used. They differ only if the requested language was NULL, in 00214 // which case tesseract_->lang is set to the Tesseract default ("eng"). 00215 if (tesseract_ != NULL && 00216 (datapath_ == NULL || language_ == NULL || 00217 *datapath_ != datapath || last_oem_requested_ != oem || 00218 (*language_ != language && tesseract_->lang != language))) { 00219 delete tesseract_; 00220 tesseract_ = NULL; 00221 } 00222 00223 bool reset_classifier = true; 00224 if (tesseract_ == NULL) { 00225 reset_classifier = false; 00226 tesseract_ = new Tesseract; 00227 if (tesseract_->init_tesseract( 00228 datapath, output_file_ != NULL ? output_file_->string() : NULL, 00229 language, oem, configs, configs_size, vars_vec, vars_values, 00230 set_only_non_debug_params) != 0) { 00231 return -1; 00232 } 00233 } 00234 // Update datapath and language requested for the last valid initialization. 00235 if (datapath_ == NULL) 00236 datapath_ = new STRING(datapath); 00237 else 00238 *datapath_ = datapath; 00239 if (language_ == NULL) 00240 language_ = new STRING(language); 00241 else 00242 *language_ = language; 00243 last_oem_requested_ = oem; 00244 00245 // For same language and datapath, just reset the adaptive classifier. 00246 if (reset_classifier) tesseract_->ResetAdaptiveClassifier(); 00247 00248 return 0; 00249 } 00250 00259 const char* TessBaseAPI::GetInitLanguagesAsString() const { 00260 return (language_ == NULL || language_->string() == NULL) ? 00261 "" : language_->string(); 00262 } 00263 00269 void TessBaseAPI::GetLoadedLanguagesAsVector( 00270 GenericVector<STRING>* langs) const { 00271 langs->clear(); 00272 if (tesseract_ != NULL) { 00273 langs->push_back(tesseract_->lang); 00274 int num_subs = tesseract_->num_sub_langs(); 00275 for (int i = 0; i < num_subs; ++i) 00276 langs->push_back(tesseract_->get_sub_lang(i)->lang); 00277 } 00278 } 00279 00286 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) { 00287 if (tesseract_ == NULL) 00288 tesseract_ = new Tesseract; 00289 return tesseract_->init_tesseract_lm(datapath, NULL, language); 00290 } 00291 00296 void TessBaseAPI::InitForAnalysePage() { 00297 if (tesseract_ == NULL) { 00298 tesseract_ = new Tesseract; 00299 tesseract_->InitAdaptiveClassifier(false); 00300 } 00301 } 00302 00308 void TessBaseAPI::ReadConfigFile(const char* filename) { 00309 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY); 00310 } 00311 00313 void TessBaseAPI::ReadDebugConfigFile(const char* filename) { 00314 tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY); 00315 } 00316 00322 void TessBaseAPI::SetPageSegMode(PageSegMode mode) { 00323 if (tesseract_ == NULL) 00324 tesseract_ = new Tesseract; 00325 tesseract_->tessedit_pageseg_mode.set_value(mode); 00326 } 00327 00329 PageSegMode TessBaseAPI::GetPageSegMode() const { 00330 if (tesseract_ == NULL) 00331 return PSM_SINGLE_BLOCK; 00332 return static_cast<PageSegMode>( 00333 static_cast<int>(tesseract_->tessedit_pageseg_mode)); 00334 } 00335 00349 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata, 00350 int bytes_per_pixel, 00351 int bytes_per_line, 00352 int left, int top, 00353 int width, int height) { 00354 if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize) 00355 return NULL; // Nothing worth doing. 00356 00357 // Since this original api didn't give the exact size of the image, 00358 // we have to invent a reasonable value. 00359 int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8; 00360 SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, 00361 bytes_per_pixel, bytes_per_line); 00362 SetRectangle(left, top, width, height); 00363 00364 return GetUTF8Text(); 00365 } 00366 00371 void TessBaseAPI::ClearAdaptiveClassifier() { 00372 if (tesseract_ == NULL) 00373 return; 00374 tesseract_->ResetAdaptiveClassifier(); 00375 tesseract_->ResetDocumentDictionary(); 00376 } 00377 00387 void TessBaseAPI::SetImage(const unsigned char* imagedata, 00388 int width, int height, 00389 int bytes_per_pixel, int bytes_per_line) { 00390 if (InternalSetImage()) 00391 thresholder_->SetImage(imagedata, width, height, 00392 bytes_per_pixel, bytes_per_line); 00393 } 00394 00395 void TessBaseAPI::SetSourceResolution(int ppi) { 00396 if (thresholder_) 00397 thresholder_->SetSourceYResolution(ppi); 00398 else 00399 tprintf("Please call SetImage before SetSourceResolution.\n"); 00400 } 00401 00412 void TessBaseAPI::SetImage(const Pix* pix) { 00413 if (InternalSetImage()) 00414 thresholder_->SetImage(pix); 00415 } 00416 00422 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) { 00423 if (thresholder_ == NULL) 00424 return; 00425 thresholder_->SetRectangle(left, top, width, height); 00426 ClearResults(); 00427 } 00428 00433 Pix* TessBaseAPI::GetThresholdedImage() { 00434 if (tesseract_ == NULL) 00435 return NULL; 00436 if (tesseract_->pix_binary() == NULL) 00437 Threshold(tesseract_->mutable_pix_binary()); 00438 return pixClone(tesseract_->pix_binary()); 00439 } 00440 00446 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) { 00447 return GetComponentImages(RIL_BLOCK, false, pixa, NULL); 00448 } 00449 00456 Boxa* TessBaseAPI::GetTextlines(Pixa** pixa, int** blockids) { 00457 return GetComponentImages(RIL_TEXTLINE, true, pixa, blockids); 00458 } 00459 00468 Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) { 00469 return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids); 00470 } 00471 00477 Boxa* TessBaseAPI::GetWords(Pixa** pixa) { 00478 return GetComponentImages(RIL_WORD, true, pixa, NULL); 00479 } 00480 00487 Boxa* TessBaseAPI::GetConnectedComponents(Pixa** pixa) { 00488 return GetComponentImages(RIL_SYMBOL, true, pixa, NULL); 00489 } 00490 00499 Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level, 00500 bool text_only, 00501 Pixa** pixa, int** blockids) { 00502 PageIterator* page_it = GetIterator(); 00503 if (page_it == NULL) 00504 page_it = AnalyseLayout(); 00505 if (page_it == NULL) 00506 return NULL; // Failed. 00507 00508 // Count the components to get a size for the arrays. 00509 int component_count = 0; 00510 int left, top, right, bottom; 00511 do { 00512 if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) && 00513 (!text_only || PTIsTextType(page_it->BlockType()))) 00514 ++component_count; 00515 } while (page_it->Next(level)); 00516 00517 Boxa* boxa = boxaCreate(component_count); 00518 if (pixa != NULL) 00519 *pixa = pixaCreate(component_count); 00520 if (blockids != NULL) 00521 *blockids = new int[component_count]; 00522 00523 int blockid = 0; 00524 int component_index = 0; 00525 page_it->Begin(); 00526 do { 00527 if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) && 00528 (!text_only || PTIsTextType(page_it->BlockType()))) { 00529 Box* lbox = boxCreate(left, top, right - left, bottom - top); 00530 boxaAddBox(boxa, lbox, L_INSERT); 00531 if (pixa != NULL) { 00532 Pix* pix = page_it->GetBinaryImage(level); 00533 pixaAddPix(*pixa, pix, L_INSERT); 00534 pixaAddBox(*pixa, lbox, L_CLONE); 00535 } 00536 if (blockids != NULL) { 00537 (*blockids)[component_index] = blockid; 00538 if (page_it->IsAtFinalElement(RIL_BLOCK, level)) 00539 ++blockid; 00540 } 00541 ++component_index; 00542 } 00543 } while (page_it->Next(level)); 00544 delete page_it; 00545 return boxa; 00546 } 00547 00548 int TessBaseAPI::GetThresholdedImageScaleFactor() const { 00549 if (thresholder_ == NULL) { 00550 return 0; 00551 } 00552 return thresholder_->GetScaleFactor(); 00553 } 00554 00556 void TessBaseAPI::DumpPGM(const char* filename) { 00557 if (tesseract_ == NULL) 00558 return; 00559 FILE *fp = fopen(filename, "wb"); 00560 Pix* pix = tesseract_->pix_binary(); 00561 int width = pixGetWidth(pix); 00562 int height = pixGetHeight(pix); 00563 l_uint32* data = pixGetData(pix); 00564 fprintf(fp, "P5 %d %d 255\n", width, height); 00565 for (int y = 0; y < height; ++y, data += pixGetWpl(pix)) { 00566 for (int x = 0; x < width; ++x) { 00567 uinT8 b = GET_DATA_BIT(data, x) ? 0 : 255; 00568 fwrite(&b, 1, 1, fp); 00569 } 00570 } 00571 fclose(fp); 00572 } 00573 00580 int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks, 00581 Boxa* boxa_words, Pixa* pixa_words, 00582 const FCOORD& reskew, Pix* page_pix, 00583 PAGE_RES* page_res) { 00584 int block_count = boxaGetCount(boxa_blocks); 00585 ASSERT_HOST(block_count == pixaGetCount(pixa_blocks)); 00586 // Write each block to the current directory as junk_write_display.nnn.png. 00587 for (int i = 0; i < block_count; ++i) { 00588 Pix* pix = pixaGetPix(pixa_blocks, i, L_CLONE); 00589 pixDisplayWrite(pix, 1); 00590 } 00591 int word_count = boxaGetCount(boxa_words); 00592 ASSERT_HOST(word_count == pixaGetCount(pixa_words)); 00593 int pr_word = 0; 00594 PAGE_RES_IT page_res_it(page_res); 00595 for (page_res_it.restart_page(); page_res_it.word () != NULL; 00596 page_res_it.forward(), ++pr_word) { 00597 WERD_RES *word = page_res_it.word(); 00598 WERD_CHOICE* choice = word->best_choice; 00599 // Write the first 100 words to files names wordims/<wordstring>.tif. 00600 if (pr_word < 100) { 00601 STRING filename("wordims/"); 00602 if (choice != NULL) { 00603 filename += choice->unichar_string(); 00604 } else { 00605 char numbuf[32]; 00606 filename += "unclassified"; 00607 snprintf(numbuf, 32, "%03d", pr_word); 00608 filename += numbuf; 00609 } 00610 filename += ".tif"; 00611 Pix* pix = pixaGetPix(pixa_words, pr_word, L_CLONE); 00612 pixWrite(filename.string(), pix, IFF_TIFF_G4); 00613 } 00614 } 00615 ASSERT_HOST(pr_word == word_count); 00616 return 0; 00617 } 00618 00630 PageIterator* TessBaseAPI::AnalyseLayout() { 00631 if (FindLines() == 0) { 00632 if (block_list_->empty()) 00633 return NULL; // The page was empty. 00634 page_res_ = new PAGE_RES(block_list_, NULL); 00635 return new PageIterator(page_res_, tesseract_, 00636 thresholder_->GetScaleFactor(), 00637 thresholder_->GetScaledYResolution(), 00638 rect_left_, rect_top_, rect_width_, rect_height_); 00639 } 00640 return NULL; 00641 } 00642 00647 int TessBaseAPI::Recognize(ETEXT_DESC* monitor) { 00648 if (tesseract_ == NULL) 00649 return -1; 00650 if (FindLines() != 0) 00651 return -1; 00652 if (page_res_ != NULL) 00653 delete page_res_; 00654 00655 tesseract_->SetBlackAndWhitelist(); 00656 recognition_done_ = true; 00657 if (tesseract_->tessedit_resegment_from_line_boxes) 00658 page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_); 00659 else if (tesseract_->tessedit_resegment_from_boxes) 00660 page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_); 00661 else 00662 page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_); 00663 if (tesseract_->tessedit_make_boxes_from_boxes) { 00664 tesseract_->CorrectClassifyWords(page_res_); 00665 return 0; 00666 } 00667 00668 if (truth_cb_ != NULL) { 00669 tesseract_->wordrec_run_blamer.set_value(true); 00670 truth_cb_->Run(tesseract_->getDict().getUnicharset(), 00671 image_height_, page_res_); 00672 } 00673 00674 int result = 0; 00675 if (tesseract_->interactive_display_mode) { 00676 #ifndef GRAPHICS_DISABLED 00677 tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_); 00678 #endif // GRAPHICS_DISABLED 00679 // The page_res is invalid after an interactive session, so cleanup 00680 // in a way that lets us continue to the next page without crashing. 00681 delete page_res_; 00682 page_res_ = NULL; 00683 return -1; 00684 } else if (tesseract_->tessedit_train_from_boxes) { 00685 tesseract_->ApplyBoxTraining(*output_file_, page_res_); 00686 } else if (tesseract_->tessedit_ambigs_training) { 00687 FILE *training_output_file = tesseract_->init_recog_training(*input_file_); 00688 // OCR the page segmented into words by tesseract. 00689 tesseract_->recog_training_segmented( 00690 *input_file_, page_res_, monitor, training_output_file); 00691 fclose(training_output_file); 00692 } else { 00693 // Now run the main recognition. 00694 if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) { 00695 int paragraph_debug_level = 0; 00696 GetIntVariable("paragraph_debug_level", ¶graph_debug_level); 00697 DetectParagraphs(paragraph_debug_level); 00698 } else { 00699 result = -1; 00700 } 00701 } 00702 return result; 00703 } 00704 00706 int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) { 00707 if (tesseract_ == NULL) 00708 return -1; 00709 if (thresholder_ == NULL || thresholder_->IsEmpty()) { 00710 tprintf("Please call SetImage before attempting recognition."); 00711 return -1; 00712 } 00713 if (page_res_ != NULL) 00714 ClearResults(); 00715 if (FindLines() != 0) 00716 return -1; 00717 // Additional conditions under which chopper test cannot be run 00718 if (tesseract_->interactive_display_mode) return -1; 00719 00720 recognition_done_ = true; 00721 00722 page_res_ = new PAGE_RES(block_list_, &(tesseract_->prev_word_best_choice_)); 00723 00724 PAGE_RES_IT page_res_it(page_res_); 00725 00726 while (page_res_it.word() != NULL) { 00727 WERD_RES *word_res = page_res_it.word(); 00728 GenericVector<TBOX> boxes; 00729 tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block, 00730 page_res_it.row()->row, word_res); 00731 page_res_it.forward(); 00732 } 00733 return 0; 00734 } 00735 00752 bool TessBaseAPI::ProcessPages(const char* filename, 00753 const char* retry_config, int timeout_millisec, 00754 STRING* text_out) { 00755 int page = tesseract_->tessedit_page_number; 00756 if (page < 0) 00757 page = 0; 00758 FILE* fp = fopen(filename, "rb"); 00759 if (fp == NULL) { 00760 tprintf(_("Image file %s cannot be opened!\n"), filename); 00761 return false; 00762 } 00763 // Find the number of pages if a tiff file, or zero otherwise. 00764 int npages = CountTiffPages(fp); 00765 fclose(fp); 00766 00767 if (tesseract_->tessedit_create_hocr) { 00768 *text_out = 00769 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"" 00770 " \"http://www.w3.org/TR/html4/loose.dtd\">\n" 00771 "<html>\n<head>\n<title></title>\n" 00772 "<meta http-equiv=\"Content-Type\" content=\"text/html;" 00773 "charset=utf-8\" />\n<meta name='ocr-system' content='tesseract'/>\n" 00774 "</head>\n<body>\n"; 00775 } else { 00776 *text_out = ""; 00777 } 00778 00779 bool success = true; 00780 Pix *pix; 00781 if (npages > 0) { 00782 for (; page < npages && (pix = pixReadTiff(filename, page)) != NULL; 00783 ++page) { 00784 if ((page >= 0) && (npages > 1)) 00785 tprintf(_("Page %d of %d\n"), page + 1, npages); 00786 char page_str[kMaxIntSize]; 00787 snprintf(page_str, kMaxIntSize - 1, "%d", page); 00788 SetVariable("applybox_page", page_str); 00789 success &= ProcessPage(pix, page, filename, retry_config, 00790 timeout_millisec, text_out); 00791 pixDestroy(&pix); 00792 if (tesseract_->tessedit_page_number >= 0 || npages == 1) { 00793 break; 00794 } 00795 } 00796 } else { 00797 // The file is not a tiff file, so use the general pixRead function. 00798 pix = pixRead(filename); 00799 if (pix != NULL) { 00800 success &= ProcessPage(pix, 0, filename, retry_config, 00801 timeout_millisec, text_out); 00802 pixDestroy(&pix); 00803 } else { 00804 // The file is not an image file, so try it as a list of filenames. 00805 FILE* fimg = fopen(filename, "rb"); 00806 if (fimg == NULL) { 00807 tprintf(_("File %s cannot be opened!\n"), filename); 00808 return false; 00809 } 00810 tprintf(_("Reading %s as a list of filenames...\n"), filename); 00811 char pagename[MAX_PATH]; 00812 // Skip to the requested page number. 00813 for (int i = 0; i < page && 00814 fgets(pagename, sizeof(pagename), fimg) != NULL; 00815 ++i); 00816 while (fgets(pagename, sizeof(pagename), fimg) != NULL) { 00817 chomp_string(pagename); 00818 pix = pixRead(pagename); 00819 if (pix == NULL) { 00820 tprintf(_("Image file %s cannot be read!\n"), pagename); 00821 fclose(fimg); 00822 return false; 00823 } 00824 tprintf(_("Page %d : %s\n"), page, pagename); 00825 success &= ProcessPage(pix, page, pagename, retry_config, 00826 timeout_millisec, text_out); 00827 pixDestroy(&pix); 00828 ++page; 00829 } 00830 fclose(fimg); 00831 } 00832 } 00833 if (tesseract_->tessedit_create_hocr) 00834 *text_out += "</body>\n</html>\n"; 00835 return success; 00836 } 00837 00849 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, 00850 const char* retry_config, int timeout_millisec, 00851 STRING* text_out) { 00852 SetInputName(filename); 00853 SetImage(pix); 00854 bool failed = false; 00855 if (timeout_millisec > 0) { 00856 // Running with a timeout. 00857 ETEXT_DESC monitor; 00858 monitor.cancel = NULL; 00859 monitor.cancel_this = NULL; 00860 monitor.set_deadline_msecs(timeout_millisec); 00861 // Now run the main recognition. 00862 failed = Recognize(&monitor) < 0; 00863 } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY || 00864 tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) { 00865 // Disabled character recognition. 00866 PageIterator* it = AnalyseLayout(); 00867 if (it == NULL) { 00868 failed = true; 00869 } else { 00870 delete it; 00871 return true; 00872 } 00873 } else { 00874 // Normal layout and character recognition with no timeout. 00875 failed = Recognize(NULL) < 0; 00876 } 00877 if (tesseract_->tessedit_write_images) { 00878 Pix* page_pix = GetThresholdedImage(); 00879 pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4); 00880 } 00881 if (failed && retry_config != NULL && retry_config[0] != '\0') { 00882 // Save current config variables before switching modes. 00883 FILE* fp = fopen(kOldVarsFile, "wb"); 00884 PrintVariables(fp); 00885 fclose(fp); 00886 // Switch to alternate mode for retry. 00887 ReadConfigFile(retry_config); 00888 SetImage(pix); 00889 Recognize(NULL); 00890 // Restore saved config variables. 00891 ReadConfigFile(kOldVarsFile); 00892 } 00893 // Get text only if successful. 00894 if (!failed) { 00895 char* text; 00896 if (tesseract_->tessedit_create_boxfile || 00897 tesseract_->tessedit_make_boxes_from_boxes) { 00898 text = GetBoxText(page_index); 00899 } else if (tesseract_->tessedit_write_unlv) { 00900 text = GetUNLVText(); 00901 } else if (tesseract_->tessedit_create_hocr) { 00902 text = GetHOCRText(page_index); 00903 } else { 00904 text = GetUTF8Text(); 00905 } 00906 *text_out += text; 00907 delete [] text; 00908 return true; 00909 } 00910 return false; 00911 } 00912 00917 LTRResultIterator* TessBaseAPI::GetLTRIterator() { 00918 if (tesseract_ == NULL || page_res_ == NULL) 00919 return NULL; 00920 return new LTRResultIterator( 00921 page_res_, tesseract_, 00922 thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(), 00923 rect_left_, rect_top_, rect_width_, rect_height_); 00924 } 00925 00934 ResultIterator* TessBaseAPI::GetIterator() { 00935 if (tesseract_ == NULL || page_res_ == NULL) 00936 return NULL; 00937 return ResultIterator::StartOfParagraph(LTRResultIterator( 00938 page_res_, tesseract_, 00939 thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(), 00940 rect_left_, rect_top_, rect_width_, rect_height_)); 00941 } 00942 00951 MutableIterator* TessBaseAPI::GetMutableIterator() { 00952 if (tesseract_ == NULL || page_res_ == NULL) 00953 return NULL; 00954 return new MutableIterator(page_res_, tesseract_, 00955 thresholder_->GetScaleFactor(), 00956 thresholder_->GetScaledYResolution(), 00957 rect_left_, rect_top_, rect_width_, rect_height_); 00958 } 00959 00961 char* TessBaseAPI::GetUTF8Text() { 00962 if (tesseract_ == NULL || 00963 (!recognition_done_ && Recognize(NULL) < 0)) 00964 return NULL; 00965 STRING text(""); 00966 ResultIterator *it = GetIterator(); 00967 do { 00968 if (it->Empty(RIL_PARA)) continue; 00969 char *para_text = it->GetUTF8Text(RIL_PARA); 00970 text += para_text; 00971 delete []para_text; 00972 } while (it->Next(RIL_PARA)); 00973 char* result = new char[text.length() + 1]; 00974 strncpy(result, text.string(), text.length() + 1); 00975 delete it; 00976 return result; 00977 } 00978 00979 static void AddBoxTohOCR(const PageIterator *it, 00980 PageIteratorLevel level, 00981 STRING* hocr_str) { 00982 int left, top, right, bottom; 00983 it->BoundingBox(level, &left, &top, &right, &bottom); 00984 hocr_str->add_str_int("' title=\"bbox ", left); 00985 hocr_str->add_str_int(" ", top); 00986 hocr_str->add_str_int(" ", right); 00987 hocr_str->add_str_int(" ", bottom); 00988 *hocr_str += "\">"; 00989 } 00990 00999 char* TessBaseAPI::GetHOCRText(int page_number) { 01000 if (tesseract_ == NULL || 01001 (page_res_ == NULL && Recognize(NULL) < 0)) 01002 return NULL; 01003 01004 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; 01005 int page_id = page_number + 1; // hOCR uses 1-based page numbers. 01006 01007 STRING hocr_str(""); 01008 01009 if (input_file_ == NULL) 01010 SetInputName(NULL); 01011 01012 hocr_str.add_str_int("<div class='ocr_page' id='page_", page_id); 01013 hocr_str += "' title='image \""; 01014 hocr_str += input_file_ ? *input_file_ : "unknown"; 01015 hocr_str.add_str_int("\"; bbox ", rect_left_); 01016 hocr_str.add_str_int(" ", rect_top_); 01017 hocr_str.add_str_int(" ", rect_width_); 01018 hocr_str.add_str_int(" ", rect_height_); 01019 hocr_str += "'>\n"; 01020 01021 ResultIterator *res_it = GetIterator(); 01022 for (; !res_it->Empty(RIL_BLOCK); wcnt++) { 01023 if (res_it->Empty(RIL_WORD)) { 01024 res_it->Next(RIL_WORD); 01025 continue; 01026 } 01027 01028 // Open any new block/paragraph/textline. 01029 if (res_it->IsAtBeginningOf(RIL_BLOCK)) { 01030 hocr_str.add_str_int("<div class='ocr_carea' id='block_", bcnt); 01031 hocr_str.add_str_int("_", bcnt); 01032 AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str); 01033 } 01034 if (res_it->IsAtBeginningOf(RIL_PARA)) { 01035 if (res_it->ParagraphIsLtr()) { 01036 hocr_str.add_str_int("\n<p class='ocr_par' dir='ltr' id='par_", pcnt); 01037 } else { 01038 hocr_str.add_str_int("\n<p class='ocr_par' dir='rtl' id='par_", pcnt); 01039 } 01040 AddBoxTohOCR(res_it, RIL_PARA, &hocr_str); 01041 } 01042 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { 01043 hocr_str.add_str_int("<span class='ocr_line' id='line_", lcnt); 01044 AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str); 01045 } 01046 01047 // Now, process the word... 01048 hocr_str.add_str_int("<span class='ocr_word' id='word_", wcnt); 01049 AddBoxTohOCR(res_it, RIL_WORD, &hocr_str); 01050 const char *font_name; 01051 bool bold, italic, underlined, monospace, serif, smallcaps; 01052 int pointsize, font_id; 01053 font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, 01054 &monospace, &serif, &smallcaps, 01055 &pointsize, &font_id); 01056 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); 01057 bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); 01058 bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); 01059 if (bold) hocr_str += "<strong>"; 01060 if (italic) hocr_str += "<em>"; 01061 do { 01062 const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); 01063 if (grapheme && grapheme[0] != 0) { 01064 if (grapheme[1] == 0) { 01065 switch (grapheme[0]) { 01066 case '<': hocr_str += "<"; break; 01067 case '>': hocr_str += ">"; break; 01068 case '&': hocr_str += "&"; break; 01069 case '"': hocr_str += """; break; 01070 case '\'': hocr_str += "'"; break; 01071 default: hocr_str += grapheme; 01072 } 01073 } else { 01074 hocr_str += grapheme; 01075 } 01076 } 01077 delete []grapheme; 01078 res_it->Next(RIL_SYMBOL); 01079 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); 01080 if (italic) hocr_str += "</em>"; 01081 if (bold) hocr_str += "</strong>"; 01082 hocr_str += "</span> "; 01083 wcnt++; 01084 // Close any ending block/paragraph/textline. 01085 if (last_word_in_line) { 01086 hocr_str += "</span>\n"; 01087 lcnt++; 01088 } 01089 if (last_word_in_para) { 01090 hocr_str += "</p>\n"; 01091 pcnt++; 01092 } 01093 if (last_word_in_block) { 01094 hocr_str += "</div>\n"; 01095 bcnt++; 01096 } 01097 } 01098 hocr_str += "</div>\n"; 01099 01100 char *ret = new char[hocr_str.length() + 1]; 01101 strcpy(ret, hocr_str.string()); 01102 delete res_it; 01103 return ret; 01104 } 01105 01107 const int kNumbersPerBlob = 5; 01112 const int kBytesPerNumber = 5; 01118 const int kBytesPerBlob = kNumbersPerBlob * (kBytesPerNumber + 1) + 1; 01119 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1; 01121 const int kBytesPer64BitNumber = 20; 01128 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + 01129 UNICHAR_LEN; 01130 01136 char* TessBaseAPI::GetBoxText(int page_number) { 01137 if (tesseract_ == NULL || 01138 (!recognition_done_ && Recognize(NULL) < 0)) 01139 return NULL; 01140 int blob_count; 01141 int utf8_length = TextLength(&blob_count); 01142 int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + 01143 kMaxBytesPerLine; 01144 char* result = new char[total_length]; 01145 int output_length = 0; 01146 LTRResultIterator* it = GetLTRIterator(); 01147 do { 01148 int left, top, right, bottom; 01149 if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) { 01150 char* text = it->GetUTF8Text(RIL_SYMBOL); 01151 // Tesseract uses space for recognition failure. Fix to a reject 01152 // character, kTesseractReject so we don't create illegal box files. 01153 for (int i = 0; text[i] != '\0'; ++i) { 01154 if (text[i] == ' ') 01155 text[i] = kTesseractReject; 01156 } 01157 snprintf(result + output_length, total_length - output_length, 01158 "%s %d %d %d %d %d\n", 01159 text, left, image_height_ - bottom, 01160 right, image_height_ - top, page_number); 01161 output_length += strlen(result + output_length); 01162 delete [] text; 01163 // Just in case... 01164 if (output_length + kMaxBytesPerLine > total_length) 01165 break; 01166 } 01167 } while (it->Next(RIL_SYMBOL)); 01168 delete it; 01169 return result; 01170 } 01171 01177 const int kUniChs[] = { 01178 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0 01179 }; 01181 const int kLatinChs[] = { 01182 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0 01183 }; 01184 01190 char* TessBaseAPI::GetUNLVText() { 01191 if (tesseract_ == NULL || 01192 (!recognition_done_ && Recognize(NULL) < 0)) 01193 return NULL; 01194 bool tilde_crunch_written = false; 01195 bool last_char_was_newline = true; 01196 bool last_char_was_tilde = false; 01197 01198 int total_length = TextLength(NULL); 01199 PAGE_RES_IT page_res_it(page_res_); 01200 char* result = new char[total_length]; 01201 char* ptr = result; 01202 for (page_res_it.restart_page(); page_res_it.word () != NULL; 01203 page_res_it.forward()) { 01204 WERD_RES *word = page_res_it.word(); 01205 // Process the current word. 01206 if (word->unlv_crunch_mode != CR_NONE) { 01207 if (word->unlv_crunch_mode != CR_DELETE && 01208 (!tilde_crunch_written || 01209 (word->unlv_crunch_mode == CR_KEEP_SPACE && 01210 word->word->space() > 0 && 01211 !word->word->flag(W_FUZZY_NON) && 01212 !word->word->flag(W_FUZZY_SP)))) { 01213 if (!word->word->flag(W_BOL) && 01214 word->word->space() > 0 && 01215 !word->word->flag(W_FUZZY_NON) && 01216 !word->word->flag(W_FUZZY_SP)) { 01217 /* Write a space to separate from preceeding good text */ 01218 *ptr++ = ' '; 01219 last_char_was_tilde = false; 01220 } 01221 if (!last_char_was_tilde) { 01222 // Write a reject char. 01223 last_char_was_tilde = true; 01224 *ptr++ = kUNLVReject; 01225 tilde_crunch_written = true; 01226 last_char_was_newline = false; 01227 } 01228 } 01229 } else { 01230 // NORMAL PROCESSING of non tilde crunched words. 01231 tilde_crunch_written = false; 01232 tesseract_->set_unlv_suspects(word); 01233 const char* wordstr = word->best_choice->unichar_string().string(); 01234 const STRING& lengths = word->best_choice->unichar_lengths(); 01235 int length = lengths.length(); 01236 int i = 0; 01237 int offset = 0; 01238 01239 if (last_char_was_tilde && 01240 word->word->space() == 0 && wordstr[offset] == ' ') { 01241 // Prevent adjacent tilde across words - we know that adjacent tildes 01242 // within words have been removed. 01243 // Skip the first character. 01244 offset = lengths[i++]; 01245 } 01246 if (i < length && wordstr[offset] != 0) { 01247 if (!last_char_was_newline) 01248 *ptr++ = ' '; 01249 else 01250 last_char_was_newline = false; 01251 for (; i < length; offset += lengths[i++]) { 01252 if (wordstr[offset] == ' ' || 01253 wordstr[offset] == kTesseractReject) { 01254 *ptr++ = kUNLVReject; 01255 last_char_was_tilde = true; 01256 } else { 01257 if (word->reject_map[i].rejected()) 01258 *ptr++ = kUNLVSuspect; 01259 UNICHAR ch(wordstr + offset, lengths[i]); 01260 int uni_ch = ch.first_uni(); 01261 for (int j = 0; kUniChs[j] != 0; ++j) { 01262 if (kUniChs[j] == uni_ch) { 01263 uni_ch = kLatinChs[j]; 01264 break; 01265 } 01266 } 01267 if (uni_ch <= 0xff) { 01268 *ptr++ = static_cast<char>(uni_ch); 01269 last_char_was_tilde = false; 01270 } else { 01271 *ptr++ = kUNLVReject; 01272 last_char_was_tilde = true; 01273 } 01274 } 01275 } 01276 } 01277 } 01278 if (word->word->flag(W_EOL) && !last_char_was_newline) { 01279 /* Add a new line output */ 01280 *ptr++ = '\n'; 01281 tilde_crunch_written = false; 01282 last_char_was_newline = true; 01283 last_char_was_tilde = false; 01284 } 01285 } 01286 *ptr++ = '\n'; 01287 *ptr = '\0'; 01288 return result; 01289 } 01290 01292 int TessBaseAPI::MeanTextConf() { 01293 int* conf = AllWordConfidences(); 01294 if (!conf) return 0; 01295 int sum = 0; 01296 int *pt = conf; 01297 while (*pt >= 0) sum += *pt++; 01298 if (pt != conf) sum /= pt - conf; 01299 delete [] conf; 01300 return sum; 01301 } 01302 01304 int* TessBaseAPI::AllWordConfidences() { 01305 if (tesseract_ == NULL || 01306 (!recognition_done_ && Recognize(NULL) < 0)) 01307 return NULL; 01308 int n_word = 0; 01309 PAGE_RES_IT res_it(page_res_); 01310 for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) 01311 n_word++; 01312 01313 int* conf = new int[n_word+1]; 01314 n_word = 0; 01315 for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) { 01316 WERD_RES *word = res_it.word(); 01317 WERD_CHOICE* choice = word->best_choice; 01318 int w_conf = static_cast<int>(100 + 5 * choice->certainty()); 01319 // This is the eq for converting Tesseract confidence to 1..100 01320 if (w_conf < 0) w_conf = 0; 01321 if (w_conf > 100) w_conf = 100; 01322 conf[n_word++] = w_conf; 01323 } 01324 conf[n_word] = -1; 01325 return conf; 01326 } 01327 01338 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) { 01339 int debug = 0; 01340 GetIntVariable("applybox_debug", &debug); 01341 bool success = true; 01342 PageSegMode current_psm = GetPageSegMode(); 01343 SetPageSegMode(mode); 01344 SetVariable("classify_enable_learning", "0"); 01345 char* text = GetUTF8Text(); 01346 if (debug) { 01347 tprintf("Trying to adapt \"%s\" to \"%s\"\n", text, wordstr); 01348 } 01349 if (text != NULL) { 01350 PAGE_RES_IT it(page_res_); 01351 WERD_RES* word_res = it.word(); 01352 if (word_res != NULL) { 01353 word_res->word->set_text(wordstr); 01354 } else { 01355 success = false; 01356 } 01357 // Check to see if text matches wordstr. 01358 int w = 0; 01359 int t = 0; 01360 for (t = 0; text[t] != '\0'; ++t) { 01361 if (text[t] == '\n' || text[t] == ' ') 01362 continue; 01363 while (wordstr[w] != '\0' && wordstr[w] == ' ') 01364 ++w; 01365 if (text[t] != wordstr[w]) 01366 break; 01367 ++w; 01368 } 01369 if (text[t] != '\0' || wordstr[w] != '\0') { 01370 // No match. 01371 delete page_res_; 01372 GenericVector<TBOX> boxes; 01373 page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_); 01374 tesseract_->ReSegmentByClassification(page_res_); 01375 tesseract_->TidyUp(page_res_); 01376 PAGE_RES_IT pr_it(page_res_); 01377 if (pr_it.word() == NULL) 01378 success = false; 01379 else 01380 word_res = pr_it.word(); 01381 } else { 01382 word_res->BestChoiceToCorrectText(); 01383 } 01384 if (success) { 01385 tesseract_->EnableLearning = true; 01386 tesseract_->LearnWord(NULL, NULL, word_res); 01387 } 01388 delete [] text; 01389 } else { 01390 success = false; 01391 } 01392 SetPageSegMode(current_psm); 01393 return success; 01394 } 01395 01402 void TessBaseAPI::Clear() { 01403 if (thresholder_ != NULL) 01404 thresholder_->Clear(); 01405 ClearResults(); 01406 } 01407 01414 void TessBaseAPI::End() { 01415 if (thresholder_ != NULL) { 01416 delete thresholder_; 01417 thresholder_ = NULL; 01418 } 01419 if (page_res_ != NULL) { 01420 delete page_res_; 01421 page_res_ = NULL; 01422 } 01423 if (block_list_ != NULL) { 01424 delete block_list_; 01425 block_list_ = NULL; 01426 } 01427 if (paragraph_models_ != NULL) { 01428 paragraph_models_->delete_data_pointers(); 01429 delete paragraph_models_; 01430 paragraph_models_ = NULL; 01431 } 01432 if (tesseract_ != NULL) { 01433 delete tesseract_; 01434 if (osd_tesseract_ == tesseract_) 01435 osd_tesseract_ = NULL; 01436 tesseract_ = NULL; 01437 } 01438 if (osd_tesseract_ != NULL) { 01439 delete osd_tesseract_; 01440 osd_tesseract_ = NULL; 01441 } 01442 if (equ_detect_ != NULL) { 01443 delete equ_detect_; 01444 equ_detect_ = NULL; 01445 } 01446 if (input_file_ != NULL) { 01447 delete input_file_; 01448 input_file_ = NULL; 01449 } 01450 if (output_file_ != NULL) { 01451 delete output_file_; 01452 output_file_ = NULL; 01453 } 01454 if (datapath_ != NULL) { 01455 delete datapath_; 01456 datapath_ = NULL; 01457 } 01458 if (language_ != NULL) { 01459 delete language_; 01460 language_ = NULL; 01461 } 01462 } 01463 01468 int TessBaseAPI::IsValidWord(const char *word) { 01469 return tesseract_->getDict().valid_word(word); 01470 } 01471 01472 01473 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) { 01474 if (page_res_ == NULL) 01475 FindLines(); 01476 if (block_list_->length() < 1) { 01477 return false; 01478 } 01479 01480 // Get first block 01481 BLOCK_IT block_it(block_list_); 01482 block_it.move_to_first(); 01483 ROW_LIST* rows = block_it.data()->row_list(); 01484 if (rows->length() < 1) { 01485 return false; 01486 } 01487 01488 // Get first line of block 01489 ROW_IT row_it(rows); 01490 row_it.move_to_first(); 01491 ROW* row = row_it.data(); 01492 01493 // Calculate offset and slope (NOTE: Kind of ugly) 01494 *out_offset = static_cast<int>(row->base_line(0.0)); 01495 *out_slope = row->base_line(1.0) - row->base_line(0.0); 01496 01497 return true; 01498 } 01499 01501 void TessBaseAPI::SetDictFunc(DictFunc f) { 01502 if (tesseract_ != NULL) { 01503 tesseract_->getDict().letter_is_okay_ = f; 01504 } 01505 } 01506 01511 void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) { 01512 if (tesseract_ != NULL) { 01513 tesseract_->getDict().probability_in_context_ = f; 01514 // Set it for the sublangs too. 01515 int num_subs = tesseract_->num_sub_langs(); 01516 for (int i = 0; i < num_subs; ++i) { 01517 tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f; 01518 } 01519 } 01520 } 01521 01523 void TessBaseAPI::SetFillLatticeFunc(FillLatticeFunc f) { 01524 if (tesseract_ != NULL) tesseract_->fill_lattice_ = f; 01525 } 01526 01528 bool TessBaseAPI::InternalSetImage() { 01529 if (tesseract_ == NULL) { 01530 tprintf("Please call Init before attempting to send an image."); 01531 return false; 01532 } 01533 if (thresholder_ == NULL) 01534 thresholder_ = new ImageThresholder; 01535 ClearResults(); 01536 return true; 01537 } 01538 01545 void TessBaseAPI::Threshold(Pix** pix) { 01546 ASSERT_HOST(pix != NULL); 01547 if (!thresholder_->IsBinary()) { 01548 tesseract_->set_pix_grey(thresholder_->GetPixRectGrey()); 01549 } 01550 if (*pix != NULL) 01551 pixDestroy(pix); 01552 // Zero resolution messes up the algorithms, so make sure it is credible. 01553 int y_res = thresholder_->GetScaledYResolution(); 01554 if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) { 01555 // Use the minimum default resolution, as it is safer to under-estimate 01556 // than over-estimate resolution. 01557 thresholder_->SetSourceYResolution(kMinCredibleResolution); 01558 } 01559 thresholder_->ThresholdToPix(pix); 01560 thresholder_->GetImageSizes(&rect_left_, &rect_top_, 01561 &rect_width_, &rect_height_, 01562 &image_width_, &image_height_); 01563 // Set the internal resolution that is used for layout parameters from the 01564 // estimated resolution, rather than the image resolution, which may be 01565 // fabricated, but we will use the image resolution, if there is one, to 01566 // report output point sizes. 01567 int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(), 01568 kMinCredibleResolution, 01569 kMaxCredibleResolution); 01570 if (estimated_res != thresholder_->GetScaledEstimatedResolution()) { 01571 tprintf("Estimated resolution %d out of range! Corrected to %d\n", 01572 thresholder_->GetScaledEstimatedResolution(), estimated_res); 01573 } 01574 tesseract_->set_source_resolution(estimated_res); 01575 } 01576 01578 int TessBaseAPI::FindLines() { 01579 if (thresholder_ == NULL || thresholder_->IsEmpty()) { 01580 tprintf("Please call SetImage before attempting recognition."); 01581 return -1; 01582 } 01583 if (recognition_done_) 01584 ClearResults(); 01585 if (!block_list_->empty()) { 01586 return 0; 01587 } 01588 if (tesseract_ == NULL) { 01589 tesseract_ = new Tesseract; 01590 tesseract_->InitAdaptiveClassifier(false); 01591 } 01592 if (tesseract_->pix_binary() == NULL) 01593 Threshold(tesseract_->mutable_pix_binary()); 01594 if (tesseract_->ImageWidth() > MAX_INT16 || 01595 tesseract_->ImageHeight() > MAX_INT16) { 01596 tprintf("Image too large: (%d, %d)\n", 01597 tesseract_->ImageWidth(), tesseract_->ImageHeight()); 01598 return -1; 01599 } 01600 01601 tesseract_->PrepareForPageseg(); 01602 01603 if (tesseract_->textord_equation_detect) { 01604 if (equ_detect_ == NULL && datapath_ != NULL) { 01605 equ_detect_ = new EquationDetect(datapath_->string(), NULL); 01606 } 01607 tesseract_->SetEquationDetect(equ_detect_); 01608 } 01609 01610 Tesseract* osd_tess = osd_tesseract_; 01611 OSResults osr; 01612 if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == NULL) { 01613 if (strcmp(language_->string(), "osd") == 0) { 01614 osd_tess = tesseract_; 01615 } else { 01616 osd_tesseract_ = new Tesseract; 01617 if (osd_tesseract_->init_tesseract( 01618 datapath_->string(), NULL, "osd", OEM_TESSERACT_ONLY, 01619 NULL, 0, NULL, NULL, false) == 0) { 01620 osd_tess = osd_tesseract_; 01621 osd_tesseract_->set_source_resolution( 01622 thresholder_->GetSourceYResolution()); 01623 } else { 01624 tprintf("Warning: Auto orientation and script detection requested," 01625 " but osd language failed to load\n"); 01626 delete osd_tesseract_; 01627 osd_tesseract_ = NULL; 01628 } 01629 } 01630 } 01631 01632 if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0) 01633 return -1; 01634 // If Devanagari is being recognized, we use different images for page seg 01635 // and for OCR. 01636 tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr); 01637 return 0; 01638 } 01639 01641 void TessBaseAPI::ClearResults() { 01642 if (tesseract_ != NULL) { 01643 tesseract_->Clear(); 01644 } 01645 if (page_res_ != NULL) { 01646 delete page_res_; 01647 page_res_ = NULL; 01648 } 01649 recognition_done_ = false; 01650 if (block_list_ == NULL) 01651 block_list_ = new BLOCK_LIST; 01652 else 01653 block_list_->clear(); 01654 if (paragraph_models_ != NULL) { 01655 paragraph_models_->delete_data_pointers(); 01656 delete paragraph_models_; 01657 paragraph_models_ = NULL; 01658 } 01659 } 01660 01668 int TessBaseAPI::TextLength(int* blob_count) { 01669 if (tesseract_ == NULL || page_res_ == NULL) 01670 return 0; 01671 01672 PAGE_RES_IT page_res_it(page_res_); 01673 int total_length = 2; 01674 int total_blobs = 0; 01675 // Iterate over the data structures to extract the recognition result. 01676 for (page_res_it.restart_page(); page_res_it.word () != NULL; 01677 page_res_it.forward()) { 01678 WERD_RES *word = page_res_it.word(); 01679 WERD_CHOICE* choice = word->best_choice; 01680 if (choice != NULL) { 01681 total_blobs += choice->length() + 2; 01682 total_length += choice->unichar_string().length() + 2; 01683 for (int i = 0; i < word->reject_map.length(); ++i) { 01684 if (word->reject_map[i].rejected()) 01685 ++total_length; 01686 } 01687 } 01688 } 01689 if (blob_count != NULL) 01690 *blob_count = total_blobs; 01691 return total_length; 01692 } 01693 01698 bool TessBaseAPI::DetectOS(OSResults* osr) { 01699 if (tesseract_ == NULL) 01700 return false; 01701 ClearResults(); 01702 if (tesseract_->pix_binary() == NULL) 01703 Threshold(tesseract_->mutable_pix_binary()); 01704 if (input_file_ == NULL) 01705 input_file_ = new STRING(kInputFile); 01706 return orientation_and_script_detection(*input_file_, osr, tesseract_); 01707 } 01708 01709 void TessBaseAPI::set_min_orientation_margin(double margin) { 01710 tesseract_->min_orientation_margin.set_value(margin); 01711 } 01712 01727 void TessBaseAPI::GetBlockTextOrientations(int** block_orientation, 01728 bool** vertical_writing) { 01729 delete[] *block_orientation; 01730 *block_orientation = NULL; 01731 delete[] *vertical_writing; 01732 *vertical_writing = NULL; 01733 BLOCK_IT block_it(block_list_); 01734 01735 block_it.move_to_first(); 01736 int num_blocks = 0; 01737 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { 01738 if (!block_it.data()->poly_block()->IsText()) { 01739 continue; 01740 } 01741 ++num_blocks; 01742 } 01743 if (!num_blocks) { 01744 tprintf("WARNING: Found no blocks\n"); 01745 return; 01746 } 01747 *block_orientation = new int[num_blocks]; 01748 *vertical_writing = new bool[num_blocks]; 01749 block_it.move_to_first(); 01750 int i = 0; 01751 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 01752 block_it.forward()) { 01753 if (!block_it.data()->poly_block()->IsText()) { 01754 continue; 01755 } 01756 FCOORD re_rotation = block_it.data()->re_rotation(); 01757 float re_theta = re_rotation.angle(); 01758 FCOORD classify_rotation = block_it.data()->classify_rotation(); 01759 float classify_theta = classify_rotation.angle(); 01760 double rot_theta = - (re_theta - classify_theta) * 2.0 / PI; 01761 if (rot_theta < 0) rot_theta += 4; 01762 int num_rotations = static_cast<int>(rot_theta + 0.5); 01763 (*block_orientation)[i] = num_rotations; 01764 // The classify_rotation is non-zero only if the text has vertical 01765 // writing direction. 01766 (*vertical_writing)[i] = classify_rotation.y() != 0.0f; 01767 ++i; 01768 } 01769 } 01770 01771 // ____________________________________________________________________________ 01772 // Ocropus add-ons. 01773 01775 BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() { 01776 FindLines(); 01777 BLOCK_LIST* result = block_list_; 01778 block_list_ = NULL; 01779 return result; 01780 } 01781 01787 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) { 01788 delete block_list; 01789 } 01790 01791 01792 ROW *TessBaseAPI::MakeTessOCRRow(float baseline, 01793 float xheight, 01794 float descender, 01795 float ascender) { 01796 inT32 xstarts[] = {-32000}; 01797 double quad_coeffs[] = {0, 0, baseline}; 01798 return new ROW(1, 01799 xstarts, 01800 quad_coeffs, 01801 xheight, 01802 ascender - (baseline + xheight), 01803 descender - baseline, 01804 0, 01805 0); 01806 } 01807 01809 TBLOB *TessBaseAPI::MakeTBLOB(Pix *pix) { 01810 int width = pixGetWidth(pix); 01811 int height = pixGetHeight(pix); 01812 BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height); 01813 01814 // Create C_BLOBs from the page 01815 extract_edges(pix, &block); 01816 01817 // Merge all C_BLOBs 01818 C_BLOB_LIST *list = block.blob_list(); 01819 C_BLOB_IT c_blob_it(list); 01820 if (c_blob_it.empty()) 01821 return NULL; 01822 // Move all the outlines to the first blob. 01823 C_OUTLINE_IT ol_it(c_blob_it.data()->out_list()); 01824 for (c_blob_it.forward(); 01825 !c_blob_it.at_first(); 01826 c_blob_it.forward()) { 01827 C_BLOB *c_blob = c_blob_it.data(); 01828 ol_it.add_list_after(c_blob->out_list()); 01829 } 01830 // Convert the first blob to the output TBLOB. 01831 return TBLOB::PolygonalCopy(c_blob_it.data()); 01832 } 01833 01839 void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row, 01840 bool numeric_mode, DENORM *denorm) { 01841 TWERD word; 01842 word.blobs = tblob; 01843 if (denorm != NULL) { 01844 word.SetupBLNormalize(NULL, row, row->x_height(), numeric_mode, denorm); 01845 word.Normalize(*denorm); 01846 } else { 01847 DENORM normer; 01848 word.SetupBLNormalize(NULL, row, row->x_height(), numeric_mode, &normer); 01849 word.Normalize(normer); 01850 } 01851 word.blobs = NULL; 01852 } 01853 01858 TBLOB *make_tesseract_blob(float baseline, float xheight, 01859 float descender, float ascender, 01860 bool numeric_mode, Pix* pix) { 01861 TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix); 01862 01863 // Normalize TBLOB 01864 ROW *row = 01865 TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender); 01866 TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode, NULL); 01867 delete row; 01868 return tblob; 01869 } 01870 01876 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr, 01877 int length, 01878 float baseline, 01879 float xheight, 01880 float descender, 01881 float ascender) { 01882 UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length); 01883 TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender, 01884 tesseract_->classify_bln_numeric_mode, 01885 tesseract_->pix_binary()); 01886 float threshold; 01887 UNICHAR_ID best_class = 0; 01888 float best_rating = -100; 01889 01890 01891 // Classify to get a raw choice. 01892 BLOB_CHOICE_LIST choices; 01893 DENORM denorm; 01894 tesseract_->AdaptiveClassifier(blob, denorm, &choices, NULL); 01895 BLOB_CHOICE_IT choice_it; 01896 choice_it.set_to_list(&choices); 01897 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 01898 choice_it.forward()) { 01899 if (choice_it.data()->rating() > best_rating) { 01900 best_rating = choice_it.data()->rating(); 01901 best_class = choice_it.data()->unichar_id(); 01902 } 01903 } 01904 01905 threshold = tesseract_->matcher_good_threshold; 01906 01907 if (blob->outlines) 01908 tesseract_->AdaptToChar(blob, denorm, id, kUnknownFontinfoId, threshold); 01909 delete blob; 01910 } 01911 01912 01913 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) { 01914 PAGE_RES *page_res = new PAGE_RES(block_list, 01915 &(tesseract_->prev_word_best_choice_)); 01916 tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1); 01917 return page_res; 01918 } 01919 01920 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list, 01921 PAGE_RES* pass1_result) { 01922 if (!pass1_result) 01923 pass1_result = new PAGE_RES(block_list, 01924 &(tesseract_->prev_word_best_choice_)); 01925 tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2); 01926 return pass1_result; 01927 } 01928 01929 void TessBaseAPI::DetectParagraphs(int debug_level) { 01930 if (paragraph_models_ == NULL) 01931 paragraph_models_ = new GenericVector<ParagraphModel*>; 01932 MutableIterator *result_it = GetMutableIterator(); 01933 do { // Detect paragraphs for this block 01934 GenericVector<ParagraphModel *> models; 01935 ::tesseract::DetectParagraphs(debug_level, result_it, &models); 01936 *paragraph_models_ += models; 01937 } while (result_it->Next(RIL_BLOCK)); 01938 delete result_it; 01939 } 01940 01941 struct TESS_CHAR : ELIST_LINK { 01942 char *unicode_repr; 01943 int length; // of unicode_repr 01944 float cost; 01945 TBOX box; 01946 01947 TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) { 01948 length = (len == -1 ? strlen(repr) : len); 01949 unicode_repr = new char[length + 1]; 01950 strncpy(unicode_repr, repr, length); 01951 } 01952 01953 TESS_CHAR() { // Satisfies ELISTIZE. 01954 } 01955 ~TESS_CHAR() { 01956 delete [] unicode_repr; 01957 } 01958 }; 01959 01960 ELISTIZEH(TESS_CHAR) 01961 ELISTIZE(TESS_CHAR) 01962 01963 static void add_space(TESS_CHAR_IT* it) { 01964 TESS_CHAR *t = new TESS_CHAR(0, " "); 01965 it->add_after_then_move(t); 01966 } 01967 01968 01969 static float rating_to_cost(float rating) { 01970 rating = 100 + rating; 01971 // cuddled that to save from coverage profiler 01972 // (I have never seen ratings worse than -100, 01973 // but the check won't hurt) 01974 if (rating < 0) rating = 0; 01975 return rating; 01976 } 01977 01982 static void extract_result(TESS_CHAR_IT* out, 01983 PAGE_RES* page_res) { 01984 PAGE_RES_IT page_res_it(page_res); 01985 int word_count = 0; 01986 while (page_res_it.word() != NULL) { 01987 WERD_RES *word = page_res_it.word(); 01988 const char *str = word->best_choice->unichar_string().string(); 01989 const char *len = word->best_choice->unichar_lengths().string(); 01990 TBOX real_rect = word->word->bounding_box(); 01991 01992 if (word_count) 01993 add_space(out); 01994 int n = strlen(len); 01995 for (int i = 0; i < n; i++) { 01996 TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()), 01997 str, *len); 01998 tc->box = real_rect.intersection(word->box_word->BlobBox(i)); 01999 out->add_after_then_move(tc); 02000 str += *len; 02001 len++; 02002 } 02003 page_res_it.forward(); 02004 word_count++; 02005 } 02006 } 02007 02012 int TessBaseAPI::TesseractExtractResult(char** text, 02013 int** lengths, 02014 float** costs, 02015 int** x0, 02016 int** y0, 02017 int** x1, 02018 int** y1, 02019 PAGE_RES* page_res) { 02020 TESS_CHAR_LIST tess_chars; 02021 TESS_CHAR_IT tess_chars_it(&tess_chars); 02022 extract_result(&tess_chars_it, page_res); 02023 tess_chars_it.move_to_first(); 02024 int n = tess_chars.length(); 02025 int text_len = 0; 02026 *lengths = new int[n]; 02027 *costs = new float[n]; 02028 *x0 = new int[n]; 02029 *y0 = new int[n]; 02030 *x1 = new int[n]; 02031 *y1 = new int[n]; 02032 int i = 0; 02033 for (tess_chars_it.mark_cycle_pt(); 02034 !tess_chars_it.cycled_list(); 02035 tess_chars_it.forward(), i++) { 02036 TESS_CHAR *tc = tess_chars_it.data(); 02037 text_len += (*lengths)[i] = tc->length; 02038 (*costs)[i] = tc->cost; 02039 (*x0)[i] = tc->box.left(); 02040 (*y0)[i] = tc->box.bottom(); 02041 (*x1)[i] = tc->box.right(); 02042 (*y1)[i] = tc->box.top(); 02043 } 02044 char *p = *text = new char[text_len]; 02045 02046 tess_chars_it.move_to_first(); 02047 for (tess_chars_it.mark_cycle_pt(); 02048 !tess_chars_it.cycled_list(); 02049 tess_chars_it.forward()) { 02050 TESS_CHAR *tc = tess_chars_it.data(); 02051 strncpy(p, tc->unicode_repr, tc->length); 02052 p += tc->length; 02053 } 02054 return n; 02055 } 02056 02058 void TessBaseAPI::GetFeaturesForBlob(TBLOB* blob, const DENORM& denorm, 02059 INT_FEATURE_ARRAY int_features, 02060 int* num_features, 02061 int* FeatureOutlineIndex) { 02062 if (tesseract_) { 02063 tesseract_->ResetFeaturesHaveBeenExtracted(); 02064 } 02065 uinT8* norm_array = new uinT8[MAX_NUM_CLASSES]; 02066 inT32 len; 02067 *num_features = tesseract_->GetCharNormFeatures( 02068 blob, denorm, tesseract_->PreTrainedTemplates, 02069 int_features, norm_array, norm_array, &len, FeatureOutlineIndex); 02070 delete [] norm_array; 02071 } 02072 02077 ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks, 02078 int left, int top, int right, int bottom) { 02079 TBOX box(left, bottom, right, top); 02080 BLOCK_IT b_it(blocks); 02081 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 02082 BLOCK* block = b_it.data(); 02083 if (!box.major_overlap(block->bounding_box())) 02084 continue; 02085 ROW_IT r_it(block->row_list()); 02086 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { 02087 ROW* row = r_it.data(); 02088 if (!box.major_overlap(row->bounding_box())) 02089 continue; 02090 WERD_IT w_it(row->word_list()); 02091 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 02092 WERD* word = w_it.data(); 02093 if (box.major_overlap(word->bounding_box())) 02094 return row; 02095 } 02096 } 02097 } 02098 return NULL; 02099 } 02100 02102 void TessBaseAPI::RunAdaptiveClassifier(TBLOB* blob, const DENORM& denorm, 02103 int num_max_matches, 02104 int* unichar_ids, 02105 float* ratings, 02106 int* num_matches_returned) { 02107 BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST; 02108 tesseract_->AdaptiveClassifier(blob, denorm, choices, NULL); 02109 BLOB_CHOICE_IT choices_it(choices); 02110 int& index = *num_matches_returned; 02111 index = 0; 02112 for (choices_it.mark_cycle_pt(); 02113 !choices_it.cycled_list() && index < num_max_matches; 02114 choices_it.forward()) { 02115 BLOB_CHOICE* choice = choices_it.data(); 02116 unichar_ids[index] = choice->unichar_id(); 02117 ratings[index] = choice->rating(); 02118 ++index; 02119 } 02120 *num_matches_returned = index; 02121 delete choices; 02122 } 02123 02125 const char* TessBaseAPI::GetUnichar(int unichar_id) { 02126 return tesseract_->unicharset.id_to_unichar(unichar_id); 02127 } 02128 02130 const Dawg *TessBaseAPI::GetDawg(int i) const { 02131 if (tesseract_ == NULL || i >= NumDawgs()) return NULL; 02132 return tesseract_->getDict().GetDawg(i); 02133 } 02134 02136 int TessBaseAPI::NumDawgs() const { 02137 return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs(); 02138 } 02139 02141 CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const { 02142 return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext(); 02143 } 02144 } // namespace tesseract.