Tesseract  3.02
tesseract-ocr/api/baseapi.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        baseapi.cpp
00003  * Description: Simple API for calling tesseract.
00004  * Author:      Ray Smith
00005  * Created:     Fri Oct 06 15:35:01 PDT 2006
00006  *
00007  * (C) Copyright 2006, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // Include automatically generated configuration file if running autoconf.
00021 #ifdef HAVE_CONFIG_H
00022 #include "config_auto.h"
00023 #endif
00024 
00025 #include "allheaders.h"
00026 
00027 #ifdef USING_GETTEXT
00028 #include <libintl.h>
00029 #include <locale.h>
00030 #define _(x) gettext(x)
00031 #else
00032 #define _(x) (x)
00033 #endif
00034 
00035 #include "baseapi.h"
00036 
00037 #include "resultiterator.h"
00038 #include "mutableiterator.h"
00039 #include "thresholder.h"
00040 #include "tesseractclass.h"
00041 #include "pageres.h"
00042 #include "paragraphs.h"
00043 #include "tessvars.h"
00044 #include "control.h"
00045 #include "pgedit.h"
00046 #include "paramsd.h"
00047 #include "output.h"
00048 #include "globals.h"
00049 #include "edgblob.h"
00050 #include "equationdetect.h"
00051 #include "tessbox.h"
00052 #include "imgs.h"
00053 #include "imgtiff.h"
00054 #include "makerow.h"
00055 #include "permute.h"
00056 #include "otsuthr.h"
00057 #include "osdetect.h"
00058 #include "params.h"
00059 
00060 #if defined(_WIN32) && !defined(VERSION)
00061 #include "version.h"
00062 #endif
00063 
00064 namespace tesseract {
00065 
00067 const int kMinRectSize = 10;
00069 const char kTesseractReject = '~';
00071 const char kUNLVReject = '~';
00073 const char kUNLVSuspect = '^';
00078 const char* kInputFile = "noname.tif";
00080 const char* kOldVarsFile = "failed_vars.txt";
00082 const int kMaxIntSize = 22;
00087 const int kMinCredibleResolution = 70;
00089 const int kMaxCredibleResolution = 2400;
00090 
00091 TessBaseAPI::TessBaseAPI()
00092   : tesseract_(NULL),
00093     osd_tesseract_(NULL),
00094     equ_detect_(NULL),
00095     // Thresholder is initialized to NULL here, but will be set before use by:
00096     // A constructor of a derived API,  SetThresholder(), or
00097     // created implicitly when used in InternalSetImage.
00098     thresholder_(NULL),
00099     paragraph_models_(NULL),
00100     block_list_(NULL),
00101     page_res_(NULL),
00102     input_file_(NULL),
00103     output_file_(NULL),
00104     datapath_(NULL),
00105     language_(NULL),
00106     last_oem_requested_(OEM_DEFAULT),
00107     recognition_done_(false),
00108     truth_cb_(NULL),
00109     rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
00110     image_width_(0), image_height_(0) {
00111 }
00112 
00113 TessBaseAPI::~TessBaseAPI() {
00114   End();
00115 }
00116 
00120 const char* TessBaseAPI::Version() {
00121   return VERSION;
00122 }
00123 
00128 void TessBaseAPI::SetInputName(const char* name) {
00129   if (input_file_ == NULL)
00130     input_file_ = new STRING(name);
00131   else
00132     *input_file_ = name;
00133 }
00134 
00136 void TessBaseAPI::SetOutputName(const char* name) {
00137   if (output_file_ == NULL)
00138     output_file_ = new STRING(name);
00139   else
00140     *output_file_ = name;
00141 }
00142 
00143 bool TessBaseAPI::SetVariable(const char* name, const char* value) {
00144   if (tesseract_ == NULL) tesseract_ = new Tesseract;
00145   return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
00146                               tesseract_->params());
00147 }
00148 
00149 bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) {
00150   if (tesseract_ == NULL) tesseract_ = new Tesseract;
00151   return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY,
00152                               tesseract_->params());
00153 }
00154 
00155 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
00156   IntParam *p = ParamUtils::FindParam<IntParam>(
00157       name, GlobalParams()->int_params, tesseract_->params()->int_params);
00158   if (p == NULL) return false;
00159   *value = (inT32)(*p);
00160   return true;
00161 }
00162 
00163 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
00164   BoolParam *p = ParamUtils::FindParam<BoolParam>(
00165       name, GlobalParams()->bool_params, tesseract_->params()->bool_params);
00166   if (p == NULL) return false;
00167   *value = (BOOL8)(*p);
00168   return true;
00169 }
00170 
00171 const char *TessBaseAPI::GetStringVariable(const char *name) const {
00172   StringParam *p = ParamUtils::FindParam<StringParam>(
00173       name, GlobalParams()->string_params, tesseract_->params()->string_params);
00174   return (p != NULL) ? p->string() : NULL;
00175 }
00176 
00177 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
00178   DoubleParam *p = ParamUtils::FindParam<DoubleParam>(
00179       name, GlobalParams()->double_params, tesseract_->params()->double_params);
00180   if (p == NULL) return false;
00181   *value = (double)(*p);
00182   return true;
00183 }
00184 
00186 bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) {
00187   return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
00188 }
00189 
00191 void TessBaseAPI::PrintVariables(FILE *fp) const {
00192   ParamUtils::PrintParams(fp, tesseract_->params());
00193 }
00194 
00203 int TessBaseAPI::Init(const char* datapath, const char* language,
00204                       OcrEngineMode oem, char **configs, int configs_size,
00205                       const GenericVector<STRING> *vars_vec,
00206                       const GenericVector<STRING> *vars_values,
00207                       bool set_only_non_debug_params) {
00208   // Default language is "eng".
00209   if (language == NULL) language = "eng";
00210   // If the datapath, OcrEngineMode or the language have changed - start again.
00211   // Note that the language_ field stores the last requested language that was
00212   // initialized successfully, while tesseract_->lang stores the language
00213   // actually used. They differ only if the requested language was NULL, in
00214   // which case tesseract_->lang is set to the Tesseract default ("eng").
00215   if (tesseract_ != NULL &&
00216       (datapath_ == NULL || language_ == NULL ||
00217        *datapath_ != datapath || last_oem_requested_ != oem ||
00218        (*language_ != language && tesseract_->lang != language))) {
00219     delete tesseract_;
00220     tesseract_ = NULL;
00221   }
00222 
00223   bool reset_classifier = true;
00224   if (tesseract_ == NULL) {
00225     reset_classifier = false;
00226     tesseract_ = new Tesseract;
00227     if (tesseract_->init_tesseract(
00228             datapath, output_file_ != NULL ? output_file_->string() : NULL,
00229             language, oem, configs, configs_size, vars_vec, vars_values,
00230             set_only_non_debug_params) != 0) {
00231       return -1;
00232     }
00233   }
00234   // Update datapath and language requested for the last valid initialization.
00235   if (datapath_ == NULL)
00236     datapath_ = new STRING(datapath);
00237   else
00238     *datapath_ = datapath;
00239   if (language_ == NULL)
00240     language_ = new STRING(language);
00241   else
00242     *language_ = language;
00243   last_oem_requested_ = oem;
00244 
00245   // For same language and datapath, just reset the adaptive classifier.
00246   if (reset_classifier) tesseract_->ResetAdaptiveClassifier();
00247 
00248   return 0;
00249 }
00250 
00259 const char* TessBaseAPI::GetInitLanguagesAsString() const {
00260   return (language_ == NULL || language_->string() == NULL) ?
00261       "" : language_->string();
00262 }
00263 
00269 void TessBaseAPI::GetLoadedLanguagesAsVector(
00270     GenericVector<STRING>* langs) const {
00271   langs->clear();
00272   if (tesseract_ != NULL) {
00273     langs->push_back(tesseract_->lang);
00274     int num_subs = tesseract_->num_sub_langs();
00275     for (int i = 0; i < num_subs; ++i)
00276       langs->push_back(tesseract_->get_sub_lang(i)->lang);
00277   }
00278 }
00279 
00286 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
00287   if (tesseract_ == NULL)
00288     tesseract_ = new Tesseract;
00289   return tesseract_->init_tesseract_lm(datapath, NULL, language);
00290 }
00291 
00296 void TessBaseAPI::InitForAnalysePage() {
00297   if (tesseract_ == NULL) {
00298     tesseract_ = new Tesseract;
00299     tesseract_->InitAdaptiveClassifier(false);
00300   }
00301 }
00302 
00308 void TessBaseAPI::ReadConfigFile(const char* filename) {
00309   tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);
00310 }
00311 
00313 void TessBaseAPI::ReadDebugConfigFile(const char* filename) {
00314   tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);
00315 }
00316 
00322 void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
00323   if (tesseract_ == NULL)
00324     tesseract_ = new Tesseract;
00325   tesseract_->tessedit_pageseg_mode.set_value(mode);
00326 }
00327 
00329 PageSegMode TessBaseAPI::GetPageSegMode() const {
00330   if (tesseract_ == NULL)
00331     return PSM_SINGLE_BLOCK;
00332   return static_cast<PageSegMode>(
00333     static_cast<int>(tesseract_->tessedit_pageseg_mode));
00334 }
00335 
00349 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
00350                                  int bytes_per_pixel,
00351                                  int bytes_per_line,
00352                                  int left, int top,
00353                                  int width, int height) {
00354   if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
00355     return NULL;  // Nothing worth doing.
00356 
00357   // Since this original api didn't give the exact size of the image,
00358   // we have to invent a reasonable value.
00359   int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
00360   SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top,
00361            bytes_per_pixel, bytes_per_line);
00362   SetRectangle(left, top, width, height);
00363 
00364   return GetUTF8Text();
00365 }
00366 
00371 void TessBaseAPI::ClearAdaptiveClassifier() {
00372   if (tesseract_ == NULL)
00373     return;
00374   tesseract_->ResetAdaptiveClassifier();
00375   tesseract_->ResetDocumentDictionary();
00376 }
00377 
00387 void TessBaseAPI::SetImage(const unsigned char* imagedata,
00388                            int width, int height,
00389                            int bytes_per_pixel, int bytes_per_line) {
00390   if (InternalSetImage())
00391     thresholder_->SetImage(imagedata, width, height,
00392                            bytes_per_pixel, bytes_per_line);
00393 }
00394 
00395 void TessBaseAPI::SetSourceResolution(int ppi) {
00396   if (thresholder_)
00397     thresholder_->SetSourceYResolution(ppi);
00398   else
00399     tprintf("Please call SetImage before SetSourceResolution.\n");
00400 }
00401 
00412 void TessBaseAPI::SetImage(const Pix* pix) {
00413   if (InternalSetImage())
00414     thresholder_->SetImage(pix);
00415 }
00416 
00422 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
00423   if (thresholder_ == NULL)
00424     return;
00425   thresholder_->SetRectangle(left, top, width, height);
00426   ClearResults();
00427 }
00428 
00433 Pix* TessBaseAPI::GetThresholdedImage() {
00434   if (tesseract_ == NULL)
00435     return NULL;
00436   if (tesseract_->pix_binary() == NULL)
00437     Threshold(tesseract_->mutable_pix_binary());
00438   return pixClone(tesseract_->pix_binary());
00439 }
00440 
00446 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
00447   return GetComponentImages(RIL_BLOCK, false, pixa, NULL);
00448 }
00449 
00456 Boxa* TessBaseAPI::GetTextlines(Pixa** pixa, int** blockids) {
00457   return GetComponentImages(RIL_TEXTLINE, true, pixa, blockids);
00458 }
00459 
00468 Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) {
00469   return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
00470 }
00471 
00477 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
00478   return GetComponentImages(RIL_WORD, true, pixa, NULL);
00479 }
00480 
00487 Boxa* TessBaseAPI::GetConnectedComponents(Pixa** pixa) {
00488   return GetComponentImages(RIL_SYMBOL, true, pixa, NULL);
00489 }
00490 
00499 Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level,
00500                                       bool text_only,
00501                                       Pixa** pixa, int** blockids) {
00502   PageIterator* page_it = GetIterator();
00503   if (page_it == NULL)
00504     page_it = AnalyseLayout();
00505   if (page_it == NULL)
00506     return NULL;  // Failed.
00507 
00508   // Count the components to get a size for the arrays.
00509   int component_count = 0;
00510   int left, top, right, bottom;
00511   do {
00512     if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
00513         (!text_only || PTIsTextType(page_it->BlockType())))
00514       ++component_count;
00515   } while (page_it->Next(level));
00516 
00517   Boxa* boxa = boxaCreate(component_count);
00518   if (pixa != NULL)
00519     *pixa = pixaCreate(component_count);
00520   if (blockids != NULL)
00521     *blockids = new int[component_count];
00522 
00523   int blockid = 0;
00524   int component_index = 0;
00525   page_it->Begin();
00526   do {
00527     if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
00528         (!text_only || PTIsTextType(page_it->BlockType()))) {
00529       Box* lbox = boxCreate(left, top, right - left, bottom - top);
00530       boxaAddBox(boxa, lbox, L_INSERT);
00531       if (pixa != NULL) {
00532         Pix* pix = page_it->GetBinaryImage(level);
00533         pixaAddPix(*pixa, pix, L_INSERT);
00534         pixaAddBox(*pixa, lbox, L_CLONE);
00535       }
00536       if (blockids != NULL) {
00537         (*blockids)[component_index] = blockid;
00538         if (page_it->IsAtFinalElement(RIL_BLOCK, level))
00539           ++blockid;
00540       }
00541       ++component_index;
00542     }
00543   } while (page_it->Next(level));
00544   delete page_it;
00545   return boxa;
00546 }
00547 
00548 int TessBaseAPI::GetThresholdedImageScaleFactor() const {
00549   if (thresholder_ == NULL) {
00550     return 0;
00551   }
00552   return thresholder_->GetScaleFactor();
00553 }
00554 
00556 void TessBaseAPI::DumpPGM(const char* filename) {
00557   if (tesseract_ == NULL)
00558     return;
00559   FILE *fp = fopen(filename, "wb");
00560   Pix* pix = tesseract_->pix_binary();
00561   int width = pixGetWidth(pix);
00562   int height = pixGetHeight(pix);
00563   l_uint32* data = pixGetData(pix);
00564   fprintf(fp, "P5 %d %d 255\n", width, height);
00565   for (int y = 0; y < height; ++y, data += pixGetWpl(pix)) {
00566     for (int x = 0; x < width; ++x) {
00567       uinT8 b = GET_DATA_BIT(data, x) ? 0 : 255;
00568       fwrite(&b, 1, 1, fp);
00569     }
00570   }
00571   fclose(fp);
00572 }
00573 
00580 int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
00581                 Boxa* boxa_words, Pixa* pixa_words,
00582                 const FCOORD& reskew, Pix* page_pix,
00583                 PAGE_RES* page_res) {
00584   int block_count = boxaGetCount(boxa_blocks);
00585   ASSERT_HOST(block_count == pixaGetCount(pixa_blocks));
00586   // Write each block to the current directory as junk_write_display.nnn.png.
00587   for (int i = 0; i < block_count; ++i) {
00588     Pix* pix = pixaGetPix(pixa_blocks, i, L_CLONE);
00589     pixDisplayWrite(pix, 1);
00590   }
00591   int word_count = boxaGetCount(boxa_words);
00592   ASSERT_HOST(word_count == pixaGetCount(pixa_words));
00593   int pr_word = 0;
00594   PAGE_RES_IT page_res_it(page_res);
00595   for (page_res_it.restart_page(); page_res_it.word () != NULL;
00596        page_res_it.forward(), ++pr_word) {
00597     WERD_RES *word = page_res_it.word();
00598     WERD_CHOICE* choice = word->best_choice;
00599     // Write the first 100 words to files names wordims/<wordstring>.tif.
00600     if (pr_word < 100) {
00601       STRING filename("wordims/");
00602       if (choice != NULL) {
00603         filename += choice->unichar_string();
00604       } else {
00605         char numbuf[32];
00606         filename += "unclassified";
00607         snprintf(numbuf, 32, "%03d", pr_word);
00608         filename += numbuf;
00609       }
00610       filename += ".tif";
00611       Pix* pix = pixaGetPix(pixa_words, pr_word, L_CLONE);
00612       pixWrite(filename.string(), pix, IFF_TIFF_G4);
00613     }
00614   }
00615   ASSERT_HOST(pr_word == word_count);
00616   return 0;
00617 }
00618 
00630 PageIterator* TessBaseAPI::AnalyseLayout() {
00631   if (FindLines() == 0) {
00632     if (block_list_->empty())
00633       return NULL;  // The page was empty.
00634     page_res_ = new PAGE_RES(block_list_, NULL);
00635     return new PageIterator(page_res_, tesseract_,
00636                             thresholder_->GetScaleFactor(),
00637                             thresholder_->GetScaledYResolution(),
00638                             rect_left_, rect_top_, rect_width_, rect_height_);
00639   }
00640   return NULL;
00641 }
00642 
00647 int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
00648   if (tesseract_ == NULL)
00649     return -1;
00650   if (FindLines() != 0)
00651     return -1;
00652   if (page_res_ != NULL)
00653     delete page_res_;
00654 
00655   tesseract_->SetBlackAndWhitelist();
00656   recognition_done_ = true;
00657   if (tesseract_->tessedit_resegment_from_line_boxes)
00658     page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_);
00659   else if (tesseract_->tessedit_resegment_from_boxes)
00660     page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_);
00661   else
00662     page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_);
00663   if (tesseract_->tessedit_make_boxes_from_boxes) {
00664     tesseract_->CorrectClassifyWords(page_res_);
00665     return 0;
00666   }
00667 
00668   if (truth_cb_ != NULL) {
00669     tesseract_->wordrec_run_blamer.set_value(true);
00670     truth_cb_->Run(tesseract_->getDict().getUnicharset(),
00671                    image_height_, page_res_);
00672   }
00673 
00674   int result = 0;
00675   if (tesseract_->interactive_display_mode) {
00676     #ifndef GRAPHICS_DISABLED
00677     tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);
00678     #endif  // GRAPHICS_DISABLED
00679     // The page_res is invalid after an interactive session, so cleanup
00680     // in a way that lets us continue to the next page without crashing.
00681     delete page_res_;
00682     page_res_ = NULL;
00683     return -1;
00684   } else if (tesseract_->tessedit_train_from_boxes) {
00685     tesseract_->ApplyBoxTraining(*output_file_, page_res_);
00686   } else if (tesseract_->tessedit_ambigs_training) {
00687     FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
00688     // OCR the page segmented into words by tesseract.
00689     tesseract_->recog_training_segmented(
00690         *input_file_, page_res_, monitor, training_output_file);
00691     fclose(training_output_file);
00692   } else {
00693     // Now run the main recognition.
00694     if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) {
00695       int paragraph_debug_level = 0;
00696       GetIntVariable("paragraph_debug_level", &paragraph_debug_level);
00697       DetectParagraphs(paragraph_debug_level);
00698     } else {
00699       result = -1;
00700     }
00701   }
00702   return result;
00703 }
00704 
00706 int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {
00707   if (tesseract_ == NULL)
00708     return -1;
00709   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
00710     tprintf("Please call SetImage before attempting recognition.");
00711     return -1;
00712   }
00713   if (page_res_ != NULL)
00714     ClearResults();
00715   if (FindLines() != 0)
00716     return -1;
00717   // Additional conditions under which chopper test cannot be run
00718   if (tesseract_->interactive_display_mode) return -1;
00719 
00720   recognition_done_ = true;
00721 
00722   page_res_ = new PAGE_RES(block_list_, &(tesseract_->prev_word_best_choice_));
00723 
00724   PAGE_RES_IT page_res_it(page_res_);
00725 
00726   while (page_res_it.word() != NULL) {
00727     WERD_RES *word_res = page_res_it.word();
00728     GenericVector<TBOX> boxes;
00729     tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block,
00730                                   page_res_it.row()->row, word_res);
00731     page_res_it.forward();
00732   }
00733   return 0;
00734 }
00735 
00752 bool TessBaseAPI::ProcessPages(const char* filename,
00753                                const char* retry_config, int timeout_millisec,
00754                                STRING* text_out) {
00755   int page = tesseract_->tessedit_page_number;
00756   if (page < 0)
00757     page = 0;
00758   FILE* fp = fopen(filename, "rb");
00759   if (fp == NULL) {
00760     tprintf(_("Image file %s cannot be opened!\n"), filename);
00761     return false;
00762   }
00763   // Find the number of pages if a tiff file, or zero otherwise.
00764   int npages = CountTiffPages(fp);
00765   fclose(fp);
00766 
00767   if (tesseract_->tessedit_create_hocr) {
00768     *text_out =
00769         "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
00770         " \"http://www.w3.org/TR/html4/loose.dtd\">\n"
00771         "<html>\n<head>\n<title></title>\n"
00772         "<meta http-equiv=\"Content-Type\" content=\"text/html;"
00773         "charset=utf-8\" />\n<meta name='ocr-system' content='tesseract'/>\n"
00774         "</head>\n<body>\n";
00775   } else {
00776     *text_out = "";
00777   }
00778 
00779   bool success = true;
00780   Pix *pix;
00781   if (npages > 0) {
00782     for (; page < npages && (pix = pixReadTiff(filename, page)) != NULL;
00783          ++page) {
00784       if ((page >= 0) && (npages > 1))
00785         tprintf(_("Page %d of %d\n"), page + 1, npages);
00786       char page_str[kMaxIntSize];
00787       snprintf(page_str, kMaxIntSize - 1, "%d", page);
00788       SetVariable("applybox_page", page_str);
00789       success &= ProcessPage(pix, page, filename, retry_config,
00790                              timeout_millisec, text_out);
00791       pixDestroy(&pix);
00792       if (tesseract_->tessedit_page_number >= 0 || npages == 1) {
00793         break;
00794       }
00795     }
00796   } else {
00797     // The file is not a tiff file, so use the general pixRead function.
00798     pix = pixRead(filename);
00799     if (pix != NULL) {
00800       success &= ProcessPage(pix, 0, filename, retry_config,
00801                              timeout_millisec, text_out);
00802       pixDestroy(&pix);
00803     } else {
00804       // The file is not an image file, so try it as a list of filenames.
00805       FILE* fimg = fopen(filename, "rb");
00806       if (fimg == NULL) {
00807         tprintf(_("File %s cannot be opened!\n"), filename);
00808         return false;
00809       }
00810       tprintf(_("Reading %s as a list of filenames...\n"), filename);
00811       char pagename[MAX_PATH];
00812       // Skip to the requested page number.
00813       for (int i = 0; i < page &&
00814            fgets(pagename, sizeof(pagename), fimg) != NULL;
00815            ++i);
00816       while (fgets(pagename, sizeof(pagename), fimg) != NULL) {
00817         chomp_string(pagename);
00818         pix = pixRead(pagename);
00819         if (pix == NULL) {
00820           tprintf(_("Image file %s cannot be read!\n"), pagename);
00821           fclose(fimg);
00822           return false;
00823         }
00824         tprintf(_("Page %d : %s\n"), page, pagename);
00825         success &= ProcessPage(pix, page, pagename, retry_config,
00826                                timeout_millisec, text_out);
00827         pixDestroy(&pix);
00828         ++page;
00829       }
00830       fclose(fimg);
00831     }
00832   }
00833   if (tesseract_->tessedit_create_hocr)
00834     *text_out += "</body>\n</html>\n";
00835   return success;
00836 }
00837 
00849 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
00850                               const char* retry_config, int timeout_millisec,
00851                               STRING* text_out) {
00852   SetInputName(filename);
00853   SetImage(pix);
00854   bool failed = false;
00855   if (timeout_millisec > 0) {
00856     // Running with a timeout.
00857     ETEXT_DESC monitor;
00858     monitor.cancel = NULL;
00859     monitor.cancel_this = NULL;
00860     monitor.set_deadline_msecs(timeout_millisec);
00861     // Now run the main recognition.
00862     failed = Recognize(&monitor) < 0;
00863   } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY ||
00864              tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
00865     // Disabled character recognition.
00866     PageIterator* it = AnalyseLayout();
00867     if (it == NULL) {
00868       failed = true;
00869     } else {
00870       delete it;
00871       return true;
00872     }
00873   } else {
00874     // Normal layout and character recognition with no timeout.
00875     failed = Recognize(NULL) < 0;
00876   }
00877   if (tesseract_->tessedit_write_images) {
00878     Pix* page_pix = GetThresholdedImage();
00879     pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
00880   }
00881   if (failed && retry_config != NULL && retry_config[0] != '\0') {
00882     // Save current config variables before switching modes.
00883     FILE* fp = fopen(kOldVarsFile, "wb");
00884     PrintVariables(fp);
00885     fclose(fp);
00886     // Switch to alternate mode for retry.
00887     ReadConfigFile(retry_config);
00888     SetImage(pix);
00889     Recognize(NULL);
00890     // Restore saved config variables.
00891     ReadConfigFile(kOldVarsFile);
00892   }
00893   // Get text only if successful.
00894   if (!failed) {
00895     char* text;
00896     if (tesseract_->tessedit_create_boxfile ||
00897         tesseract_->tessedit_make_boxes_from_boxes) {
00898       text = GetBoxText(page_index);
00899     } else if (tesseract_->tessedit_write_unlv) {
00900       text = GetUNLVText();
00901     } else if (tesseract_->tessedit_create_hocr) {
00902       text = GetHOCRText(page_index);
00903     } else {
00904       text = GetUTF8Text();
00905     }
00906     *text_out += text;
00907     delete [] text;
00908     return true;
00909   }
00910   return false;
00911 }
00912 
00917 LTRResultIterator* TessBaseAPI::GetLTRIterator() {
00918   if (tesseract_ == NULL || page_res_ == NULL)
00919     return NULL;
00920   return new LTRResultIterator(
00921       page_res_, tesseract_,
00922       thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
00923       rect_left_, rect_top_, rect_width_, rect_height_);
00924 }
00925 
00934 ResultIterator* TessBaseAPI::GetIterator() {
00935   if (tesseract_ == NULL || page_res_ == NULL)
00936     return NULL;
00937   return ResultIterator::StartOfParagraph(LTRResultIterator(
00938       page_res_, tesseract_,
00939       thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
00940       rect_left_, rect_top_, rect_width_, rect_height_));
00941 }
00942 
00951 MutableIterator* TessBaseAPI::GetMutableIterator() {
00952   if (tesseract_ == NULL || page_res_ == NULL)
00953     return NULL;
00954   return new MutableIterator(page_res_, tesseract_,
00955                              thresholder_->GetScaleFactor(),
00956                              thresholder_->GetScaledYResolution(),
00957                              rect_left_, rect_top_, rect_width_, rect_height_);
00958 }
00959 
00961 char* TessBaseAPI::GetUTF8Text() {
00962   if (tesseract_ == NULL ||
00963       (!recognition_done_ && Recognize(NULL) < 0))
00964     return NULL;
00965   STRING text("");
00966   ResultIterator *it = GetIterator();
00967   do {
00968     if (it->Empty(RIL_PARA)) continue;
00969     char *para_text = it->GetUTF8Text(RIL_PARA);
00970     text += para_text;
00971     delete []para_text;
00972   } while (it->Next(RIL_PARA));
00973   char* result = new char[text.length() + 1];
00974   strncpy(result, text.string(), text.length() + 1);
00975   delete it;
00976   return result;
00977 }
00978 
00979 static void AddBoxTohOCR(const PageIterator *it,
00980                          PageIteratorLevel level,
00981                          STRING* hocr_str) {
00982   int left, top, right, bottom;
00983   it->BoundingBox(level, &left, &top, &right, &bottom);
00984   hocr_str->add_str_int("' title=\"bbox ", left);
00985   hocr_str->add_str_int(" ", top);
00986   hocr_str->add_str_int(" ", right);
00987   hocr_str->add_str_int(" ", bottom);
00988   *hocr_str += "\">";
00989 }
00990 
00999 char* TessBaseAPI::GetHOCRText(int page_number) {
01000   if (tesseract_ == NULL ||
01001       (page_res_ == NULL && Recognize(NULL) < 0))
01002     return NULL;
01003 
01004   int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
01005   int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
01006 
01007   STRING hocr_str("");
01008 
01009   if (input_file_ == NULL)
01010       SetInputName(NULL);
01011 
01012   hocr_str.add_str_int("<div class='ocr_page' id='page_", page_id);
01013   hocr_str += "' title='image \"";
01014   hocr_str += input_file_ ? *input_file_ : "unknown";
01015   hocr_str.add_str_int("\"; bbox ", rect_left_);
01016   hocr_str.add_str_int(" ", rect_top_);
01017   hocr_str.add_str_int(" ", rect_width_);
01018   hocr_str.add_str_int(" ", rect_height_);
01019   hocr_str += "'>\n";
01020 
01021   ResultIterator *res_it = GetIterator();
01022   for (; !res_it->Empty(RIL_BLOCK); wcnt++) {
01023     if (res_it->Empty(RIL_WORD)) {
01024       res_it->Next(RIL_WORD);
01025       continue;
01026     }
01027 
01028     // Open any new block/paragraph/textline.
01029     if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
01030       hocr_str.add_str_int("<div class='ocr_carea' id='block_", bcnt);
01031       hocr_str.add_str_int("_", bcnt);
01032       AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
01033     }
01034     if (res_it->IsAtBeginningOf(RIL_PARA)) {
01035       if (res_it->ParagraphIsLtr()) {
01036         hocr_str.add_str_int("\n<p class='ocr_par' dir='ltr' id='par_", pcnt);
01037       } else {
01038         hocr_str.add_str_int("\n<p class='ocr_par' dir='rtl' id='par_", pcnt);
01039       }
01040       AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
01041     }
01042     if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
01043       hocr_str.add_str_int("<span class='ocr_line' id='line_", lcnt);
01044       AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
01045     }
01046 
01047     // Now, process the word...
01048     hocr_str.add_str_int("<span class='ocr_word' id='word_", wcnt);
01049     AddBoxTohOCR(res_it, RIL_WORD, &hocr_str);
01050     const char *font_name;
01051     bool bold, italic, underlined, monospace, serif, smallcaps;
01052     int pointsize, font_id;
01053     font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
01054                                            &monospace, &serif, &smallcaps,
01055                                            &pointsize, &font_id);
01056     bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
01057     bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
01058     bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
01059     if (bold) hocr_str += "<strong>";
01060     if (italic) hocr_str += "<em>";
01061     do {
01062       const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
01063       if (grapheme && grapheme[0] != 0) {
01064         if (grapheme[1] == 0) {
01065           switch (grapheme[0]) {
01066             case '<': hocr_str += "&lt;"; break;
01067             case '>': hocr_str += "&gt;"; break;
01068             case '&': hocr_str += "&amp;"; break;
01069             case '"': hocr_str += "&quot;"; break;
01070             case '\'': hocr_str += "&#39;"; break;
01071             default: hocr_str += grapheme;
01072           }
01073         } else {
01074           hocr_str += grapheme;
01075         }
01076       }
01077       delete []grapheme;
01078       res_it->Next(RIL_SYMBOL);
01079     } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
01080     if (italic) hocr_str += "</em>";
01081     if (bold) hocr_str += "</strong>";
01082     hocr_str += "</span> ";
01083     wcnt++;
01084     // Close any ending block/paragraph/textline.
01085     if (last_word_in_line) {
01086       hocr_str += "</span>\n";
01087       lcnt++;
01088     }
01089     if (last_word_in_para) {
01090       hocr_str += "</p>\n";
01091       pcnt++;
01092     }
01093     if (last_word_in_block) {
01094       hocr_str += "</div>\n";
01095       bcnt++;
01096     }
01097   }
01098   hocr_str += "</div>\n";
01099 
01100   char *ret = new char[hocr_str.length() + 1];
01101   strcpy(ret, hocr_str.string());
01102   delete res_it;
01103   return ret;
01104 }
01105 
01107 const int kNumbersPerBlob = 5;
01112 const int kBytesPerNumber = 5;
01118 const int kBytesPerBlob = kNumbersPerBlob * (kBytesPerNumber + 1) + 1;
01119 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
01121 const int kBytesPer64BitNumber = 20;
01128 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 +
01129     UNICHAR_LEN;
01130 
01136 char* TessBaseAPI::GetBoxText(int page_number) {
01137   if (tesseract_ == NULL ||
01138       (!recognition_done_ && Recognize(NULL) < 0))
01139     return NULL;
01140   int blob_count;
01141   int utf8_length = TextLength(&blob_count);
01142   int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
01143       kMaxBytesPerLine;
01144   char* result = new char[total_length];
01145   int output_length = 0;
01146   LTRResultIterator* it = GetLTRIterator();
01147   do {
01148     int left, top, right, bottom;
01149     if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
01150       char* text = it->GetUTF8Text(RIL_SYMBOL);
01151       // Tesseract uses space for recognition failure. Fix to a reject
01152       // character, kTesseractReject so we don't create illegal box files.
01153       for (int i = 0; text[i] != '\0'; ++i) {
01154         if (text[i] == ' ')
01155           text[i] = kTesseractReject;
01156       }
01157       snprintf(result + output_length, total_length - output_length,
01158                "%s %d %d %d %d %d\n",
01159                text, left, image_height_ - bottom,
01160                right, image_height_ - top, page_number);
01161       output_length += strlen(result + output_length);
01162       delete [] text;
01163       // Just in case...
01164       if (output_length + kMaxBytesPerLine > total_length)
01165         break;
01166     }
01167   } while (it->Next(RIL_SYMBOL));
01168   delete it;
01169   return result;
01170 }
01171 
01177 const int kUniChs[] = {
01178   0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
01179 };
01181 const int kLatinChs[] = {
01182   0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
01183 };
01184 
01190 char* TessBaseAPI::GetUNLVText() {
01191   if (tesseract_ == NULL ||
01192       (!recognition_done_ && Recognize(NULL) < 0))
01193     return NULL;
01194   bool tilde_crunch_written = false;
01195   bool last_char_was_newline = true;
01196   bool last_char_was_tilde = false;
01197 
01198   int total_length = TextLength(NULL);
01199   PAGE_RES_IT   page_res_it(page_res_);
01200   char* result = new char[total_length];
01201   char* ptr = result;
01202   for (page_res_it.restart_page(); page_res_it.word () != NULL;
01203        page_res_it.forward()) {
01204     WERD_RES *word = page_res_it.word();
01205     // Process the current word.
01206     if (word->unlv_crunch_mode != CR_NONE) {
01207       if (word->unlv_crunch_mode != CR_DELETE &&
01208           (!tilde_crunch_written ||
01209            (word->unlv_crunch_mode == CR_KEEP_SPACE &&
01210             word->word->space() > 0 &&
01211             !word->word->flag(W_FUZZY_NON) &&
01212             !word->word->flag(W_FUZZY_SP)))) {
01213         if (!word->word->flag(W_BOL) &&
01214             word->word->space() > 0 &&
01215             !word->word->flag(W_FUZZY_NON) &&
01216             !word->word->flag(W_FUZZY_SP)) {
01217           /* Write a space to separate from preceeding good text */
01218           *ptr++ = ' ';
01219           last_char_was_tilde = false;
01220         }
01221         if (!last_char_was_tilde) {
01222           // Write a reject char.
01223           last_char_was_tilde = true;
01224           *ptr++ = kUNLVReject;
01225           tilde_crunch_written = true;
01226           last_char_was_newline = false;
01227         }
01228       }
01229     } else {
01230       // NORMAL PROCESSING of non tilde crunched words.
01231       tilde_crunch_written = false;
01232       tesseract_->set_unlv_suspects(word);
01233       const char* wordstr = word->best_choice->unichar_string().string();
01234       const STRING& lengths = word->best_choice->unichar_lengths();
01235       int length = lengths.length();
01236       int i = 0;
01237       int offset = 0;
01238 
01239       if (last_char_was_tilde &&
01240           word->word->space() == 0 && wordstr[offset] == ' ') {
01241         // Prevent adjacent tilde across words - we know that adjacent tildes
01242         // within words have been removed.
01243         // Skip the first character.
01244         offset = lengths[i++];
01245       }
01246       if (i < length && wordstr[offset] != 0) {
01247         if (!last_char_was_newline)
01248           *ptr++ = ' ';
01249         else
01250           last_char_was_newline = false;
01251         for (; i < length; offset += lengths[i++]) {
01252           if (wordstr[offset] == ' ' ||
01253               wordstr[offset] == kTesseractReject) {
01254             *ptr++ = kUNLVReject;
01255             last_char_was_tilde = true;
01256           } else {
01257             if (word->reject_map[i].rejected())
01258               *ptr++ = kUNLVSuspect;
01259             UNICHAR ch(wordstr + offset, lengths[i]);
01260             int uni_ch = ch.first_uni();
01261             for (int j = 0; kUniChs[j] != 0; ++j) {
01262               if (kUniChs[j] == uni_ch) {
01263                 uni_ch = kLatinChs[j];
01264                 break;
01265               }
01266             }
01267             if (uni_ch <= 0xff) {
01268               *ptr++ = static_cast<char>(uni_ch);
01269               last_char_was_tilde = false;
01270             } else {
01271               *ptr++ = kUNLVReject;
01272               last_char_was_tilde = true;
01273             }
01274           }
01275         }
01276       }
01277     }
01278     if (word->word->flag(W_EOL) && !last_char_was_newline) {
01279       /* Add a new line output */
01280       *ptr++ = '\n';
01281       tilde_crunch_written = false;
01282       last_char_was_newline = true;
01283       last_char_was_tilde = false;
01284     }
01285   }
01286   *ptr++ = '\n';
01287   *ptr = '\0';
01288   return result;
01289 }
01290 
01292 int TessBaseAPI::MeanTextConf() {
01293   int* conf = AllWordConfidences();
01294   if (!conf) return 0;
01295   int sum = 0;
01296   int *pt = conf;
01297   while (*pt >= 0) sum += *pt++;
01298   if (pt != conf) sum /= pt - conf;
01299   delete [] conf;
01300   return sum;
01301 }
01302 
01304 int* TessBaseAPI::AllWordConfidences() {
01305   if (tesseract_ == NULL ||
01306       (!recognition_done_ && Recognize(NULL) < 0))
01307     return NULL;
01308   int n_word = 0;
01309   PAGE_RES_IT res_it(page_res_);
01310   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
01311     n_word++;
01312 
01313   int* conf = new int[n_word+1];
01314   n_word = 0;
01315   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
01316     WERD_RES *word = res_it.word();
01317     WERD_CHOICE* choice = word->best_choice;
01318     int w_conf = static_cast<int>(100 + 5 * choice->certainty());
01319                  // This is the eq for converting Tesseract confidence to 1..100
01320     if (w_conf < 0) w_conf = 0;
01321     if (w_conf > 100) w_conf = 100;
01322     conf[n_word++] = w_conf;
01323   }
01324   conf[n_word] = -1;
01325   return conf;
01326 }
01327 
01338 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) {
01339   int debug = 0;
01340   GetIntVariable("applybox_debug", &debug);
01341   bool success = true;
01342   PageSegMode current_psm = GetPageSegMode();
01343   SetPageSegMode(mode);
01344   SetVariable("classify_enable_learning", "0");
01345   char* text = GetUTF8Text();
01346   if (debug) {
01347     tprintf("Trying to adapt \"%s\" to \"%s\"\n", text, wordstr);
01348   }
01349   if (text != NULL) {
01350     PAGE_RES_IT it(page_res_);
01351     WERD_RES* word_res = it.word();
01352     if (word_res != NULL) {
01353       word_res->word->set_text(wordstr);
01354     } else {
01355       success = false;
01356     }
01357     // Check to see if text matches wordstr.
01358     int w = 0;
01359     int t = 0;
01360     for (t = 0; text[t] != '\0'; ++t) {
01361       if (text[t] == '\n' || text[t] == ' ')
01362         continue;
01363       while (wordstr[w] != '\0' && wordstr[w] == ' ')
01364         ++w;
01365       if (text[t] != wordstr[w])
01366         break;
01367       ++w;
01368     }
01369     if (text[t] != '\0' || wordstr[w] != '\0') {
01370       // No match.
01371       delete page_res_;
01372       GenericVector<TBOX> boxes;
01373       page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);
01374       tesseract_->ReSegmentByClassification(page_res_);
01375       tesseract_->TidyUp(page_res_);
01376       PAGE_RES_IT pr_it(page_res_);
01377       if (pr_it.word() == NULL)
01378         success = false;
01379       else
01380         word_res = pr_it.word();
01381     } else {
01382       word_res->BestChoiceToCorrectText();
01383     }
01384     if (success) {
01385       tesseract_->EnableLearning = true;
01386       tesseract_->LearnWord(NULL, NULL, word_res);
01387     }
01388     delete [] text;
01389   } else {
01390     success = false;
01391   }
01392   SetPageSegMode(current_psm);
01393   return success;
01394 }
01395 
01402 void TessBaseAPI::Clear() {
01403   if (thresholder_ != NULL)
01404     thresholder_->Clear();
01405   ClearResults();
01406 }
01407 
01414 void TessBaseAPI::End() {
01415   if (thresholder_ != NULL) {
01416     delete thresholder_;
01417     thresholder_ = NULL;
01418   }
01419   if (page_res_ != NULL) {
01420     delete page_res_;
01421     page_res_ = NULL;
01422   }
01423   if (block_list_ != NULL) {
01424     delete block_list_;
01425     block_list_ = NULL;
01426   }
01427   if (paragraph_models_ != NULL) {
01428     paragraph_models_->delete_data_pointers();
01429     delete paragraph_models_;
01430     paragraph_models_ = NULL;
01431   }
01432   if (tesseract_ != NULL) {
01433     delete tesseract_;
01434     if (osd_tesseract_ == tesseract_)
01435       osd_tesseract_ = NULL;
01436     tesseract_ = NULL;
01437   }
01438   if (osd_tesseract_ != NULL) {
01439     delete osd_tesseract_;
01440     osd_tesseract_ = NULL;
01441   }
01442   if (equ_detect_ != NULL) {
01443     delete equ_detect_;
01444     equ_detect_ = NULL;
01445   }
01446   if (input_file_ != NULL) {
01447     delete input_file_;
01448     input_file_ = NULL;
01449   }
01450   if (output_file_ != NULL) {
01451     delete output_file_;
01452     output_file_ = NULL;
01453   }
01454   if (datapath_ != NULL) {
01455     delete datapath_;
01456     datapath_ = NULL;
01457   }
01458   if (language_ != NULL) {
01459     delete language_;
01460     language_ = NULL;
01461   }
01462 }
01463 
01468 int TessBaseAPI::IsValidWord(const char *word) {
01469   return tesseract_->getDict().valid_word(word);
01470 }
01471 
01472 
01473 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
01474   if (page_res_ == NULL)
01475     FindLines();
01476   if (block_list_->length() < 1) {
01477     return false;
01478   }
01479 
01480   // Get first block
01481   BLOCK_IT block_it(block_list_);
01482   block_it.move_to_first();
01483   ROW_LIST* rows = block_it.data()->row_list();
01484   if (rows->length() < 1) {
01485     return false;
01486   }
01487 
01488   // Get first line of block
01489   ROW_IT row_it(rows);
01490   row_it.move_to_first();
01491   ROW* row = row_it.data();
01492 
01493   // Calculate offset and slope (NOTE: Kind of ugly)
01494   *out_offset = static_cast<int>(row->base_line(0.0));
01495   *out_slope = row->base_line(1.0) - row->base_line(0.0);
01496 
01497   return true;
01498 }
01499 
01501 void TessBaseAPI::SetDictFunc(DictFunc f) {
01502   if (tesseract_ != NULL) {
01503     tesseract_->getDict().letter_is_okay_ = f;
01504   }
01505 }
01506 
01511 void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {
01512   if (tesseract_ != NULL) {
01513     tesseract_->getDict().probability_in_context_ = f;
01514     // Set it for the sublangs too.
01515     int num_subs = tesseract_->num_sub_langs();
01516     for (int i = 0; i < num_subs; ++i) {
01517       tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;
01518     }
01519   }
01520 }
01521 
01523 void TessBaseAPI::SetFillLatticeFunc(FillLatticeFunc f) {
01524   if (tesseract_ != NULL) tesseract_->fill_lattice_ = f;
01525 }
01526 
01528 bool TessBaseAPI::InternalSetImage() {
01529   if (tesseract_ == NULL) {
01530     tprintf("Please call Init before attempting to send an image.");
01531     return false;
01532   }
01533   if (thresholder_ == NULL)
01534     thresholder_ = new ImageThresholder;
01535   ClearResults();
01536   return true;
01537 }
01538 
01545 void TessBaseAPI::Threshold(Pix** pix) {
01546   ASSERT_HOST(pix != NULL);
01547   if (!thresholder_->IsBinary()) {
01548     tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());
01549   }
01550   if (*pix != NULL)
01551     pixDestroy(pix);
01552   // Zero resolution messes up the algorithms, so make sure it is credible.
01553   int y_res = thresholder_->GetScaledYResolution();
01554   if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
01555     // Use the minimum default resolution, as it is safer to under-estimate
01556     // than over-estimate resolution.
01557     thresholder_->SetSourceYResolution(kMinCredibleResolution);
01558   }
01559   thresholder_->ThresholdToPix(pix);
01560   thresholder_->GetImageSizes(&rect_left_, &rect_top_,
01561                               &rect_width_, &rect_height_,
01562                               &image_width_, &image_height_);
01563   // Set the internal resolution that is used for layout parameters from the
01564   // estimated resolution, rather than the image resolution, which may be
01565   // fabricated, but we will use the image resolution, if there is one, to
01566   // report output point sizes.
01567   int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
01568                                   kMinCredibleResolution,
01569                                   kMaxCredibleResolution);
01570   if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
01571     tprintf("Estimated resolution %d out of range! Corrected to %d\n",
01572             thresholder_->GetScaledEstimatedResolution(), estimated_res);
01573   }
01574   tesseract_->set_source_resolution(estimated_res);
01575 }
01576 
01578 int TessBaseAPI::FindLines() {
01579   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
01580     tprintf("Please call SetImage before attempting recognition.");
01581     return -1;
01582   }
01583   if (recognition_done_)
01584     ClearResults();
01585   if (!block_list_->empty()) {
01586     return 0;
01587   }
01588   if (tesseract_ == NULL) {
01589     tesseract_ = new Tesseract;
01590     tesseract_->InitAdaptiveClassifier(false);
01591   }
01592   if (tesseract_->pix_binary() == NULL)
01593     Threshold(tesseract_->mutable_pix_binary());
01594   if (tesseract_->ImageWidth() > MAX_INT16 ||
01595       tesseract_->ImageHeight() > MAX_INT16) {
01596     tprintf("Image too large: (%d, %d)\n",
01597             tesseract_->ImageWidth(), tesseract_->ImageHeight());
01598     return -1;
01599   }
01600 
01601   tesseract_->PrepareForPageseg();
01602 
01603   if (tesseract_->textord_equation_detect) {
01604     if (equ_detect_ == NULL && datapath_ != NULL) {
01605       equ_detect_ = new EquationDetect(datapath_->string(), NULL);
01606     }
01607     tesseract_->SetEquationDetect(equ_detect_);
01608   }
01609 
01610   Tesseract* osd_tess = osd_tesseract_;
01611   OSResults osr;
01612   if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == NULL) {
01613     if (strcmp(language_->string(), "osd") == 0) {
01614       osd_tess = tesseract_;
01615     } else {
01616       osd_tesseract_ = new Tesseract;
01617       if (osd_tesseract_->init_tesseract(
01618           datapath_->string(), NULL, "osd", OEM_TESSERACT_ONLY,
01619           NULL, 0, NULL, NULL, false) == 0) {
01620         osd_tess = osd_tesseract_;
01621         osd_tesseract_->set_source_resolution(
01622             thresholder_->GetSourceYResolution());
01623       } else {
01624         tprintf("Warning: Auto orientation and script detection requested,"
01625                 " but osd language failed to load\n");
01626         delete osd_tesseract_;
01627         osd_tesseract_ = NULL;
01628       }
01629     }
01630   }
01631 
01632   if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0)
01633     return -1;
01634   // If Devanagari is being recognized, we use different images for page seg
01635   // and for OCR.
01636   tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
01637   return 0;
01638 }
01639 
01641 void TessBaseAPI::ClearResults() {
01642   if (tesseract_ != NULL) {
01643     tesseract_->Clear();
01644   }
01645   if (page_res_ != NULL) {
01646     delete page_res_;
01647     page_res_ = NULL;
01648   }
01649   recognition_done_ = false;
01650   if (block_list_ == NULL)
01651     block_list_ = new BLOCK_LIST;
01652   else
01653     block_list_->clear();
01654   if (paragraph_models_ != NULL) {
01655     paragraph_models_->delete_data_pointers();
01656     delete paragraph_models_;
01657     paragraph_models_ = NULL;
01658   }
01659 }
01660 
01668 int TessBaseAPI::TextLength(int* blob_count) {
01669   if (tesseract_ == NULL || page_res_ == NULL)
01670     return 0;
01671 
01672   PAGE_RES_IT   page_res_it(page_res_);
01673   int total_length = 2;
01674   int total_blobs = 0;
01675   // Iterate over the data structures to extract the recognition result.
01676   for (page_res_it.restart_page(); page_res_it.word () != NULL;
01677        page_res_it.forward()) {
01678     WERD_RES *word = page_res_it.word();
01679     WERD_CHOICE* choice = word->best_choice;
01680     if (choice != NULL) {
01681       total_blobs += choice->length() + 2;
01682       total_length += choice->unichar_string().length() + 2;
01683       for (int i = 0; i < word->reject_map.length(); ++i) {
01684         if (word->reject_map[i].rejected())
01685           ++total_length;
01686       }
01687     }
01688   }
01689   if (blob_count != NULL)
01690     *blob_count = total_blobs;
01691   return total_length;
01692 }
01693 
01698 bool TessBaseAPI::DetectOS(OSResults* osr) {
01699   if (tesseract_ == NULL)
01700     return false;
01701   ClearResults();
01702   if (tesseract_->pix_binary() == NULL)
01703     Threshold(tesseract_->mutable_pix_binary());
01704   if (input_file_ == NULL)
01705     input_file_ = new STRING(kInputFile);
01706   return orientation_and_script_detection(*input_file_, osr, tesseract_);
01707 }
01708 
01709 void TessBaseAPI::set_min_orientation_margin(double margin) {
01710   tesseract_->min_orientation_margin.set_value(margin);
01711 }
01712 
01727 void TessBaseAPI::GetBlockTextOrientations(int** block_orientation,
01728                                            bool** vertical_writing) {
01729   delete[] *block_orientation;
01730   *block_orientation = NULL;
01731   delete[] *vertical_writing;
01732   *vertical_writing = NULL;
01733   BLOCK_IT block_it(block_list_);
01734 
01735   block_it.move_to_first();
01736   int num_blocks = 0;
01737   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
01738     if (!block_it.data()->poly_block()->IsText()) {
01739       continue;
01740     }
01741     ++num_blocks;
01742   }
01743   if (!num_blocks) {
01744     tprintf("WARNING: Found no blocks\n");
01745     return;
01746   }
01747   *block_orientation = new int[num_blocks];
01748   *vertical_writing = new bool[num_blocks];
01749   block_it.move_to_first();
01750   int i = 0;
01751   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
01752        block_it.forward()) {
01753     if (!block_it.data()->poly_block()->IsText()) {
01754       continue;
01755     }
01756     FCOORD re_rotation = block_it.data()->re_rotation();
01757     float re_theta = re_rotation.angle();
01758     FCOORD classify_rotation = block_it.data()->classify_rotation();
01759     float classify_theta = classify_rotation.angle();
01760     double rot_theta = - (re_theta - classify_theta) * 2.0 / PI;
01761     if (rot_theta < 0) rot_theta += 4;
01762     int num_rotations = static_cast<int>(rot_theta + 0.5);
01763     (*block_orientation)[i] = num_rotations;
01764     // The classify_rotation is non-zero only if the text has vertical
01765     // writing direction.
01766     (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
01767     ++i;
01768   }
01769 }
01770 
01771 // ____________________________________________________________________________
01772 // Ocropus add-ons.
01773 
01775 BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
01776   FindLines();
01777   BLOCK_LIST* result = block_list_;
01778   block_list_ = NULL;
01779   return result;
01780 }
01781 
01787 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
01788   delete block_list;
01789 }
01790 
01791 
01792 ROW *TessBaseAPI::MakeTessOCRRow(float baseline,
01793                                  float xheight,
01794                                  float descender,
01795                                  float ascender) {
01796   inT32 xstarts[] = {-32000};
01797   double quad_coeffs[] = {0, 0, baseline};
01798   return new ROW(1,
01799                  xstarts,
01800                  quad_coeffs,
01801                  xheight,
01802                  ascender - (baseline + xheight),
01803                  descender - baseline,
01804                  0,
01805                  0);
01806 }
01807 
01809 TBLOB *TessBaseAPI::MakeTBLOB(Pix *pix) {
01810   int width = pixGetWidth(pix);
01811   int height = pixGetHeight(pix);
01812   BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height);
01813 
01814   // Create C_BLOBs from the page
01815   extract_edges(pix, &block);
01816 
01817   // Merge all C_BLOBs
01818   C_BLOB_LIST *list = block.blob_list();
01819   C_BLOB_IT c_blob_it(list);
01820   if (c_blob_it.empty())
01821     return NULL;
01822   // Move all the outlines to the first blob.
01823   C_OUTLINE_IT ol_it(c_blob_it.data()->out_list());
01824   for (c_blob_it.forward();
01825        !c_blob_it.at_first();
01826        c_blob_it.forward()) {
01827       C_BLOB *c_blob = c_blob_it.data();
01828       ol_it.add_list_after(c_blob->out_list());
01829   }
01830   // Convert the first blob to the output TBLOB.
01831   return TBLOB::PolygonalCopy(c_blob_it.data());
01832 }
01833 
01839 void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row,
01840                                  bool numeric_mode, DENORM *denorm) {
01841   TWERD word;
01842   word.blobs = tblob;
01843   if (denorm != NULL) {
01844     word.SetupBLNormalize(NULL, row, row->x_height(), numeric_mode, denorm);
01845     word.Normalize(*denorm);
01846   } else {
01847     DENORM normer;
01848     word.SetupBLNormalize(NULL, row, row->x_height(), numeric_mode, &normer);
01849     word.Normalize(normer);
01850   }
01851   word.blobs = NULL;
01852 }
01853 
01858 TBLOB *make_tesseract_blob(float baseline, float xheight,
01859                            float descender, float ascender,
01860                            bool numeric_mode, Pix* pix) {
01861   TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);
01862 
01863   // Normalize TBLOB
01864   ROW *row =
01865       TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
01866   TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode, NULL);
01867   delete row;
01868   return tblob;
01869 }
01870 
01876 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
01877                                    int length,
01878                                    float baseline,
01879                                    float xheight,
01880                                    float descender,
01881                                    float ascender) {
01882   UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
01883   TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender,
01884                                     tesseract_->classify_bln_numeric_mode,
01885                                     tesseract_->pix_binary());
01886   float threshold;
01887   UNICHAR_ID best_class = 0;
01888   float best_rating = -100;
01889 
01890 
01891   // Classify to get a raw choice.
01892   BLOB_CHOICE_LIST choices;
01893   DENORM denorm;
01894   tesseract_->AdaptiveClassifier(blob, denorm, &choices, NULL);
01895   BLOB_CHOICE_IT choice_it;
01896   choice_it.set_to_list(&choices);
01897   for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
01898        choice_it.forward()) {
01899     if (choice_it.data()->rating() > best_rating) {
01900       best_rating = choice_it.data()->rating();
01901       best_class = choice_it.data()->unichar_id();
01902     }
01903   }
01904 
01905   threshold = tesseract_->matcher_good_threshold;
01906 
01907   if (blob->outlines)
01908     tesseract_->AdaptToChar(blob, denorm, id, kUnknownFontinfoId, threshold);
01909   delete blob;
01910 }
01911 
01912 
01913 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
01914   PAGE_RES *page_res = new PAGE_RES(block_list,
01915                                     &(tesseract_->prev_word_best_choice_));
01916   tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
01917   return page_res;
01918 }
01919 
01920 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
01921                                         PAGE_RES* pass1_result) {
01922   if (!pass1_result)
01923     pass1_result = new PAGE_RES(block_list,
01924                                 &(tesseract_->prev_word_best_choice_));
01925   tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
01926   return pass1_result;
01927 }
01928 
01929 void TessBaseAPI::DetectParagraphs(int debug_level) {
01930   if (paragraph_models_ == NULL)
01931     paragraph_models_ = new GenericVector<ParagraphModel*>;
01932   MutableIterator *result_it = GetMutableIterator();
01933   do {  // Detect paragraphs for this block
01934     GenericVector<ParagraphModel *> models;
01935     ::tesseract::DetectParagraphs(debug_level, result_it, &models);
01936     *paragraph_models_ += models;
01937   } while (result_it->Next(RIL_BLOCK));
01938   delete result_it;
01939 }
01940 
01941 struct TESS_CHAR : ELIST_LINK {
01942   char *unicode_repr;
01943   int length;  // of unicode_repr
01944   float cost;
01945   TBOX box;
01946 
01947   TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
01948     length = (len == -1 ? strlen(repr) : len);
01949     unicode_repr = new char[length + 1];
01950     strncpy(unicode_repr, repr, length);
01951   }
01952 
01953   TESS_CHAR() {  // Satisfies ELISTIZE.
01954   }
01955   ~TESS_CHAR() {
01956     delete [] unicode_repr;
01957   }
01958 };
01959 
01960 ELISTIZEH(TESS_CHAR)
01961 ELISTIZE(TESS_CHAR)
01962 
01963 static void add_space(TESS_CHAR_IT* it) {
01964   TESS_CHAR *t = new TESS_CHAR(0, " ");
01965   it->add_after_then_move(t);
01966 }
01967 
01968 
01969 static float rating_to_cost(float rating) {
01970   rating = 100 + rating;
01971   // cuddled that to save from coverage profiler
01972   // (I have never seen ratings worse than -100,
01973   //  but the check won't hurt)
01974   if (rating < 0) rating = 0;
01975   return rating;
01976 }
01977 
01982 static void extract_result(TESS_CHAR_IT* out,
01983                            PAGE_RES* page_res) {
01984   PAGE_RES_IT page_res_it(page_res);
01985   int word_count = 0;
01986   while (page_res_it.word() != NULL) {
01987     WERD_RES *word = page_res_it.word();
01988     const char *str = word->best_choice->unichar_string().string();
01989     const char *len = word->best_choice->unichar_lengths().string();
01990     TBOX real_rect = word->word->bounding_box();
01991 
01992     if (word_count)
01993       add_space(out);
01994     int n = strlen(len);
01995     for (int i = 0; i < n; i++) {
01996       TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
01997                                     str, *len);
01998       tc->box = real_rect.intersection(word->box_word->BlobBox(i));
01999       out->add_after_then_move(tc);
02000        str += *len;
02001       len++;
02002     }
02003     page_res_it.forward();
02004     word_count++;
02005   }
02006 }
02007 
02012 int TessBaseAPI::TesseractExtractResult(char** text,
02013                                         int** lengths,
02014                                         float** costs,
02015                                         int** x0,
02016                                         int** y0,
02017                                         int** x1,
02018                                         int** y1,
02019                                         PAGE_RES* page_res) {
02020   TESS_CHAR_LIST tess_chars;
02021   TESS_CHAR_IT tess_chars_it(&tess_chars);
02022   extract_result(&tess_chars_it, page_res);
02023   tess_chars_it.move_to_first();
02024   int n = tess_chars.length();
02025   int text_len = 0;
02026   *lengths = new int[n];
02027   *costs = new float[n];
02028   *x0 = new int[n];
02029   *y0 = new int[n];
02030   *x1 = new int[n];
02031   *y1 = new int[n];
02032   int i = 0;
02033   for (tess_chars_it.mark_cycle_pt();
02034        !tess_chars_it.cycled_list();
02035        tess_chars_it.forward(), i++) {
02036     TESS_CHAR *tc = tess_chars_it.data();
02037     text_len += (*lengths)[i] = tc->length;
02038     (*costs)[i] = tc->cost;
02039     (*x0)[i] = tc->box.left();
02040     (*y0)[i] = tc->box.bottom();
02041     (*x1)[i] = tc->box.right();
02042     (*y1)[i] = tc->box.top();
02043   }
02044   char *p = *text = new char[text_len];
02045 
02046   tess_chars_it.move_to_first();
02047   for (tess_chars_it.mark_cycle_pt();
02048         !tess_chars_it.cycled_list();
02049        tess_chars_it.forward()) {
02050     TESS_CHAR *tc = tess_chars_it.data();
02051     strncpy(p, tc->unicode_repr, tc->length);
02052     p += tc->length;
02053   }
02054   return n;
02055 }
02056 
02058 void TessBaseAPI::GetFeaturesForBlob(TBLOB* blob, const DENORM& denorm,
02059                                      INT_FEATURE_ARRAY int_features,
02060                                      int* num_features,
02061                                      int* FeatureOutlineIndex) {
02062   if (tesseract_) {
02063     tesseract_->ResetFeaturesHaveBeenExtracted();
02064   }
02065   uinT8* norm_array = new uinT8[MAX_NUM_CLASSES];
02066   inT32 len;
02067   *num_features = tesseract_->GetCharNormFeatures(
02068       blob, denorm, tesseract_->PreTrainedTemplates,
02069       int_features, norm_array, norm_array, &len, FeatureOutlineIndex);
02070   delete [] norm_array;
02071 }
02072 
02077 ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks,
02078                                 int left, int top, int right, int bottom) {
02079   TBOX box(left, bottom, right, top);
02080   BLOCK_IT b_it(blocks);
02081   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
02082     BLOCK* block = b_it.data();
02083     if (!box.major_overlap(block->bounding_box()))
02084       continue;
02085     ROW_IT r_it(block->row_list());
02086     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
02087       ROW* row = r_it.data();
02088       if (!box.major_overlap(row->bounding_box()))
02089         continue;
02090       WERD_IT w_it(row->word_list());
02091       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
02092         WERD* word = w_it.data();
02093         if (box.major_overlap(word->bounding_box()))
02094           return row;
02095       }
02096     }
02097   }
02098   return NULL;
02099 }
02100 
02102 void TessBaseAPI::RunAdaptiveClassifier(TBLOB* blob, const DENORM& denorm,
02103                                         int num_max_matches,
02104                                         int* unichar_ids,
02105                                         float* ratings,
02106                                         int* num_matches_returned) {
02107   BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
02108   tesseract_->AdaptiveClassifier(blob, denorm, choices, NULL);
02109   BLOB_CHOICE_IT choices_it(choices);
02110   int& index = *num_matches_returned;
02111   index = 0;
02112   for (choices_it.mark_cycle_pt();
02113        !choices_it.cycled_list() && index < num_max_matches;
02114        choices_it.forward()) {
02115     BLOB_CHOICE* choice = choices_it.data();
02116     unichar_ids[index] = choice->unichar_id();
02117     ratings[index] = choice->rating();
02118     ++index;
02119   }
02120   *num_matches_returned = index;
02121   delete choices;
02122 }
02123 
02125 const char* TessBaseAPI::GetUnichar(int unichar_id) {
02126   return tesseract_->unicharset.id_to_unichar(unichar_id);
02127 }
02128 
02130 const Dawg *TessBaseAPI::GetDawg(int i) const {
02131   if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
02132   return tesseract_->getDict().GetDawg(i);
02133 }
02134 
02136 int TessBaseAPI::NumDawgs() const {
02137   return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
02138 }
02139 
02141 CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const {
02142   return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext();
02143 }
02144 }  // namespace tesseract.