Tesseract  3.02
tesseract-ocr/ccmain/osdetect.cpp
Go to the documentation of this file.
00001 
00002 // File:        osdetect.cpp
00003 // Description: Orientation and script detection.
00004 // Author:      Samuel Charron
00005 //              Ranjith Unnikrishnan
00006 //
00007 // (C) Copyright 2008, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "osdetect.h"
00021 
00022 #include "blobbox.h"
00023 #include "blread.h"
00024 #include "colfind.h"
00025 #include "fontinfo.h"
00026 #include "imagefind.h"
00027 #include "linefind.h"
00028 #include "oldlist.h"
00029 #include "qrsequence.h"
00030 #include "ratngs.h"
00031 #include "strngs.h"
00032 #include "tabvector.h"
00033 #include "tesseractclass.h"
00034 #include "textord.h"
00035 
00036 const int kMinCharactersToTry = 50;
00037 const int kMaxCharactersToTry = 5 * kMinCharactersToTry;
00038 
00039 const float kSizeRatioToReject = 2.0;
00040 const int kMinAcceptableBlobHeight = 10;
00041 
00042 const float kOrientationAcceptRatio = 1.3;
00043 const float kScriptAcceptRatio = 1.3;
00044 
00045 const float kHanRatioInKorean = 0.7;
00046 const float kHanRatioInJapanese = 0.3;
00047 
00048 const float kNonAmbiguousMargin = 1.0;
00049 
00050 // General scripts
00051 static const char* han_script = "Han";
00052 static const char* latin_script = "Latin";
00053 static const char* katakana_script = "Katakana";
00054 static const char* hiragana_script = "Hiragana";
00055 static const char* hangul_script = "Hangul";
00056 
00057 // Pseudo-scripts Name
00058 const char* ScriptDetector::korean_script_ = "Korean";
00059 const char* ScriptDetector::japanese_script_ = "Japanese";
00060 const char* ScriptDetector::fraktur_script_ = "Fraktur";
00061 
00062 // Minimum believable resolution.
00063 const int kMinCredibleResolution = 70;
00064 // Default resolution used if input is not believable.
00065 const int kDefaultResolution = 300;
00066 
00067 void OSResults::update_best_orientation() {
00068   float first = orientations[0];
00069   float second = orientations[1];
00070   best_result.orientation_id = 0;
00071   if (orientations[0] < orientations[1]) {
00072     first = orientations[1];
00073     second = orientations[0];
00074     best_result.orientation_id = 1;
00075   }
00076   for (int i = 2; i < 4; ++i) {
00077     if (orientations[i] > first) {
00078       second = first;
00079       first = orientations[i];
00080       best_result.orientation_id = i;
00081     } else if (orientations[i] > second) {
00082       second = orientations[i];
00083     }
00084   }
00085   // Store difference of top two orientation scores.
00086   best_result.oconfidence = first - second;
00087 }
00088 
00089 void OSResults::set_best_orientation(int orientation_id) {
00090   best_result.orientation_id = orientation_id;
00091   best_result.oconfidence = 0;
00092 }
00093 
00094 void OSResults::update_best_script(int orientation) {
00095   // We skip index 0 to ignore the "Common" script.
00096   float first = scripts_na[orientation][1];
00097   float second = scripts_na[orientation][2];
00098   best_result.script_id = 1;
00099   if (scripts_na[orientation][1] < scripts_na[orientation][2]) {
00100     first = scripts_na[orientation][2];
00101     second = scripts_na[orientation][1];
00102     best_result.script_id = 2;
00103   }
00104   for (int i = 3; i < kMaxNumberOfScripts; ++i) {
00105     if (scripts_na[orientation][i] > first) {
00106       best_result.script_id = i;
00107       second = first;
00108       first = scripts_na[orientation][i];
00109     } else if (scripts_na[orientation][i] > second) {
00110       second = scripts_na[orientation][i];
00111     }
00112   }
00113   best_result.sconfidence =
00114       (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
00115 }
00116 
00117 int OSResults::get_best_script(int orientation_id) const {
00118   int max_id = -1;
00119   for (int j = 0; j < kMaxNumberOfScripts; ++j) {
00120     const char *script = unicharset->get_script_from_script_id(j);
00121     if (strcmp(script, "Common") && strcmp(script, "NULL")) {
00122       if (max_id == -1 ||
00123           scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id])
00124         max_id = j;
00125     }
00126   }
00127   return max_id;
00128 }
00129 
00130 // Print the script scores for all possible orientations.
00131 void OSResults::print_scores(void) const {
00132   for (int i = 0; i < 4; ++i) {
00133     printf("Orientation id #%d", i);
00134     print_scores(i);
00135   }
00136 }
00137 
00138 // Print the script scores for the given candidate orientation.
00139 void OSResults::print_scores(int orientation_id) const {
00140   for (int j = 0; j < kMaxNumberOfScripts; ++j) {
00141     if (scripts_na[orientation_id][j]) {
00142       printf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
00143              scripts_na[orientation_id][j]);
00144     }
00145   }
00146 }
00147 
00148 // Accumulate scores with given OSResults instance and update the best script.
00149 void OSResults::accumulate(const OSResults& osr) {
00150   for (int i = 0; i < 4; ++i) {
00151     orientations[i] += osr.orientations[i];
00152     for (int j = 0; j < kMaxNumberOfScripts; ++j)
00153       scripts_na[i][j] += osr.scripts_na[i][j];
00154   }
00155   unicharset = osr.unicharset;
00156   update_best_orientation();
00157   update_best_script(best_result.orientation_id);
00158 }
00159 
00160 // Detect and erase horizontal/vertical lines and picture regions from the
00161 // image, so that non-text blobs are removed from consideration.
00162 void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
00163                             TO_BLOCK_LIST *to_blocks) {
00164   Pix *pix = tess->pix_binary();
00165   ASSERT_HOST(pix != NULL);
00166   int vertical_x = 0;
00167   int vertical_y = 1;
00168   tesseract::TabVector_LIST v_lines;
00169   tesseract::TabVector_LIST h_lines;
00170   const int kMinCredibleResolution = 70;
00171   int resolution = (kMinCredibleResolution > pixGetXRes(pix)) ?
00172       kMinCredibleResolution : pixGetXRes(pix);
00173 
00174   tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix,
00175                                             &vertical_x, &vertical_y,
00176                                             NULL, &v_lines, &h_lines);
00177   Pix* im_pix = tesseract::ImageFind::FindImages(pix);
00178   if (im_pix != NULL) {
00179     pixSubtract(pix, pix, im_pix);
00180     pixDestroy(&im_pix);
00181   }
00182   tess->mutable_textord()->find_components(tess->pix_binary(),
00183                                            blocks, to_blocks);
00184 }
00185 
00186 // Find connected components in the page and process a subset until finished or
00187 // a stopping criterion is met.
00188 // Returns the number of blobs used in making the estimate. 0 implies failure.
00189 int orientation_and_script_detection(STRING& filename,
00190                                      OSResults* osr,
00191                                      tesseract::Tesseract* tess) {
00192   STRING name = filename;        //truncated name
00193   const char *lastdot;           //of name
00194   TBOX page_box;
00195 
00196   lastdot = strrchr (name.string (), '.');
00197   if (lastdot != NULL)
00198     name[lastdot-name.string()] = '\0';
00199 
00200   ASSERT_HOST(tess->pix_binary() != NULL)
00201   int width = pixGetWidth(tess->pix_binary());
00202   int height = pixGetHeight(tess->pix_binary());
00203   int resolution = pixGetXRes(tess->pix_binary());
00204   // Zero resolution messes up the algorithms, so make sure it is credible.
00205   if (resolution < kMinCredibleResolution)
00206     resolution = kDefaultResolution;
00207 
00208   BLOCK_LIST blocks;
00209   if (!read_unlv_file(name, width, height, &blocks))
00210     FullPageBlock(width, height, &blocks);
00211 
00212   // Try to remove non-text regions from consideration.
00213   TO_BLOCK_LIST land_blocks, port_blocks;
00214   remove_nontext_regions(tess, &blocks, &port_blocks);
00215 
00216   if (port_blocks.empty()) {
00217     // page segmentation did not succeed, so we need to find_components first.
00218     tess->mutable_textord()->find_components(tess->pix_binary(),
00219                                              &blocks, &port_blocks);
00220   } else {
00221     page_box.set_left(0);
00222     page_box.set_bottom(0);
00223     page_box.set_right(width);
00224     page_box.set_top(height);
00225     // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
00226     tess->mutable_textord()->filter_blobs(page_box.topright(),
00227                                           &port_blocks, true);
00228   }
00229 
00230   return os_detect(&port_blocks, osr, tess);
00231 }
00232 
00233 // Filter and sample the blobs.
00234 // Returns a non-zero number of blobs if the page was successfully processed, or
00235 // zero if the page had too few characters to be reliable
00236 int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
00237               tesseract::Tesseract* tess) {
00238   int blobs_total = 0;
00239   TO_BLOCK_IT block_it;
00240   block_it.set_to_list(port_blocks);
00241 
00242   BLOBNBOX_CLIST filtered_list;
00243   BLOBNBOX_C_IT filtered_it(&filtered_list);
00244 
00245   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00246        block_it.forward ()) {
00247     TO_BLOCK* to_block = block_it.data();
00248     if (to_block->block->poly_block() &&
00249         !to_block->block->poly_block()->IsText()) continue;
00250     BLOBNBOX_IT bbox_it;
00251     bbox_it.set_to_list(&to_block->blobs);
00252     for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list ();
00253          bbox_it.forward ()) {
00254       BLOBNBOX* bbox = bbox_it.data();
00255       C_BLOB*   blob = bbox->cblob();
00256       TBOX      box = blob->bounding_box();
00257       ++blobs_total;
00258 
00259       float y_x = fabs((box.height() * 1.0) / box.width());
00260       float x_y = 1.0f / y_x;
00261       // Select a >= 1.0 ratio
00262       float ratio = x_y > y_x ? x_y : y_x;
00263       // Blob is ambiguous
00264       if (ratio > kSizeRatioToReject) continue;
00265       if (box.height() < kMinAcceptableBlobHeight) continue;
00266       filtered_it.add_to_end(bbox);
00267     }
00268   }
00269   return os_detect_blobs(&filtered_list, osr, tess);
00270 }
00271 
00272 // Detect orientation and script from a list of blobs.
00273 // Returns a non-zero number of blobs if the list was successfully processed, or
00274 // zero if the list had too few characters to be reliable
00275 int os_detect_blobs(BLOBNBOX_CLIST* blob_list, OSResults* osr,
00276                     tesseract::Tesseract* tess) {
00277   OSResults osr_;
00278   if (osr == NULL)
00279     osr = &osr_;
00280 
00281   osr->unicharset = &tess->unicharset;
00282   OrientationDetector o(osr);
00283   ScriptDetector s(osr, tess);
00284 
00285   BLOBNBOX_C_IT filtered_it(blob_list);
00286   int real_max = MIN(filtered_it.length(), kMaxCharactersToTry);
00287   // printf("Total blobs found = %d\n", blobs_total);
00288   // printf("Number of blobs post-filtering = %d\n", filtered_it.length());
00289   // printf("Number of blobs to try = %d\n", real_max);
00290 
00291   // If there are too few characters, skip this page entirely.
00292   if (real_max < kMinCharactersToTry / 2) {
00293     printf("Too few characters. Skipping this page\n");
00294     return 0;
00295   }
00296 
00297   BLOBNBOX** blobs = new BLOBNBOX*[filtered_it.length()];
00298   int number_of_blobs = 0;
00299   for (filtered_it.mark_cycle_pt (); !filtered_it.cycled_list ();
00300        filtered_it.forward ()) {
00301     blobs[number_of_blobs++] = (BLOBNBOX*)filtered_it.data();
00302   }
00303   QRSequenceGenerator sequence(number_of_blobs);
00304   int num_blobs_evaluated = 0;
00305   for (int i = 0; i < real_max; ++i) {
00306     if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
00307         && i > kMinCharactersToTry) {
00308       break;
00309     }
00310     ++num_blobs_evaluated;
00311   }
00312   delete [] blobs;
00313 
00314   // Make sure the best_result is up-to-date
00315   int orientation = o.get_orientation();
00316   osr->update_best_script(orientation);
00317   return num_blobs_evaluated;
00318 }
00319 
00320 // Processes a single blob to estimate script and orientation.
00321 // Return true if estimate of orientation and script satisfies stopping
00322 // criteria.
00323 bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
00324                     ScriptDetector* s, OSResults* osr,
00325                     tesseract::Tesseract* tess) {
00326   tess->tess_cn_matching.set_value(true); // turn it on
00327   tess->tess_bn_matching.set_value(false);
00328   C_BLOB* blob = bbox->cblob();
00329   TBLOB* tblob = TBLOB::PolygonalCopy(blob);
00330   TBOX box = tblob->bounding_box();
00331   FCOORD current_rotation(1.0f, 0.0f);
00332   FCOORD rotation90(0.0f, 1.0f);
00333   BLOB_CHOICE_LIST ratings[4];
00334   // Test the 4 orientations
00335   for (int i = 0; i < 4; ++i) {
00336     // Normalize the blob. Set the origin to the place we want to be the
00337     // bottom-middle after rotation.
00338     // Scaling is to make the rotated height the x-height.
00339     float scaling = static_cast<float>(kBlnXHeight) / box.height();
00340     float x_origin = (box.left() + box.right()) / 2.0f;
00341     float y_origin = (box.bottom() + box.top()) / 2.0f;
00342     if (i == 0 || i == 2) {
00343       // Rotation is 0 or 180.
00344       y_origin = i == 0 ? box.bottom() : box.top();
00345     } else {
00346       // Rotation is 90 or 270.
00347       scaling = static_cast<float>(kBlnXHeight) / box.width();
00348       x_origin = i == 1 ? box.left() : box.right();
00349     }
00350     DENORM denorm;
00351     denorm.SetupNormalization(NULL, NULL, &current_rotation, NULL, NULL, 0,
00352                               x_origin, y_origin, scaling, scaling,
00353                               0.0f, static_cast<float>(kBlnBaselineOffset));
00354     TBLOB* rotated_blob = new TBLOB(*tblob);
00355     rotated_blob->Normalize(denorm);
00356     tess->AdaptiveClassifier(rotated_blob, denorm, ratings + i, NULL);
00357     delete rotated_blob;
00358     current_rotation.rotate(rotation90);
00359   }
00360   delete tblob;
00361 
00362   bool stop = o->detect_blob(ratings);
00363   s->detect_blob(ratings);
00364   int orientation = o->get_orientation();
00365   stop = s->must_stop(orientation) && stop;
00366   return stop;
00367 }
00368 
00369 
00370 OrientationDetector::OrientationDetector(OSResults* osr) {
00371   osr_ = osr;
00372 }
00373 
00374 // Score the given blob and return true if it is now sure of the orientation
00375 // after adding this block.
00376 bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
00377   float blob_o_score[4] = {0.0, 0.0, 0.0, 0.0};
00378   float total_blob_o_score = 0.0;
00379 
00380   for (int i = 0; i < 4; ++i) {
00381     BLOB_CHOICE_IT choice_it;
00382     choice_it.set_to_list(scores + i);
00383     if (!choice_it.empty()) {
00384       // The certainty score ranges between [-20,0]. This is converted here to
00385       // [0,1], with 1 indicating best match.
00386       blob_o_score[i] = 1 + 0.05 * choice_it.data()->certainty();
00387       total_blob_o_score += blob_o_score[i];
00388     }
00389   }
00390   // Normalize the orientation scores for the blob and use them to
00391   // update the aggregated orientation score.
00392   for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {
00393     osr_->orientations[i] += log(blob_o_score[i] / total_blob_o_score);
00394   }
00395 
00396   float first = -1;
00397   float second = -1;
00398 
00399   int idx = -1;
00400   for (int i = 0; i < 4; ++i) {
00401     if (osr_->orientations[i] > first) {
00402       idx = i;
00403       second = first;
00404       first = osr_->orientations[i];
00405     } else if (osr_->orientations[i] > second) {
00406       second = osr_->orientations[i];
00407     }
00408   }
00409 
00410   return first / second > kOrientationAcceptRatio;
00411 }
00412 
00413 int OrientationDetector::get_orientation() {
00414   osr_->update_best_orientation();
00415   return osr_->best_result.orientation_id;
00416 }
00417 
00418 
00419 ScriptDetector::ScriptDetector(OSResults* osr, tesseract::Tesseract* tess) {
00420   osr_ = osr;
00421   tess_ = tess;
00422   katakana_id_ = tess_->unicharset.add_script(katakana_script);
00423   hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
00424   han_id_ = tess_->unicharset.add_script(han_script);
00425   hangul_id_ = tess_->unicharset.add_script(hangul_script);
00426   japanese_id_ = tess_->unicharset.add_script(japanese_script_);
00427   korean_id_ = tess_->unicharset.add_script(korean_script_);
00428   latin_id_ = tess_->unicharset.add_script(latin_script);
00429   fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
00430 }
00431 
00432 
00433 // Score the given blob and return true if it is now sure of the script after
00434 // adding this blob.
00435 void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
00436   bool done[kMaxNumberOfScripts];
00437   for (int i = 0; i < 4; ++i) {
00438     for (int j = 0; j < kMaxNumberOfScripts; ++j)
00439       done[j] = false;
00440 
00441     BLOB_CHOICE_IT choice_it;
00442     choice_it.set_to_list(scores + i);
00443 
00444     float prev_score = -1;
00445     int script_count = 0;
00446     int prev_id = -1;
00447     int prev_script;
00448     int prev_class_id = -1;
00449     int prev_fontinfo_id = -1;
00450     const char* prev_unichar = "";
00451     const char* unichar = "";
00452     float next_best_score = -1.0;
00453     int next_best_script_id = -1;
00454     const char* next_best_unichar = "";
00455 
00456     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
00457          choice_it.forward()) {
00458       BLOB_CHOICE* choice = choice_it.data();
00459       int id = choice->script_id();
00460       // Script already processed before.
00461       if (done[id]) continue;
00462       done[id] = true;
00463 
00464       unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
00465       // Save data from the first match
00466       if (prev_score < 0) {
00467         prev_score = -choice->certainty();
00468         script_count = 1;
00469         prev_id = id;
00470         prev_script = choice->script_id();
00471         prev_unichar = unichar;
00472         prev_class_id = choice->unichar_id();
00473         prev_fontinfo_id = choice->fontinfo_id();
00474       } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
00475         ++script_count;
00476         next_best_score = -choice->certainty();
00477         next_best_script_id = choice->script_id();
00478         next_best_unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
00479       }
00480 
00481       if (strlen(prev_unichar) == 1)
00482         if (unichar[0] >= '0' && unichar[0] <= '9')
00483           break;
00484 
00485       // if script_count is >= 2, character is ambiguous, skip other matches
00486       // since they are useless.
00487       if (script_count >= 2)
00488         break;
00489     }
00490     // Character is non ambiguous
00491     if (script_count == 1) {
00492       // Update the score of the winning script
00493       osr_->scripts_na[i][prev_id] += 1.0;
00494 
00495       // Workaround for Fraktur
00496       if (prev_id == latin_id_) {
00497         if (prev_fontinfo_id >= 0) {
00498           const tesseract::FontInfo &fi =
00499               tess_->get_fontinfo_table().get(prev_fontinfo_id);
00500           //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
00501           //       fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
00502           //       fi.is_serif(), fi.is_fraktur(),
00503           //       prev_unichar);
00504           if (fi.is_fraktur()) {
00505             osr_->scripts_na[i][prev_id] -= 1.0;
00506             osr_->scripts_na[i][fraktur_id_] += 1.0;
00507           }
00508         }
00509       }
00510 
00511       // Update Japanese / Korean pseudo-scripts
00512       if (prev_id == katakana_id_)
00513         osr_->scripts_na[i][japanese_id_] += 1.0;
00514       if (prev_id == hiragana_id_)
00515         osr_->scripts_na[i][japanese_id_] += 1.0;
00516       if (prev_id == hangul_id_)
00517         osr_->scripts_na[i][korean_id_] += 1.0;
00518       if (prev_id == han_id_)
00519         osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
00520       if (prev_id == han_id_)
00521         osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
00522     }
00523   }  // iterate over each orientation
00524 }
00525 
00526 bool ScriptDetector::must_stop(int orientation) {
00527   osr_->update_best_script(orientation);
00528   return osr_->best_result.sconfidence > 1;
00529 }
00530 
00531 // Helper method to convert an orientation index to its value in degrees.
00532 // The value represents the amount of clockwise rotation in degrees that must be
00533 // applied for the text to be upright (readable).
00534 const int OrientationIdToValue(const int& id) {
00535   switch (id) {
00536     case 0:
00537       return 0;
00538     case 1:
00539       return 270;
00540     case 2:
00541       return 180;
00542     case 3:
00543       return 90;
00544     default:
00545       return -1;
00546   }
00547 }