Tesseract  3.02
tesseract-ocr/textord/tordmain.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        tordmain.cpp  (Formerly textordp.c)
00003  * Description: C++ top level textord code.
00004  * Author:                  Ray Smith
00005  * Created:                 Tue Jul 28 17:12:33 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 #include "mfcpch.h"
00020 #ifdef __UNIX__
00021 #include <assert.h>
00022 #endif
00023 #include "stderr.h"
00024 #include "globaloc.h"
00025 #include "blread.h"
00026 #include "blobbox.h"
00027 #include "ccstruct.h"
00028 #include "edgblob.h"
00029 #include "drawtord.h"
00030 #include "makerow.h"
00031 #include "wordseg.h"
00032 #include "imgs.h"
00033 #include "textord.h"
00034 #include "tordmain.h"
00035 #include "secname.h"
00036 
00037 // Include automatically generated configuration file if running autoconf.
00038 #ifdef HAVE_CONFIG_H
00039 #include "config_auto.h"
00040 #endif
00041 
00042 #include "allheaders.h"
00043 
00044 const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block";
00045 
00046 #undef EXTERN
00047 #define EXTERN
00048 
00049 #define MAX_NEAREST_DIST  600    //for block skew stats
00050 
00051 /**********************************************************************
00052  * SetBlobStrokeWidth
00053  *
00054  * Set the horizontal and vertical stroke widths in the blob.
00055  **********************************************************************/
00056 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
00057   // Cut the blob rectangle into a Pix.
00058   int pix_height = pixGetHeight(pix);
00059   const TBOX& box = blob->bounding_box();
00060   int width = box.width();
00061   int height = box.height();
00062   Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
00063                                 width, height);
00064   Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL);
00065   boxDestroy(&blob_pix_box);
00066   Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
00067   pixDestroy(&pix_blob);
00068   // Compute the stroke widths.
00069   uinT32* data = pixGetData(dist_pix);
00070   int wpl = pixGetWpl(dist_pix);
00071   // Horizontal width of stroke.
00072   STATS h_stats(0, width + 1);
00073   for (int y = 0; y < height; ++y) {
00074     uinT32* pixels = data + y*wpl;
00075     int prev_pixel = 0;
00076     int pixel = GET_DATA_BYTE(pixels, 0);
00077     for (int x = 1; x < width; ++x) {
00078       int next_pixel = GET_DATA_BYTE(pixels, x);
00079       // We are looking for a pixel that is equal to its vertical neighbours,
00080       // yet greater than its left neighbour.
00081       if (prev_pixel < pixel &&
00082           (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
00083           (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
00084         if (pixel > next_pixel) {
00085           // Single local max, so an odd width.
00086           h_stats.add(pixel * 2 - 1, 1);
00087         } else if (pixel == next_pixel && x + 1 < width &&
00088                  pixel > GET_DATA_BYTE(pixels, x + 1)) {
00089           // Double local max, so an even width.
00090           h_stats.add(pixel * 2, 1);
00091         }
00092       }
00093       prev_pixel = pixel;
00094       pixel = next_pixel;
00095     }
00096   }
00097   // Vertical width of stroke.
00098   STATS v_stats(0, height + 1);
00099   for (int x = 0; x < width; ++x) {
00100     int prev_pixel = 0;
00101     int pixel = GET_DATA_BYTE(data, x);
00102     for (int y = 1; y < height; ++y) {
00103       uinT32* pixels = data + y*wpl;
00104       int next_pixel = GET_DATA_BYTE(pixels, x);
00105       // We are looking for a pixel that is equal to its horizontal neighbours,
00106       // yet greater than its upper neighbour.
00107       if (prev_pixel < pixel &&
00108           (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
00109           (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
00110         if (pixel > next_pixel) {
00111           // Single local max, so an odd width.
00112           v_stats.add(pixel * 2 - 1, 1);
00113         } else if (pixel == next_pixel && y + 1 < height &&
00114                  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
00115           // Double local max, so an even width.
00116           v_stats.add(pixel * 2, 1);
00117         }
00118       }
00119       prev_pixel = pixel;
00120       pixel = next_pixel;
00121     }
00122   }
00123   pixDestroy(&dist_pix);
00124   // Store the horizontal and vertical width in the blob, keeping both
00125   // widths if there is enough information, otherwse only the one with
00126   // the most samples.
00127   // If there are insufficent samples, store zero, rather than using
00128   // 2*area/perimeter, as the numbers that gives do not match the numbers
00129   // from the distance method.
00130   if (h_stats.get_total() >= (width + height) / 4) {
00131     blob->set_horz_stroke_width(h_stats.ile(0.5f));
00132     if (v_stats.get_total() >= (width + height) / 4)
00133       blob->set_vert_stroke_width(v_stats.ile(0.5f));
00134     else
00135       blob->set_vert_stroke_width(0.0f);
00136   } else {
00137     if (v_stats.get_total() >= (width + height) / 4 ||
00138         v_stats.get_total() > h_stats.get_total()) {
00139       blob->set_horz_stroke_width(0.0f);
00140       blob->set_vert_stroke_width(v_stats.ile(0.5f));
00141     } else {
00142       blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
00143                                                           : 0.0f);
00144       blob->set_vert_stroke_width(0.0f);
00145     }
00146   }
00147 }
00148 
00149 
00150 /**********************************************************************
00151  * assign_blobs_to_blocks2
00152  *
00153  * Make a list of TO_BLOCKs for portrait and landscape orientation.
00154  **********************************************************************/
00155 
00156 void assign_blobs_to_blocks2(Pix* pix,
00157                              BLOCK_LIST *blocks,          // blocks to process
00158                              TO_BLOCK_LIST *port_blocks) {  // output list
00159   BLOCK *block;                  // current block
00160   BLOBNBOX *newblob;             // created blob
00161   C_BLOB *blob;                  // current blob
00162   BLOCK_IT block_it = blocks;
00163   C_BLOB_IT blob_it;             // iterator
00164   BLOBNBOX_IT port_box_it;       // iterator
00165                                  // destination iterator
00166   TO_BLOCK_IT port_block_it = port_blocks;
00167   TO_BLOCK *port_block;          // created block
00168 
00169   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00170     block = block_it.data();
00171     port_block = new TO_BLOCK(block);
00172 
00173     // Convert the good outlines to block->blob_list
00174     port_box_it.set_to_list(&port_block->blobs);
00175     blob_it.set_to_list(block->blob_list());
00176     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00177       blob = blob_it.extract();
00178       newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
00179       SetBlobStrokeWidth(pix, newblob);
00180       port_box_it.add_after_then_move(newblob);
00181     }
00182 
00183     // Put the rejected outlines in block->noise_blobs, which allows them to
00184     // be reconsidered and sorted back into rows and recover outlines mistakenly
00185     // rejected.
00186     port_box_it.set_to_list(&port_block->noise_blobs);
00187     blob_it.set_to_list(block->reject_blobs());
00188     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00189       blob = blob_it.extract();
00190       newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
00191       SetBlobStrokeWidth(pix, newblob);
00192       port_box_it.add_after_then_move(newblob);
00193     }
00194 
00195     port_block_it.add_after_then_move(port_block);
00196   }
00197 }
00198 
00199 namespace tesseract {
00200 /**********************************************************************
00201  * find_components
00202  *
00203  * Find the C_OUTLINEs of the connected components in each block, put them
00204  * in C_BLOBs, and filter them by size, putting the different size
00205  * grades on different lists in the matching TO_BLOCK in to_blocks.
00206  **********************************************************************/
00207 
00208 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
00209                               TO_BLOCK_LIST *to_blocks) {
00210   int width = pixGetWidth(pix);
00211   int height = pixGetHeight(pix);
00212   if (width > MAX_INT16 || height > MAX_INT16) {
00213     tprintf("Input image too large! (%d, %d)\n", width, height);
00214     return;  // Can't handle it.
00215   }
00216 
00217   set_global_loc_code(LOC_EDGE_PROG);
00218 
00219   BLOCK_IT block_it(blocks);    // iterator
00220   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00221        block_it.forward()) {
00222     BLOCK* block = block_it.data();
00223     if (block->poly_block() == NULL || block->poly_block()->IsText()) {
00224       extract_edges(pix, block);
00225     }
00226   }
00227 
00228   assign_blobs_to_blocks2(pix, blocks, to_blocks);
00229   ICOORD page_tr(width, height);
00230   filter_blobs(page_tr, to_blocks, !textord_test_landscape);
00231 }
00232 
00233 /**********************************************************************
00234  * filter_blobs
00235  *
00236  * Sort the blobs into sizes in all the blocks for later work.
00237  **********************************************************************/
00238 
00239 void Textord::filter_blobs(ICOORD page_tr,         // top right
00240                            TO_BLOCK_LIST *blocks,  // output list
00241                            BOOL8 testing_on) {     // for plotting
00242   TO_BLOCK_IT block_it = blocks;          // destination iterator
00243   TO_BLOCK *block;                        // created block
00244 
00245   #ifndef GRAPHICS_DISABLED
00246   if (to_win != NULL)
00247     to_win->Clear();
00248   #endif  // GRAPHICS_DISABLED
00249 
00250   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00251        block_it.forward()) {
00252     block = block_it.data();
00253     block->line_size = filter_noise_blobs(&block->blobs,
00254       &block->noise_blobs,
00255       &block->small_blobs,
00256       &block->large_blobs);
00257     block->line_spacing = block->line_size *
00258         (tesseract::CCStruct::kDescenderFraction +
00259          tesseract::CCStruct::kXHeightFraction +
00260          2 * tesseract::CCStruct::kAscenderFraction) /
00261          tesseract::CCStruct::kXHeightFraction;
00262     block->line_size *= textord_min_linesize;
00263     block->max_blob_size = block->line_size * textord_excess_blobsize;
00264 
00265     #ifndef GRAPHICS_DISABLED
00266     if (textord_show_blobs && testing_on) {
00267       if (to_win == NULL)
00268         create_to_win(page_tr);
00269       block->plot_graded_blobs(to_win);
00270     }
00271     if (textord_show_boxes && testing_on) {
00272       if (to_win == NULL)
00273         create_to_win(page_tr);
00274       plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE);
00275       plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE);
00276       plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE);
00277       plot_box_list(to_win, &block->blobs, ScrollView::WHITE);
00278     }
00279     #endif  // GRAPHICS_DISABLED
00280   }
00281 }
00282 
00283 /**********************************************************************
00284  * filter_noise_blobs
00285  *
00286  * Move small blobs to a separate list.
00287  **********************************************************************/
00288 
00289 float Textord::filter_noise_blobs(
00290     BLOBNBOX_LIST *src_list,      // original list
00291     BLOBNBOX_LIST *noise_list,    // noise list
00292     BLOBNBOX_LIST *small_list,    // small blobs
00293     BLOBNBOX_LIST *large_list) {  // large blobs
00294   inT16 height;                  //height of blob
00295   inT16 width;                   //of blob
00296   BLOBNBOX *blob;                //current blob
00297   float initial_x;               //first guess
00298   BLOBNBOX_IT src_it = src_list; //iterators
00299   BLOBNBOX_IT noise_it = noise_list;
00300   BLOBNBOX_IT small_it = small_list;
00301   BLOBNBOX_IT large_it = large_list;
00302   STATS size_stats (0, MAX_NEAREST_DIST);
00303   //blob heights
00304   float min_y;                   //size limits
00305   float max_y;
00306   float max_x;
00307   float max_height;              //of good blobs
00308 
00309   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00310     blob = src_it.data ();
00311     if (blob->bounding_box ().height () < textord_max_noise_size)
00312       noise_it.add_after_then_move (src_it.extract ());
00313     else if (blob->enclosed_area () >= blob->bounding_box ().height ()
00314       * blob->bounding_box ().width () * textord_noise_area_ratio)
00315       small_it.add_after_then_move (src_it.extract ());
00316   }
00317   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00318     size_stats.add (src_it.data ()->bounding_box ().height (), 1);
00319   }
00320   initial_x = size_stats.ile (textord_initialx_ile);
00321   max_y = ceil(initial_x *
00322                (tesseract::CCStruct::kDescenderFraction +
00323                 tesseract::CCStruct::kXHeightFraction +
00324                 2 * tesseract::CCStruct::kAscenderFraction) /
00325                tesseract::CCStruct::kXHeightFraction);
00326   min_y = floor (initial_x / 2);
00327   max_x = ceil (initial_x * textord_width_limit);
00328   small_it.move_to_first ();
00329   for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
00330   small_it.forward ()) {
00331     height = small_it.data()->bounding_box().height();
00332     if (height > max_y)
00333       large_it.add_after_then_move(small_it.extract ());
00334     else if (height >= min_y)
00335       src_it.add_after_then_move(small_it.extract ());
00336   }
00337   size_stats.clear ();
00338   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00339     height = src_it.data ()->bounding_box ().height ();
00340     width = src_it.data ()->bounding_box ().width ();
00341     if (height < min_y)
00342       small_it.add_after_then_move (src_it.extract ());
00343     else if (height > max_y || width > max_x)
00344       large_it.add_after_then_move (src_it.extract ());
00345     else
00346       size_stats.add (height, 1);
00347   }
00348   max_height = size_stats.ile (textord_initialasc_ile);
00349   //      printf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
00350   //              max_y,min_y,initial_x,max_height);
00351   max_height *= tesseract::CCStruct::kXHeightCapRatio;
00352   if (max_height > initial_x)
00353     initial_x = max_height;
00354   //      printf(" ret=%g\n",initial_x);
00355   return initial_x;
00356 }
00357 
00358 /**********************************************************************
00359  * cleanup_blocks
00360  *
00361  * Delete empty blocks, rows from the page.
00362  **********************************************************************/
00363 
00364 void Textord::cleanup_blocks(                    //remove empties
00365                              BLOCK_LIST *blocks  //list
00366                             ) {
00367   BLOCK_IT block_it = blocks;    //iterator
00368   ROW_IT row_it;                 //row iterator
00369 
00370   int num_rows = 0;
00371   int num_rows_all = 0;
00372   int num_blocks = 0;
00373   int num_blocks_all = 0;
00374   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00375        block_it.forward ()) {
00376     num_rows = 0;
00377     num_rows_all = 0;
00378     row_it.set_to_list (block_it.data ()->row_list ());
00379     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00380       ++num_rows_all;
00381       clean_small_noise_from_words(row_it.data());
00382       if ((textord_noise_rejrows
00383            && !row_it.data ()->word_list ()->empty ()
00384            && clean_noise_from_row (row_it.data ()))
00385           || row_it.data ()->word_list ()->empty ())
00386         delete row_it.extract ();//lose empty row
00387       else {
00388         if (textord_noise_rejwords)
00389           clean_noise_from_words (row_it.data ());
00390         if (textord_blshift_maxshift >= 0)
00391           tweak_row_baseline(row_it.data(),
00392                              textord_blshift_maxshift,
00393                              textord_blshift_xfraction);
00394         ++num_rows;
00395       }
00396     }
00397     if (block_it.data()->row_list()->empty() &&
00398         (block_it.data()->poly_block() == NULL ||
00399          block_it.data()->poly_block()->IsText())) {
00400       delete block_it.extract();  // Lose empty text blocks but not other types.
00401     } else {
00402       ++num_blocks;
00403     }
00404     ++num_blocks_all;
00405     if (textord_noise_debug)
00406       tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
00407   }
00408   if (textord_noise_debug)
00409     tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
00410 }
00411 
00412 
00413 /**********************************************************************
00414  * clean_noise_from_row
00415  *
00416  * Move blobs of words from rows of garbage into the reject blobs list.
00417  **********************************************************************/
00418 
00419 BOOL8 Textord::clean_noise_from_row(          //remove empties
00420                                     ROW *row  //row to clean
00421                                    ) {
00422   BOOL8 testing_on;
00423   TBOX blob_box;                  //bounding box
00424   C_BLOB *blob;                  //current blob
00425   C_OUTLINE *outline;            //current outline
00426   WERD *word;                    //current word
00427   inT32 blob_size;               //biggest size
00428   inT32 trans_count = 0;         //no of transitions
00429   inT32 trans_threshold;         //noise tolerance
00430   inT32 dot_count;               //small objects
00431   inT32 norm_count;              //normal objects
00432   inT32 super_norm_count;        //real char-like
00433                                  //words of row
00434   WERD_IT word_it = row->word_list ();
00435   C_BLOB_IT blob_it;             //blob iterator
00436   C_OUTLINE_IT out_it;           //outline iterator
00437 
00438   if (textord_test_y > row->base_line (textord_test_x)
00439     && textord_show_blobs
00440     && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
00441     testing_on = TRUE;
00442   else
00443     testing_on = FALSE;
00444   dot_count = 0;
00445   norm_count = 0;
00446   super_norm_count = 0;
00447   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00448     word = word_it.data ();      //current word
00449                                  //blobs in word
00450     blob_it.set_to_list (word->cblob_list ());
00451     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00452     blob_it.forward ()) {
00453       blob = blob_it.data ();
00454       if (!word->flag (W_DONT_CHOP)) {
00455                                  //get outlines
00456         out_it.set_to_list (blob->out_list ());
00457         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00458         out_it.forward ()) {
00459           outline = out_it.data ();
00460           blob_box = outline->bounding_box ();
00461           blob_size =
00462             blob_box.width () >
00463             blob_box.height ()? blob_box.width () : blob_box.
00464             height();
00465           if (blob_size < textord_noise_sizelimit * row->x_height ())
00466             dot_count++;         //count smal outlines
00467           if (!outline->child ()->empty ()
00468             && blob_box.height () <
00469             (1 + textord_noise_syfract) * row->x_height ()
00470             && blob_box.height () >
00471             (1 - textord_noise_syfract) * row->x_height ()
00472             && blob_box.width () <
00473             (1 + textord_noise_sxfract) * row->x_height ()
00474             && blob_box.width () >
00475             (1 - textord_noise_sxfract) * row->x_height ())
00476             super_norm_count++;  //count smal outlines
00477         }
00478       }
00479       else
00480         super_norm_count++;
00481       blob_box = blob->bounding_box ();
00482       blob_size =
00483         blob_box.width () >
00484         blob_box.height ()? blob_box.width () : blob_box.height ();
00485       if (blob_size >= textord_noise_sizelimit * row->x_height ()
00486           && blob_size < row->x_height () * 2) {
00487         trans_threshold = blob_size / textord_noise_sizefraction;
00488         trans_count = blob->count_transitions (trans_threshold);
00489         if (trans_count < textord_noise_translimit)
00490           norm_count++;
00491       }
00492       else if (blob_box.height () > row->x_height () * 2
00493         && (!word_it.at_first () || !blob_it.at_first ()))
00494         dot_count += 2;
00495       #ifndef SECURE_NAMES
00496       if (testing_on) {
00497         tprintf
00498           ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
00499           blob_box.left (), blob_box.bottom (), blob_box.right (),
00500           blob_box.top (), blob->out_list ()->length (), trans_count,
00501           blob_box.bottom () - row->base_line (blob_box.left ()));
00502       }
00503       #endif
00504     }
00505   }
00506   #ifndef SECURE_NAMES
00507   if (textord_noise_debug) {
00508     tprintf ("Row ending at (%d,%g):",
00509       blob_box.right (), row->base_line (blob_box.right ()));
00510     tprintf (" R=%g, dc=%d, nc=%d, %s\n",
00511       norm_count > 0 ? (float) dot_count / norm_count : 9999,
00512       dot_count, norm_count,
00513       dot_count > norm_count * textord_noise_normratio
00514       && dot_count > 2 ? "REJECTED" : "ACCEPTED");
00515   }
00516   #endif
00517   return super_norm_count < textord_noise_sncount
00518     && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
00519 }
00520 
00521 /**********************************************************************
00522  * clean_noise_from_words
00523  *
00524  * Move blobs of words from rows of garbage into the reject blobs list.
00525  **********************************************************************/
00526 
00527 void Textord::clean_noise_from_words(          //remove empties
00528                                      ROW *row  //row to clean
00529                                     ) {
00530   TBOX blob_box;                  //bounding box
00531   inT8 *word_dud;                //was it chucked
00532   C_BLOB *blob;                  //current blob
00533   C_OUTLINE *outline;            //current outline
00534   WERD *word;                    //current word
00535   inT32 blob_size;               //biggest size
00536   inT32 trans_count;             //no of transitions
00537   inT32 trans_threshold;         //noise tolerance
00538   inT32 dot_count;               //small objects
00539   inT32 norm_count;              //normal objects
00540   inT32 dud_words;               //number discarded
00541   inT32 ok_words;                //number remaining
00542   inT32 word_index;              //current word
00543                                  //words of row
00544   WERD_IT word_it = row->word_list ();
00545   C_BLOB_IT blob_it;             //blob iterator
00546   C_OUTLINE_IT out_it;           //outline iterator
00547 
00548   ok_words = word_it.length ();
00549   if (ok_words == 0 || textord_no_rejects)
00550     return;
00551   word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
00552   dud_words = 0;
00553   ok_words = 0;
00554   word_index = 0;
00555   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00556     word = word_it.data ();      //current word
00557     dot_count = 0;
00558     norm_count = 0;
00559                                  //blobs in word
00560     blob_it.set_to_list (word->cblob_list ());
00561     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00562     blob_it.forward ()) {
00563       blob = blob_it.data ();
00564       if (!word->flag (W_DONT_CHOP)) {
00565                                  //get outlines
00566         out_it.set_to_list (blob->out_list ());
00567         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00568         out_it.forward ()) {
00569           outline = out_it.data ();
00570           blob_box = outline->bounding_box ();
00571           blob_size =
00572             blob_box.width () >
00573             blob_box.height ()? blob_box.width () : blob_box.
00574             height();
00575           if (blob_size < textord_noise_sizelimit * row->x_height ())
00576             dot_count++;         //count smal outlines
00577           if (!outline->child ()->empty ()
00578             && blob_box.height () <
00579             (1 + textord_noise_syfract) * row->x_height ()
00580             && blob_box.height () >
00581             (1 - textord_noise_syfract) * row->x_height ()
00582             && blob_box.width () <
00583             (1 + textord_noise_sxfract) * row->x_height ()
00584             && blob_box.width () >
00585             (1 - textord_noise_sxfract) * row->x_height ())
00586             norm_count++;        //count smal outlines
00587         }
00588       }
00589       else
00590         norm_count++;
00591       blob_box = blob->bounding_box ();
00592       blob_size =
00593         blob_box.width () >
00594         blob_box.height ()? blob_box.width () : blob_box.height ();
00595       if (blob_size >= textord_noise_sizelimit * row->x_height ()
00596       && blob_size < row->x_height () * 2) {
00597         trans_threshold = blob_size / textord_noise_sizefraction;
00598         trans_count = blob->count_transitions (trans_threshold);
00599         if (trans_count < textord_noise_translimit)
00600           norm_count++;
00601       }
00602       else if (blob_box.height () > row->x_height () * 2
00603         && (!word_it.at_first () || !blob_it.at_first ()))
00604         dot_count += 2;
00605     }
00606     if (dot_count > 2) {
00607       if (dot_count > norm_count * textord_noise_normratio * 2)
00608         word_dud[word_index] = 2;
00609       else if (dot_count > norm_count * textord_noise_normratio)
00610         word_dud[word_index] = 1;
00611       else
00612         word_dud[word_index] = 0;
00613     }
00614     else
00615       word_dud[word_index] = 0;
00616     if (word_dud[word_index] == 2)
00617       dud_words++;
00618     else
00619       ok_words++;
00620     word_index++;
00621   }
00622 
00623   word_index = 0;
00624   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00625     if (word_dud[word_index] == 2
00626     || (word_dud[word_index] == 1 && dud_words > ok_words)) {
00627       word = word_it.data ();    //current word
00628                                  //rejected blobs
00629       blob_it.set_to_list (word->rej_cblob_list ());
00630                                  //move from blobs
00631       blob_it.add_list_after (word->cblob_list ());
00632     }
00633     word_index++;
00634   }
00635   free_mem(word_dud);
00636 }
00637 
00638 // Remove outlines that are a tiny fraction in either width or height
00639 // of the word height.
00640 void Textord::clean_small_noise_from_words(ROW *row) {
00641   WERD_IT word_it(row->word_list());
00642   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00643     WERD* word = word_it.data();
00644     int min_size = static_cast<int>(
00645       textord_noise_hfract * word->bounding_box().height() + 0.5);
00646     C_BLOB_IT blob_it(word->cblob_list());
00647     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00648       C_BLOB* blob = blob_it.data();
00649       C_OUTLINE_IT out_it(blob->out_list());
00650       for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
00651         C_OUTLINE* outline = out_it.data();
00652         outline->RemoveSmallRecursive(min_size, &out_it);
00653       }
00654       if (blob->out_list()->empty()) {
00655         delete blob_it.extract();
00656       }
00657     }
00658     if (word->cblob_list()->empty()) {
00659       if (!word_it.at_last()) {
00660         // The next word is no longer a fuzzy non space if it was before,
00661         // since the word before is about to be deleted.
00662         WERD* next_word = word_it.data_relative(1);
00663         if (next_word->flag(W_FUZZY_NON)) {
00664           next_word->set_flag(W_FUZZY_NON, false);
00665         }
00666       }
00667       delete word_it.extract();
00668     }
00669   }
00670 }
00671 }  // tesseract
00672 
00673 /**********************************************************************
00674  * tweak_row_baseline
00675  *
00676  * Shift baseline to fit the blobs more accurately where they are
00677  * close enough.
00678  **********************************************************************/
00679 
00680 void tweak_row_baseline(ROW *row,
00681                         double blshift_maxshift,
00682                         double blshift_xfraction) {
00683   TBOX blob_box;                 //bounding box
00684   C_BLOB *blob;                  //current blob
00685   WERD *word;                    //current word
00686   inT32 blob_count;              //no of blobs
00687   inT32 src_index;               //source segment
00688   inT32 dest_index;              //destination segment
00689   inT32 *xstarts;                //spline segments
00690   double *coeffs;                //spline coeffs
00691   float ydiff;                   //baseline error
00692   float x_centre;                //centre of blob
00693                                  //words of row
00694   WERD_IT word_it = row->word_list ();
00695   C_BLOB_IT blob_it;             //blob iterator
00696 
00697   blob_count = 0;
00698   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00699     word = word_it.data ();      //current word
00700                                  //get total blobs
00701     blob_count += word->cblob_list ()->length ();
00702   }
00703   if (blob_count == 0)
00704     return;
00705   xstarts =
00706     (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
00707     sizeof (inT32));
00708   coeffs =
00709     (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
00710     sizeof (double));
00711 
00712   src_index = 0;
00713   dest_index = 0;
00714   xstarts[0] = row->baseline.xcoords[0];
00715   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00716     word = word_it.data ();      //current word
00717                                  //blobs in word
00718     blob_it.set_to_list (word->cblob_list ());
00719     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00720     blob_it.forward ()) {
00721       blob = blob_it.data ();
00722       blob_box = blob->bounding_box ();
00723       x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
00724       ydiff = blob_box.bottom () - row->base_line (x_centre);
00725       if (ydiff < 0)
00726         ydiff = -ydiff / row->x_height ();
00727       else
00728         ydiff = ydiff / row->x_height ();
00729       if (ydiff < blshift_maxshift
00730         && blob_box.height () / row->x_height () > blshift_xfraction) {
00731         if (xstarts[dest_index] >= x_centre)
00732           xstarts[dest_index] = blob_box.left ();
00733         coeffs[dest_index * 3] = 0;
00734         coeffs[dest_index * 3 + 1] = 0;
00735         coeffs[dest_index * 3 + 2] = blob_box.bottom ();
00736         //shift it
00737         dest_index++;
00738         xstarts[dest_index] = blob_box.right () + 1;
00739       }
00740       else {
00741         if (xstarts[dest_index] <= x_centre) {
00742           while (row->baseline.xcoords[src_index + 1] <= x_centre
00743           && src_index < row->baseline.segments - 1) {
00744             if (row->baseline.xcoords[src_index + 1] >
00745             xstarts[dest_index]) {
00746               coeffs[dest_index * 3] =
00747                 row->baseline.quadratics[src_index].a;
00748               coeffs[dest_index * 3 + 1] =
00749                 row->baseline.quadratics[src_index].b;
00750               coeffs[dest_index * 3 + 2] =
00751                 row->baseline.quadratics[src_index].c;
00752               dest_index++;
00753               xstarts[dest_index] =
00754                 row->baseline.xcoords[src_index + 1];
00755             }
00756             src_index++;
00757           }
00758           coeffs[dest_index * 3] =
00759             row->baseline.quadratics[src_index].a;
00760           coeffs[dest_index * 3 + 1] =
00761             row->baseline.quadratics[src_index].b;
00762           coeffs[dest_index * 3 + 2] =
00763             row->baseline.quadratics[src_index].c;
00764           dest_index++;
00765           xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
00766         }
00767       }
00768     }
00769   }
00770   while (src_index < row->baseline.segments
00771     && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
00772     src_index++;
00773   while (src_index < row->baseline.segments) {
00774     coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
00775     coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
00776     coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
00777     dest_index++;
00778     src_index++;
00779     xstarts[dest_index] = row->baseline.xcoords[src_index];
00780   }
00781                                  //turn to spline
00782   row->baseline = QSPLINE (dest_index, xstarts, coeffs);
00783   free_mem(xstarts);
00784   free_mem(coeffs);
00785 }
00786 
00787 /**********************************************************************
00788  * blob_y_order
00789  *
00790  * Sort function to sort blobs in y from page top.
00791  **********************************************************************/
00792 
00793 inT32 blob_y_order(              //sort function
00794                    void *item1,  //items to compare
00795                    void *item2) {
00796                                  //converted ptr
00797   BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
00798                                  //converted ptr
00799   BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
00800 
00801   if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
00802     return -1;
00803   else if (blob1->bounding_box ().bottom () <
00804     blob2->bounding_box ().bottom ())
00805     return 1;
00806   else {
00807     if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
00808       return -1;
00809     else if (blob1->bounding_box ().left () >
00810       blob2->bounding_box ().left ())
00811       return 1;
00812     else
00813       return 0;
00814   }
00815 }