Tesseract  3.02
tesseract-ocr/textord/makerow.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        makerow.cpp  (Formerly makerows.c)
00003  * Description: Code to arrange blobs into rows of text.
00004  * Author:              Ray Smith
00005  * Created:             Mon Sep 21 14:34:48 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "mfcpch.h"
00021 #ifdef __UNIX__
00022 #include          <assert.h>
00023 #endif
00024 #include          "stderr.h"
00025 #include          "blobbox.h"
00026 #include          "ccstruct.h"
00027 #include          "detlinefit.h"
00028 #include          "statistc.h"
00029 #include          "drawtord.h"
00030 #include          "blkocc.h"
00031 #include          "sortflts.h"
00032 #include          "oldbasel.h"
00033 #include          "textord.h"
00034 #include          "tordmain.h"
00035 #include          "underlin.h"
00036 #include          "makerow.h"
00037 #include          "tprintf.h"
00038 #include          "tovars.h"
00039 
00040 // Include automatically generated configuration file if running autoconf.
00041 #ifdef HAVE_CONFIG_H
00042 #include "config_auto.h"
00043 #endif
00044 
00045 BOOL_VAR(textord_heavy_nr, FALSE, "Vigorously remove noise");
00046 BOOL_VAR(textord_show_initial_rows, FALSE, "Display row accumulation");
00047 BOOL_VAR(textord_show_parallel_rows, FALSE, "Display page correlated rows");
00048 BOOL_VAR(textord_show_expanded_rows, FALSE, "Display rows after expanding");
00049 BOOL_VAR(textord_show_final_rows, FALSE, "Display rows after final fitting");
00050 BOOL_VAR(textord_show_final_blobs, FALSE, "Display blob bounds after pre-ass");
00051 BOOL_VAR(textord_test_landscape, FALSE, "Tests refer to land/port");
00052 BOOL_VAR(textord_parallel_baselines, TRUE, "Force parallel baselines");
00053 BOOL_VAR(textord_straight_baselines, FALSE, "Force straight baselines");
00054 BOOL_VAR(textord_old_baselines, TRUE, "Use old baseline algorithm");
00055 BOOL_VAR(textord_old_xheight, FALSE, "Use old xheight algorithm");
00056 BOOL_VAR(textord_fix_xheight_bug, TRUE, "Use spline baseline");
00057 BOOL_VAR(textord_fix_makerow_bug, TRUE, "Prevent multiple baselines");
00058 BOOL_VAR(textord_debug_xheights, FALSE, "Test xheight algorithms");
00059 BOOL_VAR(textord_biased_skewcalc, TRUE, "Bias skew estimates with line length");
00060 BOOL_VAR(textord_interpolating_skew, TRUE, "Interpolate across gaps");
00061 INT_VAR(textord_skewsmooth_offset, 2, "For smooth factor");
00062 INT_VAR(textord_skewsmooth_offset2, 1, "For smooth factor");
00063 INT_VAR(textord_test_x, -1, "coord of test pt");
00064 INT_VAR(textord_test_y, -1, "coord of test pt");
00065 INT_VAR(textord_min_blobs_in_row, 4, "Min blobs before gradient counted");
00066 INT_VAR(textord_spline_minblobs, 8, "Min blobs in each spline segment");
00067 INT_VAR(textord_spline_medianwin, 6, "Size of window for spline segmentation");
00068 INT_VAR(textord_max_blob_overlaps, 4,
00069         "Max number of blobs a big blob can overlap");
00070 INT_VAR(textord_min_xheight, 10, "Min credible pixel xheight");
00071 double_VAR(textord_spline_shift_fraction, 0.02,
00072            "Fraction of line spacing for quad");
00073 double_VAR(textord_spline_outlier_fraction, 0.1,
00074            "Fraction of line spacing for outlier");
00075 double_VAR(textord_skew_ile, 0.5, "Ile of gradients for page skew");
00076 double_VAR(textord_skew_lag, 0.01, "Lag for skew on row accumulation");
00077 double_VAR(textord_linespace_iqrlimit, 0.2, "Max iqr/median for linespace");
00078 double_VAR(textord_width_limit, 8, "Max width of blobs to make rows");
00079 double_VAR(textord_chop_width, 1.5, "Max width before chopping");
00080 double_VAR(textord_expansion_factor, 1.0,
00081            "Factor to expand rows by in expand_rows");
00082 double_VAR(textord_overlap_x, 0.5, "Fraction of linespace for good overlap");
00083 double_VAR(textord_minxh, 0.25, "fraction of linesize for min xheight");
00084 double_VAR(textord_min_linesize, 1.25, "* blob height for initial linesize");
00085 double_VAR(textord_excess_blobsize, 1.3,
00086            "New row made if blob makes row this big");
00087 double_VAR(textord_occupancy_threshold, 0.4, "Fraction of neighbourhood");
00088 double_VAR(textord_underline_width, 2.0, "Multiple of line_size for underline");
00089 double_VAR(textord_min_blob_height_fraction, 0.75,
00090            "Min blob height/top to include blob top into xheight stats");
00091 double_VAR(textord_xheight_mode_fraction, 0.4,
00092            "Min pile height to make xheight");
00093 double_VAR(textord_ascheight_mode_fraction, 0.08,
00094            "Min pile height to make ascheight");
00095 double_VAR(textord_descheight_mode_fraction, 0.08,
00096            "Min pile height to make descheight");
00097 double_VAR(textord_ascx_ratio_min, 1.25, "Min cap/xheight");
00098 double_VAR(textord_ascx_ratio_max, 1.8, "Max cap/xheight");
00099 double_VAR(textord_descx_ratio_min, 0.25, "Min desc/xheight");
00100 double_VAR(textord_descx_ratio_max, 0.6, "Max desc/xheight");
00101 double_VAR(textord_xheight_error_margin, 0.1, "Accepted variation");
00102 INT_VAR(textord_lms_line_trials, 12, "Number of linew fits to do");
00103 BOOL_VAR(textord_new_initial_xheight, TRUE, "Use test xheight mechanism");
00104 
00105 #define MAX_HEIGHT_MODES  12
00106 
00107 const int kMinLeaderCount = 5;
00108 
00109 // Factored-out helper to build a single row from a list of blobs.
00110 // Returns the mean blob size.
00111 static float MakeRowFromBlobs(float line_size,
00112                               BLOBNBOX_IT* blob_it, TO_ROW_IT* row_it) {
00113   blob_it->sort(blob_x_order);
00114   blob_it->move_to_first();
00115   TO_ROW* row = NULL;
00116   float total_size = 0.0f;
00117   int blob_count = 0;
00118   // Add all the blobs to a single TO_ROW.
00119   for (; !blob_it->empty(); blob_it->forward()) {
00120     BLOBNBOX* blob = blob_it->extract();
00121     int top = blob->bounding_box().top();
00122     int bottom = blob->bounding_box().bottom();
00123     if (row == NULL) {
00124       row = new TO_ROW(blob, top, bottom, line_size);
00125       row_it->add_before_then_move(row);
00126     } else {
00127       row->add_blob(blob, top, bottom, line_size);
00128     }
00129     total_size += top - bottom;
00130     ++blob_count;
00131   }
00132   return blob_count > 0 ? total_size / blob_count : total_size;
00133 }
00134 
00135 // Helper to make a row using the children of a single blob.
00136 // Returns the mean size of the blobs created.
00137 float MakeRowFromSubBlobs(TO_BLOCK* block, C_BLOB* blob, TO_ROW_IT* row_it) {
00138   // The blobs made from the children will go in the small_blobs list.
00139   BLOBNBOX_IT bb_it(&block->small_blobs);
00140   C_OUTLINE_IT ol_it(blob->out_list());
00141   // Get the children.
00142   ol_it.set_to_list(ol_it.data()->child());
00143   if (ol_it.empty())
00144     return 0.0f;
00145   for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
00146     // Deep copy the child outline and use that to make a blob.
00147     C_OUTLINE* outline = C_OUTLINE::deep_copy(ol_it.data());
00148     // The constructor from a list of outlines corrects the direction.
00149     C_OUTLINE_LIST outlines;
00150     C_OUTLINE_IT ol_it(&outlines);
00151     ol_it.add_after_then_move(outline);
00152     C_BLOB* blob = new C_BLOB(&outlines);
00153     BLOBNBOX* bbox = new BLOBNBOX(blob);
00154     bb_it.add_after_then_move(bbox);
00155   }
00156   // Now we can make a row from the blobs.
00157   return MakeRowFromBlobs(block->line_size, &bb_it, row_it);
00158 }
00159 
00167 float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
00168   BLOBNBOX_IT blob_it = &block->blobs;
00169   TO_ROW_IT row_it = block->get_rows();
00170 
00171   // Include all the small blobs and large blobs.
00172   blob_it.add_list_after(&block->small_blobs);
00173   blob_it.add_list_after(&block->noise_blobs);
00174   blob_it.add_list_after(&block->large_blobs);
00175   if (block->blobs.singleton()) {
00176     blob_it.move_to_first();
00177     float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
00178     if (size > block->line_size)
00179       block->line_size = size;
00180   }
00181   MakeRowFromBlobs(block->line_size, &blob_it, &row_it);
00182   // Fit an LMS line to the rows.
00183   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
00184     fit_lms_line(row_it.data());
00185   float gradient;
00186   float fit_error;
00187   // Compute the skew based on the fitted line.
00188   compute_page_skew(blocks, gradient, fit_error);
00189   return gradient;
00190 }
00191 
00197 float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks) {
00198   float port_m;                  // global skew
00199   float port_err;                // global noise
00200   TO_BLOCK_IT block_it;          // iterator
00201 
00202   block_it.set_to_list(port_blocks);
00203   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00204        block_it.forward())
00205   make_initial_textrows(page_tr, block_it.data(), FCOORD(1.0f, 0.0f),
00206       !(BOOL8) textord_test_landscape);
00207                                  // compute globally
00208   compute_page_skew(port_blocks, port_m, port_err);
00209   block_it.set_to_list(port_blocks);
00210   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00211     cleanup_rows_making(page_tr, block_it.data(), port_m, FCOORD(1.0f, 0.0f),
00212                  block_it.data()->block->bounding_box().left(),
00213                  !(BOOL8)textord_test_landscape);
00214   }
00215   return port_m;                 // global skew
00216 }
00217 
00218 namespace tesseract {
00219 
00220 void Textord::fit_rows(float gradient, ICOORD page_tr, TO_BLOCK_LIST *blocks) {
00221   TO_BLOCK_IT block_it(blocks);          // iterator
00222   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00223     cleanup_rows_fitting(page_tr, block_it.data(), gradient, FCOORD(1.0f, 0.0f),
00224                  block_it.data()->block->bounding_box().left(),
00225                  !(BOOL8)textord_test_landscape);
00226   }
00227 }
00228 
00229 }  // namespace tesseract.
00230 
00236 void make_initial_textrows(                  //find lines
00237                            ICOORD page_tr,
00238                            TO_BLOCK *block,  //block to do
00239                            FCOORD rotation,  //for drawing
00240                            BOOL8 testing_on  //correct orientation
00241                           ) {
00242   TO_ROW_IT row_it = block->get_rows ();
00243 
00244 #ifndef GRAPHICS_DISABLED
00245   ScrollView::Color colour;                 //of row
00246 
00247   if (textord_show_initial_rows && testing_on) {
00248     if (to_win == NULL)
00249       create_to_win(page_tr);
00250   }
00251 #endif
00252                                  //guess skew
00253   assign_blobs_to_rows (block, NULL, 0, TRUE, TRUE, textord_show_initial_rows && testing_on);
00254   row_it.move_to_first ();
00255   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00256     fit_lms_line (row_it.data ());
00257 #ifndef GRAPHICS_DISABLED
00258   if (textord_show_initial_rows && testing_on) {
00259     colour = ScrollView::RED;
00260     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00261       plot_to_row (row_it.data (), colour, rotation);
00262       colour = (ScrollView::Color) (colour + 1);
00263       if (colour > ScrollView::MAGENTA)
00264         colour = ScrollView::RED;
00265     }
00266   }
00267 #endif
00268 }
00269 
00270 
00276 void fit_lms_line(TO_ROW *row) {
00277   float m, c;                    // fitted line
00278   tesseract::DetLineFit lms;
00279   BLOBNBOX_IT blob_it = row->blob_list();
00280 
00281   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00282     const TBOX& box = blob_it.data()->bounding_box();
00283     lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
00284   }
00285   double error = lms.Fit(&m, &c);
00286   row->set_line(m, c, error);
00287 }
00288 
00289 
00296 void compute_page_skew(                        //get average gradient
00297                        TO_BLOCK_LIST *blocks,  //list of blocks
00298                        float &page_m,          //average gradient
00299                        float &page_err         //average error
00300                       ) {
00301   inT32 row_count;               //total rows
00302   inT32 blob_count;              //total_blobs
00303   inT32 row_err;                 //integer error
00304   float *gradients;              //of rows
00305   float *errors;                 //of rows
00306   inT32 row_index;               //of total
00307   TO_ROW *row;                   //current row
00308   TO_BLOCK_IT block_it = blocks; //iterator
00309   TO_ROW_IT row_it;
00310 
00311   row_count = 0;
00312   blob_count = 0;
00313   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00314        block_it.forward ()) {
00315     POLY_BLOCK* pb = block_it.data()->block->poly_block();
00316     if (pb != NULL && !pb->IsText())
00317       continue;  // Pretend non-text blocks don't exist.
00318     row_count += block_it.data ()->get_rows ()->length ();
00319     //count up rows
00320     row_it.set_to_list (block_it.data ()->get_rows ());
00321     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00322       blob_count += row_it.data ()->blob_list ()->length ();
00323   }
00324   if (row_count == 0) {
00325     page_m = 0.0f;
00326     page_err = 0.0f;
00327     return;
00328   }
00329   gradients = (float *) alloc_mem (blob_count * sizeof (float));
00330   //get mem
00331   errors = (float *) alloc_mem (blob_count * sizeof (float));
00332   if (gradients == NULL || errors == NULL)
00333     MEMORY_OUT.error ("compute_page_skew", ABORT, NULL);
00334 
00335   row_index = 0;
00336   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00337        block_it.forward ()) {
00338     POLY_BLOCK* pb = block_it.data()->block->poly_block();
00339     if (pb != NULL && !pb->IsText())
00340       continue;  // Pretend non-text blocks don't exist.
00341     row_it.set_to_list (block_it.data ()->get_rows ());
00342     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00343       row = row_it.data ();
00344       blob_count = row->blob_list ()->length ();
00345       row_err = (inT32) ceil (row->line_error ());
00346       if (row_err <= 0)
00347         row_err = 1;
00348       if (textord_biased_skewcalc) {
00349         blob_count /= row_err;
00350         for (blob_count /= row_err; blob_count > 0; blob_count--) {
00351           gradients[row_index] = row->line_m ();
00352           errors[row_index] = row->line_error ();
00353           row_index++;
00354         }
00355       }
00356       else if (blob_count >= textord_min_blobs_in_row) {
00357                                  //get gradient
00358         gradients[row_index] = row->line_m ();
00359         errors[row_index] = row->line_error ();
00360         row_index++;
00361       }
00362     }
00363   }
00364   if (row_index == 0) {
00365                                  //desperate
00366     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00367          block_it.forward ()) {
00368       POLY_BLOCK* pb = block_it.data()->block->poly_block();
00369       if (pb != NULL && !pb->IsText())
00370         continue;  // Pretend non-text blocks don't exist.
00371       row_it.set_to_list (block_it.data ()->get_rows ());
00372       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00373            row_it.forward ()) {
00374         row = row_it.data ();
00375         gradients[row_index] = row->line_m ();
00376         errors[row_index] = row->line_error ();
00377         row_index++;
00378       }
00379     }
00380   }
00381   row_count = row_index;
00382   row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
00383     gradients, row_count);
00384   page_m = gradients[row_index];
00385   row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
00386     errors, row_count);
00387   page_err = errors[row_index];
00388   free_mem(gradients);
00389   free_mem(errors);
00390 }
00391 
00392 const double kNoiseSize = 0.5;  // Fraction of xheight.
00393 const int kMinSize = 8;  // Min pixels to be xheight.
00394 
00399 static bool dot_of_i(BLOBNBOX* dot, BLOBNBOX* i, TO_ROW* row) {
00400   const TBOX& ibox = i->bounding_box();
00401   const TBOX& dotbox = dot->bounding_box();
00402 
00403   // Must overlap horizontally by enough and be high enough.
00404   int overlap = MIN(dotbox.right(), ibox.right()) -
00405                 MAX(dotbox.left(), ibox.left());
00406   if (ibox.height() <= 2 * dotbox.height() ||
00407       (overlap * 2 < ibox.width() && overlap < dotbox.width()))
00408     return false;
00409 
00410   // If the i is tall and thin then it is good.
00411   if (ibox.height() > ibox.width() * 2)
00412     return true;  // The i or ! must be tall and thin.
00413 
00414   // It might still be tall and thin, but it might be joined to something.
00415   // So search the outline for a piece of large height close to the edges
00416   // of the dot.
00417   const double kHeightFraction = 0.6;
00418   double target_height = MIN(dotbox.bottom(), ibox.top());
00419   target_height -= row->line_m()*dotbox.left() + row->line_c();
00420   target_height *= kHeightFraction;
00421   int left_min = dotbox.left() - dotbox.width();
00422   int middle = (dotbox.left() + dotbox.right())/2;
00423   int right_max = dotbox.right() + dotbox.width();
00424   int left_miny = 0;
00425   int left_maxy = 0;
00426   int right_miny = 0;
00427   int right_maxy = 0;
00428   bool found_left = false;
00429   bool found_right = false;
00430   bool in_left = false;
00431   bool in_right = false;
00432   C_BLOB* blob = i->cblob();
00433   C_OUTLINE_IT o_it = blob->out_list();
00434   for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
00435     C_OUTLINE* outline = o_it.data();
00436     int length = outline->pathlength();
00437     ICOORD pos = outline->start_pos();
00438     for (int step = 0; step < length; pos += outline->step(step++)) {
00439       int x = pos.x();
00440       int y = pos.y();
00441       if (x >= left_min && x < middle && !found_left) {
00442         // We are in the left part so find min and max y.
00443         if (in_left) {
00444           if (y > left_maxy) left_maxy = y;
00445           if (y < left_miny) left_miny = y;
00446         } else {
00447           left_maxy = left_miny = y;
00448           in_left = true;
00449         }
00450       } else if (in_left) {
00451         // We just left the left so look for size.
00452         if (left_maxy - left_miny > target_height) {
00453           if (found_right)
00454             return true;
00455           found_left = true;
00456         }
00457         in_left = false;
00458       }
00459       if (x <= right_max && x > middle && !found_right) {
00460         // We are in the right part so find min and max y.
00461         if (in_right) {
00462           if (y > right_maxy) right_maxy = y;
00463           if (y < right_miny) right_miny = y;
00464         } else {
00465           right_maxy = right_miny = y;
00466           in_right = true;
00467         }
00468       } else if (in_right) {
00469         // We just left the right so look for size.
00470         if (right_maxy - right_miny > target_height) {
00471           if (found_left)
00472             return true;
00473           found_right = true;
00474         }
00475         in_right = false;
00476       }
00477     }
00478   }
00479   return false;
00480 }
00481 
00482 static void vigorous_noise_removal(TO_BLOCK* block) {
00483   TO_ROW_IT row_it = block->get_rows ();
00484   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00485     TO_ROW* row = row_it.data();
00486     BLOBNBOX_IT b_it = row->blob_list();
00487     // Estimate the xheight on the row.
00488     int max_height = 0;
00489     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00490       BLOBNBOX* blob = b_it.data();
00491       if (blob->bounding_box().height() > max_height)
00492         max_height = blob->bounding_box().height();
00493     }
00494     STATS hstats(0, max_height + 1);
00495     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00496       BLOBNBOX* blob = b_it.data();
00497       int height = blob->bounding_box().height();
00498       if (height >= kMinSize)
00499         hstats.add(blob->bounding_box().height(), 1);
00500     }
00501     float xheight = hstats.median();
00502     // Delete small objects.
00503     BLOBNBOX* prev = NULL;
00504     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00505       BLOBNBOX* blob = b_it.data();
00506       const TBOX& box = blob->bounding_box();
00507       if (box.height() < kNoiseSize * xheight) {
00508         // Small so delete unless it looks like an i dot.
00509         if (prev != NULL) {
00510           if (dot_of_i(blob, prev, row))
00511             continue;  // Looks OK.
00512         }
00513         if (!b_it.at_last()) {
00514           BLOBNBOX* next = b_it.data_relative(1);
00515           if (dot_of_i(blob, next, row))
00516             continue;  // Looks OK.
00517         }
00518         // It might be noise so get rid of it.
00519         if (blob->cblob() != NULL)
00520           delete blob->cblob();
00521         delete b_it.extract();
00522       } else {
00523         prev = blob;
00524       }
00525     }
00526   }
00527 }
00528 
00534 void cleanup_rows_making(                   //find lines
00535                   ICOORD page_tr,    //top right
00536                   TO_BLOCK *block,   //block to do
00537                   float gradient,    //gradient to fit
00538                   FCOORD rotation,   //for drawing
00539                   inT32 block_edge,  //edge of block
00540                   BOOL8 testing_on  //correct orientation
00541                  ) {
00542                                  //iterators
00543   BLOBNBOX_IT blob_it = &block->blobs;
00544   TO_ROW_IT row_it = block->get_rows ();
00545 
00546 #ifndef GRAPHICS_DISABLED
00547   if (textord_show_parallel_rows && testing_on) {
00548     if (to_win == NULL)
00549       create_to_win(page_tr);
00550   }
00551 #endif
00552                                  //get row coords
00553   fit_parallel_rows(block,
00554                     gradient,
00555                     rotation,
00556                     block_edge,
00557                     textord_show_parallel_rows &&testing_on);
00558   delete_non_dropout_rows(block,
00559                           gradient,
00560                           rotation,
00561                           block_edge,
00562                           textord_show_parallel_rows &&testing_on);
00563   expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
00564   blob_it.set_to_list (&block->blobs);
00565   row_it.set_to_list (block->get_rows ());
00566   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00567     blob_it.add_list_after (row_it.data ()->blob_list ());
00568   //give blobs back
00569   assign_blobs_to_rows (block, &gradient, 1, FALSE, FALSE, FALSE);
00570   //now new rows must be genuine
00571   blob_it.set_to_list (&block->blobs);
00572   blob_it.add_list_after (&block->large_blobs);
00573   assign_blobs_to_rows (block, &gradient, 2, TRUE, TRUE, FALSE);
00574   //safe to use big ones now
00575   blob_it.set_to_list (&block->blobs);
00576                                  //throw all blobs in
00577   blob_it.add_list_after (&block->noise_blobs);
00578   blob_it.add_list_after (&block->small_blobs);
00579   assign_blobs_to_rows (block, &gradient, 3, FALSE, FALSE, FALSE);
00580 }
00581 
00582 namespace tesseract {
00583 
00584 void Textord::cleanup_rows_fitting(ICOORD page_tr,    // top right
00585                                    TO_BLOCK *block,   // block to do
00586                                    float gradient,    // gradient to fit
00587                                    FCOORD rotation,   // for drawing
00588                                    inT32 block_edge,  // edge of block
00589                                    BOOL8 testing_on) {  // correct orientation
00590   BLOBNBOX_IT blob_it = &block->blobs;
00591   TO_ROW_IT row_it = block->get_rows();
00592 
00593 #ifndef GRAPHICS_DISABLED
00594   if (textord_show_parallel_rows && testing_on) {
00595     if (to_win == NULL)
00596       create_to_win(page_tr);
00597   }
00598 #endif
00599   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
00600     row_it.data()->blob_list()->sort(blob_x_order);
00601   fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
00602   if (textord_heavy_nr) {
00603     vigorous_noise_removal(block);
00604   }
00605   POLY_BLOCK* pb = block->block->poly_block();
00606   if (pb == NULL || pb->IsText()) {
00607     separate_underlines(block, gradient, rotation, testing_on);
00608     pre_associate_blobs(page_tr, block, rotation, testing_on);
00609   }
00610 
00611 #ifndef GRAPHICS_DISABLED
00612   if (textord_show_final_rows && testing_on) {
00613     if (to_win == NULL)
00614       create_to_win(page_tr);
00615   }
00616 #endif
00617 
00618   fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
00619   //              textord_show_final_rows && testing_on);
00620   make_spline_rows(block,
00621                    gradient,
00622                    rotation,
00623                    block_edge,
00624                    textord_show_final_rows && testing_on);
00625   // We only want to call compute_block_xheight() if
00626   // both textord_old_xheight and textord_old_baselines are false.
00627   // No need to call compute_block_xheight() if textord_old_baselines
00628   // is true, since all appropriate xheight computation functions
00629   // would be called from make_old_baselines().
00630   // Note: it can not be the case that textord_old_baselines is
00631   // false, and textord_old_xheight is true.
00632   if (!textord_old_xheight && !textord_old_baselines)
00633     compute_block_xheight(block, gradient);
00634   if (textord_restore_underlines)  // fix underlines
00635     restore_underlined_blobs(block);
00636 #ifndef GRAPHICS_DISABLED
00637   if (textord_show_final_rows && testing_on) {
00638     ScrollView::Color colour = ScrollView::RED;
00639     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00640       plot_parallel_row(row_it.data(), gradient,
00641         block_edge, colour, rotation);
00642       colour = (ScrollView::Color) (colour + 1);
00643       if (colour > ScrollView::MAGENTA)
00644         colour = ScrollView::RED;
00645     }
00646     plot_blob_list(to_win, &block->blobs,
00647                    ScrollView::MAGENTA, ScrollView::WHITE);
00648     //show discarded blobs
00649     plot_blob_list(to_win, &block->underlines,
00650                    ScrollView::YELLOW, ScrollView::CORAL);
00651   }
00652   if (textord_show_final_rows && testing_on && block->blobs.length () > 0)
00653     tprintf ("%d blobs discarded as noise\n", block->blobs.length ());
00654   if (textord_show_final_rows && testing_on) {
00655     draw_meanlines(block, gradient, block_edge, ScrollView::WHITE, rotation);
00656   }
00657 #endif
00658 }
00659 
00660 }  // namespace tesseract.
00661 
00667 void delete_non_dropout_rows(                   //find lines
00668                              TO_BLOCK *block,   //block to do
00669                              float gradient,    //global skew
00670                              FCOORD rotation,   //deskew vector
00671                              inT32 block_edge,  //left edge
00672                              BOOL8 testing_on   //correct orientation
00673                             ) {
00674   TBOX block_box;                 //deskewed block
00675   inT32 *deltas;                 //change in occupation
00676   inT32 *occupation;             //of pixel coords
00677   inT32 max_y;                   //in block
00678   inT32 min_y;
00679   inT32 line_index;              //of scan line
00680   inT32 line_count;              //no of scan lines
00681   inT32 distance;                //to drop-out
00682   inT32 xleft;                   //of block
00683   inT32 ybottom;                 //of block
00684   TO_ROW *row;                   //current row
00685   TO_ROW_IT row_it = block->get_rows ();
00686   BLOBNBOX_IT blob_it = &block->blobs;
00687 
00688   if (row_it.length () == 0)
00689     return;                      //empty block
00690   block_box = deskew_block_coords (block, gradient);
00691   xleft = block->block->bounding_box ().left ();
00692   ybottom = block->block->bounding_box ().bottom ();
00693   min_y = block_box.bottom () - 1;
00694   max_y = block_box.top () + 1;
00695   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00696     line_index = (inT32) floor (row_it.data ()->intercept ());
00697     if (line_index <= min_y)
00698       min_y = line_index - 1;
00699     if (line_index >= max_y)
00700       max_y = line_index + 1;
00701   }
00702   line_count = max_y - min_y + 1;
00703   if (line_count <= 0)
00704     return;                      //empty block
00705   deltas = (inT32 *) alloc_mem (line_count * sizeof (inT32));
00706   occupation = (inT32 *) alloc_mem (line_count * sizeof (inT32));
00707   if (deltas == NULL || occupation == NULL)
00708     MEMORY_OUT.error ("compute_line_spacing", ABORT, NULL);
00709 
00710   compute_line_occupation(block, gradient, min_y, max_y, occupation, deltas);
00711   compute_occupation_threshold ((inT32)
00712     ceil (block->line_spacing *
00713     (tesseract::CCStruct::kDescenderFraction +
00714     tesseract::CCStruct::kAscenderFraction)),
00715     (inT32) ceil (block->line_spacing *
00716     (tesseract::CCStruct::kXHeightFraction +
00717     tesseract::CCStruct::kAscenderFraction)),
00718     max_y - min_y + 1, occupation, deltas);
00719 #ifndef GRAPHICS_DISABLED
00720   if (testing_on) {
00721     draw_occupation(xleft, ybottom, min_y, max_y, occupation, deltas);
00722   }
00723 #endif
00724   compute_dropout_distances(occupation, deltas, line_count);
00725   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00726     row = row_it.data ();
00727     line_index = (inT32) floor (row->intercept ());
00728     distance = deltas[line_index - min_y];
00729     if (find_best_dropout_row (row, distance, block->line_spacing / 2,
00730     line_index, &row_it, testing_on)) {
00731 #ifndef GRAPHICS_DISABLED
00732       if (testing_on)
00733         plot_parallel_row(row, gradient, block_edge,
00734                           ScrollView::WHITE, rotation);
00735 #endif
00736       blob_it.add_list_after (row_it.data ()->blob_list ());
00737       delete row_it.extract ();  //too far away
00738     }
00739   }
00740   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00741     blob_it.add_list_after (row_it.data ()->blob_list ());
00742   }
00743 
00744   free_mem(deltas);
00745   free_mem(occupation);
00746 }
00747 
00748 
00755 BOOL8 find_best_dropout_row(                    //find neighbours
00756                             TO_ROW *row,        //row to test
00757                             inT32 distance,     //dropout dist
00758                             float dist_limit,   //threshold distance
00759                             inT32 line_index,   //index of row
00760                             TO_ROW_IT *row_it,  //current position
00761                             BOOL8 testing_on    //correct orientation
00762                            ) {
00763   inT32 next_index;              //of neigbouring row
00764   inT32 row_offset;              //from current row
00765   inT32 abs_dist;                //absolute distance
00766   inT8 row_inc;                  //increment to row_index
00767   TO_ROW *next_row;              //nextious row
00768 
00769   if (testing_on)
00770     tprintf ("Row at %g(%g), dropout dist=%d,",
00771       row->intercept (), row->parallel_c (), distance);
00772   if (distance < 0) {
00773     row_inc = 1;
00774     abs_dist = -distance;
00775   }
00776   else {
00777     row_inc = -1;
00778     abs_dist = distance;
00779   }
00780   if (abs_dist > dist_limit) {
00781     if (testing_on) {
00782       tprintf (" too far - deleting\n");
00783     }
00784     return TRUE;
00785   }
00786   if ((distance < 0 && !row_it->at_last ())
00787   || (distance >= 0 && !row_it->at_first ())) {
00788     row_offset = row_inc;
00789     do {
00790       next_row = row_it->data_relative (row_offset);
00791       next_index = (inT32) floor (next_row->intercept ());
00792       if ((distance < 0
00793         && next_index < line_index
00794         && next_index > line_index + distance + distance)
00795         || (distance >= 0
00796         && next_index > line_index
00797       && next_index < line_index + distance + distance)) {
00798         if (testing_on) {
00799           tprintf (" nearer neighbour (%d) at %g\n",
00800             line_index + distance - next_index,
00801             next_row->intercept ());
00802         }
00803         return TRUE;             //other is nearer
00804       }
00805       else if (next_index == line_index
00806       || next_index == line_index + distance + distance) {
00807         if (row->believability () <= next_row->believability ()) {
00808           if (testing_on) {
00809             tprintf (" equal but more believable at %g (%g/%g)\n",
00810               next_row->intercept (),
00811               row->believability (),
00812               next_row->believability ());
00813           }
00814           return TRUE;           //other is more believable
00815         }
00816       }
00817       row_offset += row_inc;
00818     }
00819     while ((next_index == line_index
00820       || next_index == line_index + distance + distance)
00821       && row_offset < row_it->length ());
00822     if (testing_on)
00823       tprintf (" keeping\n");
00824   }
00825   return FALSE;
00826 }
00827 
00828 
00835 TBOX deskew_block_coords(                  //block box
00836                         TO_BLOCK *block,  //block to do
00837                         float gradient    //global skew
00838                        ) {
00839   TBOX result;                    //block bounds
00840   TBOX blob_box;                  //of block
00841   FCOORD rotation;               //deskew vector
00842   float length;                  //of gradient vector
00843   TO_ROW_IT row_it = block->get_rows ();
00844   TO_ROW *row;                   //current row
00845   BLOBNBOX *blob;                //current blob
00846   BLOBNBOX_IT blob_it;           //iterator
00847 
00848   length = sqrt (gradient * gradient + 1);
00849   rotation = FCOORD (1 / length, -gradient / length);
00850   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00851     row = row_it.data ();
00852     blob_it.set_to_list (row->blob_list ());
00853     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00854     blob_it.forward ()) {
00855       blob = blob_it.data ();
00856       blob_box = blob->bounding_box ();
00857       blob_box.rotate (rotation);//de-skew it
00858       result += blob_box;
00859     }
00860   }
00861   return result;
00862 }
00863 
00864 
00871 void compute_line_occupation(                    //project blobs
00872                              TO_BLOCK *block,    //block to do
00873                              float gradient,     //global skew
00874                              inT32 min_y,        //min coord in block
00875                              inT32 max_y,        //in block
00876                              inT32 *occupation,  //output projection
00877                              inT32 *deltas       //derivative
00878                             ) {
00879   inT32 line_count;              //maxy-miny+1
00880   inT32 line_index;              //of scan line
00881   int index;                     //array index for daft compilers
00882   float top, bottom;             //coords of blob
00883   inT32 width;                   //of blob
00884   TO_ROW *row;                   //current row
00885   TO_ROW_IT row_it = block->get_rows ();
00886   BLOBNBOX *blob;                //current blob
00887   BLOBNBOX_IT blob_it;           //iterator
00888   float length;                  //of skew vector
00889   TBOX blob_box;                  //bounding box
00890   FCOORD rotation;               //inverse of skew
00891 
00892   line_count = max_y - min_y + 1;
00893   length = sqrt (gradient * gradient + 1);
00894   rotation = FCOORD (1 / length, -gradient / length);
00895   for (line_index = 0; line_index < line_count; line_index++)
00896     deltas[line_index] = 0;
00897   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00898     row = row_it.data ();
00899     blob_it.set_to_list (row->blob_list ());
00900     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00901     blob_it.forward ()) {
00902       blob = blob_it.data ();
00903       blob_box = blob->bounding_box ();
00904       blob_box.rotate (rotation);//de-skew it
00905       top = blob_box.top ();
00906       bottom = blob_box.bottom ();
00907       width =
00908         (inT32) floor ((FLOAT32) (blob_box.right () - blob_box.left ()));
00909       if ((inT32) floor (bottom) < min_y
00910         || (inT32) floor (bottom) - min_y >= line_count)
00911         fprintf (stderr,
00912           "Bad y coord of bottom, " INT32FORMAT "(" INT32FORMAT ","
00913           INT32FORMAT ")\n", (inT32) floor (bottom), min_y, max_y);
00914                                  //count transitions
00915       index = (inT32) floor (bottom) - min_y;
00916       deltas[index] += width;
00917       if ((inT32) floor (top) < min_y
00918         || (inT32) floor (top) - min_y >= line_count)
00919         fprintf (stderr,
00920           "Bad y coord of top, " INT32FORMAT "(" INT32FORMAT ","
00921           INT32FORMAT ")\n", (inT32) floor (top), min_y, max_y);
00922       index = (inT32) floor (top) - min_y;
00923       deltas[index] -= width;
00924     }
00925   }
00926   occupation[0] = deltas[0];
00927   for (line_index = 1; line_index < line_count; line_index++)
00928     occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
00929 }
00930 
00931 
00937 void compute_occupation_threshold(                    //project blobs
00938                                   inT32 low_window,   //below result point
00939                                   inT32 high_window,  //above result point
00940                                   inT32 line_count,   //array sizes
00941                                   inT32 *occupation,  //input projection
00942                                   inT32 *thresholds   //output thresholds
00943                                  ) {
00944   inT32 line_index;              //of thresholds line
00945   inT32 low_index;               //in occupation
00946   inT32 high_index;              //in occupation
00947   inT32 sum;                     //current average
00948   inT32 divisor;                 //to get thresholds
00949   inT32 min_index;               //of min occ
00950   inT32 min_occ;                 //min in locality
00951   inT32 test_index;              //for finding min
00952 
00953   divisor =
00954     (inT32) ceil ((low_window + high_window) / textord_occupancy_threshold);
00955   if (low_window + high_window < line_count) {
00956     for (sum = 0, high_index = 0; high_index < low_window; high_index++)
00957       sum += occupation[high_index];
00958     for (low_index = 0; low_index < high_window; low_index++, high_index++)
00959       sum += occupation[high_index];
00960     min_occ = occupation[0];
00961     min_index = 0;
00962     for (test_index = 1; test_index < high_index; test_index++) {
00963       if (occupation[test_index] <= min_occ) {
00964         min_occ = occupation[test_index];
00965         min_index = test_index;  //find min in region
00966       }
00967     }
00968     for (line_index = 0; line_index < low_window; line_index++)
00969       thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
00970     //same out to end
00971     for (low_index = 0; high_index < line_count; low_index++, high_index++) {
00972       sum -= occupation[low_index];
00973       sum += occupation[high_index];
00974       if (occupation[high_index] <= min_occ) {
00975                                  //find min in region
00976         min_occ = occupation[high_index];
00977         min_index = high_index;
00978       }
00979                                  //lost min from region
00980       if (min_index <= low_index) {
00981         min_occ = occupation[low_index + 1];
00982         min_index = low_index + 1;
00983         for (test_index = low_index + 2; test_index <= high_index;
00984         test_index++) {
00985           if (occupation[test_index] <= min_occ) {
00986             min_occ = occupation[test_index];
00987                                  //find min in region
00988             min_index = test_index;
00989           }
00990         }
00991       }
00992       thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
00993     }
00994   }
00995   else {
00996     min_occ = occupation[0];
00997     min_index = 0;
00998     for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
00999       if (occupation[low_index] < min_occ) {
01000         min_occ = occupation[low_index];
01001         min_index = low_index;
01002       }
01003       sum += occupation[low_index];
01004     }
01005     line_index = 0;
01006   }
01007   for (; line_index < line_count; line_index++)
01008     thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
01009   //same out to end
01010 }
01011 
01012 
01018 void compute_dropout_distances(                    //project blobs
01019                                inT32 *occupation,  //input projection
01020                                inT32 *thresholds,  //output thresholds
01021                                inT32 line_count    //array sizes
01022                               ) {
01023   inT32 line_index;              //of thresholds line
01024   inT32 distance;                //from prev dropout
01025   inT32 next_dist;               //to next dropout
01026   inT32 back_index;              //for back filling
01027   inT32 prev_threshold;          //before overwrite
01028 
01029   distance = -line_count;
01030   line_index = 0;
01031   do {
01032     do {
01033       distance--;
01034       prev_threshold = thresholds[line_index];
01035                                  //distance from prev
01036       thresholds[line_index] = distance;
01037       line_index++;
01038     }
01039     while (line_index < line_count
01040       && (occupation[line_index] < thresholds[line_index]
01041       || occupation[line_index - 1] >= prev_threshold));
01042     if (line_index < line_count) {
01043       back_index = line_index - 1;
01044       next_dist = 1;
01045       while (next_dist < -distance && back_index >= 0) {
01046         thresholds[back_index] = next_dist;
01047         back_index--;
01048         next_dist++;
01049         distance++;
01050       }
01051       distance = 1;
01052     }
01053   }
01054   while (line_index < line_count);
01055 }
01056 
01057 
01065 void expand_rows(                   //find lines
01066                  ICOORD page_tr,    //top right
01067                  TO_BLOCK *block,   //block to do
01068                  float gradient,    //gradient to fit
01069                  FCOORD rotation,   //for drawing
01070                  inT32 block_edge,  //edge of block
01071                  BOOL8 testing_on   //correct orientation
01072                 ) {
01073   BOOL8 swallowed_row;           //eaten a neighbour
01074   float y_max, y_min;            //new row limits
01075   float y_bottom, y_top;         //allowed limits
01076   TO_ROW *test_row;              //next row
01077   TO_ROW *row;                   //current row
01078                                  //iterators
01079   BLOBNBOX_IT blob_it = &block->blobs;
01080   TO_ROW_IT row_it = block->get_rows ();
01081 
01082 #ifndef GRAPHICS_DISABLED
01083   if (textord_show_expanded_rows && testing_on) {
01084     if (to_win == NULL)
01085       create_to_win(page_tr);
01086   }
01087 #endif
01088 
01089   adjust_row_limits(block);  //shift min,max.
01090   if (textord_new_initial_xheight) {
01091     if (block->get_rows ()->length () == 0)
01092       return;
01093     compute_row_stats(block, textord_show_expanded_rows &&testing_on);
01094   }
01095   assign_blobs_to_rows (block, &gradient, 4, TRUE, FALSE, FALSE);
01096   //get real membership
01097   if (block->get_rows ()->length () == 0)
01098     return;
01099   fit_parallel_rows(block,
01100                     gradient,
01101                     rotation,
01102                     block_edge,
01103                     textord_show_expanded_rows &&testing_on);
01104   if (!textord_new_initial_xheight)
01105     compute_row_stats(block, textord_show_expanded_rows &&testing_on);
01106   row_it.move_to_last ();
01107   do {
01108     row = row_it.data ();
01109     y_max = row->max_y ();       //get current limits
01110     y_min = row->min_y ();
01111     y_bottom = row->intercept () - block->line_size * textord_expansion_factor *
01112       tesseract::CCStruct::kDescenderFraction;
01113     y_top = row->intercept () + block->line_size * textord_expansion_factor *
01114         (tesseract::CCStruct::kXHeightFraction +
01115          tesseract::CCStruct::kAscenderFraction);
01116     if (y_min > y_bottom) {      //expansion allowed
01117       if (textord_show_expanded_rows && testing_on)
01118         tprintf("Expanding bottom of row at %f from %f to %f\n",
01119                 row->intercept(), y_min, y_bottom);
01120                                  //expandable
01121       swallowed_row = TRUE;
01122       while (swallowed_row && !row_it.at_last ()) {
01123         swallowed_row = FALSE;
01124                                  //get next one
01125         test_row = row_it.data_relative (1);
01126                                  //overlaps space
01127         if (test_row->max_y () > y_bottom) {
01128           if (test_row->min_y () > y_bottom) {
01129             if (textord_show_expanded_rows && testing_on)
01130               tprintf("Eating row below at %f\n", test_row->intercept());
01131             row_it.forward ();
01132 #ifndef GRAPHICS_DISABLED
01133             if (textord_show_expanded_rows && testing_on)
01134               plot_parallel_row(test_row,
01135                                 gradient,
01136                                 block_edge,
01137                                 ScrollView::WHITE,
01138                                 rotation);
01139 #endif
01140             blob_it.set_to_list (row->blob_list ());
01141             blob_it.add_list_after (test_row->blob_list ());
01142                                  //swallow complete row
01143             delete row_it.extract ();
01144             row_it.backward ();
01145             swallowed_row = TRUE;
01146           }
01147           else if (test_row->max_y () < y_min) {
01148                                  //shorter limit
01149             y_bottom = test_row->max_y ();
01150             if (textord_show_expanded_rows && testing_on)
01151               tprintf("Truncating limit to %f due to touching row at %f\n",
01152                       y_bottom, test_row->intercept());
01153           }
01154           else {
01155             y_bottom = y_min;    //can't expand it
01156             if (textord_show_expanded_rows && testing_on)
01157               tprintf("Not expanding limit beyond %f due to touching row at %f\n",
01158                       y_bottom, test_row->intercept());
01159           }
01160         }
01161       }
01162       y_min = y_bottom;          //expand it
01163     }
01164     if (y_max < y_top) {         //expansion allowed
01165       if (textord_show_expanded_rows && testing_on)
01166         tprintf("Expanding top of row at %f from %f to %f\n",
01167                 row->intercept(), y_max, y_top);
01168       swallowed_row = TRUE;
01169       while (swallowed_row && !row_it.at_first ()) {
01170         swallowed_row = FALSE;
01171                                  //get one above
01172         test_row = row_it.data_relative (-1);
01173         if (test_row->min_y () < y_top) {
01174           if (test_row->max_y () < y_top) {
01175             if (textord_show_expanded_rows && testing_on)
01176               tprintf("Eating row above at %f\n", test_row->intercept());
01177             row_it.backward ();
01178             blob_it.set_to_list (row->blob_list ());
01179 #ifndef GRAPHICS_DISABLED
01180             if (textord_show_expanded_rows && testing_on)
01181               plot_parallel_row(test_row,
01182                                 gradient,
01183                                 block_edge,
01184                                 ScrollView::WHITE,
01185                                 rotation);
01186 #endif
01187             blob_it.add_list_after (test_row->blob_list ());
01188                                  //swallow complete row
01189             delete row_it.extract ();
01190             row_it.forward ();
01191             swallowed_row = TRUE;
01192           }
01193           else if (test_row->min_y () < y_max) {
01194                                  //shorter limit
01195             y_top = test_row->min_y ();
01196             if (textord_show_expanded_rows && testing_on)
01197               tprintf("Truncating limit to %f due to touching row at %f\n",
01198                       y_top, test_row->intercept());
01199           }
01200           else {
01201             y_top = y_max;       //can't expand it
01202             if (textord_show_expanded_rows && testing_on)
01203               tprintf("Not expanding limit beyond %f due to touching row at %f\n",
01204                       y_top, test_row->intercept());
01205           }
01206         }
01207       }
01208       y_max = y_top;
01209     }
01210                                  //new limits
01211     row->set_limits (y_min, y_max);
01212     row_it.backward ();
01213   }
01214   while (!row_it.at_last ());
01215 }
01216 
01217 
01223 void adjust_row_limits(                 //tidy limits
01224                        TO_BLOCK *block  //block to do
01225                       ) {
01226   TO_ROW *row;                   //current row
01227   float size;                    //size of row
01228   float ymax;                    //top of row
01229   float ymin;                    //bottom of row
01230   TO_ROW_IT row_it = block->get_rows ();
01231 
01232   if (textord_show_expanded_rows)
01233     tprintf("Adjusting row limits for block(%d,%d)\n",
01234             block->block->bounding_box().left(),
01235             block->block->bounding_box().top());
01236   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01237     row = row_it.data ();
01238     size = row->max_y () - row->min_y ();
01239     if (textord_show_expanded_rows)
01240       tprintf("Row at %f has min %f, max %f, size %f\n",
01241               row->intercept(), row->min_y(), row->max_y(), size);
01242     size /= tesseract::CCStruct::kXHeightFraction +
01243         tesseract::CCStruct::kAscenderFraction +
01244         tesseract::CCStruct::kDescenderFraction;
01245     ymax = size * (tesseract::CCStruct::kXHeightFraction +
01246                    tesseract::CCStruct::kAscenderFraction);
01247     ymin = -size * tesseract::CCStruct::kDescenderFraction;
01248     row->set_limits (row->intercept () + ymin, row->intercept () + ymax);
01249     row->merged = FALSE;
01250   }
01251 }
01252 
01253 
01259 void compute_row_stats(                  //find lines
01260                        TO_BLOCK *block,  //block to do
01261                        BOOL8 testing_on  //correct orientation
01262                       ) {
01263   inT32 row_index;               //of median
01264   TO_ROW *row;                   //current row
01265   TO_ROW *prev_row;              //previous row
01266   float iqr;                     //inter quartile range
01267   TO_ROW_IT row_it = block->get_rows ();
01268                                  //number of rows
01269   inT16 rowcount = row_it.length ();
01270   TO_ROW **rows;                 //for choose nth
01271 
01272   rows = (TO_ROW **) alloc_mem (rowcount * sizeof (TO_ROW *));
01273   if (rows == NULL)
01274     MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
01275   rowcount = 0;
01276   prev_row = NULL;
01277   row_it.move_to_last ();        //start at bottom
01278   do {
01279     row = row_it.data ();
01280     if (prev_row != NULL) {
01281       rows[rowcount++] = prev_row;
01282       prev_row->spacing = row->intercept () - prev_row->intercept ();
01283       if (testing_on)
01284         tprintf ("Row at %g yields spacing of %g\n",
01285           row->intercept (), prev_row->spacing);
01286     }
01287     prev_row = row;
01288     row_it.backward ();
01289   }
01290   while (!row_it.at_last ());
01291   block->key_row = prev_row;
01292   block->baseline_offset =
01293     fmod (prev_row->parallel_c (), block->line_spacing);
01294   if (testing_on)
01295     tprintf ("Blob based spacing=(%g,%g), offset=%g",
01296       block->line_size, block->line_spacing, block->baseline_offset);
01297   if (rowcount > 0) {
01298     row_index = choose_nth_item (rowcount * 3 / 4, rows, rowcount,
01299       sizeof (TO_ROW *), row_spacing_order);
01300     iqr = rows[row_index]->spacing;
01301     row_index = choose_nth_item (rowcount / 4, rows, rowcount,
01302       sizeof (TO_ROW *), row_spacing_order);
01303     iqr -= rows[row_index]->spacing;
01304     row_index = choose_nth_item (rowcount / 2, rows, rowcount,
01305       sizeof (TO_ROW *), row_spacing_order);
01306     block->key_row = rows[row_index];
01307     if (testing_on)
01308       tprintf (" row based=%g(%g)", rows[row_index]->spacing, iqr);
01309     if (rowcount > 2
01310     && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
01311       if (!textord_new_initial_xheight) {
01312         if (rows[row_index]->spacing < block->line_spacing
01313           && rows[row_index]->spacing > block->line_size)
01314           //within range
01315           block->line_size = rows[row_index]->spacing;
01316         //spacing=size
01317         else if (rows[row_index]->spacing > block->line_spacing)
01318           block->line_size = block->line_spacing;
01319         //too big so use max
01320       }
01321       else {
01322         if (rows[row_index]->spacing < block->line_spacing)
01323           block->line_size = rows[row_index]->spacing;
01324         else
01325           block->line_size = block->line_spacing;
01326         //too big so use max
01327       }
01328       if (block->line_size < textord_min_xheight)
01329         block->line_size = (float) textord_min_xheight;
01330       block->line_spacing = rows[row_index]->spacing;
01331       block->max_blob_size =
01332         block->line_spacing * textord_excess_blobsize;
01333     }
01334     block->baseline_offset = fmod (rows[row_index]->intercept (),
01335       block->line_spacing);
01336   }
01337   if (testing_on)
01338     tprintf ("\nEstimate line size=%g, spacing=%g, offset=%g\n",
01339       block->line_size, block->line_spacing, block->baseline_offset);
01340   free_mem(rows);
01341 }
01342 
01343 
01373 namespace tesseract {
01374 void Textord::compute_block_xheight(TO_BLOCK *block, float gradient) {
01375   TO_ROW *row;                          // current row
01376   float asc_frac_xheight = CCStruct::kAscenderFraction /
01377       CCStruct::kXHeightFraction;
01378   float desc_frac_xheight = CCStruct::kDescenderFraction /
01379       CCStruct::kXHeightFraction;
01380   inT32 min_height, max_height;         // limits on xheight
01381   TO_ROW_IT row_it = block->get_rows();
01382   if (row_it.empty()) return;  // no rows
01383 
01384   // Compute the best guess of xheight of each row individually.
01385   // Use xheight and ascrise values of the rows where ascenders were found.
01386   get_min_max_xheight(block->line_size, &min_height, &max_height);
01387   STATS row_asc_xheights(min_height, max_height + 1);
01388   STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight),
01389                         static_cast<int>(max_height * asc_frac_xheight) + 1);
01390   int min_desc_height = static_cast<int>(min_height * desc_frac_xheight);
01391   int max_desc_height = static_cast<int>(max_height * desc_frac_xheight);
01392   STATS row_asc_descdrop(min_desc_height, max_desc_height + 1);
01393   STATS row_desc_xheights(min_height, max_height + 1);
01394   STATS row_desc_descdrop(min_desc_height, max_desc_height + 1);
01395   STATS row_cap_xheights(min_height, max_height + 1);
01396   STATS row_cap_floating_xheights(min_height, max_height + 1);
01397   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01398     row = row_it.data();
01399     // Compute the xheight of this row if it has not been computed before.
01400     if (row->xheight <= 0.0) {
01401       compute_row_xheight(row, block->block->classify_rotation(),
01402                           gradient, block->line_size);
01403     }
01404     ROW_CATEGORY row_category = get_row_category(row);
01405     if (row_category == ROW_ASCENDERS_FOUND) {
01406       row_asc_xheights.add(static_cast<inT32>(row->xheight),
01407                            row->xheight_evidence);
01408       row_asc_ascrise.add(static_cast<inT32>(row->ascrise),
01409                           row->xheight_evidence);
01410       row_asc_descdrop.add(static_cast<inT32>(-row->descdrop),
01411                            row->xheight_evidence);
01412     } else if (row_category == ROW_DESCENDERS_FOUND) {
01413       row_desc_xheights.add(static_cast<inT32>(row->xheight),
01414                             row->xheight_evidence);
01415       row_desc_descdrop.add(static_cast<inT32>(-row->descdrop),
01416                             row->xheight_evidence);
01417     } else if (row_category == ROW_UNKNOWN) {
01418       fill_heights(row, gradient, min_height, max_height,
01419                    &row_cap_xheights, &row_cap_floating_xheights);
01420     }
01421   }
01422 
01423   float xheight = 0.0;
01424   float ascrise = 0.0;
01425   float descdrop = 0.0;
01426   // Compute our best guess of xheight of this block.
01427   if (row_asc_xheights.get_total() > 0) {
01428     // Determine xheight from rows where ascenders were found.
01429     xheight = row_asc_xheights.median();
01430     ascrise = row_asc_ascrise.median();
01431     descdrop = -row_asc_descdrop.median();
01432   } else if (row_desc_xheights.get_total() > 0) {
01433     // Determine xheight from rows where descenders were found.
01434     xheight = row_desc_xheights.median();
01435     descdrop = -row_desc_descdrop.median();
01436   } else if (row_cap_xheights.get_total() > 0) {
01437     // All the rows in the block were (a/de)scenderless.
01438     // Try to search for two modes in row_cap_heights that could
01439     // be the xheight and the capheight (e.g. some of the rows
01440     // were lowercase, but did not have enough (a/de)scenders.
01441     // If such two modes can not be found, this block is most
01442     // likely all caps (or all small caps, in which case the code
01443     // still works as intended).
01444     compute_xheight_from_modes(&row_cap_xheights, &row_cap_floating_xheights,
01445                                textord_single_height_mode &&
01446                                block->block->classify_rotation().y() == 0.0,
01447                                min_height, max_height, &(xheight), &(ascrise));
01448     if (ascrise == 0) {  // assume only caps in the whole block
01449       xheight = row_cap_xheights.median() * CCStruct::kXHeightCapRatio;
01450     }
01451   } else {  // default block sizes
01452     xheight = block->line_size * CCStruct::kXHeightFraction;
01453   }
01454   // Correct xheight, ascrise and descdrop if necessary.
01455   bool corrected_xheight = false;
01456   if (xheight < textord_min_xheight) {
01457     xheight = static_cast<float>(textord_min_xheight);
01458     corrected_xheight = true;
01459   }
01460   if (corrected_xheight || ascrise <= 0.0) {
01461     ascrise = xheight * asc_frac_xheight;
01462   }
01463   if (corrected_xheight || descdrop >= 0.0) {
01464     descdrop = -(xheight * desc_frac_xheight);
01465   }
01466   block->xheight = xheight;
01467 
01468   if (textord_debug_xheights) {
01469     tprintf("Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n",
01470             xheight, ascrise, descdrop);
01471   }
01472   // Correct xheight, ascrise, descdrop of rows based on block averages.
01473   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01474     correct_row_xheight(row_it.data(), xheight, ascrise, descdrop);
01475   }
01476 }
01477 
01486 void Textord::compute_row_xheight(TO_ROW *row,          // row to do
01487                                   const FCOORD& rotation,
01488                                   float gradient,       // global skew
01489                                   int block_line_size) {
01490   // Find blobs representing repeated characters in rows and mark them.
01491   // This information is used for computing row xheight and at a later
01492   // stage when words are formed by make_words.
01493   if (!row->rep_chars_marked()) {
01494     mark_repeated_chars(row);
01495   }
01496 
01497   int min_height, max_height;
01498   get_min_max_xheight(block_line_size, &min_height, &max_height);
01499   STATS heights(min_height, max_height + 1);
01500   STATS floating_heights(min_height, max_height + 1);
01501   fill_heights(row, gradient, min_height, max_height,
01502                &heights, &floating_heights);
01503   row->ascrise = 0.0f;
01504   row->xheight = 0.0f;
01505   row->xheight_evidence =
01506     compute_xheight_from_modes(&heights, &floating_heights,
01507                                textord_single_height_mode &&
01508                                rotation.y() == 0.0,
01509                                min_height, max_height,
01510                                &(row->xheight), &(row->ascrise));
01511   row->descdrop = 0.0f;
01512   if (row->xheight > 0.0) {
01513     row->descdrop = static_cast<float>(
01514         compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));
01515   }
01516 }
01517 
01518 }  // namespace tesseract.
01519 
01526 void fill_heights(TO_ROW *row, float gradient, int min_height,
01527                   int max_height, STATS *heights, STATS *floating_heights) {
01528   float xcentre;                 // centre of blob
01529   float top;                     // top y coord of blob
01530   float height;                  // height of blob
01531   BLOBNBOX *blob;                // current blob
01532   int repeated_set;
01533   BLOBNBOX_IT blob_it = row->blob_list();
01534   if (blob_it.empty()) return;  // no blobs in this row
01535   bool has_rep_chars =
01536     row->rep_chars_marked() && row->num_repeated_sets() > 0;
01537   do {
01538     blob = blob_it.data();
01539     if (!blob->joined_to_prev()) {
01540       xcentre = (blob->bounding_box().left() +
01541                  blob->bounding_box().right()) / 2.0f;
01542       top = blob->bounding_box().top();
01543       height = blob->bounding_box().height();
01544       if (textord_fix_xheight_bug)
01545         top -= row->baseline.y(xcentre);
01546       else
01547         top -= gradient * xcentre + row->parallel_c();
01548       if (top >= min_height && top <= max_height) {
01549         heights->add(static_cast<inT32>(floor(top + 0.5)), 1);
01550         if (height / top < textord_min_blob_height_fraction) {
01551           floating_heights->add(static_cast<inT32>(floor(top + 0.5)), 1);
01552         }
01553       }
01554     }
01555     // Skip repeated chars, since they are likely to skew the height stats.
01556     if (has_rep_chars && blob->repeated_set() != 0) {
01557       repeated_set = blob->repeated_set();
01558       blob_it.forward();
01559       while (!blob_it.at_first() &&
01560              blob_it.data()->repeated_set() == repeated_set) {
01561         blob_it.forward();
01562         if (textord_debug_xheights)
01563           tprintf("Skipping repeated char when computing xheight\n");
01564       }
01565     } else {
01566       blob_it.forward();
01567     }
01568   } while (!blob_it.at_first());
01569 }
01570 
01587 int compute_xheight_from_modes(
01588     STATS *heights, STATS *floating_heights, bool cap_only, int min_height,
01589     int max_height, float *xheight, float *ascrise) {
01590   int blob_index = heights->mode();  // find mode
01591   int blob_count = heights->pile_count(blob_index);  // get count of mode
01592   if (textord_debug_xheights) {
01593     tprintf("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n",
01594             min_height, max_height, blob_index, blob_count,
01595             heights->get_total());
01596     heights->print();
01597     floating_heights->print();
01598   }
01599   if (blob_count == 0) return 0;
01600   int modes[MAX_HEIGHT_MODES];  // biggest piles
01601   bool in_best_pile = FALSE;
01602   int prev_size = -MAX_INT32;
01603   int best_count = 0;
01604   int mode_count = compute_height_modes(heights, min_height, max_height,
01605                                         modes, MAX_HEIGHT_MODES);
01606   if (cap_only && mode_count > 1)
01607     mode_count = 1;
01608   int x;
01609   if (textord_debug_xheights) {
01610     tprintf("found %d modes: ", mode_count);
01611     for (x = 0; x < mode_count; x++) tprintf("%d ", modes[x]);
01612     tprintf("\n");
01613   }
01614 
01615   for (x = 0; x < mode_count - 1; x++) {
01616     if (modes[x] != prev_size + 1)
01617       in_best_pile = FALSE;    // had empty height
01618     int modes_x_count = heights->pile_count(modes[x]) -
01619       floating_heights->pile_count(modes[x]);
01620     if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) &&
01621         (in_best_pile || modes_x_count > best_count)) {
01622       for (int asc = x + 1; asc < mode_count; asc++) {
01623         float ratio =
01624           static_cast<float>(modes[asc]) / static_cast<float>(modes[x]);
01625         if (textord_ascx_ratio_min < ratio &&
01626             ratio < textord_ascx_ratio_max &&
01627             (heights->pile_count(modes[asc]) >=
01628              blob_count * textord_ascheight_mode_fraction)) {
01629           if (modes_x_count > best_count) {
01630             in_best_pile = true;
01631             best_count = modes_x_count;
01632           }
01633           if (textord_debug_xheights) {
01634             tprintf("X=%d, asc=%d, count=%d, ratio=%g\n",
01635                     modes[x], modes[asc]-modes[x], modes_x_count, ratio);
01636           }
01637           prev_size = modes[x];
01638           *xheight = static_cast<float>(modes[x]);
01639           *ascrise = static_cast<float>(modes[asc] - modes[x]);
01640         }
01641       }
01642     }
01643   }
01644   if (*xheight == 0) {  // single mode
01645     // Remove counts of the "floating" blobs (the one whose height is too
01646     // small in relation to it's top end of the bounding box) from heights
01647     // before computing the single-mode xheight.
01648     // Restore the counts in heights after the mode is found, since
01649     // floating blobs might be useful for determining potential ascenders
01650     // in compute_row_descdrop().
01651     if (floating_heights->get_total() > 0) {
01652       for (x = min_height; x < max_height; ++x) {
01653         heights->add(x, -(floating_heights->pile_count(x)));
01654       }
01655       blob_index = heights->mode();  // find the modified mode
01656       for (x = min_height; x < max_height; ++x) {
01657         heights->add(x, floating_heights->pile_count(x));
01658       }
01659     }
01660     *xheight = static_cast<float>(blob_index);
01661     *ascrise = 0.0f;
01662     best_count = heights->pile_count(blob_index);
01663     if (textord_debug_xheights)
01664       tprintf("Single mode xheight set to %g\n", *xheight);
01665   } else if (textord_debug_xheights) {
01666     tprintf("Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise);
01667   }
01668   return best_count;
01669 }
01670 
01683 inT32 compute_row_descdrop(TO_ROW *row, float gradient,
01684                            int xheight_blob_count, STATS *asc_heights) {
01685   // Count how many potential ascenders are in this row.
01686   int i_min = asc_heights->min_bucket();
01687   if ((i_min / row->xheight) < textord_ascx_ratio_min) {
01688     i_min = static_cast<int>(
01689         floor(row->xheight * textord_ascx_ratio_min + 0.5));
01690   }
01691   int i_max = asc_heights->max_bucket();
01692   if ((i_max / row->xheight) > textord_ascx_ratio_max) {
01693     i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max));
01694   }
01695   int num_potential_asc = 0;
01696   for (int i = i_min; i <= i_max; ++i) {
01697     num_potential_asc += asc_heights->pile_count(i);
01698   }
01699   inT32 min_height =
01700     static_cast<inT32>(floor(row->xheight * textord_descx_ratio_min + 0.5));
01701   inT32 max_height =
01702     static_cast<inT32>(floor(row->xheight * textord_descx_ratio_max));
01703   float xcentre;                 // centre of blob
01704   float height;                  // height of blob
01705   BLOBNBOX_IT blob_it = row->blob_list();
01706   BLOBNBOX *blob;                // current blob
01707   STATS heights (min_height, max_height + 1);
01708   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
01709     blob = blob_it.data();
01710     if (!blob->joined_to_prev()) {
01711       xcentre = (blob->bounding_box().left() +
01712                  blob->bounding_box().right()) / 2.0f;
01713       height = (gradient * xcentre + row->parallel_c() -
01714                 blob->bounding_box().bottom());
01715       if (height >= min_height && height <= max_height)
01716         heights.add(static_cast<int>(floor(height + 0.5)), 1);
01717     }
01718   }
01719   int blob_index = heights.mode();  // find mode
01720   int blob_count = heights.pile_count(blob_index);  // get count of mode
01721   float total_fraction =
01722     (textord_descheight_mode_fraction + textord_ascheight_mode_fraction);
01723   if (static_cast<float>(blob_count + num_potential_asc) <
01724       xheight_blob_count * total_fraction) {
01725     blob_count = 0;
01726   }
01727   int descdrop = blob_count > 0 ? -blob_index : 0;
01728   if (textord_debug_xheights) {
01729     tprintf("Descdrop: %d (potential ascenders %d, descenders %d)\n",
01730             descdrop, num_potential_asc, blob_count);
01731     heights.print();
01732   }
01733   return descdrop;
01734 }
01735 
01736 
01743 inT32 compute_height_modes(STATS *heights,    // stats to search
01744                            inT32 min_height,  // bottom of range
01745                            inT32 max_height,  // top of range
01746                            inT32 *modes,      // output array
01747                            inT32 maxmodes) {  // size of modes
01748   inT32 pile_count;              // no in source pile
01749   inT32 src_count;               // no of source entries
01750   inT32 src_index;               // current entry
01751   inT32 least_count;             // height of smalllest
01752   inT32 least_index;             // index of least
01753   inT32 dest_count;              // index in modes
01754 
01755   src_count = max_height + 1 - min_height;
01756   dest_count = 0;
01757   least_count = MAX_INT32;
01758   least_index = -1;
01759   for (src_index = 0; src_index < src_count; src_index++) {
01760     pile_count = heights->pile_count(min_height + src_index);
01761     if (pile_count > 0) {
01762       if (dest_count < maxmodes) {
01763         if (pile_count < least_count) {
01764           // find smallest in array
01765           least_count = pile_count;
01766           least_index = dest_count;
01767         }
01768         modes[dest_count++] = min_height + src_index;
01769       } else if (pile_count >= least_count) {
01770         while (least_index < maxmodes - 1) {
01771           modes[least_index] = modes[least_index + 1];
01772           // shuffle up
01773           least_index++;
01774         }
01775         // new one on end
01776         modes[maxmodes - 1] = min_height + src_index;
01777         if (pile_count == least_count) {
01778           // new smallest
01779           least_index = maxmodes - 1;
01780         } else {
01781           least_count = heights->pile_count(modes[0]);
01782           least_index = 0;
01783           for (dest_count = 1; dest_count < maxmodes; dest_count++) {
01784             pile_count = heights->pile_count(modes[dest_count]);
01785             if (pile_count < least_count) {
01786               // find smallest
01787               least_count = pile_count;
01788               least_index = dest_count;
01789             }
01790           }
01791         }
01792       }
01793     }
01794   }
01795   return dest_count;
01796 }
01797 
01798 
01805 void correct_row_xheight(TO_ROW *row, float xheight,
01806                          float ascrise, float descdrop) {
01807   ROW_CATEGORY row_category = get_row_category(row);
01808   if (textord_debug_xheights) {
01809     tprintf("correcting row xheight: row->xheight %.4f"
01810             ", row->acrise %.4f row->descdrop %.4f\n",
01811             row->xheight, row->ascrise, row->descdrop);
01812   }
01813   bool normal_xheight =
01814     within_error_margin(row->xheight, xheight, textord_xheight_error_margin);
01815   bool cap_xheight =
01816     within_error_margin(row->xheight, xheight + ascrise,
01817                         textord_xheight_error_margin);
01818   // Use the average xheight/ascrise for the following cases:
01819   // -- the xheight of the row could not be determined at all
01820   // -- the row has descenders (e.g. "many groups", "ISBN 12345 p.3")
01821   //    and its xheight is close to either cap height or average xheight
01822   // -- the row does not have ascenders or descenders, but its xheight
01823   //    is close to the average block xheight (e.g. row with "www.mmm.com")
01824   if (row_category == ROW_ASCENDERS_FOUND) {
01825     if (row->descdrop >= 0.0) {
01826       row->descdrop = row->xheight * (descdrop / xheight);
01827     }
01828   } else if (row_category == ROW_INVALID ||
01829              (row_category == ROW_DESCENDERS_FOUND &&
01830               (normal_xheight || cap_xheight)) ||
01831               (row_category == ROW_UNKNOWN && normal_xheight)) {
01832     if (textord_debug_xheights) tprintf("using average xheight\n");
01833     row->xheight = xheight;
01834     row->ascrise = ascrise;
01835     row->descdrop = descdrop;
01836   } else if (row_category == ROW_DESCENDERS_FOUND) {
01837     // Assume this is a row with mostly lowercase letters and it's xheight
01838     // is computed correctly (unfortunately there is no way to distinguish
01839     // this from the case when descenders are found, but the most common
01840     // height is capheight).
01841     if (textord_debug_xheights) tprintf("lowercase, corrected ascrise\n");
01842     row->ascrise = row->xheight * (ascrise / xheight);
01843   } else if (row_category == ROW_UNKNOWN) {
01844   // Otherwise assume this row is an all-caps or small-caps row
01845   // and adjust xheight and ascrise of the row.
01846 
01847     row->all_caps = true;
01848     if (cap_xheight) { // regular all caps
01849       if (textord_debug_xheights) tprintf("all caps\n");
01850       row->xheight = xheight;
01851       row->ascrise = ascrise;
01852       row->descdrop = descdrop;
01853     } else {  // small caps or caps with an odd xheight
01854       if (textord_debug_xheights) {
01855         if (row->xheight < xheight + ascrise && row->xheight > xheight) {
01856           tprintf("small caps\n");
01857         } else {
01858           tprintf("all caps with irregular xheight\n");
01859         }
01860       }
01861       row->ascrise = row->xheight * (ascrise / (xheight + ascrise));
01862       row->xheight -= row->ascrise;
01863       row->descdrop = row->xheight * (descdrop / xheight);
01864     }
01865   }
01866   if (textord_debug_xheights) {
01867     tprintf("corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop"
01868             " = %.4f\n", row->xheight, row->ascrise, row->descdrop);
01869   }
01870 }
01871 
01872 static int CountOverlaps(const TBOX& box, int min_height,
01873                          BLOBNBOX_LIST* blobs) {
01874   int overlaps = 0;
01875   BLOBNBOX_IT blob_it(blobs);
01876   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
01877     BLOBNBOX* blob = blob_it.data();
01878     TBOX blob_box = blob->bounding_box();
01879     if (blob_box.height() >= min_height && box.major_overlap(blob_box)) {
01880       ++overlaps;
01881     }
01882   }
01883   return overlaps;
01884 }
01885 
01892 void separate_underlines(TO_BLOCK *block,  // block to do
01893                          float gradient,   // skew angle
01894                          FCOORD rotation,  // inverse landscape
01895                          BOOL8 testing_on) {  // correct orientation
01896   BLOBNBOX *blob;                // current blob
01897   C_BLOB *rotated_blob;          // rotated blob
01898   TO_ROW *row;                   // current row
01899   float length;                  // of g_vec
01900   TBOX blob_box;
01901   FCOORD blob_rotation;          // inverse of rotation
01902   FCOORD g_vec;                  // skew rotation
01903   BLOBNBOX_IT blob_it;           // iterator
01904                                  // iterator
01905   BLOBNBOX_IT under_it = &block->underlines;
01906   BLOBNBOX_IT large_it = &block->large_blobs;
01907   TO_ROW_IT row_it = block->get_rows();
01908   int min_blob_height = static_cast<int>(textord_min_blob_height_fraction *
01909                                          block->line_size + 0.5);
01910 
01911                                  // length of vector
01912   length = sqrt(1 + gradient * gradient);
01913   g_vec = FCOORD(1 / length, -gradient / length);
01914   blob_rotation = FCOORD(rotation.x(), -rotation.y());
01915   blob_rotation.rotate(g_vec);  // undoing everything
01916   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01917     row = row_it.data();
01918                                  // get blobs
01919     blob_it.set_to_list(row->blob_list());
01920     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
01921          blob_it.forward()) {
01922       blob = blob_it.data();
01923       blob_box = blob->bounding_box();
01924       if (blob_box.width() > block->line_size * textord_underline_width) {
01925         ASSERT_HOST(blob->cblob() != NULL);
01926         rotated_blob = crotate_cblob (blob->cblob(),
01927           blob_rotation);
01928         if (test_underline(
01929             testing_on && textord_show_final_rows,
01930             rotated_blob, static_cast<inT16>(row->intercept()),
01931             static_cast<inT16>(
01932                 block->line_size *
01933                 (tesseract::CCStruct::kXHeightFraction +
01934                  tesseract::CCStruct::kAscenderFraction / 2.0f)))) {
01935           under_it.add_after_then_move(blob_it.extract());
01936           if (testing_on && textord_show_final_rows) {
01937             tprintf("Underlined blob at:");
01938               rotated_blob->bounding_box().print();
01939             tprintf("Was:");
01940               blob_box.print();
01941           }
01942         } else if (CountOverlaps(blob->bounding_box(), min_blob_height,
01943                                  row->blob_list()) >
01944                    textord_max_blob_overlaps) {
01945           large_it.add_after_then_move(blob_it.extract());
01946           if (testing_on && textord_show_final_rows) {
01947             tprintf("Large blob overlaps %d blobs at:",
01948                     CountOverlaps(blob_box, min_blob_height,
01949                                   row->blob_list()));
01950             blob_box.print();
01951           }
01952         }
01953         delete rotated_blob;
01954       }
01955     }
01956   }
01957 }
01958 
01959 
01965 void pre_associate_blobs(                  //make rough chars
01966                          ICOORD page_tr,   //top right
01967                          TO_BLOCK *block,  //block to do
01968                          FCOORD rotation,  //inverse landscape
01969                          BOOL8 testing_on  //correct orientation
01970                         ) {
01971 #ifndef GRAPHICS_DISABLED
01972   ScrollView::Color colour;                 //of boxes
01973 #endif
01974   BLOBNBOX *blob;                //current blob
01975   BLOBNBOX *nextblob;            //next in list
01976   TBOX blob_box;
01977   FCOORD blob_rotation;          //inverse of rotation
01978   BLOBNBOX_IT blob_it;           //iterator
01979   BLOBNBOX_IT start_it;          //iterator
01980   TO_ROW_IT row_it = block->get_rows ();
01981 
01982 #ifndef GRAPHICS_DISABLED
01983   colour = ScrollView::RED;
01984 #endif
01985 
01986   blob_rotation = FCOORD (rotation.x (), -rotation.y ());
01987   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01988                                  //get blobs
01989     blob_it.set_to_list (row_it.data ()->blob_list ());
01990     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
01991     blob_it.forward ()) {
01992       blob = blob_it.data ();
01993       blob_box = blob->bounding_box ();
01994       start_it = blob_it;        //save start point
01995       //                      if (testing_on && textord_show_final_blobs)
01996       //                      {
01997       //                              tprintf("Blob at (%d,%d)->(%d,%d), addr=%x, count=%d\n",
01998       //                                      blob_box.left(),blob_box.bottom(),
01999       //                                      blob_box.right(),blob_box.top(),
02000       //                                      (void*)blob,blob_it.length());
02001       //                      }
02002       bool overlap;
02003       do {
02004         overlap = false;
02005         if (!blob_it.at_last ()) {
02006           nextblob = blob_it.data_relative(1);
02007           overlap = blob_box.major_x_overlap(nextblob->bounding_box());
02008           if (overlap) {
02009             blob->merge(nextblob); // merge new blob
02010             blob_box = blob->bounding_box(); // get bigger box
02011             blob_it.forward();
02012           }
02013         }
02014       }
02015       while (overlap);
02016       blob->chop (&start_it, &blob_it,
02017         blob_rotation,
02018         block->line_size * tesseract::CCStruct::kXHeightFraction *
02019         textord_chop_width);
02020       //attempt chop
02021     }
02022 #ifndef GRAPHICS_DISABLED
02023     if (testing_on && textord_show_final_blobs) {
02024       if (to_win == NULL)
02025         create_to_win(page_tr);
02026       to_win->Pen(colour);
02027       for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
02028       blob_it.forward ()) {
02029         blob = blob_it.data ();
02030         blob_box = blob->bounding_box ();
02031         blob_box.rotate (rotation);
02032         if (!blob->joined_to_prev ()) {
02033           to_win->Rectangle (blob_box.left (), blob_box.bottom (),
02034             blob_box.right (), blob_box.top ());
02035         }
02036       }
02037       colour = (ScrollView::Color) (colour + 1);
02038       if (colour > ScrollView::MAGENTA)
02039         colour = ScrollView::RED;
02040     }
02041 #endif
02042   }
02043 }
02044 
02045 
02051 void fit_parallel_rows(                   //find lines
02052                        TO_BLOCK *block,   //block to do
02053                        float gradient,    //gradient to fit
02054                        FCOORD rotation,   //for drawing
02055                        inT32 block_edge,  //edge of block
02056                        BOOL8 testing_on   //correct orientation
02057                       ) {
02058 #ifndef GRAPHICS_DISABLED
02059   ScrollView::Color colour;                 //of row
02060 #endif
02061   TO_ROW_IT row_it = block->get_rows ();
02062 
02063   row_it.move_to_first ();
02064   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02065     if (row_it.data ()->blob_list ()->empty ())
02066       delete row_it.extract ();  //nothing in it
02067     else
02068       fit_parallel_lms (gradient, row_it.data ());
02069   }
02070 #ifndef GRAPHICS_DISABLED
02071   if (testing_on) {
02072     colour = ScrollView::RED;
02073     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02074       plot_parallel_row (row_it.data (), gradient,
02075         block_edge, colour, rotation);
02076       colour = (ScrollView::Color) (colour + 1);
02077       if (colour > ScrollView::MAGENTA)
02078         colour = ScrollView::RED;
02079     }
02080   }
02081 #endif
02082   row_it.sort (row_y_order);     //may have gone out of order
02083 }
02084 
02085 
02093 void fit_parallel_lms(float gradient, TO_ROW *row) {
02094   float c;                       // fitted line
02095   int blobcount;                 // no of blobs
02096    tesseract::DetLineFit lms;
02097   BLOBNBOX_IT blob_it = row->blob_list();
02098 
02099   blobcount = 0;
02100   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
02101     if (!blob_it.data()->joined_to_prev()) {
02102       const TBOX& box = blob_it.data()->bounding_box();
02103       lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
02104       blobcount++;
02105     }
02106   }
02107   double error = lms.ConstrainedFit(gradient, &c);
02108   row->set_parallel_line(gradient, c, error);
02109   if (textord_straight_baselines && blobcount > textord_lms_line_trials) {
02110     error = lms.Fit(&gradient, &c);
02111   }
02112                                  //set the other too
02113   row->set_line(gradient, c, error);
02114 }
02115 
02116 
02122 namespace tesseract {
02123 void Textord::make_spline_rows(TO_BLOCK *block,   // block to do
02124                                float gradient,    // gradient to fit
02125                                FCOORD rotation,   // for drawing
02126                                inT32 block_edge,  // edge of block
02127                                BOOL8 testing_on) {
02128 #ifndef GRAPHICS_DISABLED
02129   ScrollView::Color colour;       //of row
02130 #endif
02131   TO_ROW_IT row_it = block->get_rows ();
02132 
02133   row_it.move_to_first ();
02134   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02135     if (row_it.data ()->blob_list ()->empty ())
02136       delete row_it.extract ();  //nothing in it
02137     else
02138       make_baseline_spline (row_it.data (), block);
02139   }
02140   if (textord_old_baselines) {
02141 #ifndef GRAPHICS_DISABLED
02142     if (testing_on) {
02143       colour = ScrollView::RED;
02144       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
02145       row_it.forward ()) {
02146         row_it.data ()->baseline.plot (to_win, colour);
02147         colour = (ScrollView::Color) (colour + 1);
02148         if (colour > ScrollView::MAGENTA)
02149           colour = ScrollView::RED;
02150       }
02151     }
02152 #endif
02153     make_old_baselines(block, testing_on, gradient);
02154   }
02155 #ifndef GRAPHICS_DISABLED
02156   if (testing_on) {
02157     colour = ScrollView::RED;
02158     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02159       row_it.data ()->baseline.plot (to_win, colour);
02160       colour = (ScrollView::Color) (colour + 1);
02161       if (colour > ScrollView::MAGENTA)
02162         colour = ScrollView::RED;
02163     }
02164   }
02165 #endif
02166 }
02167 
02168 }  // namespace tesseract.
02169 
02170 
02178 void make_baseline_spline(TO_ROW *row,     //row to fit
02179                           TO_BLOCK *block) {
02180   BLOBNBOX_IT blob_it = row->blob_list ();
02181   inT32 *xstarts;                // spline boundaries
02182   double *coeffs;                // quadratic coeffs
02183   inT32 segments;                // no of segments
02184 
02185   xstarts =
02186     (inT32 *) alloc_mem((row->blob_list()->length() + 1) * sizeof(inT32));
02187   if (segment_baseline(row, block, segments, xstarts)
02188   && !textord_straight_baselines && !textord_parallel_baselines) {
02189     coeffs = linear_spline_baseline(row, block, segments, xstarts);
02190   } else {
02191     xstarts[1] = xstarts[segments];
02192     segments = 1;
02193     coeffs = (double *) alloc_mem (3 * sizeof (double));
02194     coeffs[0] = 0;
02195     coeffs[1] = row->line_m ();
02196     coeffs[2] = row->line_c ();
02197   }
02198   row->baseline = QSPLINE (segments, xstarts, coeffs);
02199   free_mem(coeffs);
02200   free_mem(xstarts);
02201 }
02202 
02203 
02211 BOOL8
02212 segment_baseline (               //split baseline
02213 TO_ROW * row,                    //row to fit
02214 TO_BLOCK * block,                //block it came from
02215 inT32 & segments,                //no fo segments
02216 inT32 xstarts[]                  //coords of segments
02217 ) {
02218   BOOL8 needs_curve;             //needs curved line
02219   int blobcount;                 //no of blobs
02220   int blobindex;                 //current blob
02221   int last_state;                //above, on , below
02222   int state;                     //of current blob
02223   float yshift;                  //from baseline
02224   TBOX box;                       //blob box
02225   TBOX new_box;                   //new_it box
02226   float middle;                  //xcentre of blob
02227                                  //blobs
02228   BLOBNBOX_IT blob_it = row->blob_list ();
02229   BLOBNBOX_IT new_it = blob_it;  //front end
02230   SORTED_FLOATS yshifts;         //shifts from baseline
02231 
02232   needs_curve = FALSE;
02233   box = box_next_pre_chopped (&blob_it);
02234   xstarts[0] = box.left ();
02235   segments = 1;
02236   blobcount = row->blob_list ()->length ();
02237   if (textord_oldbl_debug)
02238     tprintf ("Segmenting baseline of %d blobs at (%d,%d)\n",
02239       blobcount, box.left (), box.bottom ());
02240   if (blobcount <= textord_spline_medianwin
02241   || blobcount < textord_spline_minblobs) {
02242     blob_it.move_to_last ();
02243     box = blob_it.data ()->bounding_box ();
02244     xstarts[1] = box.right ();
02245     return FALSE;
02246   }
02247   last_state = 0;
02248   new_it.mark_cycle_pt ();
02249   for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
02250     new_box = box_next_pre_chopped (&new_it);
02251     middle = (new_box.left () + new_box.right ()) / 2.0;
02252     yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
02253                                  //record shift
02254     yshifts.add (yshift, blobindex);
02255     if (new_it.cycled_list ()) {
02256       xstarts[1] = new_box.right ();
02257       return FALSE;
02258     }
02259   }
02260   for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++)
02261     box = box_next_pre_chopped (&blob_it);
02262   do {
02263     new_box = box_next_pre_chopped (&new_it);
02264                                  //get middle one
02265     yshift = yshifts[textord_spline_medianwin / 2];
02266     if (yshift > textord_spline_shift_fraction * block->line_size)
02267       state = 1;
02268     else if (-yshift > textord_spline_shift_fraction * block->line_size)
02269       state = -1;
02270     else
02271       state = 0;
02272     if (state != 0)
02273       needs_curve = TRUE;
02274     //              tprintf("State=%d, prev=%d, shift=%g\n",
02275     //                      state,last_state,yshift);
02276     if (state != last_state && blobcount > textord_spline_minblobs) {
02277       xstarts[segments++] = box.left ();
02278       blobcount = 0;
02279     }
02280     last_state = state;
02281     yshifts.remove (blobindex - textord_spline_medianwin);
02282     box = box_next_pre_chopped (&blob_it);
02283     middle = (new_box.left () + new_box.right ()) / 2.0;
02284     yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
02285     yshifts.add (yshift, blobindex);
02286     blobindex++;
02287     blobcount++;
02288   }
02289   while (!new_it.cycled_list ());
02290   if (blobcount > textord_spline_minblobs || segments == 1) {
02291     xstarts[segments] = new_box.right ();
02292   }
02293   else {
02294     xstarts[--segments] = new_box.right ();
02295   }
02296   if (textord_oldbl_debug)
02297     tprintf ("Made %d segments on row at (%d,%d)\n",
02298       segments, box.right (), box.bottom ());
02299   return needs_curve;
02300 }
02301 
02302 
02310 double *
02311 linear_spline_baseline (         //split baseline
02312 TO_ROW * row,                    //row to fit
02313 TO_BLOCK * block,                //block it came from
02314 inT32 & segments,                //no fo segments
02315 inT32 xstarts[]                  //coords of segments
02316 ) {
02317   int blobcount;                 //no of blobs
02318   int blobindex;                 //current blob
02319   int index1, index2;            //blob numbers
02320   int blobs_per_segment;         //blobs in each
02321   TBOX box;                       //blob box
02322   TBOX new_box;                   //new_it box
02323                                  //blobs
02324   BLOBNBOX_IT blob_it = row->blob_list ();
02325   BLOBNBOX_IT new_it = blob_it;  //front end
02326   float b, c;                    //fitted curve
02327   tesseract::DetLineFit lms;
02328   double *coeffs;                //quadratic coeffs
02329   inT32 segment;                 //current segment
02330 
02331   box = box_next_pre_chopped (&blob_it);
02332   xstarts[0] = box.left ();
02333   blobcount = 1;
02334   while (!blob_it.at_first ()) {
02335     blobcount++;
02336     box = box_next_pre_chopped (&blob_it);
02337   }
02338   segments = blobcount / textord_spline_medianwin;
02339   if (segments < 1)
02340     segments = 1;
02341   blobs_per_segment = blobcount / segments;
02342   coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
02343   if (textord_oldbl_debug)
02344     tprintf
02345       ("Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
02346       blobcount, box.left (), box.bottom (), segments, blobs_per_segment);
02347   segment = 1;
02348   for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
02349     box_next_pre_chopped(&new_it);
02350   index1 = 0;
02351   blobindex = index2;
02352   do {
02353     blobindex += blobs_per_segment;
02354     lms.Clear();
02355     while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
02356       box = box_next_pre_chopped (&blob_it);
02357       int middle = (box.left() + box.right()) / 2;
02358       lms.Add(ICOORD(middle, box.bottom()));
02359       index1++;
02360       if (index1 == blobindex - blobs_per_segment / 2
02361       || index1 == blobcount - 1) {
02362         xstarts[segment] = box.left ();
02363       }
02364     }
02365     lms.Fit(&b, &c);
02366     coeffs[segment * 3 - 3] = 0;
02367     coeffs[segment * 3 - 2] = b;
02368     coeffs[segment * 3 - 1] = c;
02369     segment++;
02370     if (segment > segments)
02371       break;
02372 
02373     blobindex += blobs_per_segment;
02374     lms.Clear();
02375     while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
02376       new_box = box_next_pre_chopped (&new_it);
02377       int middle = (new_box.left() + new_box.right()) / 2;
02378       lms.Add(ICOORD (middle, new_box.bottom()));
02379       index2++;
02380       if (index2 == blobindex - blobs_per_segment / 2
02381       || index2 == blobcount - 1) {
02382         xstarts[segment] = new_box.left ();
02383       }
02384     }
02385     lms.Fit(&b, &c);
02386     coeffs[segment * 3 - 3] = 0;
02387     coeffs[segment * 3 - 2] = b;
02388     coeffs[segment * 3 - 1] = c;
02389     segment++;
02390   }
02391   while (segment <= segments);
02392   return coeffs;
02393 }
02394 
02395 
02402 void assign_blobs_to_rows(                      //find lines
02403                           TO_BLOCK *block,      //block to do
02404                           float *gradient,      //block skew
02405                           int pass,             //identification
02406                           BOOL8 reject_misses,  //chuck big ones out
02407                           BOOL8 make_new_rows,  //add rows for unmatched
02408                           BOOL8 drawing_skew    //draw smoothed skew
02409                          ) {
02410   OVERLAP_STATE overlap_result;  //what to do with it
02411   float ycoord;                  //current y
02412   float top, bottom;             //of blob
02413   float g_length = 1.0f;         //from gradient
02414   inT16 row_count;               //no of rows
02415   inT16 left_x;                  //left edge
02416   inT16 last_x;                  //previous edge
02417   float block_skew;              //y delta
02418   float smooth_factor;           //for new coords
02419   float near_dist;               //dist to nearest row
02420   ICOORD testpt;                 //testing only
02421   BLOBNBOX *blob;                //current blob
02422   TO_ROW *row;                   //current row
02423   TO_ROW *dest_row = NULL;       //row to put blob in
02424                                  //iterators
02425   BLOBNBOX_IT blob_it = &block->blobs;
02426   TO_ROW_IT row_it = block->get_rows ();
02427 
02428   ycoord =
02429     (block->block->bounding_box ().bottom () +
02430     block->block->bounding_box ().top ()) / 2.0f;
02431   if (gradient != NULL)
02432     g_length = sqrt (1 + *gradient * *gradient);
02433 #ifndef GRAPHICS_DISABLED
02434   if (drawing_skew)
02435     to_win->SetCursor(block->block->bounding_box ().left (), ycoord);
02436 #endif
02437   testpt = ICOORD (textord_test_x, textord_test_y);
02438   blob_it.sort (blob_x_order);
02439   smooth_factor = 1.0;
02440   block_skew = 0.0f;
02441   row_count = row_it.length ();  //might have rows
02442   if (!blob_it.empty ()) {
02443     left_x = blob_it.data ()->bounding_box ().left ();
02444   }
02445   else {
02446     left_x = block->block->bounding_box ().left ();
02447   }
02448   last_x = left_x;
02449   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
02450     blob = blob_it.data ();
02451     if (gradient != NULL) {
02452       block_skew = (1 - 1 / g_length) * blob->bounding_box ().bottom ()
02453         + *gradient / g_length * blob->bounding_box ().left ();
02454     }
02455     else if (blob->bounding_box ().left () - last_x > block->line_size / 2
02456       && last_x - left_x > block->line_size * 2
02457     && textord_interpolating_skew) {
02458       //                      tprintf("Interpolating skew from %g",block_skew);
02459       block_skew *= (float) (blob->bounding_box ().left () - left_x)
02460         / (last_x - left_x);
02461       //                      tprintf("to %g\n",block_skew);
02462     }
02463     last_x = blob->bounding_box ().left ();
02464     top = blob->bounding_box ().top () - block_skew;
02465     bottom = blob->bounding_box ().bottom () - block_skew;
02466 #ifndef GRAPHICS_DISABLED
02467     if (drawing_skew)
02468       to_win->DrawTo(blob->bounding_box ().left (), ycoord + block_skew);
02469 #endif
02470     if (!row_it.empty ()) {
02471       for (row_it.move_to_first ();
02472         !row_it.at_last () && row_it.data ()->min_y () > top;
02473         row_it.forward ());
02474       row = row_it.data ();
02475       if (row->min_y () <= top && row->max_y () >= bottom) {
02476       //any overlap
02477         dest_row = row;
02478         overlap_result = most_overlapping_row (&row_it, dest_row,
02479           top, bottom,
02480           block->line_size,
02481           blob->bounding_box ().
02482           contains (testpt));
02483         if (overlap_result == NEW_ROW && !reject_misses)
02484           overlap_result = ASSIGN;
02485       }
02486       else {
02487         overlap_result = NEW_ROW;
02488         if (!make_new_rows) {
02489           near_dist = row_it.data_relative (-1)->min_y () - top;
02490                                  //below bottom
02491           if (bottom < row->min_y ()) {
02492             if (row->min_y () - bottom <=
02493               (block->line_spacing -
02494             block->line_size) * tesseract::CCStruct::kDescenderFraction) {
02495                                  //done it
02496               overlap_result = ASSIGN;
02497               dest_row = row;
02498             }
02499           }
02500           else if (near_dist > 0
02501           && near_dist < bottom - row->max_y ()) {
02502             row_it.backward ();
02503             dest_row = row_it.data ();
02504             if (dest_row->min_y () - bottom <=
02505               (block->line_spacing -
02506             block->line_size) * tesseract::CCStruct::kDescenderFraction) {
02507                                  //done it
02508               overlap_result = ASSIGN;
02509             }
02510           }
02511           else {
02512             if (top - row->max_y () <=
02513               (block->line_spacing -
02514               block->line_size) * (textord_overlap_x +
02515             tesseract::CCStruct::kAscenderFraction)) {
02516                                  //done it
02517               overlap_result = ASSIGN;
02518               dest_row = row;
02519             }
02520           }
02521         }
02522       }
02523       if (overlap_result == ASSIGN)
02524         dest_row->add_blob (blob_it.extract (), top, bottom,
02525           block->line_size);
02526       if (overlap_result == NEW_ROW) {
02527         if (make_new_rows && top - bottom < block->max_blob_size) {
02528           dest_row =
02529             new TO_ROW (blob_it.extract (), top, bottom,
02530             block->line_size);
02531           row_count++;
02532           if (bottom > row_it.data ()->min_y ())
02533             row_it.add_before_then_move (dest_row);
02534           //insert in right place
02535           else
02536             row_it.add_after_then_move (dest_row);
02537           smooth_factor =
02538             1.0 / (row_count * textord_skew_lag +
02539             textord_skewsmooth_offset);
02540         }
02541         else
02542           overlap_result = REJECT;
02543       }
02544     }
02545     else if (make_new_rows && top - bottom < block->max_blob_size) {
02546       overlap_result = NEW_ROW;
02547       dest_row =
02548         new TO_ROW (blob_it.extract (), top, bottom, block->line_size);
02549       row_count++;
02550       row_it.add_after_then_move (dest_row);
02551       smooth_factor = 1.0 / (row_count * textord_skew_lag +
02552                              textord_skewsmooth_offset2);
02553     }
02554     else
02555       overlap_result = REJECT;
02556     if (blob->bounding_box ().contains (testpt)) {
02557       if (overlap_result != REJECT) {
02558         tprintf ("Test blob assigned to row at (%g,%g) on pass %d\n",
02559           dest_row->min_y (), dest_row->max_y (), pass);
02560       }
02561       else {
02562         tprintf ("Test blob assigned to no row on pass %d\n", pass);
02563       }
02564     }
02565     if (overlap_result != REJECT) {
02566       while (!row_it.at_first ()
02567         && row_it.data ()->min_y () >
02568       row_it.data_relative (-1)->min_y ()) {
02569         row = row_it.extract ();
02570         row_it.backward ();
02571         row_it.add_before_then_move (row);
02572       }
02573       while (!row_it.at_last ()
02574         && row_it.data ()->min_y () <
02575       row_it.data_relative (1)->min_y ()) {
02576         row = row_it.extract ();
02577         row_it.forward ();
02578                                  //keep rows in order
02579         row_it.add_after_then_move (row);
02580       }
02581       block_skew = (1 - smooth_factor) * block_skew
02582         + smooth_factor * (blob->bounding_box ().bottom () -
02583         dest_row->initial_min_y ());
02584     }
02585   }
02586   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02587     if (row_it.data ()->blob_list ()->empty ())
02588       delete row_it.extract ();  //discard empty rows
02589   }
02590 }
02591 
02592 
02598 OVERLAP_STATE most_overlapping_row(                    //find best row
02599                                    TO_ROW_IT *row_it,  //iterator
02600                                    TO_ROW *&best_row,  //output row
02601                                    float top,          //top of blob
02602                                    float bottom,       //bottom of blob
02603                                    float rowsize,      //max row size
02604                                    BOOL8 testing_blob  //test stuff
02605                                   ) {
02606   OVERLAP_STATE result;          //result of tests
02607   float overlap;                 //of blob & row
02608   float bestover;                //nearest row
02609   float merge_top, merge_bottom; //size of merged row
02610   ICOORD testpt;                 //testing only
02611   TO_ROW *row;                   //current row
02612   TO_ROW *test_row;              //for multiple overlaps
02613   BLOBNBOX_IT blob_it;           //for merging rows
02614 
02615   result = ASSIGN;
02616   row = row_it->data ();
02617   bestover = top - bottom;
02618   if (top > row->max_y ())
02619     bestover -= top - row->max_y ();
02620   if (bottom < row->min_y ())
02621                                  //compute overlap
02622     bestover -= row->min_y () - bottom;
02623   if (testing_blob) {
02624     tprintf ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f\n",
02625       bottom, top, row->min_y (), row->max_y (), bestover);
02626   }
02627   test_row = row;
02628   do {
02629     if (!row_it->at_last ()) {
02630       row_it->forward ();
02631       test_row = row_it->data ();
02632       if (test_row->min_y () <= top && test_row->max_y () >= bottom) {
02633         merge_top =
02634           test_row->max_y () >
02635           row->max_y ()? test_row->max_y () : row->max_y ();
02636         merge_bottom =
02637           test_row->min_y () <
02638           row->min_y ()? test_row->min_y () : row->min_y ();
02639         if (merge_top - merge_bottom <= rowsize) {
02640           if (testing_blob) {
02641             tprintf ("Merging rows at (%g,%g), (%g,%g)\n",
02642               row->min_y (), row->max_y (),
02643               test_row->min_y (), test_row->max_y ());
02644           }
02645           test_row->set_limits (merge_bottom, merge_top);
02646           blob_it.set_to_list (test_row->blob_list ());
02647           blob_it.add_list_after (row->blob_list ());
02648           blob_it.sort (blob_x_order);
02649           row_it->backward ();
02650           delete row_it->extract ();
02651           row_it->forward ();
02652           bestover = -1.0f;      //force replacement
02653         }
02654         overlap = top - bottom;
02655         if (top > test_row->max_y ())
02656           overlap -= top - test_row->max_y ();
02657         if (bottom < test_row->min_y ())
02658           overlap -= test_row->min_y () - bottom;
02659         if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
02660           result = REJECT;
02661         }
02662         if (overlap > bestover) {
02663           bestover = overlap;    //find biggest overlap
02664           row = test_row;
02665         }
02666         if (testing_blob) {
02667           tprintf
02668             ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f->%f\n",
02669             bottom, top, test_row->min_y (), test_row->max_y (),
02670             overlap, bestover);
02671         }
02672       }
02673     }
02674   }
02675   while (!row_it->at_last ()
02676     && test_row->min_y () <= top && test_row->max_y () >= bottom);
02677   while (row_it->data () != row)
02678     row_it->backward ();         //make it point to row
02679                                  //doesn't overlap much
02680   if (top - bottom - bestover > rowsize * textord_overlap_x &&
02681       (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x)
02682     && result == ASSIGN)
02683     result = NEW_ROW;            //doesn't overlap enough
02684   best_row = row;
02685   return result;
02686 }
02687 
02688 
02694 int blob_x_order(                    //sort function
02695                  const void *item1,  //items to compare
02696                  const void *item2) {
02697                                  //converted ptr
02698   BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
02699                                  //converted ptr
02700   BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
02701 
02702   if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
02703     return -1;
02704   else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ())
02705     return 1;
02706   else
02707     return 0;
02708 }
02709 
02710 
02716 int row_y_order(                    //sort function
02717                 const void *item1,  //items to compare
02718                 const void *item2) {
02719                                  //converted ptr
02720   TO_ROW *row1 = *(TO_ROW **) item1;
02721                                  //converted ptr
02722   TO_ROW *row2 = *(TO_ROW **) item2;
02723 
02724   if (row1->parallel_c () > row2->parallel_c ())
02725     return -1;
02726   else if (row1->parallel_c () < row2->parallel_c ())
02727     return 1;
02728   else
02729     return 0;
02730 }
02731 
02732 
02738 int row_spacing_order(                    //sort function
02739                       const void *item1,  //items to compare
02740                       const void *item2) {
02741                                  //converted ptr
02742   TO_ROW *row1 = *(TO_ROW **) item1;
02743                                  //converted ptr
02744   TO_ROW *row2 = *(TO_ROW **) item2;
02745 
02746   if (row1->spacing < row2->spacing)
02747     return -1;
02748   else if (row1->spacing > row2->spacing)
02749     return 1;
02750   else
02751     return 0;
02752 }
02753 
02760 void mark_repeated_chars(TO_ROW *row) {
02761   BLOBNBOX_IT box_it(row->blob_list());            // Iterator.
02762   int num_repeated_sets = 0;
02763   if (!box_it.empty()) {
02764     do {
02765       BLOBNBOX* bblob = box_it.data();
02766       int repeat_length = 0;
02767       if (bblob->flow() == BTFT_LEADER &&
02768           !bblob->joined_to_prev() && bblob->cblob() != NULL) {
02769         BLOBNBOX_IT test_it(box_it);
02770         for (test_it.forward(); !test_it.at_first(); test_it.forward()) {
02771           bblob = test_it.data();
02772           if (bblob->flow() != BTFT_LEADER)
02773             break;
02774           if (bblob->joined_to_prev() || bblob->cblob() == NULL) {
02775             tprintf("Cancelled repeat of length %d due to %s\n",
02776                     repeat_length,
02777                     bblob->joined_to_prev() ? "Joined" : "Null");
02778             repeat_length = 0;
02779             break;
02780           }
02781           ++repeat_length;
02782         }
02783       }
02784       if (repeat_length >= kMinLeaderCount) {
02785         num_repeated_sets++;
02786         for (; repeat_length > 0; box_it.forward(), --repeat_length) {
02787           bblob = box_it.data();
02788           bblob->set_repeated_set(num_repeated_sets);
02789         }
02790         if (!box_it.at_first())
02791           bblob->set_repeated_set(0);
02792      } else {
02793         box_it.forward();
02794         bblob->set_repeated_set(0);
02795       }
02796     } while (!box_it.at_first());  // until all done
02797   }
02798   row->set_num_repeated_sets(num_repeated_sets);
02799 }