Tesseract  3.02
tesseract-ocr/textord/wordseg.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        wordseg.cpp  (Formerly wspace.c)
00003  * Description: Code to segment the blobs into words.
00004  * Author:              Ray Smith
00005  * Created:             Fri Oct 16 11:32:28 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "mfcpch.h"
00021 #ifdef __UNIX__
00022 #include          <assert.h>
00023 #endif
00024 #include          "stderr.h"
00025 #include          "blobbox.h"
00026 #include          "statistc.h"
00027 #include          "drawtord.h"
00028 #include          "makerow.h"
00029 #include          "pitsync1.h"
00030 #include          "tovars.h"
00031 #include          "topitch.h"
00032 #include          "cjkpitch.h"
00033 #include          "textord.h"
00034 #include          "fpchop.h"
00035 #include          "wordseg.h"
00036 
00037 // Include automatically generated configuration file if running autoconf.
00038 #ifdef HAVE_CONFIG_H
00039 #include "config_auto.h"
00040 #endif
00041 
00042 #define EXTERN
00043 
00044 EXTERN BOOL_VAR(textord_fp_chopping, TRUE, "Do fixed pitch chopping");
00045 EXTERN BOOL_VAR(textord_force_make_prop_words, FALSE,
00046                 "Force proportional word segmentation on all rows");
00047 EXTERN BOOL_VAR(textord_chopper_test, FALSE,
00048                 "Chopper is being tested.");
00049 
00050 #define FIXED_WIDTH_MULTIPLE  5
00051 #define BLOCK_STATS_CLUSTERS  10
00052 
00053 
00061 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
00062   TO_ROW_IT to_row_it(rows);
00063   ROW_IT row_it(real_rows);
00064   for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list();
00065        to_row_it.forward()) {
00066     TO_ROW* row = to_row_it.data();
00067     // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
00068     // to create the word.
00069     C_BLOB_LIST cblobs;
00070     C_BLOB_IT cblob_it(&cblobs);
00071     BLOBNBOX_IT box_it(row->blob_list());
00072     for (;!box_it.empty(); box_it.forward()) {
00073       BLOBNBOX* bblob= box_it.extract();
00074       if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
00075         if (bblob->cblob() != NULL) {
00076           C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
00077           cout_it.move_to_last();
00078           cout_it.add_list_after(bblob->cblob()->out_list());
00079           delete bblob->cblob();
00080         }
00081       } else {
00082         if (bblob->cblob() != NULL)
00083           cblob_it.add_after_then_move(bblob->cblob());
00084       }
00085       delete bblob;
00086     }
00087     // Convert the TO_ROW to a ROW.
00088     ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
00089                             static_cast<inT16>(row->space_size));
00090     WERD_IT word_it(real_row->word_list());
00091     WERD* word = new WERD(&cblobs, 0, NULL);
00092     word->set_flag(W_BOL, TRUE);
00093     word->set_flag(W_EOL, TRUE);
00094     word->set_flag(W_DONT_CHOP, one_blob);
00095     word_it.add_after_then_move(word);
00096     row_it.add_after_then_move(real_row);
00097   }
00098 }
00099 
00105 void make_words(tesseract::Textord *textord,
00106                 ICOORD page_tr,                // top right
00107                 float gradient,                // page skew
00108                 BLOCK_LIST *blocks,            // block list
00109                 TO_BLOCK_LIST *port_blocks) {  // output list
00110   TO_BLOCK_IT block_it;          // iterator
00111   TO_BLOCK *block;               // current block
00112 
00113   if (textord->use_cjk_fp_model()) {
00114     compute_fixed_pitch_cjk(page_tr, port_blocks);
00115   } else {
00116     compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
00117                         !(BOOL8) textord_test_landscape);
00118   }
00119   textord->to_spacing(page_tr, port_blocks);
00120   block_it.set_to_list(port_blocks);
00121   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00122     block = block_it.data();
00123     make_real_words(textord, block, FCOORD(1.0f, 0.0f));
00124   }
00125 }
00126 
00127 
00135 void set_row_spaces(                  //find space sizes
00136                     TO_BLOCK *block,  //block to do
00137                     FCOORD rotation,  //for drawing
00138                     BOOL8 testing_on  //correct orientation
00139                    ) {
00140   inT32 maxwidth;                //of widest space
00141   TO_ROW *row;                   //current row
00142   TO_ROW_IT row_it = block->get_rows ();
00143 
00144   if (row_it.empty ())
00145     return;                      //empty block
00146   maxwidth = (inT32) ceil (block->xheight * textord_words_maxspace);
00147   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00148     row = row_it.data ();
00149     if (row->fixed_pitch == 0) {
00150       //                      if (!textord_test_mode
00151       //                      && row_words(block,row,maxwidth,rotation,testing_on)==0
00152       //                      || textord_test_mode
00153       //                      && row_words2(block,row,maxwidth,rotation,testing_on)==0)
00154       //                      {
00155       row->min_space =
00156         (inT32) ceil (row->pr_space -
00157         (row->pr_space -
00158         row->pr_nonsp) * textord_words_definite_spread);
00159       row->max_nonspace =
00160         (inT32) floor (row->pr_nonsp +
00161         (row->pr_space -
00162         row->pr_nonsp) * textord_words_definite_spread);
00163       if (testing_on && textord_show_initial_words) {
00164         tprintf ("Assigning defaults %d non, %d space to row at %g\n",
00165           row->max_nonspace, row->min_space, row->intercept ());
00166       }
00167       row->space_threshold = (row->max_nonspace + row->min_space) / 2;
00168       row->space_size = row->pr_space;
00169       row->kern_size = row->pr_nonsp;
00170       //                      }
00171     }
00172 #ifndef GRAPHICS_DISABLED
00173     if (textord_show_initial_words && testing_on) {
00174       plot_word_decisions (to_win, (inT16) row->fixed_pitch, row);
00175     }
00176 #endif
00177   }
00178 }
00179 
00180 
00187 inT32 row_words(                  //compute space size
00188                 TO_BLOCK *block,  //block it came from
00189                 TO_ROW *row,      //row to operate on
00190                 inT32 maxwidth,   //max expected space size
00191                 FCOORD rotation,  //for drawing
00192                 BOOL8 testing_on  //for debug
00193                ) {
00194   BOOL8 testing_row;             //contains testpt
00195   BOOL8 prev_valid;              //if decent size
00196   BOOL8 this_valid;              //current blob big enough
00197   inT32 prev_x;                  //end of prev blob
00198   inT32 min_gap;                 //min interesting gap
00199   inT32 cluster_count;           //no of clusters
00200   inT32 gap_index;               //which cluster
00201   inT32 smooth_factor;           //for smoothing stats
00202   BLOBNBOX *blob;                //current blob
00203   float lower, upper;            //clustering parameters
00204   float gaps[3];                 //gap clusers
00205   ICOORD testpt;
00206   TBOX blob_box;                  //bounding box
00207                                  //iterator
00208   BLOBNBOX_IT blob_it = row->blob_list ();
00209   STATS gap_stats (0, maxwidth);
00210   STATS cluster_stats[4];        //clusters
00211 
00212   testpt = ICOORD (textord_test_x, textord_test_y);
00213   smooth_factor =
00214     (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
00215   //      if (testing_on)
00216   //              tprintf("Row smooth factor=%d\n",smooth_factor);
00217   prev_valid = FALSE;
00218   prev_x = -MAX_INT32;
00219   testing_row = FALSE;
00220   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00221     blob = blob_it.data ();
00222     blob_box = blob->bounding_box ();
00223     if (blob_box.contains (testpt))
00224       testing_row = TRUE;
00225     gap_stats.add (blob_box.width (), 1);
00226   }
00227   min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile));
00228   gap_stats.clear ();
00229   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00230     blob = blob_it.data ();
00231     if (!blob->joined_to_prev ()) {
00232       blob_box = blob->bounding_box ();
00233       //                      this_valid=blob_box.width()>=min_gap;
00234       this_valid = TRUE;
00235       if (this_valid && prev_valid
00236       && blob_box.left () - prev_x < maxwidth) {
00237         gap_stats.add (blob_box.left () - prev_x, 1);
00238       }
00239       prev_x = blob_box.right ();
00240       prev_valid = this_valid;
00241     }
00242   }
00243   if (gap_stats.get_total () == 0) {
00244     row->min_space = 0;          //no evidence
00245     row->max_nonspace = 0;
00246     return 0;
00247   }
00248   gap_stats.smooth (smooth_factor);
00249   lower = row->xheight * textord_words_initial_lower;
00250   upper = row->xheight * textord_words_initial_upper;
00251   cluster_count = gap_stats.cluster (lower, upper,
00252     textord_spacesize_ratioprop, 3,
00253     cluster_stats);
00254   while (cluster_count < 2 && ceil (lower) < floor (upper)) {
00255                                  //shrink gap
00256     upper = (upper * 3 + lower) / 4;
00257     lower = (lower * 3 + upper) / 4;
00258     cluster_count = gap_stats.cluster (lower, upper,
00259       textord_spacesize_ratioprop, 3,
00260       cluster_stats);
00261   }
00262   if (cluster_count < 2) {
00263     row->min_space = 0;          //no evidence
00264     row->max_nonspace = 0;
00265     return 0;
00266   }
00267   for (gap_index = 0; gap_index < cluster_count; gap_index++)
00268     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
00269   //get medians
00270   if (cluster_count > 2) {
00271     if (testing_on && textord_show_initial_words) {
00272       tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n",
00273         row->intercept (),
00274         cluster_stats[1].ile (0.5),
00275         cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5));
00276     }
00277     lower = gaps[0];
00278     if (gaps[1] > lower) {
00279       upper = gaps[1];           //prefer most frequent
00280       if (upper < block->xheight * textord_words_min_minspace
00281       && gaps[2] > gaps[1]) {
00282         upper = gaps[2];
00283       }
00284     }
00285     else if (gaps[2] > lower
00286       && gaps[2] >= block->xheight * textord_words_min_minspace)
00287       upper = gaps[2];
00288     else if (lower >= block->xheight * textord_words_min_minspace) {
00289       upper = lower;             //not nice
00290       lower = gaps[1];
00291       if (testing_on && textord_show_initial_words) {
00292         tprintf ("Had to switch most common from lower to upper!!\n");
00293         gap_stats.print();
00294       }
00295     }
00296     else {
00297       row->min_space = 0;        //no evidence
00298       row->max_nonspace = 0;
00299       return 0;
00300     }
00301   }
00302   else {
00303     if (gaps[1] < gaps[0]) {
00304       if (testing_on && textord_show_initial_words) {
00305         tprintf ("Had to switch most common from lower to upper!!\n");
00306         gap_stats.print();
00307       }
00308       lower = gaps[1];
00309       upper = gaps[0];
00310     }
00311     else {
00312       upper = gaps[1];
00313       lower = gaps[0];
00314     }
00315   }
00316   if (upper < block->xheight * textord_words_min_minspace) {
00317     row->min_space = 0;          //no evidence
00318     row->max_nonspace = 0;
00319     return 0;
00320   }
00321   if (upper * 3 < block->min_space * 2 + block->max_nonspace
00322   || lower * 3 > block->min_space * 2 + block->max_nonspace) {
00323     if (testing_on && textord_show_initial_words) {
00324       tprintf ("Disagreement between block and row at %g!!\n",
00325         row->intercept ());
00326       tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper);
00327       gap_stats.print();
00328     }
00329   }
00330   row->min_space =
00331     (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
00332   row->max_nonspace =
00333     (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
00334   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
00335   row->space_size = upper;
00336   row->kern_size = lower;
00337   if (testing_on && textord_show_initial_words) {
00338     if (testing_row) {
00339       tprintf ("GAP STATS\n");
00340       gap_stats.print();
00341       tprintf ("SPACE stats\n");
00342       cluster_stats[2].print_summary();
00343       tprintf ("NONSPACE stats\n");
00344       cluster_stats[1].print_summary();
00345     }
00346     tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
00347       row->intercept (), row->min_space, upper,
00348       row->max_nonspace, lower);
00349   }
00350   return cluster_stats[2].get_total ();
00351 }
00352 
00353 
00360 inT32 row_words2(                  //compute space size
00361                  TO_BLOCK *block,  //block it came from
00362                  TO_ROW *row,      //row to operate on
00363                  inT32 maxwidth,   //max expected space size
00364                  FCOORD rotation,  //for drawing
00365                  BOOL8 testing_on  //for debug
00366                 ) {
00367   BOOL8 testing_row;             //contains testpt
00368   BOOL8 prev_valid;              //if decent size
00369   BOOL8 this_valid;              //current blob big enough
00370   inT32 prev_x;                  //end of prev blob
00371   inT32 min_width;               //min interesting width
00372   inT32 valid_count;             //good gaps
00373   inT32 total_count;             //total gaps
00374   inT32 cluster_count;           //no of clusters
00375   inT32 prev_count;              //previous cluster_count
00376   inT32 gap_index;               //which cluster
00377   inT32 smooth_factor;           //for smoothing stats
00378   BLOBNBOX *blob;                //current blob
00379   float lower, upper;            //clustering parameters
00380   ICOORD testpt;
00381   TBOX blob_box;                  //bounding box
00382                                  //iterator
00383   BLOBNBOX_IT blob_it = row->blob_list ();
00384   STATS gap_stats (0, maxwidth);
00385                                  //gap sizes
00386   float gaps[BLOCK_STATS_CLUSTERS];
00387   STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
00388   //clusters
00389 
00390   testpt = ICOORD (textord_test_x, textord_test_y);
00391   smooth_factor =
00392     (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
00393   //      if (testing_on)
00394   //              tprintf("Row smooth factor=%d\n",smooth_factor);
00395   prev_valid = FALSE;
00396   prev_x = -MAX_INT16;
00397   testing_row = FALSE;
00398                                  //min blob size
00399   min_width = (inT32) block->pr_space;
00400   total_count = 0;
00401   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00402     blob = blob_it.data ();
00403     if (!blob->joined_to_prev ()) {
00404       blob_box = blob->bounding_box ();
00405       this_valid = blob_box.width () >= min_width;
00406       this_valid = TRUE;
00407       if (this_valid && prev_valid
00408       && blob_box.left () - prev_x < maxwidth) {
00409         gap_stats.add (blob_box.left () - prev_x, 1);
00410       }
00411       total_count++;             //count possibles
00412       prev_x = blob_box.right ();
00413       prev_valid = this_valid;
00414     }
00415   }
00416   valid_count = gap_stats.get_total ();
00417   if (valid_count < total_count * textord_words_minlarge) {
00418     gap_stats.clear ();
00419     prev_x = -MAX_INT16;
00420     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00421     blob_it.forward ()) {
00422       blob = blob_it.data ();
00423       if (!blob->joined_to_prev ()) {
00424         blob_box = blob->bounding_box ();
00425         if (blob_box.left () - prev_x < maxwidth) {
00426           gap_stats.add (blob_box.left () - prev_x, 1);
00427         }
00428         prev_x = blob_box.right ();
00429       }
00430     }
00431   }
00432   if (gap_stats.get_total () == 0) {
00433     row->min_space = 0;          //no evidence
00434     row->max_nonspace = 0;
00435     return 0;
00436   }
00437 
00438   cluster_count = 0;
00439   lower = block->xheight * words_initial_lower;
00440   upper = block->xheight * words_initial_upper;
00441   gap_stats.smooth (smooth_factor);
00442   do {
00443     prev_count = cluster_count;
00444     cluster_count = gap_stats.cluster (lower, upper,
00445       textord_spacesize_ratioprop,
00446       BLOCK_STATS_CLUSTERS, cluster_stats);
00447   }
00448   while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
00449   if (cluster_count < 1) {
00450     row->min_space = 0;
00451     row->max_nonspace = 0;
00452     return 0;
00453   }
00454   for (gap_index = 0; gap_index < cluster_count; gap_index++)
00455     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
00456   //get medians
00457   if (testing_on) {
00458     tprintf ("cluster_count=%d:", cluster_count);
00459     for (gap_index = 0; gap_index < cluster_count; gap_index++)
00460       tprintf (" %g(%d)", gaps[gap_index],
00461         cluster_stats[gap_index + 1].get_total ());
00462     tprintf ("\n");
00463   }
00464 
00465   //Try to find proportional non-space and space for row.
00466   for (gap_index = 0; gap_index < cluster_count
00467     && gaps[gap_index] > block->max_nonspace; gap_index++);
00468   if (gap_index < cluster_count)
00469     lower = gaps[gap_index];     //most frequent below
00470   else {
00471     if (testing_on)
00472       tprintf ("No cluster below block threshold!, using default=%g\n",
00473         block->pr_nonsp);
00474     lower = block->pr_nonsp;
00475   }
00476   for (gap_index = 0; gap_index < cluster_count
00477     && gaps[gap_index] <= block->max_nonspace; gap_index++);
00478   if (gap_index < cluster_count)
00479     upper = gaps[gap_index];     //most frequent above
00480   else {
00481     if (testing_on)
00482       tprintf ("No cluster above block threshold!, using default=%g\n",
00483         block->pr_space);
00484     upper = block->pr_space;
00485   }
00486   row->min_space =
00487     (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
00488   row->max_nonspace =
00489     (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
00490   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
00491   row->space_size = upper;
00492   row->kern_size = lower;
00493   if (testing_on) {
00494     if (testing_row) {
00495       tprintf ("GAP STATS\n");
00496       gap_stats.print();
00497       tprintf ("SPACE stats\n");
00498       cluster_stats[2].print_summary();
00499       tprintf ("NONSPACE stats\n");
00500       cluster_stats[1].print_summary();
00501     }
00502     tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
00503       row->intercept (), row->min_space, upper,
00504       row->max_nonspace, lower);
00505   }
00506   return 1;
00507 }
00508 
00509 
00516 void make_real_words(
00517                      tesseract::Textord *textord,
00518                      TO_BLOCK *block,  //block to do
00519                      FCOORD rotation   //for drawing
00520                     ) {
00521   TO_ROW *row;                   //current row
00522   TO_ROW_IT row_it = block->get_rows ();
00523   ROW *real_row = NULL;          //output row
00524   ROW_IT real_row_it = block->block->row_list ();
00525 
00526   if (row_it.empty ())
00527     return;                      //empty block
00528   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00529     row = row_it.data ();
00530     if (row->blob_list ()->empty () && !row->rep_words.empty ()) {
00531       real_row = make_rep_words (row, block);
00532     } else if (!row->blob_list()->empty()) {
00533       // In a fixed pitch document, some lines may be detected as fixed pitch
00534       // while others don't, and will go through different path.
00535       // For non-space delimited language like CJK, fixed pitch chop always
00536       // leave the entire line as one word.  We can force consistent chopping
00537       // with force_make_prop_words flag.
00538       POLY_BLOCK* pb = block->block->poly_block();
00539       if (textord_chopper_test) {
00540         real_row = textord->make_blob_words (row, rotation);
00541       } else if (textord_force_make_prop_words ||
00542                  (pb != NULL && !pb->IsText()) ||
00543                  row->pitch_decision == PITCH_DEF_PROP ||
00544                  row->pitch_decision == PITCH_CORR_PROP) {
00545         real_row = textord->make_prop_words (row, rotation);
00546       } else if (row->pitch_decision == PITCH_DEF_FIXED ||
00547                  row->pitch_decision == PITCH_CORR_FIXED) {
00548         real_row = fixed_pitch_words (row, rotation);
00549       } else {
00550         ASSERT_HOST(FALSE);
00551       }
00552     }
00553     if (real_row != NULL) {
00554                                  //put row in block
00555       real_row_it.add_after_then_move (real_row);
00556     }
00557   }
00558   block->block->set_stats (block->fixed_pitch == 0, (inT16) block->kern_size,
00559     (inT16) block->space_size,
00560     (inT16) block->fixed_pitch);
00561   block->block->check_pitch ();
00562 }
00563 
00564 
00572 ROW *make_rep_words(                 //make a row
00573                     TO_ROW *row,     //row to convert
00574                     TO_BLOCK *block  //block it lives in
00575                    ) {
00576   inT32 xstarts[2];              //ends of row
00577   ROW *real_row;                 //output row
00578   TBOX word_box;                  //bounding box
00579   double coeffs[3];              //spline
00580                                  //iterator
00581   WERD_IT word_it = &row->rep_words;
00582 
00583   if (word_it.empty ())
00584     return NULL;
00585   word_box = word_it.data ()->bounding_box ();
00586   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ())
00587     word_box += word_it.data ()->bounding_box ();
00588   xstarts[0] = word_box.left ();
00589   xstarts[1] = word_box.right ();
00590   coeffs[0] = 0;
00591   coeffs[1] = row->line_m ();
00592   coeffs[2] = row->line_c ();
00593   row->xheight = block->xheight;
00594   real_row = new ROW(row,
00595     (inT16) block->kern_size, (inT16) block->space_size);
00596   word_it.set_to_list (real_row->word_list ());
00597                                  //put words in row
00598   word_it.add_list_after (&row->rep_words);
00599   real_row->recalc_bounding_box ();
00600   return real_row;
00601 }
00602 
00603 
00611 WERD *make_real_word(BLOBNBOX_IT *box_it,  //iterator
00612                      inT32 blobcount,      //no of blobs to use
00613                      BOOL8 bol,            //start of line
00614                      uinT8 blanks          //no of blanks
00615                     ) {
00616   C_OUTLINE_IT cout_it;
00617   C_BLOB_LIST cblobs;
00618   C_BLOB_IT cblob_it = &cblobs;
00619   WERD *word;                    // new word
00620   BLOBNBOX *bblob;               // current blob
00621   inT32 blobindex;               // in row
00622 
00623   for (blobindex = 0; blobindex < blobcount; blobindex++) {
00624     bblob = box_it->extract();
00625     if (bblob->joined_to_prev()) {
00626       if (bblob->cblob() != NULL) {
00627         cout_it.set_to_list(cblob_it.data()->out_list());
00628         cout_it.move_to_last();
00629         cout_it.add_list_after(bblob->cblob()->out_list());
00630         delete bblob->cblob();
00631       }
00632     }
00633     else {
00634       if (bblob->cblob() != NULL)
00635         cblob_it.add_after_then_move(bblob->cblob());
00636     }
00637     delete bblob;
00638     box_it->forward();          // next one
00639   }
00640 
00641   if (blanks < 1)
00642     blanks = 1;
00643 
00644   word = new WERD(&cblobs, blanks, NULL);
00645 
00646   if (bol)
00647     word->set_flag(W_BOL, TRUE);
00648   if (box_it->at_first())
00649     word->set_flag(W_EOL, TRUE);  // at end of line
00650 
00651   return word;
00652 }