Tesseract  3.02
tesseract-ocr/textord/gap_map.cpp
Go to the documentation of this file.
00001 #include "mfcpch.h"
00002 #include          "statistc.h"
00003 #include          "gap_map.h"
00004 
00005 #define EXTERN
00006 EXTERN BOOL_VAR (gapmap_debug, FALSE, "Say which blocks have tables");
00007 EXTERN BOOL_VAR (gapmap_use_ends, FALSE,
00008 "Use large space at start and end of rows");
00009 EXTERN BOOL_VAR (gapmap_no_isolated_quanta, FALSE,
00010 "Ensure gaps not less than 2quanta wide");
00011 EXTERN double_VAR (gapmap_big_gaps, 1.75, "xht multiplier");
00012 
00013 /*************************************************************************
00014  * A block gap map is a quantised histogram of whitespace regions in the
00015  * block. It is a vertical projection of wide gaps WITHIN lines
00016  *
00017  * The map is held as an array of counts of rows which have a wide gap
00018  * covering that region of the row. Each bucket in the map represents a width
00019  * of about half an xheight - (The median of the xhts in the rows is used.)
00020  *
00021  * The block is considered RECTANGULAR - delimited by the left and right
00022  * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
00023  * counted.
00024  *
00025  *************************************************************************/
00026 
00027 GAPMAP::GAPMAP(                 //Constructor
00028                TO_BLOCK *block  //block
00029               ) {
00030   TO_ROW_IT row_it;              //row iterator
00031   TO_ROW *row;                   //current row
00032   BLOBNBOX_IT blob_it;           //iterator
00033   TBOX blob_box;
00034   TBOX prev_blob_box;
00035   inT16 gap_width;
00036   inT16 start_of_row;
00037   inT16 end_of_row;
00038   STATS xht_stats (0, 128);
00039   inT16 min_quantum;
00040   inT16 max_quantum;
00041   inT16 i;
00042 
00043   row_it.set_to_list (block->get_rows ());
00044   /*
00045     Find left and right extremes and bucket size
00046   */
00047   map = NULL;
00048   min_left = MAX_INT16;
00049   max_right = -MAX_INT16;
00050   total_rows = 0;
00051   any_tabs = FALSE;
00052   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00053     row = row_it.data ();
00054     if (!row->blob_list ()->empty ()) {
00055       total_rows++;
00056       xht_stats.add ((inT16) floor (row->xheight + 0.5), 1);
00057       blob_it.set_to_list (row->blob_list ());
00058       start_of_row = blob_it.data ()->bounding_box ().left ();
00059       end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00060       if (min_left > start_of_row)
00061         min_left = start_of_row;
00062       if (max_right < end_of_row)
00063         max_right = end_of_row;
00064     }
00065   }
00066   if ((total_rows < 3) || (min_left >= max_right)) {
00067     total_rows = 0;
00068     min_left = max_right = 0;
00069     return;
00070   }
00071   bucket_size = (inT16) floor (xht_stats.median () + 0.5) / 2;
00072   map_max = (max_right - min_left) / bucket_size;
00073   map = (inT16 *) alloc_mem ((map_max + 1) * sizeof (inT16));
00074   for (i = 0; i <= map_max; i++)
00075     map[i] = 0;
00076 
00077   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00078     row = row_it.data ();
00079     if (!row->blob_list ()->empty ()) {
00080       blob_it.set_to_list (row->blob_list ());
00081       blob_it.mark_cycle_pt ();
00082       blob_box = box_next (&blob_it);
00083       prev_blob_box = blob_box;
00084       if (gapmap_use_ends) {
00085         /* Leading space */
00086         gap_width = blob_box.left () - min_left;
00087         if ((gap_width > gapmap_big_gaps * row->xheight)
00088         && gap_width > 2) {
00089           max_quantum = (blob_box.left () - min_left) / bucket_size;
00090           for (i = 0; i <= max_quantum; i++)
00091             map[i]++;
00092         }
00093       }
00094       while (!blob_it.cycled_list ()) {
00095         blob_box = box_next (&blob_it);
00096         gap_width = blob_box.left () - prev_blob_box.right ();
00097         if ((gap_width > gapmap_big_gaps * row->xheight)
00098         && gap_width > 2) {
00099           min_quantum =
00100             (prev_blob_box.right () - min_left) / bucket_size;
00101           max_quantum = (blob_box.left () - min_left) / bucket_size;
00102           for (i = min_quantum; i <= max_quantum; i++)
00103             map[i]++;
00104         }
00105         prev_blob_box = blob_box;
00106       }
00107       if (gapmap_use_ends) {
00108         /* Trailing space */
00109         gap_width = max_right - prev_blob_box.right ();
00110         if ((gap_width > gapmap_big_gaps * row->xheight)
00111         && gap_width > 2) {
00112           min_quantum =
00113             (prev_blob_box.right () - min_left) / bucket_size;
00114           for (i = min_quantum; i <= map_max; i++)
00115             map[i]++;
00116         }
00117       }
00118     }
00119   }
00120   for (i = 0; i <= map_max; i++) {
00121     if (map[i] > total_rows / 2) {
00122       if (gapmap_no_isolated_quanta &&
00123         (((i == 0) &&
00124         (map[i + 1] <= total_rows / 2)) ||
00125         ((i == map_max) &&
00126         (map[i - 1] <= total_rows / 2)) ||
00127         ((i > 0) &&
00128         (i < map_max) &&
00129         (map[i - 1] <= total_rows / 2) &&
00130       (map[i + 1] <= total_rows / 2)))) {
00131         map[i] = 0;              //prevent isolated quantum
00132       }
00133       else
00134         any_tabs = TRUE;
00135     }
00136   }
00137   if (gapmap_debug && any_tabs)
00138     tprintf ("Table found\n");
00139 }
00140 
00141 
00142 /*************************************************************************
00143  * GAPMAP::table_gap()
00144  * Is there a bucket in the specified range where more than half the rows in the
00145  * block have a wide gap?
00146  *************************************************************************/
00147 
00148 BOOL8 GAPMAP::table_gap(             //Is gap a table?
00149                         inT16 left,  //From here
00150                         inT16 right  //To here
00151                        ) {
00152   inT16 min_quantum;
00153   inT16 max_quantum;
00154   inT16 i;
00155   BOOL8 tab_found = FALSE;
00156 
00157   if (!any_tabs)
00158     return FALSE;
00159 
00160   min_quantum = (left - min_left) / bucket_size;
00161   max_quantum = (right - min_left) / bucket_size;
00162   for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++)
00163     if (map[i] > total_rows / 2)
00164       tab_found = TRUE;
00165   return tab_found;
00166 }