Tesseract
3.02
|
00001 #include "mfcpch.h" 00002 #include "statistc.h" 00003 #include "gap_map.h" 00004 00005 #define EXTERN 00006 EXTERN BOOL_VAR (gapmap_debug, FALSE, "Say which blocks have tables"); 00007 EXTERN BOOL_VAR (gapmap_use_ends, FALSE, 00008 "Use large space at start and end of rows"); 00009 EXTERN BOOL_VAR (gapmap_no_isolated_quanta, FALSE, 00010 "Ensure gaps not less than 2quanta wide"); 00011 EXTERN double_VAR (gapmap_big_gaps, 1.75, "xht multiplier"); 00012 00013 /************************************************************************* 00014 * A block gap map is a quantised histogram of whitespace regions in the 00015 * block. It is a vertical projection of wide gaps WITHIN lines 00016 * 00017 * The map is held as an array of counts of rows which have a wide gap 00018 * covering that region of the row. Each bucket in the map represents a width 00019 * of about half an xheight - (The median of the xhts in the rows is used.) 00020 * 00021 * The block is considered RECTANGULAR - delimited by the left and right 00022 * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are 00023 * counted. 00024 * 00025 *************************************************************************/ 00026 00027 GAPMAP::GAPMAP( //Constructor 00028 TO_BLOCK *block //block 00029 ) { 00030 TO_ROW_IT row_it; //row iterator 00031 TO_ROW *row; //current row 00032 BLOBNBOX_IT blob_it; //iterator 00033 TBOX blob_box; 00034 TBOX prev_blob_box; 00035 inT16 gap_width; 00036 inT16 start_of_row; 00037 inT16 end_of_row; 00038 STATS xht_stats (0, 128); 00039 inT16 min_quantum; 00040 inT16 max_quantum; 00041 inT16 i; 00042 00043 row_it.set_to_list (block->get_rows ()); 00044 /* 00045 Find left and right extremes and bucket size 00046 */ 00047 map = NULL; 00048 min_left = MAX_INT16; 00049 max_right = -MAX_INT16; 00050 total_rows = 0; 00051 any_tabs = FALSE; 00052 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00053 row = row_it.data (); 00054 if (!row->blob_list ()->empty ()) { 00055 total_rows++; 00056 xht_stats.add ((inT16) floor (row->xheight + 0.5), 1); 00057 blob_it.set_to_list (row->blob_list ()); 00058 start_of_row = blob_it.data ()->bounding_box ().left (); 00059 end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); 00060 if (min_left > start_of_row) 00061 min_left = start_of_row; 00062 if (max_right < end_of_row) 00063 max_right = end_of_row; 00064 } 00065 } 00066 if ((total_rows < 3) || (min_left >= max_right)) { 00067 total_rows = 0; 00068 min_left = max_right = 0; 00069 return; 00070 } 00071 bucket_size = (inT16) floor (xht_stats.median () + 0.5) / 2; 00072 map_max = (max_right - min_left) / bucket_size; 00073 map = (inT16 *) alloc_mem ((map_max + 1) * sizeof (inT16)); 00074 for (i = 0; i <= map_max; i++) 00075 map[i] = 0; 00076 00077 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00078 row = row_it.data (); 00079 if (!row->blob_list ()->empty ()) { 00080 blob_it.set_to_list (row->blob_list ()); 00081 blob_it.mark_cycle_pt (); 00082 blob_box = box_next (&blob_it); 00083 prev_blob_box = blob_box; 00084 if (gapmap_use_ends) { 00085 /* Leading space */ 00086 gap_width = blob_box.left () - min_left; 00087 if ((gap_width > gapmap_big_gaps * row->xheight) 00088 && gap_width > 2) { 00089 max_quantum = (blob_box.left () - min_left) / bucket_size; 00090 for (i = 0; i <= max_quantum; i++) 00091 map[i]++; 00092 } 00093 } 00094 while (!blob_it.cycled_list ()) { 00095 blob_box = box_next (&blob_it); 00096 gap_width = blob_box.left () - prev_blob_box.right (); 00097 if ((gap_width > gapmap_big_gaps * row->xheight) 00098 && gap_width > 2) { 00099 min_quantum = 00100 (prev_blob_box.right () - min_left) / bucket_size; 00101 max_quantum = (blob_box.left () - min_left) / bucket_size; 00102 for (i = min_quantum; i <= max_quantum; i++) 00103 map[i]++; 00104 } 00105 prev_blob_box = blob_box; 00106 } 00107 if (gapmap_use_ends) { 00108 /* Trailing space */ 00109 gap_width = max_right - prev_blob_box.right (); 00110 if ((gap_width > gapmap_big_gaps * row->xheight) 00111 && gap_width > 2) { 00112 min_quantum = 00113 (prev_blob_box.right () - min_left) / bucket_size; 00114 for (i = min_quantum; i <= map_max; i++) 00115 map[i]++; 00116 } 00117 } 00118 } 00119 } 00120 for (i = 0; i <= map_max; i++) { 00121 if (map[i] > total_rows / 2) { 00122 if (gapmap_no_isolated_quanta && 00123 (((i == 0) && 00124 (map[i + 1] <= total_rows / 2)) || 00125 ((i == map_max) && 00126 (map[i - 1] <= total_rows / 2)) || 00127 ((i > 0) && 00128 (i < map_max) && 00129 (map[i - 1] <= total_rows / 2) && 00130 (map[i + 1] <= total_rows / 2)))) { 00131 map[i] = 0; //prevent isolated quantum 00132 } 00133 else 00134 any_tabs = TRUE; 00135 } 00136 } 00137 if (gapmap_debug && any_tabs) 00138 tprintf ("Table found\n"); 00139 } 00140 00141 00142 /************************************************************************* 00143 * GAPMAP::table_gap() 00144 * Is there a bucket in the specified range where more than half the rows in the 00145 * block have a wide gap? 00146 *************************************************************************/ 00147 00148 BOOL8 GAPMAP::table_gap( //Is gap a table? 00149 inT16 left, //From here 00150 inT16 right //To here 00151 ) { 00152 inT16 min_quantum; 00153 inT16 max_quantum; 00154 inT16 i; 00155 BOOL8 tab_found = FALSE; 00156 00157 if (!any_tabs) 00158 return FALSE; 00159 00160 min_quantum = (left - min_left) / bucket_size; 00161 max_quantum = (right - min_left) / bucket_size; 00162 for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) 00163 if (map[i] > total_rows / 2) 00164 tab_found = TRUE; 00165 return tab_found; 00166 }