Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: boxread.cpp 00003 * Description: Read data from a box file. 00004 * Author: Ray Smith 00005 * Created: Fri Aug 24 17:47:23 PDT 2007 00006 * 00007 * (C) Copyright 2007, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "mfcpch.h" 00021 #include "boxread.h" 00022 #include <string.h> 00023 00024 #include "rect.h" 00025 #include "strngs.h" 00026 #include "tprintf.h" 00027 #include "unichar.h" 00028 00029 // Special char code used to identify multi-blob labels. 00030 static const char* kMultiBlobLabelCode = "WordStr"; 00031 00032 // Open the boxfile based on the given image filename. 00033 FILE* OpenBoxFile(const STRING& fname) { 00034 STRING filename = fname; 00035 const char *lastdot = strrchr(filename.string(), '.'); 00036 if (lastdot != NULL) 00037 filename[lastdot - filename.string()] = '\0'; 00038 00039 filename += ".box"; 00040 FILE* box_file = NULL; 00041 if (!(box_file = fopen(filename.string(), "rb"))) { 00042 CANTOPENFILE.error("read_next_box", TESSEXIT, 00043 "Cant open box file %s", 00044 filename.string()); 00045 } 00046 return box_file; 00047 } 00048 00049 // Box files are used ONLY DURING TRAINING, but by both processes of 00050 // creating tr files with tesseract, and unicharset_extractor. 00051 // ReadNextBox factors out the code to interpret a line of a box 00052 // file so that applybox and unicharset_extractor interpret the same way. 00053 // This function returns the next valid box file utf8 string and coords 00054 // and returns true, or false on eof (and closes the file). 00055 // It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks 00056 // for valid utf-8 and allows space or tab between fields. 00057 // utf8_str is set with the unichar string, and bounding box with the box. 00058 // If there are page numbers in the file, it reads them all. 00059 bool ReadNextBox(int *line_number, FILE* box_file, 00060 STRING* utf8_str, TBOX* bounding_box) { 00061 return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box); 00062 } 00063 00064 // As ReadNextBox above, but get a specific page number. (0-based) 00065 // Use -1 to read any page number. Files without page number all 00066 // read as if they are page 0. 00067 bool ReadNextBox(int target_page, int *line_number, FILE* box_file, 00068 STRING* utf8_str, TBOX* bounding_box) { 00069 int page = 0; 00070 char buff[kBoxReadBufSize]; // boxfile read buffer 00071 char *buffptr = buff; 00072 00073 while (fgets(buff, sizeof(buff) - 1, box_file)) { 00074 (*line_number)++; 00075 00076 buffptr = buff; 00077 const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr); 00078 if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) 00079 buffptr += 3; // Skip unicode file designation. 00080 // Check for blank lines in box file 00081 while (*buffptr == ' ' || *buffptr == '\t') 00082 buffptr++; 00083 if (*buffptr != '\0') { 00084 if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) { 00085 tprintf("Box file format error on line %i; ignored\n", *line_number); 00086 continue; 00087 } 00088 if (target_page >= 0 && target_page != page) 00089 continue; // Not on the appropriate page. 00090 return true; // Successfully read a box. 00091 } 00092 } 00093 fclose(box_file); 00094 return false; // EOF 00095 } 00096 00097 // Parses the given box file string into a page_number, utf8_str, and 00098 // bounding_box. Returns true on a successful parse. 00099 // The box file is assumed to contain box definitions, one per line, of the 00100 // following format for blob-level boxes: 00101 // <UTF8 str> <left> <bottom> <right> <top> <page id> 00102 // and for word/line-level boxes: 00103 // WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> 00104 // See applyybox.cpp for more information. 00105 bool ParseBoxFileStr(const char* boxfile_str, int* page_number, 00106 STRING* utf8_str, TBOX* bounding_box) { 00107 *bounding_box = TBOX(); // Initialize it to empty. 00108 *utf8_str = ""; 00109 char uch[kBoxReadBufSize]; 00110 const char *buffptr = boxfile_str; 00111 // Read the unichar without messing up on Tibetan. 00112 // According to issue 253 the utf-8 surrogates 85 and A0 are treated 00113 // as whitespace by sscanf, so it is more reliable to just find 00114 // ascii space and tab. 00115 int uch_len = 0; 00116 while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' && 00117 uch_len < kBoxReadBufSize - 1) { 00118 uch[uch_len++] = *buffptr++; 00119 } 00120 uch[uch_len] = '\0'; 00121 if (*buffptr != '\0') ++buffptr; 00122 int x_min, y_min, x_max, y_max; 00123 *page_number = 0; 00124 int count = sscanf(buffptr, "%d %d %d %d %d", 00125 &x_min, &y_min, &x_max, &y_max, page_number); 00126 if (count != 5 && count != 4) { 00127 tprintf("Bad box coordinates in boxfile string!\n"); 00128 return false; 00129 } 00130 // Test for long space-delimited string label. 00131 if (strcmp(uch, kMultiBlobLabelCode) == 0 && 00132 (buffptr = strchr(buffptr, '#')) != NULL) { 00133 strncpy(uch, buffptr + 1, kBoxReadBufSize); 00134 chomp_string(uch); 00135 uch_len = strlen(uch); 00136 } 00137 // Validate UTF8 by making unichars with it. 00138 int used = 0; 00139 while (used < uch_len) { 00140 UNICHAR ch(uch + used, uch_len - used); 00141 int new_used = ch.utf8_len(); 00142 if (new_used == 0) { 00143 tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n", 00144 uch + used, uch[used], used + 1); 00145 return false; 00146 } 00147 used += new_used; 00148 } 00149 *utf8_str = uch; 00150 bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max); 00151 return true; // Successfully read a box. 00152 } 00153 00154 // Creates a box file string from a unichar string, TBOX and page number. 00155 void MakeBoxFileStr(const char* unichar_str, const TBOX& box, int page_num, 00156 STRING* box_str) { 00157 *box_str = unichar_str; 00158 box_str->add_str_int(" ", box.left()); 00159 box_str->add_str_int(" ", box.bottom()); 00160 box_str->add_str_int(" ", box.right()); 00161 box_str->add_str_int(" ", box.top()); 00162 box_str->add_str_int(" ", page_num); 00163 } 00164