Tesseract
3.02
|
#include <stdio.h>
#include <locale.h>
#include "boxread.h"
#include "rect.h"
#include "strngs.h"
#include "tessopt.h"
#include "unichar.h"
#include "unicharset.h"
Go to the source code of this file.
Functions | |
UNICHAR_ID | wc_to_unichar_id (const UNICHARSET &unicharset, int wc) |
void | set_properties (UNICHARSET *unicharset, const char *const c_string) |
int | main (int argc, char **argv) |
int main | ( | int | argc, |
char ** | argv | ||
) |
---------------------------------------------------------------------------- Public Function Prototypes ----------------------------------------------------------------------------
Definition at line 102 of file unicharset_extractor.cpp.
{ int option; const char* output_directory = "."; STRING unicharset_file_name; UNICHARSET unicharset; setlocale(LC_ALL, ""); // Space character needed to represent NIL classification unicharset.unichar_insert(" "); // Print usage if (argc <= 1) { printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]); exit(1); } // Parse arguments while ((option = tessopt(argc, argv, "D" )) != EOF) { switch (option) { case 'D': output_directory = tessoptarg; ++tessoptind; break; } } // Save file name unicharset_file_name = output_directory; unicharset_file_name += "/"; unicharset_file_name += kUnicharsetFileName; // Load box files for (; tessoptind < argc; ++tessoptind) { printf("Extracting unicharset from %s\n", argv[tessoptind]); FILE* box_file = fopen(argv[tessoptind], "rb"); if (box_file == NULL) { printf("Cannot open box file %s\n", argv[tessoptind]); return -1; } TBOX box; STRING unichar_string; int line_number = 0; while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) { unicharset.unichar_insert(unichar_string.string()); set_properties(&unicharset, unichar_string.string()); } } // Write unicharset file if (unicharset.save_to_file(unicharset_file_name.string())) { printf("Wrote unicharset file %s.\n", unicharset_file_name.string()); } else { printf("Cannot save unicharset file %s.\n", unicharset_file_name.string()); return -1; } return 0; }
void set_properties | ( | UNICHARSET * | unicharset, |
const char *const | c_string | ||
) |
Definition at line 61 of file unicharset_extractor.cpp.
{ #ifdef USING_WCTYPE UNICHAR_ID id; int wc; // Convert the string to a unichar id. id = unicharset->unichar_to_id(c_string); // Set the other_case property to be this unichar id by default. unicharset->set_other_case(id, id); int step = UNICHAR::utf8_step(c_string); if (step == 0) return; // Invalid utf-8. // Get the next Unicode code point in the string. UNICHAR ch(c_string, step); wc = ch.first_uni(); /* Copy the properties. */ if (iswalpha(wc)) { unicharset->set_isalpha(id, 1); if (iswlower(wc)) { unicharset->set_islower(id, 1); unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, towupper(wc))); } if (iswupper(wc)) { unicharset->set_isupper(id, 1); unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, towlower(wc))); } } if (iswdigit(wc)) unicharset->set_isdigit(id, 1); if(iswpunct(wc)) unicharset->set_ispunctuation(id, 1); #endif }
UNICHAR_ID wc_to_unichar_id | ( | const UNICHARSET & | unicharset, |
int | wc | ||
) |
Definition at line 49 of file unicharset_extractor.cpp.
{ UNICHAR uch(wc); char *unichar = uch.utf8_str(); UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar); delete[] unichar; return unichar_id; }