#include <stdio.h>
#include <locale.h>
#include "boxread.h"
#include "rect.h"
#include "strngs.h"
#include "tessopt.h"
#include "unichar.h"
#include "unicharset.h"

Functions
UNICHAR_ID	wc_to_unichar_id (const UNICHARSET &unicharset, int wc)
void	set_properties (UNICHARSET unicharset, const char const c_string)
int	main (int argc, char **argv)

Function Documentation

int main	(	int	argc,
		char **	argv
	)

---------------------------------------------------------------------------- Public Function Prototypes ----------------------------------------------------------------------------

Definition at line 102 of file unicharset_extractor.cpp.

                                {
  int option;
  const char* output_directory = ".";
  STRING unicharset_file_name;
  UNICHARSET unicharset;

  setlocale(LC_ALL, "");
  // Space character needed to represent NIL classification
  unicharset.unichar_insert(" ");

  // Print usage
  if (argc <= 1) {
    printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
    exit(1);

  }

  // Parse arguments
  while ((option = tessopt(argc, argv, "D" )) != EOF) {
    switch (option) {
      case 'D':
        output_directory = tessoptarg;
        ++tessoptind;
        break;
    }
  }

  // Save file name
  unicharset_file_name = output_directory;
  unicharset_file_name += "/";
  unicharset_file_name += kUnicharsetFileName;

  // Load box files
  for (; tessoptind < argc; ++tessoptind) {
    printf("Extracting unicharset from %s\n", argv[tessoptind]);

    FILE* box_file = fopen(argv[tessoptind], "rb");
    if (box_file == NULL) {
      printf("Cannot open box file %s\n", argv[tessoptind]);
      return -1;
    }

    TBOX box;
    STRING unichar_string;
    int line_number = 0;
    while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
      unicharset.unichar_insert(unichar_string.string());
      set_properties(&unicharset, unichar_string.string());
    }
  }

  // Write unicharset file
  if (unicharset.save_to_file(unicharset_file_name.string())) {
    printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
  }
  else {
    printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
    return -1;
  }
  return 0;
}

void set_properties	(	UNICHARSET *	unicharset,
		const char *const	c_string
	)

Definition at line 61 of file unicharset_extractor.cpp.

                                                                        {
#ifdef USING_WCTYPE
  UNICHAR_ID id;
  int wc;

  // Convert the string to a unichar id.
  id = unicharset->unichar_to_id(c_string);

  // Set the other_case property to be this unichar id by default.
  unicharset->set_other_case(id, id);

  int step = UNICHAR::utf8_step(c_string);
  if (step == 0)
    return; // Invalid utf-8.

  // Get the next Unicode code point in the string.
  UNICHAR ch(c_string, step);
  wc = ch.first_uni();

  /* Copy the properties. */
  if (iswalpha(wc)) {
    unicharset->set_isalpha(id, 1);
    if (iswlower(wc)) {
      unicharset->set_islower(id, 1);
      unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
                                                      towupper(wc)));
    }
    if (iswupper(wc)) {
      unicharset->set_isupper(id, 1);
      unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
                                                      towlower(wc)));
    }
  }
  if (iswdigit(wc))
    unicharset->set_isdigit(id, 1);
  if(iswpunct(wc))
    unicharset->set_ispunctuation(id, 1);

#endif
}

UNICHAR_ID wc_to_unichar_id	(	const UNICHARSET &	unicharset,
		int	wc
	)

Definition at line 49 of file unicharset_extractor.cpp.

                                                                  {
  UNICHAR uch(wc);
  char *unichar = uch.utf8_str();
  UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
  delete[] unichar;
  return unichar_id;
}

Functions

Function Documentation