tesseract-doc/context_8cpp_source.html

00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        context.c  (Formerly context.c)
00005  * Description:  Context checking functions
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Thu Feb 15 11:18:24 1990
00008  * Modified:     Tue Jul  9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Experimental (Do Not Distribute)
00012  *
00013  * (c) Copyright 1990, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025
00026 #include "dict.h"
00027 #include "tprintf.h"
00028 #include "unicharset.h"
00029
00030 namespace tesseract {
00031
00032 static const int kMinAbsoluteGarbageWordLength = 10;
00033 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
00034
00035 const int case_state_table[6][4] = { {
00036                                   /*  0. Begining of word        */
00037     /*    P   U   L   D                                          */
00038                                   /* -1. Error on case           */
00039       0, 1, 5, 4
00040     },
00041     {                            /*  1. After initial capital    */
00042       0, 3, 2, 4
00043     },
00044     {                            /*  2. After lower case         */
00045       0, -1, 2, -1
00046     },
00047     {                            /*  3. After upper case         */
00048       0, 3, -1, 4
00049     },
00050     {                            /*  4. After a digit            */
00051       0, -1, -1, 4
00052     },
00053     {                            /*  5. After initial lower case */
00054       5, -1, 2, -1
00055     },
00056   };
00057
00058 int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
00059   int last_state = 0;
00060   int state = 0;
00061   int x;
00062   for (x = 0; x < word.length(); ++x) {
00063     UNICHAR_ID ch_id = word.unichar_id(x);
00064     if (unicharset.get_isupper(ch_id))
00065       state = case_state_table[state][1];
00066     else if (unicharset.get_islower(ch_id))
00067       state = case_state_table[state][2];
00068     else if (unicharset.get_isdigit(ch_id))
00069       state = case_state_table[state][3];
00070     else
00071       state = case_state_table[state][0];
00072     if (state == -1) return false;
00073     last_state = state;
00074   }
00075   return state != 5; // single lower is bad
00076 }
00077
00078 bool Dict::absolute_garbage(const WERD_CHOICE &word,
00079                             const UNICHARSET &unicharset) {
00080   if (word.length() < kMinAbsoluteGarbageWordLength) return false;
00081   int num_alphanum = 0;
00082   for (int x = 0; x < word.length(); ++x) {
00083     num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
00084                      unicharset.get_isdigit(word.unichar_id(x)));
00085   }
00086   return (static_cast<float>(num_alphanum) /
00087           static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
00088 }
00089
00090 }  // namespace tesseract