Tesseract
3.02
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: context.c (Formerly context.c) 00005 * Description: Context checking functions 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Thu Feb 15 11:18:24 1990 00008 * Modified: Tue Jul 9 17:38:16 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Experimental (Do Not Distribute) 00012 * 00013 * (c) Copyright 1990, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 00026 #include "dict.h" 00027 #include "tprintf.h" 00028 #include "unicharset.h" 00029 00030 namespace tesseract { 00031 00032 static const int kMinAbsoluteGarbageWordLength = 10; 00033 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f; 00034 00035 const int case_state_table[6][4] = { { 00036 /* 0. Begining of word */ 00037 /* P U L D */ 00038 /* -1. Error on case */ 00039 0, 1, 5, 4 00040 }, 00041 { /* 1. After initial capital */ 00042 0, 3, 2, 4 00043 }, 00044 { /* 2. After lower case */ 00045 0, -1, 2, -1 00046 }, 00047 { /* 3. After upper case */ 00048 0, 3, -1, 4 00049 }, 00050 { /* 4. After a digit */ 00051 0, -1, -1, 4 00052 }, 00053 { /* 5. After initial lower case */ 00054 5, -1, 2, -1 00055 }, 00056 }; 00057 00058 int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) { 00059 int last_state = 0; 00060 int state = 0; 00061 int x; 00062 for (x = 0; x < word.length(); ++x) { 00063 UNICHAR_ID ch_id = word.unichar_id(x); 00064 if (unicharset.get_isupper(ch_id)) 00065 state = case_state_table[state][1]; 00066 else if (unicharset.get_islower(ch_id)) 00067 state = case_state_table[state][2]; 00068 else if (unicharset.get_isdigit(ch_id)) 00069 state = case_state_table[state][3]; 00070 else 00071 state = case_state_table[state][0]; 00072 if (state == -1) return false; 00073 last_state = state; 00074 } 00075 return state != 5; // single lower is bad 00076 } 00077 00078 bool Dict::absolute_garbage(const WERD_CHOICE &word, 00079 const UNICHARSET &unicharset) { 00080 if (word.length() < kMinAbsoluteGarbageWordLength) return false; 00081 int num_alphanum = 0; 00082 for (int x = 0; x < word.length(); ++x) { 00083 num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) || 00084 unicharset.get_isdigit(word.unichar_id(x))); 00085 } 00086 return (static_cast<float>(num_alphanum) / 00087 static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac); 00088 } 00089 00090 } // namespace tesseract