|
Tesseract
3.02
|
#include <tess_lang_model.h>
Public Member Functions | |
| TessLangModel (const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt) | |
| ~TessLangModel () | |
| TessLangModEdge * | Root () |
| LangModEdge ** | GetEdges (CharAltList *alt_list, LangModEdge *edge, int *edge_cnt) |
| bool | IsValidSequence (const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL) |
| bool | IsLeadingPunc (char_32 ch) |
| bool | IsTrailingPunc (char_32 ch) |
| bool | IsDigit (char_32 ch) |
| void | RemoveInvalidCharacters (string *lm_str) |
Definition at line 38 of file tess_lang_model.h.
| tesseract::TessLangModel::TessLangModel | ( | const string & | lm_params, |
| const string & | data_file_path, | ||
| bool | load_system_dawg, | ||
| TessdataManager * | tessdata_manager, | ||
| CubeRecoContext * | cntxt | ||
| ) |
Definition at line 60 of file tess_lang_model.cpp.
{
cntxt_ = cntxt;
has_case_ = cntxt_->HasCase();
// Load the rest of the language model elements from file
LoadLangModelElements(lm_params);
// Load word_dawgs_ if needed.
if (tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) {
word_dawgs_ = new DawgVector();
if (load_system_dawg &&
tessdata_manager->SeekToStart(TESSDATA_CUBE_SYSTEM_DAWG)) {
// The last parameter to the Dawg constructor (the debug level) is set to
// false, until Cube has a way to express its preferred debug level.
*word_dawgs_ += new SquishedDawg(tessdata_manager->GetDataFilePtr(),
DAWG_TYPE_WORD,
cntxt_->Lang().c_str(),
SYSTEM_DAWG_PERM, false);
}
} else {
word_dawgs_ = NULL;
}
}
| tesseract::TessLangModel::~TessLangModel | ( | ) | [inline] |
Definition at line 45 of file tess_lang_model.h.
{
if (word_dawgs_ != NULL) {
word_dawgs_->delete_data_pointers();
delete word_dawgs_;
}
}
| LangModEdge ** tesseract::TessLangModel::GetEdges | ( | CharAltList * | alt_list, |
| LangModEdge * | edge, | ||
| int * | edge_cnt | ||
| ) | [virtual] |
Implements tesseract::LangModel.
Definition at line 169 of file tess_lang_model.cpp.
{
TessLangModEdge *tess_lm_edge =
reinterpret_cast<TessLangModEdge *>(lang_mod_edge);
LangModEdge **edge_array = NULL;
(*edge_cnt) = 0;
// if we are starting from the root, we'll instantiate every DAWG
// and get the all the edges that emerge from the root
if (tess_lm_edge == NULL) {
// get DAWG count from Tesseract
int dawg_cnt = NumDawgs();
// preallocate the edge buffer
(*edge_cnt) = dawg_cnt * max_edge_;
edge_array = new LangModEdge *[(*edge_cnt)];
if (edge_array == NULL) {
return NULL;
}
for (int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) {
const Dawg *curr_dawg = GetDawg(dawg_idx);
// Only look through word Dawgs (since there is a special way of
// handling numbers and punctuation).
if (curr_dawg->type() == DAWG_TYPE_WORD) {
(*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0, NULL, true,
edge_array + (*edge_cnt));
}
} // dawg
(*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0, NULL, true,
edge_array + (*edge_cnt));
// OOD: it is intentionally not added to the list to make sure it comes
// at the end
(*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0, NULL, true,
edge_array + (*edge_cnt));
// set the root flag for all root edges
for (int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) {
edge_array[edge_idx]->SetRoot(true);
}
} else { // not starting at the root
// preallocate the edge buffer
(*edge_cnt) = max_edge_;
// allocate memory for edges
edge_array = new LangModEdge *[(*edge_cnt)];
if (edge_array == NULL) {
return NULL;
}
// get the FanOut edges from the root of each dawg
(*edge_cnt) = FanOut(alt_list,
tess_lm_edge->GetDawg(),
tess_lm_edge->EndEdge(), tess_lm_edge->EdgeMask(),
tess_lm_edge->EdgeString(), false, edge_array);
}
return edge_array;
}
| bool tesseract::TessLangModel::IsDigit | ( | char_32 | ch | ) | [virtual] |
Implements tesseract::LangModel.
Definition at line 162 of file tess_lang_model.cpp.
{
return digits_.find(ch) != string::npos;
}
| bool tesseract::TessLangModel::IsLeadingPunc | ( | char_32 | ch | ) | [virtual] |
Implements tesseract::LangModel.
Definition at line 154 of file tess_lang_model.cpp.
{
return lead_punc_.find(ch) != string::npos;
}
| bool tesseract::TessLangModel::IsTrailingPunc | ( | char_32 | ch | ) | [virtual] |
Implements tesseract::LangModel.
Definition at line 158 of file tess_lang_model.cpp.
{
return trail_punc_.find(ch) != string::npos;
}
| bool tesseract::TessLangModel::IsValidSequence | ( | const char_32 * | sequence, |
| bool | eow_flag, | ||
| LangModEdge ** | final_edge = NULL |
||
| ) | [virtual] |
Implements tesseract::LangModel.
Definition at line 145 of file tess_lang_model.cpp.
{
if (final_edge != NULL) {
(*final_edge) = NULL;
}
return IsValidSequence(NULL, sequence, eow_flag, final_edge);
}
| void tesseract::TessLangModel::RemoveInvalidCharacters | ( | string * | lm_str | ) |
Definition at line 482 of file tess_lang_model.cpp.
{
CharSet *char_set = cntxt_->CharacterSet();
tesseract::string_32 lm_str32;
CubeUtils::UTF8ToUTF32(lm_str->c_str(), &lm_str32);
int len = CubeUtils::StrLen(lm_str32.c_str());
char_32 *clean_str32 = new char_32[len + 1];
if (!clean_str32)
return;
int clean_len = 0;
for (int i = 0; i < len; ++i) {
int class_id = char_set->ClassID((char_32)lm_str32[i]);
if (class_id != INVALID_UNICHAR_ID) {
clean_str32[clean_len] = lm_str32[i];
++clean_len;
}
}
clean_str32[clean_len] = 0;
if (clean_len < len) {
lm_str->clear();
CubeUtils::UTF32ToUTF8(clean_str32, lm_str);
}
delete [] clean_str32;
}
| TessLangModEdge* tesseract::TessLangModel::Root | ( | ) | [inline, virtual] |