Tesseract
3.02
|
#include <tess_lang_model.h>
Public Member Functions | |
TessLangModel (const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt) | |
~TessLangModel () | |
TessLangModEdge * | Root () |
LangModEdge ** | GetEdges (CharAltList *alt_list, LangModEdge *edge, int *edge_cnt) |
bool | IsValidSequence (const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL) |
bool | IsLeadingPunc (char_32 ch) |
bool | IsTrailingPunc (char_32 ch) |
bool | IsDigit (char_32 ch) |
void | RemoveInvalidCharacters (string *lm_str) |
Definition at line 38 of file tess_lang_model.h.
tesseract::TessLangModel::TessLangModel | ( | const string & | lm_params, |
const string & | data_file_path, | ||
bool | load_system_dawg, | ||
TessdataManager * | tessdata_manager, | ||
CubeRecoContext * | cntxt | ||
) |
Definition at line 60 of file tess_lang_model.cpp.
{ cntxt_ = cntxt; has_case_ = cntxt_->HasCase(); // Load the rest of the language model elements from file LoadLangModelElements(lm_params); // Load word_dawgs_ if needed. if (tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) { word_dawgs_ = new DawgVector(); if (load_system_dawg && tessdata_manager->SeekToStart(TESSDATA_CUBE_SYSTEM_DAWG)) { // The last parameter to the Dawg constructor (the debug level) is set to // false, until Cube has a way to express its preferred debug level. *word_dawgs_ += new SquishedDawg(tessdata_manager->GetDataFilePtr(), DAWG_TYPE_WORD, cntxt_->Lang().c_str(), SYSTEM_DAWG_PERM, false); } } else { word_dawgs_ = NULL; } }
tesseract::TessLangModel::~TessLangModel | ( | ) | [inline] |
Definition at line 45 of file tess_lang_model.h.
{ if (word_dawgs_ != NULL) { word_dawgs_->delete_data_pointers(); delete word_dawgs_; } }
LangModEdge ** tesseract::TessLangModel::GetEdges | ( | CharAltList * | alt_list, |
LangModEdge * | edge, | ||
int * | edge_cnt | ||
) | [virtual] |
Implements tesseract::LangModel.
Definition at line 169 of file tess_lang_model.cpp.
{ TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(lang_mod_edge); LangModEdge **edge_array = NULL; (*edge_cnt) = 0; // if we are starting from the root, we'll instantiate every DAWG // and get the all the edges that emerge from the root if (tess_lm_edge == NULL) { // get DAWG count from Tesseract int dawg_cnt = NumDawgs(); // preallocate the edge buffer (*edge_cnt) = dawg_cnt * max_edge_; edge_array = new LangModEdge *[(*edge_cnt)]; if (edge_array == NULL) { return NULL; } for (int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) { const Dawg *curr_dawg = GetDawg(dawg_idx); // Only look through word Dawgs (since there is a special way of // handling numbers and punctuation). if (curr_dawg->type() == DAWG_TYPE_WORD) { (*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0, NULL, true, edge_array + (*edge_cnt)); } } // dawg (*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0, NULL, true, edge_array + (*edge_cnt)); // OOD: it is intentionally not added to the list to make sure it comes // at the end (*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0, NULL, true, edge_array + (*edge_cnt)); // set the root flag for all root edges for (int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) { edge_array[edge_idx]->SetRoot(true); } } else { // not starting at the root // preallocate the edge buffer (*edge_cnt) = max_edge_; // allocate memory for edges edge_array = new LangModEdge *[(*edge_cnt)]; if (edge_array == NULL) { return NULL; } // get the FanOut edges from the root of each dawg (*edge_cnt) = FanOut(alt_list, tess_lm_edge->GetDawg(), tess_lm_edge->EndEdge(), tess_lm_edge->EdgeMask(), tess_lm_edge->EdgeString(), false, edge_array); } return edge_array; }
bool tesseract::TessLangModel::IsDigit | ( | char_32 | ch | ) | [virtual] |
Implements tesseract::LangModel.
Definition at line 162 of file tess_lang_model.cpp.
{
return digits_.find(ch) != string::npos;
}
bool tesseract::TessLangModel::IsLeadingPunc | ( | char_32 | ch | ) | [virtual] |
Implements tesseract::LangModel.
Definition at line 154 of file tess_lang_model.cpp.
{
return lead_punc_.find(ch) != string::npos;
}
bool tesseract::TessLangModel::IsTrailingPunc | ( | char_32 | ch | ) | [virtual] |
Implements tesseract::LangModel.
Definition at line 158 of file tess_lang_model.cpp.
{
return trail_punc_.find(ch) != string::npos;
}
bool tesseract::TessLangModel::IsValidSequence | ( | const char_32 * | sequence, |
bool | eow_flag, | ||
LangModEdge ** | final_edge = NULL |
||
) | [virtual] |
Implements tesseract::LangModel.
Definition at line 145 of file tess_lang_model.cpp.
{ if (final_edge != NULL) { (*final_edge) = NULL; } return IsValidSequence(NULL, sequence, eow_flag, final_edge); }
void tesseract::TessLangModel::RemoveInvalidCharacters | ( | string * | lm_str | ) |
Definition at line 482 of file tess_lang_model.cpp.
{ CharSet *char_set = cntxt_->CharacterSet(); tesseract::string_32 lm_str32; CubeUtils::UTF8ToUTF32(lm_str->c_str(), &lm_str32); int len = CubeUtils::StrLen(lm_str32.c_str()); char_32 *clean_str32 = new char_32[len + 1]; if (!clean_str32) return; int clean_len = 0; for (int i = 0; i < len; ++i) { int class_id = char_set->ClassID((char_32)lm_str32[i]); if (class_id != INVALID_UNICHAR_ID) { clean_str32[clean_len] = lm_str32[i]; ++clean_len; } } clean_str32[clean_len] = 0; if (clean_len < len) { lm_str->clear(); CubeUtils::UTF32ToUTF8(clean_str32, lm_str); } delete [] clean_str32; }
TessLangModEdge* tesseract::TessLangModel::Root | ( | ) | [inline, virtual] |