Tesseract  3.02
tesseract::TessLangModel Class Reference

#include <tess_lang_model.h>

Inheritance diagram for tesseract::TessLangModel:
tesseract::LangModel

List of all members.

Public Member Functions

 TessLangModel (const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt)
 ~TessLangModel ()
TessLangModEdgeRoot ()
LangModEdge ** GetEdges (CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
bool IsValidSequence (const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)
bool IsLeadingPunc (char_32 ch)
bool IsTrailingPunc (char_32 ch)
bool IsDigit (char_32 ch)
void RemoveInvalidCharacters (string *lm_str)

Detailed Description

Definition at line 38 of file tess_lang_model.h.


Constructor & Destructor Documentation

tesseract::TessLangModel::TessLangModel ( const string &  lm_params,
const string &  data_file_path,
bool  load_system_dawg,
TessdataManager tessdata_manager,
CubeRecoContext cntxt 
)

Definition at line 60 of file tess_lang_model.cpp.

                                                     {
  cntxt_ = cntxt;
  has_case_ = cntxt_->HasCase();
  // Load the rest of the language model elements from file
  LoadLangModelElements(lm_params);
  // Load word_dawgs_ if needed.
  if (tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) {
    word_dawgs_ = new DawgVector();
    if (load_system_dawg &&
        tessdata_manager->SeekToStart(TESSDATA_CUBE_SYSTEM_DAWG)) {
      // The last parameter to the Dawg constructor (the debug level) is set to
      // false, until Cube has a way to express its preferred debug level.
      *word_dawgs_ +=  new SquishedDawg(tessdata_manager->GetDataFilePtr(),
                                        DAWG_TYPE_WORD,
                                        cntxt_->Lang().c_str(),
                                        SYSTEM_DAWG_PERM, false);
    }
  } else {
    word_dawgs_ = NULL;
  }
}
tesseract::TessLangModel::~TessLangModel ( ) [inline]

Definition at line 45 of file tess_lang_model.h.

                   {
    if (word_dawgs_ != NULL) {
      word_dawgs_->delete_data_pointers();
      delete word_dawgs_;
    }
  }

Member Function Documentation

LangModEdge ** tesseract::TessLangModel::GetEdges ( CharAltList alt_list,
LangModEdge edge,
int *  edge_cnt 
) [virtual]

Implements tesseract::LangModel.

Definition at line 169 of file tess_lang_model.cpp.

                                                      {
  TessLangModEdge *tess_lm_edge =
      reinterpret_cast<TessLangModEdge *>(lang_mod_edge);
  LangModEdge **edge_array = NULL;
  (*edge_cnt) = 0;

  // if we are starting from the root, we'll instantiate every DAWG
  // and get the all the edges that emerge from the root
  if (tess_lm_edge == NULL) {
    // get DAWG count from Tesseract
    int dawg_cnt = NumDawgs();
    // preallocate the edge buffer
    (*edge_cnt) = dawg_cnt * max_edge_;
    edge_array = new LangModEdge *[(*edge_cnt)];
    if (edge_array == NULL) {
      return NULL;
    }

    for (int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) {
      const Dawg *curr_dawg = GetDawg(dawg_idx);
      // Only look through word Dawgs (since there is a special way of
      // handling numbers and punctuation).
      if (curr_dawg->type() == DAWG_TYPE_WORD) {
        (*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0, NULL, true,
                              edge_array + (*edge_cnt));
      }
    }  // dawg

    (*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0, NULL, true,
                          edge_array + (*edge_cnt));

    // OOD: it is intentionally not added to the list to make sure it comes
    // at the end
    (*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0, NULL, true,
                          edge_array + (*edge_cnt));

    // set the root flag for all root edges
    for (int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) {
      edge_array[edge_idx]->SetRoot(true);
    }
  } else {  // not starting at the root
    // preallocate the edge buffer
    (*edge_cnt) = max_edge_;
    // allocate memory for edges
    edge_array = new LangModEdge *[(*edge_cnt)];
    if (edge_array == NULL) {
      return NULL;
    }

    // get the FanOut edges from the root of each dawg
    (*edge_cnt) = FanOut(alt_list,
                         tess_lm_edge->GetDawg(),
                         tess_lm_edge->EndEdge(), tess_lm_edge->EdgeMask(),
                         tess_lm_edge->EdgeString(), false, edge_array);
  }
  return edge_array;
}
bool tesseract::TessLangModel::IsDigit ( char_32  ch) [virtual]

Implements tesseract::LangModel.

Definition at line 162 of file tess_lang_model.cpp.

                                            {
  return digits_.find(ch) != string::npos;
}
bool tesseract::TessLangModel::IsLeadingPunc ( char_32  ch) [virtual]

Implements tesseract::LangModel.

Definition at line 154 of file tess_lang_model.cpp.

                                                  {
  return lead_punc_.find(ch) != string::npos;
}
bool tesseract::TessLangModel::IsTrailingPunc ( char_32  ch) [virtual]

Implements tesseract::LangModel.

Definition at line 158 of file tess_lang_model.cpp.

                                                   {
  return trail_punc_.find(ch) != string::npos;
}
bool tesseract::TessLangModel::IsValidSequence ( const char_32 sequence,
bool  eow_flag,
LangModEdge **  final_edge = NULL 
) [virtual]

Implements tesseract::LangModel.

Definition at line 145 of file tess_lang_model.cpp.

                                                              {
  if (final_edge != NULL) {
    (*final_edge) = NULL;
  }

  return IsValidSequence(NULL, sequence, eow_flag, final_edge);
}
void tesseract::TessLangModel::RemoveInvalidCharacters ( string *  lm_str)

Definition at line 482 of file tess_lang_model.cpp.

                                                          {
  CharSet *char_set = cntxt_->CharacterSet();
  tesseract::string_32 lm_str32;
  CubeUtils::UTF8ToUTF32(lm_str->c_str(), &lm_str32);

  int len = CubeUtils::StrLen(lm_str32.c_str());
  char_32 *clean_str32 = new char_32[len + 1];
  if (!clean_str32)
    return;
  int clean_len = 0;
  for (int i = 0; i < len; ++i) {
    int class_id = char_set->ClassID((char_32)lm_str32[i]);
    if (class_id != INVALID_UNICHAR_ID) {
      clean_str32[clean_len] = lm_str32[i];
      ++clean_len;
    }
  }
  clean_str32[clean_len] = 0;
  if (clean_len < len) {
    lm_str->clear();
    CubeUtils::UTF32ToUTF8(clean_str32, lm_str);
  }
  delete [] clean_str32;
}
TessLangModEdge* tesseract::TessLangModel::Root ( ) [inline, virtual]

Implements tesseract::LangModel.

Definition at line 53 of file tess_lang_model.h.

                                 {
    return NULL;
  }

The documentation for this class was generated from the following files: