Classes
struct	TESS_CHAR
class	TessBaseAPI
class	CubeRecoContext
class	CubeClassifier
class	CubeTessClassifier
struct	DocQualCallbacks
class	EquationDetect
class	LTRResultIterator
class	ChoiceIterator
class	MutableIterator
class	PageIterator
class	UnicodeSpanSkipper
struct	Cluster
class	SimpleClusterer
struct	GeometricClassifierState
struct	Interval
class	RowInfo
struct	LineHypothesis
class	RowScratchRegisters
class	ParagraphTheory
class	ParagraphModelSmearer
class	ResultIterator
class	TesseractCubeCombiner
struct	TesseractStats
class	Tesseract
class	ImageThresholder
class	BoxWord
class	CCStruct
class	DetLineFit
class	DPPoint
struct	FontSpacingInfo
struct	FontInfo
struct	FontSet
struct	ParamsTrainingHypothesis
class	ParamsTrainingBundle
class	UnicharIdArrayUtils
class	AmbigSpec
class	UnicharAmbigs
class	BitVector
class	CCUtilMutex
class	CCUtil
class	PointerVector
class	IndexMap
class	IndexMapBiDi
struct	ParamsVectors
class	ParamUtils
class	Param
class	IntParam
class	BoolParam
class	StringParam
class	DoubleParam
class	TessdataManager
class	Classify
class	ErrorCounter
class	IntFeatureDist
class	IntFeatureMap
class	IntFeatureSpace
class	ClassPruner
struct	ShapeDist
class	MasterTrainer
class	SampleIterator
struct	ShapeRating
class	ShapeClassifier
struct	UnicharAndFonts
class	Shape
class	ShapeTable
class	TessClassifier
class	TrainingSample
class	TrainingSampleSet
class	AltList
class	BeamSearch
class	Bmp8
class	CachedFile
class	CharAltList
struct	Bigram
struct	CharBigram
struct	CharBigramTable
class	CharBigrams
class	CharSamp
class	CharSampEnum
class	CharSampSet
class	CharSet
class	CharClassifier
class	CharClassifierFactory
class	ConCompPt
class	ConComp
class	ConvNetCharClassifier
class	CubeLineObject
class	CubeLineSegmenter
class	CubeObject
class	CubeSearchObject
class	CubeTuningParams
class	CubeUtils
class	FeatureBase
class	FeatureBmp
class	FeatureChebyshev
class	FeatureHybrid
class	HybridNeuralNetCharClassifier
class	LangModEdge
class	LangModel
class	SearchColumn
class	SearchNode
class	SearchNodeHashTable
class	SearchObject
class	TessLangModEdge
class	TessLangModel
class	TuningParams
class	WordAltList
class	WordListLangModel
struct	PairSizeInfo
struct	FontPairSizeInfo
class	WordSizeModel
class	WordUnigrams
class	CUtil
struct	NodeChild
class	Dawg
struct	DawgInfo
class	DawgInfoVector
class	SquishedDawg
struct	DawgArgs
class	Dict
class	PermuterState
class	Trie
class	Image
class	InputFileBuffer
class	NeuralNet
class	Neuron
struct	AlignedBlobParams
class	AlignedBlob
class	GridBase
class	IntGrid
class	BBGrid
class	GridSearch
class	TabEventHandler
class	BlobGrid
class	CCNonTextDetect
class	ColumnFinder
class	ColPartition
class	ColPartitionGrid
class	ColPartitionSet
class	PixelHistogram
class	ShiroRekhaSplitter
class	EquationDetectBase
class	ImageFind
class	LineFinder
class	StrokeWidth
class	TabFind
class	ColSegment
class	TableFinder
class	StructuredTable
class	TableRecognizer
class	TabConstraint
class	TabVector
class	TextlineProjection
class	Textord
class	WorkingPartSet
struct	AssociateStats
class	AssociateUtils
struct	LanguageModelConsistencyInfo
struct	LanguageModelDawgInfo
struct	LanguageModelNgramInfo
struct	ViterbiStateEntry
struct	LanguageModelState
struct	BestChoiceBundle
struct	BestPathByColumn
class	LanguageModel
struct	MATCH
class	BlobMatchTable
class	FRAGMENT
class	Wordrec
Typedefs
typedef int(Dict::*	DictFunc )(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
typedef double(Dict::*	ProbabilityInContextFunc )(const char lang, const char context, int context_bytes, const char *character, int character_bytes)
typedef void(Wordrec::*	FillLatticeFunc )(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
typedef TessCallback3< const UNICHARSET &, int, PAGE_RES * >	TruthCallback
typedef GenericVectorEqEq < const ParagraphModel * >	SetOfModels
typedef void(Tesseract::*	WordRecognizer )(BLOCK block, ROW row, WERD_RES *word)
typedef GenericVector < ParamsTrainingHypothesis >	ParamsTrainingHypothesisList
typedef GenericVector< UNICHAR_ID >	UnicharIdVector
typedef GenericVector < AmbigSpec_LIST * >	UnicharAmbigsVector
typedef signed int	char_32
typedef basic_string< char_32 >	string_32
typedef GenericVector< NodeChild >	NodeChildVector
typedef GenericVector< int >	SuccessorList
typedef GenericVector < SuccessorList * >	SuccessorListsVector
typedef GenericVector< Dawg * >	DawgVector
typedef GridSearch< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT >	BlobGridSearch
typedef GridSearch < ColPartition, ColPartition_CLIST, ColPartition_C_IT >	ColPartitionGridSearch
typedef GenericVector < ColPartitionSet * >	PartSetVector
typedef TessResultCallback1 < bool, int >	WidthCallback
typedef BBGrid< ColSegment, ColSegment_CLIST, ColSegment_C_IT >	ColSegmentGrid
typedef GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT >	ColSegmentGridSearch
typedef unsigned char	LanguageModelFlagsType
Enumerations
enum	LineType { LT_START = 'S', LT_BODY = 'C', LT_UNKNOWN = 'U', LT_MULTIPLE = 'M' }
enum	CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT }
enum	ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP }
enum	NormalizationMode { NM_BASELINE = -3, NM_CHAR_ISOTROPIC = -2, NM_CHAR_ANISOTROPIC = -1 }
enum	ParamsTrainingRawFeatureType { PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE, PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH, PTRAIN_RAW_FEATURE_SHAPE_COST, PTRAIN_RAW_FEATURE_NGRAM_PROB, PTRAIN_RAW_FEATURE_NUM_BAD_PUNC, PTRAIN_RAW_FEATURE_NUM_BAD_CASE, PTRAIN_RAW_FEATURE_NUM_BAD_CHAR_TYPE, PTRAIN_RAW_FEATURE_NUM_BAD_SPACING, PTRAIN_RAW_FEATURE_NUM_BAD_SCRIPT, PTRAIN_RAW_FEATURE_NUM_BAD_FONT, PTRAIN_RAW_FEATURE_WORST_CERT, PTRAIN_RAW_FEATURE_RATING, PTRAIN_RAW_FEATURE_ADAPTED, PTRAIN_RAW_FEATURE_NUM_UNICHARS, PTRAIN_RAW_FEATURE_OUTLINE_LEN, PTRAIN_NUM_RAW_FEATURE_TYPES }
enum	Orientation { ORIENTATION_PAGE_UP = 0, ORIENTATION_PAGE_RIGHT = 1, ORIENTATION_PAGE_DOWN = 2, ORIENTATION_PAGE_LEFT = 3 }
enum	WritingDirection { WRITING_DIRECTION_LEFT_TO_RIGHT = 0, WRITING_DIRECTION_RIGHT_TO_LEFT = 1, WRITING_DIRECTION_TOP_TO_BOTTOM = 2 }
enum	TextlineOrder { TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, TEXTLINE_ORDER_TOP_TO_BOTTOM = 2 }
enum	PageSegMode { PSM_OSD_ONLY, PSM_AUTO_OSD, PSM_AUTO_ONLY, PSM_AUTO, PSM_SINGLE_COLUMN, PSM_SINGLE_BLOCK_VERT_TEXT, PSM_SINGLE_BLOCK, PSM_SINGLE_LINE, PSM_SINGLE_WORD, PSM_CIRCLE_WORD, PSM_SINGLE_CHAR, PSM_COUNT }
enum	PageIteratorLevel { RIL_BLOCK, RIL_PARA, RIL_TEXTLINE, RIL_WORD, RIL_SYMBOL }
enum	ParagraphJustification { JUSTIFICATION_UNKNOWN, JUSTIFICATION_LEFT, JUSTIFICATION_CENTER, JUSTIFICATION_RIGHT }
enum	OcrEngineMode { OEM_TESSERACT_ONLY, OEM_CUBE_ONLY, OEM_TESSERACT_CUBE_COMBINED, OEM_DEFAULT }
enum	AmbigType { NOT_AMBIG, REPLACE_AMBIG, DEFINITE_AMBIG, SIMILAR_AMBIG, CASE_AMBIG, AMBIG_TYPE_COUNT }
enum	SetParamConstraint { SET_PARAM_CONSTRAINT_NONE, SET_PARAM_CONSTRAINT_DEBUG_ONLY, SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY, SET_PARAM_CONSTRAINT_NON_INIT_ONLY }
enum	TessdataType { TESSDATA_LANG_CONFIG, TESSDATA_UNICHARSET, TESSDATA_AMBIGS, TESSDATA_INTTEMP, TESSDATA_PFFMTABLE, TESSDATA_NORMPROTO, TESSDATA_PUNC_DAWG, TESSDATA_SYSTEM_DAWG, TESSDATA_NUMBER_DAWG, TESSDATA_FREQ_DAWG, TESSDATA_FIXED_LENGTH_DAWGS, TESSDATA_CUBE_UNICHARSET, TESSDATA_CUBE_SYSTEM_DAWG, TESSDATA_SHAPE_TABLE, TESSDATA_BIGRAM_DAWG, TESSDATA_UNAMBIG_DAWG, TESSDATA_PARAMS_TRAINING_MODEL, TESSDATA_NUM_ENTRIES }
enum	CharSegmentationType { CST_FRAGMENT, CST_WHOLE, CST_IMPROPER, CST_NGRAM }
enum	CountTypes { CT_SHAPE_TOP_CORRECT, CT_SHAPE_TOP_ERR, CT_FONT_ATTR_ERR, CT_UNICHAR_TOP1_ERR, CT_UNICHAR_TOP2_ERR, CT_UNICHAR_TOPN_ERR, CT_OK_MULTI_UNICHAR, CT_REJECT, CT_NUM_RESULTS, CT_RANK, CT_REJECTED_JUNK, CT_ACCEPTED_JUNK, CT_SIZE }
enum	DawgType { DAWG_TYPE_PUNCTUATION, DAWG_TYPE_WORD, DAWG_TYPE_NUMBER, DAWG_TYPE_PATTERN, DAWG_TYPE_COUNT }
enum	ColumnSpanningType { CST_NOISE, CST_FLOWING, CST_HEADING, CST_PULLOUT, CST_COUNT }
enum	NeighbourPartitionType { NPT_HTEXT, NPT_VTEXT, NPT_WEAK_HTEXT, NPT_WEAK_VTEXT, NPT_IMAGE, NPT_COUNT }
enum	LeftOrRight { LR_LEFT, LR_RIGHT }
enum	ColSegType { COL_UNKNOWN, COL_TEXT, COL_TABLE, COL_MIXED, COL_COUNT }
enum	TabAlignment { TA_LEFT_ALIGNED, TA_LEFT_RAGGED, TA_CENTER_JUSTIFIED, TA_RIGHT_ALIGNED, TA_RIGHT_RAGGED, TA_SEPARATOR, TA_COUNT }
Functions
int	CubeAPITest (Boxa boxa_blocks, Pixa pixa_blocks, Boxa boxa_words, Pixa pixa_words, const FCOORD &reskew, Pix page_pix, PAGE_RES page_res)
TBLOB *	make_tesseract_blob (float baseline, float xheight, float descender, float ascender, bool numeric_mode, Pix *pix)
TBOX	char_box_to_tbox (Box *char_box, TBOX word_box, int x_offset)
bool	IsTextOrEquationType (PolyBlockType type)
bool	IsLeftIndented (const EquationDetect::IndentType type)
bool	IsRightIndented (const EquationDetect::IndentType type)
template<typename T >
void	SimpleSwap (T &a, T &b)
STRING	RtlEmbed (const STRING &word, bool rtlify)
bool	IsLatinLetter (int ch)
bool	IsDigitLike (int ch)
bool	IsOpeningPunct (int ch)
bool	IsTerminalPunct (int ch)
const char *	SkipChars (const char str, const char toskip)
const char *	SkipChars (const char str, bool(skip)(int))
const char *	SkipOne (const char str, const char toskip)
bool	LikelyListNumeral (const STRING &word)
bool	LikelyListMark (const STRING &word)
bool	AsciiLikelyListItem (const STRING &word)
int	UnicodeFor (const UNICHARSET u, const WERD_CHOICE werd, int pos)
bool	LikelyListMarkUnicode (int ch)
bool	UniLikelyListItem (const UNICHARSET u, const WERD_CHOICE werd)
void	LeftWordAttributes (const UNICHARSET unicharset, const WERD_CHOICE werd, const STRING &utf8, bool is_list, bool starts_idea, bool *ends_idea)
void	RightWordAttributes (const UNICHARSET unicharset, const WERD_CHOICE werd, const STRING &utf8, bool is_list, bool starts_idea, bool *ends_idea)
int	ClosestCluster (const GenericVector< Cluster > &clusters, int value)
void	CalculateTabStops (GenericVector< RowScratchRegisters > rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > left_tabs, GenericVector< Cluster > *right_tabs)
void	MarkRowsWithModel (GenericVector< RowScratchRegisters > rows, int row_start, int row_end, const ParagraphModel model, bool ltr, int eop_threshold)
void	GeometricClassifyThreeTabStopTextBlock (int debug_level, GeometricClassifierState &s, ParagraphTheory *theory)
void	GeometricClassify (int debug_level, GenericVector< RowScratchRegisters > rows, int row_start, int row_end, ParagraphTheory theory)
bool	ValidFirstLine (const GenericVector< RowScratchRegisters > rows, int row, const ParagraphModel model)
bool	ValidBodyLine (const GenericVector< RowScratchRegisters > rows, int row, const ParagraphModel model)
bool	CrownCompatible (const GenericVector< RowScratchRegisters > rows, int a, int b, const ParagraphModel model)
void	DiscardUnusedModels (const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
void	DowngradeWeakestToCrowns (int debug_level, ParagraphTheory theory, GenericVector< RowScratchRegisters > rows)
void	RecomputeMarginsAndClearHypotheses (GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
int	InterwordSpace (const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
bool	FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool	FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after)
bool	TextSupportsBreak (const RowScratchRegisters &before, const RowScratchRegisters &after)
bool	LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after)
bool	LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification j)
ParagraphModel	InternalParagraphModelByOutline (const GenericVector< RowScratchRegisters > rows, int start, int end, int tolerance, bool consistent)
ParagraphModel	ParagraphModelByOutline (int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance)
bool	RowsFitModel (const GenericVector< RowScratchRegisters > rows, int start, int end, const ParagraphModel model)
void	MarkStrongEvidence (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end)
void	ModelStrongEvidence (int debug_level, GenericVector< RowScratchRegisters > rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory theory)
void	StrongEvidenceClassify (int debug_level, GenericVector< RowScratchRegisters > rows, int row_start, int row_end, ParagraphTheory theory)
void	SeparateSimpleLeaderLines (GenericVector< RowScratchRegisters > rows, int row_start, int row_end, ParagraphTheory theory)
void	ConvertHypothesizedModelRunsToParagraphs (int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA * > row_owners, ParagraphTheory theory)
bool	RowIsStranded (const GenericVector< RowScratchRegisters > &rows, int row)
void	LeftoverSegments (const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end)
void	CanonicalizeDetectionResults (GenericVector< PARA * > row_owners, PARA_LIST paragraphs)
void	DetectParagraphs (int debug_level, GenericVector< RowInfo > row_infos, GenericVector< PARA > row_owners, PARA_LIST paragraphs, GenericVector< ParagraphModel * > *models)
void	InitializeRowInfo (const MutableIterator &it, RowInfo *info)
void	DetectParagraphs (int debug_level, const MutableIterator block_start, GenericVector< ParagraphModel > *models)
bool	StrongModel (const ParagraphModel *model)
bool	read_t (PAGE_RES_IT page_res_it, TBOX tbox)
ICOORD	ComputeEndFromGradient (const ICOORD &start, double m)
bool	CompareFontInfo (const FontInfo &fi1, const FontInfo &fi2)
bool	CompareFontSet (const FontSet &fs1, const FontSet &fs2)
void	FontInfoDeleteCallback (FontInfo f)
void	FontSetDeleteCallback (FontSet fs)
bool	read_info (FILE f, FontInfo fi, bool swap)
bool	write_info (FILE *f, const FontInfo &fi)
bool	read_spacing_info (FILE f, FontInfo fi, bool swap)
bool	write_spacing_info (FILE *f, const FontInfo &fi)
bool	read_set (FILE f, FontSet fs, bool swap)
bool	write_set (FILE *f, const FontSet &fs)
void	OtsuThreshold (const unsigned char imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height, int thresholds, int *hi_values)
void	HistogramRect (const unsigned char imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height, int histogram)
int	OtsuStats (const int histogram, int H_out, int *omega0_out)
	ELISTIZE (AmbigSpec)
	ELISTIZEH (AmbigSpec)
template<typename T >
bool	cmp_eq (T const &t1, T const &t2)
template<typename T >
int	sort_cmp (const void t1, const void t2)
template<typename T >
int	sort_ptr_cmp (const void t1, const void t2)
void	ClearFeatureSpaceWindow (NORM_METHOD norm_method, ScrollView *window)
WERD_CHOICE *	get_best_delete_other (WERD_CHOICE choice1, WERD_CHOICE choice2)
BLOB_CHOICE *	get_nth_choice (BLOB_CHOICE_LIST *blob_list, int n)
UNICHAR_ID	get_top_choice_uid (BLOB_CHOICE_LIST *blob_list)
int	find_choice_by_uid (BLOB_CHOICE_LIST *blob_list, UNICHAR_ID target_uid)
WERD_CHOICE *	get_choice_from_posstr (const UNICHARSET unicharset, const BLOB_CHOICE_LIST_VECTOR &char_choices, int start_pos, const char pos_str, float *certainties)
void	get_posstr_from_choice (const BLOB_CHOICE_LIST_VECTOR &char_choices, WERD_CHOICE word_choice, int start_pos, char pos_str)
BLOB_CHOICE *	find_choice_by_type (BLOB_CHOICE_LIST *blob_choices, char target_type, const UNICHARSET &unicharset)
BLOB_CHOICE *	find_choice_by_script (BLOB_CHOICE_LIST *blob_choices, int target_sid, int backup_sid, int secondary_sid)
Pix *	GridReducedPix (const TBOX &box, int gridsize, ICOORD bleft, int left, int bottom)
Pix *	TraceOutlineOnReducedPix (C_OUTLINE outline, int gridsize, ICOORD bleft, int left, int *bottom)
Pix *	TraceBlockOnReducedPix (BLOCK block, int gridsize, ICOORD bleft, int left, int *bottom)
template<class BBC >
int	SortByBoxLeft (const void void1, const void void2)
template<class BBC >
int	SortRightToLeft (const void void1, const void void2)
template<class BBC >
int	SortByBoxBottom (const void void1, const void void2)
template<typename T >
void	DeleteObject (T *object)
ShapeTable *	LoadShapeTable (const STRING &file_prefix)
void	WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table)
MasterTrainer *	LoadTrainingData (int argc, const char const argv, bool replication, ShapeTable *shape_table, STRING file_prefix)
	ELISTIZE (ViterbiStateEntry)
	ELISTIZEH (ViterbiStateEntry)
template<class BLOB_CHOICE >
int	SortByUnicharID (const void void1, const void void2)
template<class BLOB_CHOICE >
int	SortByRating (const void void1, const void void2)
Variables
const int	kMinRectSize = 10
const char	kTesseractReject = '~'
const char	kUNLVReject = '~'
const char	kUNLVSuspect = '^'
const char *	kInputFile = "noname.tif"
const char *	kOldVarsFile = "failed_vars.txt"
const int	kMaxIntSize = 22
const int	kMinCredibleResolution = 70
	Minimum believable resolution.
const int	kMaxCredibleResolution = 2400
const int	kNumbersPerBlob = 5
const int	kBytesPerNumber = 5
const int	kBytesPerBlob = kNumbersPerBlob * (kBytesPerNumber + 1) + 1
const int	kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1
const int	kBytesPer64BitNumber = 20
const int	kMaxBytesPerLine
const int	kUniChs []
const int	kLatinChs []
const float	kMathDigitDensityTh1 = 0.25
const float	kMathDigitDensityTh2 = 0.1
const float	kMathItalicDensityTh = 0.5
const float	kUnclearDensityTh = 0.25
const int	kSeedBlobsCountTh = 10
const int	kLeftIndentAlignmentCountTh = 1
const int	kMaxCharTopRange = 48
const int	kDefaultResolution = 300
	Default resolution used if input in not believable.
const int	kMaxCircleErosions = 8
const int	kStrayLinePer = 6
const ParagraphModel *	kCrownLeft = reinterpret_cast<ParagraphModel *>(0xDEAD111F)
const ParagraphModel *	kCrownRight = reinterpret_cast<ParagraphModel *>(0xDEAD888F)
const inT16	kMaxBoxEdgeDiff = 2
const int	kBoxClipTolerance = 2
const int	kMinSubscriptOffset = 20
const int	kMinSuperscriptOffset = 20
const int	kMaxDropCapBottom = -128
const int	kNumEndPoints = 3
const int	kHistogramSize = 256
CCUtilMutex	tprintfMutex
const char *	kUTF8LineSeparator = "\u2028"
const char *	kUTF8ParagraphSeparator = "\u2029"
const char *	kLRM = "\u200E"
const char *	kRLM = "\u200F"
const char *	kRLE = "\u202A"
const char *	kPDF = "\u202C"
const char *	kHyphenLikeUTF8 []
const char *	kApostropheLikeUTF8 []
const int	kMaxOffsetDist = 32
const double	kMinPCLengthIncrease = 1.0 / 1024
const int	kMinClusteredShapes = 1
const int	kMaxUnicharsPerCluster = 2000
const float	kFontMergeDistance = 0.025
const float	kInfiniteDist = 999.0f
const int	kRandomizingCenter = 128
const int	kTestChar = -1
const int	kSquareLimit = 25
const int	kPrime1 = 17
const int	kPrime2 = 13
const int	kMinOutlierSamples = 5
const int	kStateCnt = 4
const int	kNumLiteralCnt = 5
const int	case_state_table [6][4]
const char	kDoNotReverse [] = "RRP_DO_NO_REVERSE"
const char	kReverseIfHasRTL [] = "RRP_REVERSE_IF_HAS_RTL"
const char	kForceReverse [] = "RRP_FORCE_REVERSE"
const char *const	RTLReversePolicyNames []
const double	kAlignedFraction = 0.03125
const double	kRaggedFraction = 2.5
const double	kAlignedGapFraction = 0.75
const double	kRaggedGapFraction = 1.0
const int	kVLineAlignment = 3
const int	kVLineGutter = 1
const int	kVLineSearchSize = 150
const int	kMinRaggedTabs = 5
const int	kMinAlignedTabs = 4
const int	kVLineMinLength = 500
const double	kMinTabGradient = 4.0
const int	kMaxSkewFactor = 15
const char *	kTextordDebugPix = "psdebug_pix"
const double	kMaxSmallNeighboursPerPix = 1.0 / 32
const int	kMaxLargeOverlapsWithSmall = 3
const int	kMaxMediumOverlapsWithSmall = 12
const int	kMaxLargeOverlapsWithMedium = 12
const int	kOriginalNoiseMultiple = 8
const int	kNoisePadding = 4
const double	kPhotoOffsetFraction = 0.375
const double	kMinGoodTextPARatio = 1.5
const int	kMinColumnWidth = 100
const int	kMaxIncompatibleColumnCount = 2
const double	kMarginOverlapFraction = 0.25
const double	kHorizontalGapMergeFraction = 0.5
const double	kMinNonNoiseFraction = 0.5
const double	kMinGutterWidthGrid = 0.5
const double	kMaxDistToPartSizeRatio = 1.5
bool	textord_tabfind_show_initial_partitions = false
bool	textord_tabfind_show_reject_blobs = false
int	textord_tabfind_show_partitions = 0
bool	textord_tabfind_show_columns = false
bool	textord_tabfind_show_blocks = false
bool	textord_tabfind_find_tables = true
const int	kMaxPartnerDepth = 4
const double	kMaxSpacingDrift = 1.0 / 72
const double	kMaxTopSpacingFraction = 0.25
const double	kMaxSameBlockLineSpacing = 3
const double	kMaxSizeRatio = 1.5
const double	kMaxLeaderGapFractionOfMax = 0.25
const double	kMaxLeaderGapFractionOfMin = 0.5
const int	kMinLeaderCount = 5
const int	kLeaderCutCost = 8
const int	kMinStrongTextValue = 6
const int	kMinChainTextValue = 3
const int	kHorzStrongTextlineCount = 8
const int	kHorzStrongTextlineHeight = 10
const int	kHorzStrongTextlineAspect = 5
const double	kMaxBaselineError = 0.4375
const double	kMinBaselineCoverage = 0.5
const int	kMaxRMSColorNoise = 128
const int	kMaxColorDistance = 900
const int	kRGBRMSColors = 4
bool	textord_tabfind_show_color_fit = false
const int	kMaxPadFactor = 6
const int	kMaxNeighbourDistFactor = 4
const int	kMaxCaptionLines = 7
const double	kMinCaptionGapRatio = 2.0
const double	kMinCaptionGapHeightRatio = 0.5
const double	kBigPartSizeRatio = 1.75
const double	kStrokeWidthFractionTolerance = 0.25
const double	kStrokeWidthConstantTolerance = 2.0
const double	kTinyEnoughTextlineOverlapFraction = 0.25
const double	kMaxPartitionSpacing = 1.75
const int	kSmoothDecisionMargin = 4
const double	kMinRectangularFraction = 0.125
const double	kMaxRectangularFraction = 0.75
const double	kMaxRectangularGradient = 0.1
const int	kMinImageFindSize = 100
const double	kRMSFitScaling = 8.0
const int	kMinColorDifference = 16
const int	kThinLineFraction = 20
	Denominator of resolution makes max pixel width to allow thin lines.
const int	kMinLineLengthFraction = 4
	Denominator of resolution makes min pixels to demand line lengths to be.
const int	kCrackSpacing = 100
	Spacing of cracks across the page to break up tall vertical lines.
const int	kLineFindGridSize = 50
	Grid size used by line finder. Not very critical.
const int	kMinThickLineWidth = 12
const int	kMaxLineResidue = 6
const double	kThickLengthMultiple = 0.75
const double	kMaxNonLineDensity = 0.25
const double	kMaxStaveHeight = 1.0
const double	kMinMusicPixelFraction = 0.75
int	textord_tabfind_show_strokewidths = 0
bool	textord_tabfind_only_strokewidths = false
bool	textord_tabfind_vertical_text = true
bool	textord_tabfind_force_vertical_text = false
bool	textord_tabfind_vertical_horizontal_mix = true
double	textord_tabfind_vertical_text_ratio = 0.5
const double	kStrokeWidthTolerance = 1.5
const double	kStrokeWidthFractionCJK = 0.25
const double	kStrokeWidthCJK = 2.0
const int	kCJKRadius = 2
const double	kCJKBrokenDistanceFraction = 0.25
const int	kCJKMaxComponents = 8
const double	kCJKAspectRatio = 1.25
const double	kCJKAspectRatioIncrease = 1.0625
const int	kMaxCJKSizeRatio = 5
const double	kBrokenCJKIterationFraction = 0.125
const double	kDiacriticXPadRatio = 7.0
const double	kDiacriticYPadRatio = 1.75
const double	kMinDiacriticSizeRatio = 1.0625
const double	kMaxDiacriticDistanceRatio = 1.25
const double	kMaxDiacriticGapToBaseCharHeight = 1.0
const int	kSearchRadius = 2
const int	kLineTrapLongest = 4
const int	kLineTrapShortest = 2
const int	kMostlyOneDirRatio = 3
const double	kLineResidueAspectRatio = 8.0
const int	kLineResiduePadRatio = 3
const double	kLineResidueSizeRatio = 1.75
const float	kSizeRatioToReject = 2.0
const int	kMaxLargeOverlaps = 3
const double	kNeighbourSearchFactor = 2.5
const int	kTabRadiusFactor = 5
const int	kMinVerticalSearch = 3
const int	kMaxVerticalSearch = 12
const int	kMaxRaggedSearch = 25
const int	kMinLinesInColumn = 10
const double	kMinFractionalLinesInColumn = 0.125
const double	kMinGutterWidthAbsolute = 0.02
const double	kMaxGutterWidthAbsolute = 2.00
const int	kRaggedGutterMultiple = 5
const double	kLineFragmentAspectRatio = 10.0
const double	kSmoothFactor = 0.25
const double	kCharVerticalOverlapFraction = 0.375
const double	kMaxHorizontalGap = 3.0
const int	kMinEvaluatedTabs = 3
const int	kMaxTextLineBlobRatio = 5
const int	kMinTextLineBlobRatio = 3
const double	kMinImageArea = 0.5
const double	kCosMaxSkewAngle = 0.866025
bool	textord_tabfind_show_initialtabs = false
bool	textord_tabfind_show_finaltabs = false
double	textord_tabfind_aligned_gap_fraction = 0.75
const int	kColumnWidthFactor = 20
const int	kMaxVerticalSpacing = 500
const int	kMaxBlobWidth = 500
const double	kSplitPartitionSize = 2.0
const double	kAllowTextHeight = 0.5
const double	kAllowTextWidth = 0.6
const double	kAllowTextArea = 0.8
const double	kAllowBlobHeight = 0.3
const double	kAllowBlobWidth = 0.4
const double	kAllowBlobArea = 0.05
const int	kMinBoxesInTextPartition = 10
const int	kMaxBoxesInDataPartition = 20
const double	kMaxGapInTextPartition = 4.0
const double	kMinMaxGapInTextPartition = 0.5
const double	kMaxBlobOverlapFactor = 4.0
const double	kMaxTableCellXheight = 2.0
const int	kMaxColumnHeaderDistance = 4
const double	kTableColumnThreshold = 3.0
const int	kRulingVerticalMargin = 3
const double	kMinOverlapWithTable = 0.6
const int	kSideSpaceMargin = 10
const double	kSmallTableProjectionThreshold = 0.35
const double	kLargeTableProjectionThreshold = 0.45
const int	kLargeTableRowCount = 6
const int	kMinRowsInTable = 3
const double	kRequiredFullJustifiedSpacing = 4.0
const int	kAdjacentLeaderSearchPadding = 2
const double	kParagraphEndingPreviousLineRatio = 1.3
const double	kMaxParagraphEndingLeftSpaceMultiple = 3.0
const double	kMinParagraphEndingTextToWhitespaceRatio = 3.0
const double	kMaxXProjectionGapFactor = 2.0
const double	kStrokeWidthFractionalTolerance = 0.25
bool	textord_dump_table_images = false
bool	textord_show_tables = false
bool	textord_tablefind_show_mark = false
bool	textord_tablefind_show_stats = false
bool	textord_tablefind_recognize_tables = false
const double	kHorizontalSpacing = 0.30
const double	kVerticalSpacing = -0.2
const int	kCellSplitRowThreshold = 0
const int	kCellSplitColumnThreshold = 0
const int	kLinedTableMinVerticalLines = 3
const int	kLinedTableMinHorizontalLines = 3
const double	kRequiredColumns = 0.7
const double	kMarginFactor = 1.1
const double	kMaxRowSize = 2.5
const double	kGoodRowNumberOfColumnsSmall [] = { 2, 2, 2, 2, 2, 3, 3 }
const int	kGoodRowNumberOfColumnsSmallSize
const double	kGoodRowNumberOfColumnsLarge = 0.7
const double	kMinFilledArea = 0.35
const int	kGutterMultiple = 4
const int	kGutterToNeighbourRatio = 3
const int	kSimilarVectorDist = 10
const int	kSimilarRaggedDist = 50
const int	kMaxFillinMultiple = 11
const double	kMinGutterFraction = 0.5
const double	kLineCountReciprocal = 4.0
const double	kMinAlignedGutter = 0.25
const double	kMinRaggedGutter = 1.5
double	textord_tabvector_vertical_gap_fraction = 0.5
double	textord_tabvector_vertical_box_ratio = 0.5
const char *	kAlignmentNames []

Detailed Description

recog_pseudo_word

Make a word from the selected blobs and run Tess on them.

Parameters:

page_res	recognise blobs
selection_box	within this box

fp_eval_word_spacing() Evaluation function for fixed pitch word lists.

Basically, count the number of "nice" characters - those which are in tess acceptable words or in dict words and are not rejected. Penalise any potential noise chars

process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

build_menu()

Construct the menu tree used by the command window

process_cmd_win_event()

Process a command returned from the command window (Just call the appropriate command handler)

word_blank_and_set_display() Word processor

Blank display of word then redisplay word according to current display mode settings

---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------

---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------

---------------------------------------------------------------------------- Include Files and Type Defines ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------

Typedef Documentation

typedef GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> tesseract::BlobGridSearch

Definition at line 31 of file blobgrid.h.

typedef signed int tesseract::char_32

Definition at line 40 of file string_32.h.

typedef GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> tesseract::ColPartitionGridSearch

Definition at line 893 of file colpartition.h.

typedef BBGrid<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGrid

Definition at line 118 of file tablefind.h.

typedef GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGridSearch

Definition at line 121 of file tablefind.h.

typedef GenericVector<Dawg *> tesseract::DawgVector

Definition at line 47 of file dict.h.

typedef int(Dict::* tesseract::DictFunc)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 81 of file baseapi.h.

typedef void(Wordrec::* tesseract::FillLatticeFunc)(const MATRIX &ratings, const LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 88 of file baseapi.h.

typedef unsigned char tesseract::LanguageModelFlagsType

Definition at line 37 of file language_model.h.

typedef GenericVector<NodeChild> tesseract::NodeChildVector

Definition at line 67 of file dawg.h.

typedef GenericVector<ParamsTrainingHypothesis> tesseract::ParamsTrainingHypothesisList

Definition at line 92 of file params_training_featdef.h.

typedef GenericVector<ColPartitionSet*> tesseract::PartSetVector

Definition at line 33 of file colpartitionset.h.

typedef double(Dict::* tesseract::ProbabilityInContextFunc)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Definition at line 83 of file baseapi.h.

typedef GenericVectorEqEq<const ParagraphModel *> tesseract::SetOfModels

Definition at line 94 of file paragraphs_internal.h.

typedef basic_string<char_32> tesseract::string_32

Definition at line 41 of file string_32.h.

typedef GenericVector<int> tesseract::SuccessorList

Definition at line 68 of file dawg.h.

typedef GenericVector<SuccessorList *> tesseract::SuccessorListsVector

Definition at line 69 of file dawg.h.

typedef TessCallback3<const UNICHARSET &, int, PAGE_RES *> tesseract::TruthCallback

Definition at line 92 of file baseapi.h.

typedef GenericVector<AmbigSpec_LIST *> tesseract::UnicharAmbigsVector

Definition at line 139 of file ambigs.h.

typedef GenericVector<UNICHAR_ID> tesseract::UnicharIdVector

Definition at line 34 of file ambigs.h.

typedef TessResultCallback1<bool, int> tesseract::WidthCallback

Definition at line 45 of file tabfind.h.

typedef void(Tesseract::* tesseract::WordRecognizer)(BLOCK *block, ROW *row, WERD_RES *word)

Definition at line 108 of file tesseractclass.h.

Enumeration Type Documentation

enum tesseract::AmbigType

Enumerator:

NOT_AMBIG
REPLACE_AMBIG
DEFINITE_AMBIG
SIMILAR_AMBIG
CASE_AMBIG
AMBIG_TYPE_COUNT

Definition at line 44 of file ambigs.h.

               {
  NOT_AMBIG,        // the ngram pair is not ambiguous
  REPLACE_AMBIG,    // ocred ngram should always be substituted with correct
  DEFINITE_AMBIG,   // add correct ngram to the classifier results (1-1)
  SIMILAR_AMBIG,    // use pairwise classifier for ocred/correct pair (1-1)
  CASE_AMBIG,       // this is a case ambiguity (1-1)

  AMBIG_TYPE_COUNT  // number of enum entries
};

enum tesseract::CharSegmentationType

Enumerator:

CST_FRAGMENT
CST_WHOLE
CST_IMPROPER
CST_NGRAM

Definition at line 51 of file classify.h.

                          {
  CST_FRAGMENT,  // A partial character.
  CST_WHOLE,     // A correctly segmented character.
  CST_IMPROPER,  // More than one but less than 2 characters.
  CST_NGRAM      // Multiple characters.
};

enum tesseract::CMD_EVENTS

Enumerator:

ACTION_1_CMD_EVENT
RECOG_WERDS
RECOG_PSEUDO
ACTION_2_CMD_EVENT

Definition at line 437 of file tessedit.cpp.

{
  ACTION_1_CMD_EVENT,
  RECOG_WERDS,
  RECOG_PSEUDO,
  ACTION_2_CMD_EVENT
};

enum tesseract::ColSegType

Enumerator:

COL_UNKNOWN
COL_TEXT
COL_TABLE
COL_MIXED
COL_COUNT

Definition at line 30 of file tablefind.h.

                {
  COL_UNKNOWN,
  COL_TEXT,
  COL_TABLE,
  COL_MIXED,
  COL_COUNT
};

enum tesseract::ColumnSpanningType

Enumerator:

CST_NOISE
CST_FLOWING
CST_HEADING
CST_PULLOUT
CST_COUNT

Definition at line 47 of file colpartition.h.

                        {
  CST_NOISE,        // Strictly between columns.
  CST_FLOWING,      // Strictly within a single column.
  CST_HEADING,      // Spans multiple columns.
  CST_PULLOUT,      // Touches multiple columns, but doesn't span them.
  CST_COUNT         // Number of entries.
};

enum tesseract::CountTypes

Enumerator:

CT_SHAPE_TOP_CORRECT
CT_SHAPE_TOP_ERR
CT_FONT_ATTR_ERR
CT_UNICHAR_TOP1_ERR
CT_UNICHAR_TOP2_ERR
CT_UNICHAR_TOPN_ERR
CT_OK_MULTI_UNICHAR
CT_REJECT
CT_NUM_RESULTS
CT_RANK
CT_REJECTED_JUNK
CT_ACCEPTED_JUNK
CT_SIZE

Definition at line 69 of file errorcounter.h.

                {
  CT_SHAPE_TOP_CORRECT,  // Top shape id is actually correct.
  CT_SHAPE_TOP_ERR,      // Top shape id is not correct.
  CT_FONT_ATTR_ERR,      // Font attributes incorrect, ignoring unichar.
  CT_UNICHAR_TOP1_ERR,   // Top shape does not contain correct unichar id.
  CT_UNICHAR_TOP2_ERR,   // Top 2 shapes don't contain correct unichar id.
  CT_UNICHAR_TOPN_ERR,   // No output shape contains correct unichar id.
  CT_OK_MULTI_UNICHAR,   // Top shape id has correct unichar id, and others.
  CT_REJECT,             // Classifier hates this.
  CT_NUM_RESULTS,        // Number of answers produced.
  CT_RANK,               // Rank of correct answer.
  CT_REJECTED_JUNK,      // Junk that was correctly rejected.
  CT_ACCEPTED_JUNK,      // Junk that was incorrectly classified otherwise.

  CT_SIZE                // Number of types for array sizing.
};

enum tesseract::DawgType

Enumerator:

DAWG_TYPE_PUNCTUATION
DAWG_TYPE_WORD
DAWG_TYPE_NUMBER
DAWG_TYPE_PATTERN
DAWG_TYPE_COUNT

Definition at line 71 of file dawg.h.

              {
  DAWG_TYPE_PUNCTUATION,
  DAWG_TYPE_WORD,
  DAWG_TYPE_NUMBER,
  DAWG_TYPE_PATTERN,

  DAWG_TYPE_COUNT  // number of enum entries
};

enum tesseract::LeftOrRight

Enumerator:

LR_LEFT
LR_RIGHT

Definition at line 39 of file strokewidth.h.

                 {
  LR_LEFT,
  LR_RIGHT
};

enum tesseract::LineType

Enumerator:

LT_START
LT_BODY
LT_UNKNOWN
LT_MULTIPLE

Definition at line 54 of file paragraphs_internal.h.

              {
  LT_START = 'S',     // First line of a paragraph.
  LT_BODY = 'C',      // Continuation line of a paragraph.
  LT_UNKNOWN = 'U',   // No clues.
  LT_MULTIPLE = 'M',  // Matches for both LT_START and LT_BODY.
};

enum tesseract::NeighbourPartitionType

Enumerator:

NPT_HTEXT
NPT_VTEXT
NPT_WEAK_HTEXT
NPT_WEAK_VTEXT
NPT_IMAGE
NPT_COUNT

Definition at line 1431 of file colpartitiongrid.cpp.

                            {
  NPT_HTEXT,       // Definite horizontal text.
  NPT_VTEXT,       // Definite vertical text.
  NPT_WEAK_HTEXT,  // Weakly horizontal text. Counts as HTEXT for HTEXT, but
                   // image for image and VTEXT.
  NPT_WEAK_VTEXT,  // Weakly vertical text. Counts as VTEXT for VTEXT, but
                   // image for image and HTEXT.
  NPT_IMAGE,       // Defininte non-text.
  NPT_COUNT        // Number of array elements.
};

enum tesseract::NormalizationMode

Enumerator:

NM_BASELINE
NM_CHAR_ISOTROPIC
NM_CHAR_ANISOTROPIC

Definition at line 42 of file normalis.h.

                       {
  NM_BASELINE = -3,         // The original BL normalization mode.
  NM_CHAR_ISOTROPIC = -2,   // Character normalization but isotropic.
  NM_CHAR_ANISOTROPIC = -1  // The original CN normalization mode.
};

enum tesseract::OcrEngineMode

When Tesseract/Cube is initialized we can choose to instantiate/load/run only the Tesseract part, only the Cube part or both along with the combiner. The preference of which engine to use is stored in tessedit_ocr_engine_mode.

ATTENTION: When modifying this enum, please make sure to make the appropriate changes to all the enums mirroring it (e.g. OCREngine in cityblock/workflow/detection/detection_storage.proto). Such enums will mention the connection to OcrEngineMode in the comments.

Enumerator:

OEM_TESSERACT_ONLY
OEM_CUBE_ONLY
OEM_TESSERACT_CUBE_COMBINED
OEM_DEFAULT

Definition at line 234 of file publictypes.h.

                   {
  OEM_TESSERACT_ONLY,           // Run Tesseract only - fastest
  OEM_CUBE_ONLY,                // Run Cube only - better accuracy, but slower
  OEM_TESSERACT_CUBE_COMBINED,  // Run both and combine results - best accuracy
  OEM_DEFAULT                   // Specify this mode when calling init_*(),
                                // to indicate that any of the above modes
                                // should be automatically inferred from the
                                // variables in the language-specific config,
                                // command-line configs, or if not specified
                                // in any of the above should be set to the
                                // default OEM_TESSERACT_ONLY.
};

enum tesseract::Orientation

+------------------+ Orientation Example: | 1 Aaaa Aaaa Aaaa | ==================== | Aaa aa aaa aa | To left is a diagram of some (1) English and | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. | 2 | | ####### c c C | Upright Latin characters are represented as A and a. | ####### c c c | '<' represents a latin character rotated | < ####### c c c | anti-clockwise 90 degrees. | < ####### c c | | < ####### . c | Upright Chinese characters are represented C and c. | 3 ####### c | +------------------+ NOTA BENE: enum values here should match goodoc.proto

If you orient your head so that "up" aligns with Orientation, then the characters will appear "right side up" and readable.

In the example above, both the English and Chinese paragraphs are oriented so their "up" is the top of the page (page up). The photo credit is read with one's head turned leftward ("up" is to page left).

The values of this enum match the convention of Tesseract's osdetect.h

Enumerator:

ORIENTATION_PAGE_UP
ORIENTATION_PAGE_RIGHT
ORIENTATION_PAGE_DOWN
ORIENTATION_PAGE_LEFT

Definition at line 104 of file publictypes.h.

                 {
  ORIENTATION_PAGE_UP = 0,
  ORIENTATION_PAGE_RIGHT = 1,
  ORIENTATION_PAGE_DOWN = 2,
  ORIENTATION_PAGE_LEFT = 3,
};

enum tesseract::PageIteratorLevel

enum of the elements of the page hierarchy, used in ResultIterator to provide functions that operate on each level without having to have 5x as many functions.

Enumerator:

RIL_BLOCK
RIL_PARA
RIL_TEXTLINE
RIL_WORD
RIL_SYMBOL

Definition at line 185 of file publictypes.h.

                       {
  RIL_BLOCK,     // Block of text/image/separator line.
  RIL_PARA,      // Paragraph within a block.
  RIL_TEXTLINE,  // Line within a paragraph.
  RIL_WORD,      // Word within a textline.
  RIL_SYMBOL     // Symbol/character within a word.
};

enum tesseract::PageSegMode

Possible modes for page layout analysis. These *must* be kept in order of decreasing amount of layout analysis to be done, except for OSD_ONLY, so that the inequality test macros below work.

Enumerator:

PSM_OSD_ONLY	Orientation and script detection only.
PSM_AUTO_OSD	Automatic page segmentation with orientation and script detection. (OSD)
PSM_AUTO_ONLY	Automatic page segmentation, but no OSD, or OCR.
PSM_AUTO	Fully automatic page segmentation, but no OSD.
PSM_SINGLE_COLUMN	Assume a single column of text of variable sizes.
PSM_SINGLE_BLOCK_VERT_TEXT	Assume a single uniform block of vertically aligned text.
PSM_SINGLE_BLOCK	Assume a single uniform block of text. (Default.)
PSM_SINGLE_LINE	Treat the image as a single text line.
PSM_SINGLE_WORD	Treat the image as a single word.
PSM_CIRCLE_WORD	Treat the image as a single word in a circle.
PSM_SINGLE_CHAR	Treat the image as a single character.
PSM_COUNT	Number of enum entries.

Definition at line 147 of file publictypes.h.

                 {
  PSM_OSD_ONLY,       
  PSM_AUTO_OSD,       

  PSM_AUTO_ONLY,      
  PSM_AUTO,           
  PSM_SINGLE_COLUMN,  
  PSM_SINGLE_BLOCK_VERT_TEXT,  

  PSM_SINGLE_BLOCK,   
  PSM_SINGLE_LINE,    
  PSM_SINGLE_WORD,    
  PSM_CIRCLE_WORD,    
  PSM_SINGLE_CHAR,    

  PSM_COUNT           
};

enum tesseract::ParagraphJustification

JUSTIFICATION_UNKNONW The alignment is not clearly one of the other options. This could happen for example if there are only one or two lines of text or the text looks like source code or poetry.

NOTA BENE: Fully justified paragraphs (text aligned to both left and right margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text is written with a left-to-right script and with JUSTIFICATION_RIGHT if their text is written in a right-to-left script.

Interpretation for text read in vertical lines: "Left" is wherever the starting reading position is.

JUSTIFICATION_LEFT Each line, except possibly the first, is flush to the same left tab stop.

JUSTIFICATION_CENTER The text lines of the paragraph are centered about a line going down through their middle of the text lines.

JUSTIFICATION_RIGHT Each line, except possibly the first, is flush to the same right tab stop.

Enumerator:

JUSTIFICATION_UNKNOWN
JUSTIFICATION_LEFT
JUSTIFICATION_CENTER
JUSTIFICATION_RIGHT

Definition at line 217 of file publictypes.h.

                            {
  JUSTIFICATION_UNKNOWN,
  JUSTIFICATION_LEFT,
  JUSTIFICATION_CENTER,
  JUSTIFICATION_RIGHT,
};

enum tesseract::ParamsTrainingRawFeatureType

Enumerator:

PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE
PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH
PTRAIN_RAW_FEATURE_SHAPE_COST
PTRAIN_RAW_FEATURE_NGRAM_PROB
PTRAIN_RAW_FEATURE_NUM_BAD_PUNC
PTRAIN_RAW_FEATURE_NUM_BAD_CASE
PTRAIN_RAW_FEATURE_NUM_BAD_CHAR_TYPE
PTRAIN_RAW_FEATURE_NUM_BAD_SPACING
PTRAIN_RAW_FEATURE_NUM_BAD_SCRIPT
PTRAIN_RAW_FEATURE_NUM_BAD_FONT
PTRAIN_RAW_FEATURE_WORST_CERT
PTRAIN_RAW_FEATURE_RATING
PTRAIN_RAW_FEATURE_ADAPTED
PTRAIN_RAW_FEATURE_NUM_UNICHARS
PTRAIN_RAW_FEATURE_OUTLINE_LEN
PTRAIN_NUM_RAW_FEATURE_TYPES

Definition at line 34 of file params_training_featdef.h.

                                  {
  // What dictionary (if any) was this hypothesis found in.
  // See PermuterType enum in ccstruct/ratngs.h for interpretation.
  PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE,     // 0
  // Boolean indicator of whether this hypothesis is ambiguous to a known
  // dictionary word (or a valid number pattern).
  PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH,  // 1
  // Shape cost of the segmentation path for this hypothesis.
  PTRAIN_RAW_FEATURE_SHAPE_COST,          // 2
  // Character ngram probability of the string of unichars of this hypothesis.
  PTRAIN_RAW_FEATURE_NGRAM_PROB,          // 3
  // Number of bad/inconsistent spots in this hypothesis.
  PTRAIN_RAW_FEATURE_NUM_BAD_PUNC,        // 4
  PTRAIN_RAW_FEATURE_NUM_BAD_CASE,        // 5
  PTRAIN_RAW_FEATURE_NUM_BAD_CHAR_TYPE,   // 6
  PTRAIN_RAW_FEATURE_NUM_BAD_SPACING,     // 7
  PTRAIN_RAW_FEATURE_NUM_BAD_SCRIPT,      // 8
  PTRAIN_RAW_FEATURE_NUM_BAD_FONT,        // 9
  // Classifier-related features.
  PTRAIN_RAW_FEATURE_WORST_CERT,          // 10
  PTRAIN_RAW_FEATURE_RATING,              // 11
  // Number of classifier results that came from adapted templates.
  PTRAIN_RAW_FEATURE_ADAPTED,   // 12
  // Features potentially useful for normalization.
  PTRAIN_RAW_FEATURE_NUM_UNICHARS,        // 13
  PTRAIN_RAW_FEATURE_OUTLINE_LEN,         // 14

  PTRAIN_NUM_RAW_FEATURE_TYPES
};

enum tesseract::ScriptPos

Enumerator:

SP_NORMAL
SP_SUBSCRIPT
SP_SUPERSCRIPT
SP_DROPCAP

Definition at line 38 of file boxword.h.

               {
  SP_NORMAL,
  SP_SUBSCRIPT,
  SP_SUPERSCRIPT,
  SP_DROPCAP
};

enum tesseract::SetParamConstraint

Enumerator:

SET_PARAM_CONSTRAINT_NONE
SET_PARAM_CONSTRAINT_DEBUG_ONLY
SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
SET_PARAM_CONSTRAINT_NON_INIT_ONLY

Definition at line 36 of file params.h.

                        {
  SET_PARAM_CONSTRAINT_NONE,
  SET_PARAM_CONSTRAINT_DEBUG_ONLY,
  SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY,
  SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
};

enum tesseract::TabAlignment

Enumerator:

TA_LEFT_ALIGNED
TA_LEFT_RAGGED
TA_CENTER_JUSTIFIED
TA_RIGHT_ALIGNED
TA_RIGHT_RAGGED
TA_SEPARATOR
TA_COUNT

Definition at line 43 of file tabvector.h.

                  {
  TA_LEFT_ALIGNED,
  TA_LEFT_RAGGED,
  TA_CENTER_JUSTIFIED,
  TA_RIGHT_ALIGNED,
  TA_RIGHT_RAGGED,
  TA_SEPARATOR,
  TA_COUNT
};

enum tesseract::TessdataType

Enumerator:

TESSDATA_LANG_CONFIG
TESSDATA_UNICHARSET
TESSDATA_AMBIGS
TESSDATA_INTTEMP
TESSDATA_PFFMTABLE
TESSDATA_NORMPROTO
TESSDATA_PUNC_DAWG
TESSDATA_SYSTEM_DAWG
TESSDATA_NUMBER_DAWG
TESSDATA_FREQ_DAWG
TESSDATA_FIXED_LENGTH_DAWGS
TESSDATA_CUBE_UNICHARSET
TESSDATA_CUBE_SYSTEM_DAWG
TESSDATA_SHAPE_TABLE
TESSDATA_BIGRAM_DAWG
TESSDATA_UNAMBIG_DAWG
TESSDATA_PARAMS_TRAINING_MODEL
TESSDATA_NUM_ENTRIES

Definition at line 51 of file tessdatamanager.h.

                  {
  TESSDATA_LANG_CONFIG,         // 0
  TESSDATA_UNICHARSET,          // 1
  TESSDATA_AMBIGS,              // 2
  TESSDATA_INTTEMP,             // 3
  TESSDATA_PFFMTABLE,           // 4
  TESSDATA_NORMPROTO,           // 5
  TESSDATA_PUNC_DAWG,           // 6
  TESSDATA_SYSTEM_DAWG,         // 7
  TESSDATA_NUMBER_DAWG,         // 8
  TESSDATA_FREQ_DAWG,           // 9
  TESSDATA_FIXED_LENGTH_DAWGS,  // 10
  TESSDATA_CUBE_UNICHARSET,     // 11
  TESSDATA_CUBE_SYSTEM_DAWG,    // 12
  TESSDATA_SHAPE_TABLE,         // 13
  TESSDATA_BIGRAM_DAWG,         // 14
  TESSDATA_UNAMBIG_DAWG,        // 15
  TESSDATA_PARAMS_TRAINING_MODEL,  // 16

  TESSDATA_NUM_ENTRIES
};

enum tesseract::TextlineOrder

The text lines are read in the given sequence.

In English, the order is top-to-bottom. In Chinese, vertical text lines are read right-to-left. Mongolian is written in vertical columns top to bottom like Chinese, but the lines order left-to right.

Note that only some combinations make sense. For example, WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM

Enumerator:

TEXTLINE_ORDER_LEFT_TO_RIGHT
TEXTLINE_ORDER_RIGHT_TO_LEFT
TEXTLINE_ORDER_TOP_TO_BOTTOM

Definition at line 136 of file publictypes.h.

                   {
  TEXTLINE_ORDER_LEFT_TO_RIGHT = 0,
  TEXTLINE_ORDER_RIGHT_TO_LEFT = 1,
  TEXTLINE_ORDER_TOP_TO_BOTTOM = 2,
};

enum tesseract::WritingDirection

The grapheme clusters within a line of text are laid out logically in this direction, judged when looking at the text line rotated so that its Orientation is "page up".

For English text, the writing direction is left-to-right. For the Chinese text in the above example, the writing direction is top-to-bottom.

Enumerator:

WRITING_DIRECTION_LEFT_TO_RIGHT
WRITING_DIRECTION_RIGHT_TO_LEFT
WRITING_DIRECTION_TOP_TO_BOTTOM

Definition at line 119 of file publictypes.h.

                      {
  WRITING_DIRECTION_LEFT_TO_RIGHT = 0,
  WRITING_DIRECTION_RIGHT_TO_LEFT = 1,
  WRITING_DIRECTION_TOP_TO_BOTTOM = 2,
};

Function Documentation

bool tesseract::AsciiLikelyListItem ( const STRING & word )

Definition at line 279 of file paragraphs.cpp.

                                             {
  return LikelyListMark(word) || LikelyListNumeral(word);
}

void tesseract::CalculateTabStops	(	GenericVector< RowScratchRegisters > *	rows,
		int	row_start,
		int	row_end,
		int	tolerance,
		GenericVector< Cluster > *	left_tabs,
		GenericVector< Cluster > *	right_tabs
	)

Definition at line 703 of file paragraphs.cpp.

                                                           {
  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
    return;
  // First pass: toss all left and right indents into clusterers.
  SimpleClusterer initial_lefts(tolerance);
  SimpleClusterer initial_rights(tolerance);
  GenericVector<Cluster> initial_left_tabs;
  GenericVector<Cluster> initial_right_tabs;
  for (int i = row_start; i < row_end; i++) {
    initial_lefts.Add((*rows)[i].lindent_);
    initial_rights.Add((*rows)[i].rindent_);
  }
  initial_lefts.GetClusters(&initial_left_tabs);
  initial_rights.GetClusters(&initial_right_tabs);

  // Second pass: cluster only lines that are not "stray"
  //   An example of a stray line is a page number -- a line whose start
  //   and end tab-stops are far outside the typical start and end tab-stops
  //   for the block.
  //   Put another way, we only cluster data from lines whose start or end
  //   tab stop is frequent.
  SimpleClusterer lefts(tolerance);
  SimpleClusterer rights(tolerance);
  int infrequent_enough_to_ignore = (row_end - row_start) / kStrayLinePer;
  for (int i = row_start; i < row_end; i++) {
    int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
    int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
    if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
        initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
      lefts.Add((*rows)[i].lindent_);
      rights.Add((*rows)[i].rindent_);
    }
  }
  lefts.GetClusters(left_tabs);
  rights.GetClusters(right_tabs);
}

void tesseract::CanonicalizeDetectionResults	(	GenericVector< PARA * > *	row_owners,
		PARA_LIST *	paragraphs
	)

Definition at line 2179 of file paragraphs.cpp.

                           {
  GenericVector<PARA *> &rows = *row_owners;
  paragraphs->clear();
  PARA_IT out(paragraphs);
  PARA *formerly_null = NULL;
  for (int i = 0; i < rows.size(); i++) {
    if (rows[i] == NULL) {
      if (i == 0 || rows[i - 1] != formerly_null) {
        rows[i] = formerly_null = new PARA();
      } else {
        rows[i] = formerly_null;
        continue;
      }
    } else if (i > 0 && rows[i - 1] == rows[i]) {
      continue;
    }
    out.add_after_then_move(rows[i]);
  }
}

TBOX tesseract::char_box_to_tbox	(	Box *	char_box,
		TBOX	word_box,
		int	x_offset
	)

Definition at line 42 of file cube_control.cpp.

                                                                  {
  l_int32 left;
  l_int32 top;
  l_int32 width;
  l_int32 height;
  l_int32 right;
  l_int32 bottom;

  boxGetGeometry(char_box, &left, &top, &width, &height);
  left += word_box.left() - x_offset;
  right = left + width;
  top = word_box.bottom() + word_box.height() - top;
  bottom = top - height;
  return TBOX(left, bottom, right, top);
}

void tesseract::ClearFeatureSpaceWindow	(	NORM_METHOD	norm_method,
		ScrollView *	window
	)

Definition at line 1132 of file intproto.cpp.

                                                                          {
  window->Clear();

  window->Pen(ScrollView::GREY);
  // Draw the feature space limit rectangle.
  window->Rectangle(0, 0, INT_MAX_X, INT_MAX_Y);
  if (norm_method == baseline) {
    window->SetCursor(0, INT_DESCENDER);
    window->DrawTo(INT_MAX_X, INT_DESCENDER);
    window->SetCursor(0, INT_BASELINE);
    window->DrawTo(INT_MAX_X, INT_BASELINE);
    window->SetCursor(0, INT_XHEIGHT);
    window->DrawTo(INT_MAX_X, INT_XHEIGHT);
    window->SetCursor(0, INT_CAPHEIGHT);
    window->DrawTo(INT_MAX_X, INT_CAPHEIGHT);
  } else {
    window->Rectangle(INT_XCENTER - INT_XRADIUS, INT_YCENTER - INT_YRADIUS,
                      INT_XCENTER + INT_XRADIUS, INT_YCENTER + INT_YRADIUS);
  }
}

int tesseract::ClosestCluster	(	const GenericVector< Cluster > &	clusters,
		int	value
	)

Definition at line 677 of file paragraphs.cpp.

                                                                      {
  int best_index = 0;
  for (int i = 0; i < clusters.size(); i++) {
    if (abs(value - clusters[i].center) <
        abs(value - clusters[best_index].center))
        best_index = i;
  }
  return best_index;
}

template<typename T >

bool tesseract::cmp_eq	(	T const &	t1,
		T const &	t2
	)

Definition at line 285 of file genericvector.h.

                                        {
  return t1 == t2;
}

bool tesseract::CompareFontInfo	(	const FontInfo &	fi1,
		const FontInfo &	fi2
	)

Definition at line 25 of file fontinfo.cpp.

                                                               {
  // The font properties are required to be the same for two font with the same
  // name, so there is no need to test them.
  // Consequently, querying the table with only its font name as information is
  // enough to retrieve its properties.
  return strcmp(fi1.name, fi2.name) == 0;
}

bool tesseract::CompareFontSet	(	const FontSet &	fs1,
		const FontSet &	fs2
	)

Definition at line 33 of file fontinfo.cpp.

                                                            {
  if (fs1.size != fs2.size)
    return false;
  for (int i = 0; i < fs1.size; ++i) {
    if (fs1.configs[i] != fs2.configs[i])
      return false;
  }
  return true;
}

ICOORD tesseract::ComputeEndFromGradient	(	const ICOORD &	start,
		double	m
	)

Definition at line 124 of file detlinefit.cpp.

                                                             {
  if (m > 1.0 || m < -1.0) {
    // dy dominates. Force it to have the opposite sign of start.y() and
    // compute dx based on dy being as large as possible
    int dx = static_cast<int>(floor(MAX_INT16 / m));
    if (dx < 0) ++dx;  // Truncate towards 0.
    if (start.y() > 0) dx = - dx;  // Force dy to be opposite to start.y().
    // Constrain dx so the result fits in an inT16.
    while (start.x() + dx > MAX_INT16 || start.x() + dx < -MAX_INT16)
      dx /= 2;
    if (-1 <= dx && dx <= 1) {
      return ICOORD(start.x(), start.y() + 1);  // Too steep for anything else.
    }
    int y = start.y() + static_cast<int>(floor(dx * m + 0.5));
    ASSERT_HOST(-MAX_INT16 <= y && y <= MAX_INT16);
    return ICOORD(start.x() + dx, y);
  } else {
    // dx dominates. Force it to have the opposite sign of start.x() and
    // compute dy based on dx being as large as possible.
    int dy = static_cast<int>(floor(MAX_INT16 * m));
    if (dy < 0) ++dy;  // Truncate towards 0.
    if (start.x() > 0) dy = - dy;  // Force dx to be opposite to start.x().
    // Constrain dy so the result fits in an inT16.
    while (start.y() + dy > MAX_INT16 || start.y() + dy < -MAX_INT16)
      dy /= 2;
    if (-1 <= dy && dy <= 1) {
      return ICOORD(start.x() + 1, start.y());  // Too flat for anything else.
    }
    int x = start.x() + static_cast<int>(floor(dy / m + 0.5));
    ASSERT_HOST(-MAX_INT16 <= x && x <= MAX_INT16);
    return ICOORD(x, start.y() + dy);
  }
}

void tesseract::ConvertHypothesizedModelRunsToParagraphs	(	int	debug_level,
		const GenericVector< RowScratchRegisters > &	rows,
		GenericVector< PARA * > *	row_owners,
		ParagraphTheory *	theory
	)

Definition at line 1988 of file paragraphs.cpp.

                             {
  int end = rows.size();
  int start;
  for (; end > 0; end = start) {
    start = end - 1;
    const ParagraphModel *model = NULL;
    // TODO(eger): Be smarter about dealing with multiple hypotheses.
    bool single_line_paragraph = false;
    SetOfModels models;
    rows[start].NonNullHypotheses(&models);
    if (models.size() > 0) {
      model = models[0];
      if (rows[start].GetLineType(model) != LT_BODY)
        single_line_paragraph = true;
    }
    if (model && !single_line_paragraph) {
      // walk back looking for more body lines and then a start line.
      while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
        // do nothing
      }
      if (start < 0 || rows[start].GetLineType(model) != LT_START) {
        model = NULL;
      }
    }
    if (model == NULL) {
      continue;
    }
    // rows[start, end) should be a paragraph.
    PARA *p = new PARA();
    if (model == kCrownLeft || model == kCrownRight) {
      p->is_very_first_or_continuation = true;
      // Crown paragraph.
      //   If we can find an existing ParagraphModel that fits, use it,
      //   else create a new one.
      for (int row = end; row < rows.size(); row++) {
        if ((*row_owners)[row] &&
            (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
            (start == 0 ||
             ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
          model = (*row_owners)[row]->model;
          break;
        }
      }
      if (model == kCrownLeft) {
        // No subsequent model fits, so cons one up.
        model = theory->AddModel(ParagraphModel(
            JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
            0, 0, Epsilon(rows[start].ri_->average_interword_space)));
      } else if (model == kCrownRight) {
        // No subsequent model fits, so cons one up.
        model = theory->AddModel(ParagraphModel(
            JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
            0, 0, Epsilon(rows[start].ri_->average_interword_space)));
      }
    }
    rows[start].SetUnknown();
    rows[start].AddStartLine(model);
    for (int i = start + 1; i < end; i++) {
      rows[i].SetUnknown();
      rows[i].AddBodyLine(model);
    }
    p->model = model;
    p->has_drop_cap = rows[start].ri_->has_drop_cap;
    p->is_list_item =
        model->justification() == JUSTIFICATION_RIGHT
            ? rows[start].ri_->rword_indicates_list_item
            : rows[start].ri_->lword_indicates_list_item;
    for (int row = start; row < end; row++) {
      if ((*row_owners)[row] != NULL) {
        tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
                "more than once!\n");
      }
      (*row_owners)[row] = p;
    }
  }
}

bool tesseract::CrownCompatible	(	const GenericVector< RowScratchRegisters > *	rows,
		int	a,
		int	b,
		const ParagraphModel *	model
	)

Definition at line 1237 of file paragraphs.cpp.

                                                                {
  if (model != kCrownRight && model != kCrownLeft) {
    tprintf("CrownCompatible() should only be called with crown models!\n");
    return false;
  }
  RowScratchRegisters &row_a = (*rows)[a];
  RowScratchRegisters &row_b = (*rows)[b];
  if (model == kCrownRight) {
    return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
                       row_b.rindent_ + row_b.rmargin_,
                       Epsilon(row_a.ri_->average_interword_space));
  }
  return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
                     row_b.lindent_ + row_b.lmargin_,
                     Epsilon(row_a.ri_->average_interword_space));
}

int tesseract::CubeAPITest	(	Boxa *	boxa_blocks,
		Pixa *	pixa_blocks,
		Boxa *	boxa_words,
		Pixa *	pixa_words,
		const FCOORD &	reskew,
		Pix *	page_pix,
		PAGE_RES *	page_res
	)

Placeholder for call to Cube and test that the input data is correct. reskew is the direction of baselines in the skewed image in normalized (cos theta, sin theta) form, so (0.866, 0.5) would represent a 30 degree anticlockwise skew.

Definition at line 580 of file baseapi.cpp.

                                    {
  int block_count = boxaGetCount(boxa_blocks);
  ASSERT_HOST(block_count == pixaGetCount(pixa_blocks));
  // Write each block to the current directory as junk_write_display.nnn.png.
  for (int i = 0; i < block_count; ++i) {
    Pix* pix = pixaGetPix(pixa_blocks, i, L_CLONE);
    pixDisplayWrite(pix, 1);
  }
  int word_count = boxaGetCount(boxa_words);
  ASSERT_HOST(word_count == pixaGetCount(pixa_words));
  int pr_word = 0;
  PAGE_RES_IT page_res_it(page_res);
  for (page_res_it.restart_page(); page_res_it.word () != NULL;
       page_res_it.forward(), ++pr_word) {
    WERD_RES *word = page_res_it.word();
    WERD_CHOICE* choice = word->best_choice;
    // Write the first 100 words to files names wordims/<wordstring>.tif.
    if (pr_word < 100) {
      STRING filename("wordims/");
      if (choice != NULL) {
        filename += choice->unichar_string();
      } else {
        char numbuf[32];
        filename += "unclassified";
        snprintf(numbuf, 32, "%03d", pr_word);
        filename += numbuf;
      }
      filename += ".tif";
      Pix* pix = pixaGetPix(pixa_words, pr_word, L_CLONE);
      pixWrite(filename.string(), pix, IFF_TIFF_G4);
    }
  }
  ASSERT_HOST(pr_word == word_count);
  return 0;
}

template<typename T >

void tesseract::DeleteObject ( T * object )

Definition at line 164 of file tablefind.cpp.

                                                   {
  delete object;
}

void tesseract::DetectParagraphs	(	int	debug_level,
		GenericVector< RowInfo > *	row_infos,
		GenericVector< PARA * > *	row_owners,
		PARA_LIST *	paragraphs,
		GenericVector< ParagraphModel * > *	models
	)

Definition at line 2211 of file paragraphs.cpp.

                                                               {
  GenericVector<RowScratchRegisters> rows;
  ParagraphTheory theory(models);

  // Initialize row_owners to be a bunch of NULL pointers.
  row_owners->init_to_size(row_infos->size(), NULL);

  // Set up row scratch registers for the main algorithm.
  rows.init_to_size(row_infos->size(), RowScratchRegisters());
  for (int i = 0; i < row_infos->size(); i++) {
    rows[i].Init((*row_infos)[i]);
  }

  // Pass 1:
  //   Detect sequences of lines that all contain leader dots (.....)
  //   These are likely Tables of Contents.  If there are three text lines in
  //   a row with leader dots, it's pretty safe to say the middle one should
  //   be a paragraph of its own.
  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);

  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);

  GenericVector<Interval> leftovers;
  LeftoverSegments(rows, &leftovers, 0, rows.size());
  for (int i = 0; i < leftovers.size(); i++) {
    // Pass 2a:
    //   Find any strongly evidenced start-of-paragraph lines.  If they're
    //   followed by two lines that look like body lines, make a paragraph
    //   model for that and see if that model applies throughout the text
    //   (that is, "smear" it).
    StrongEvidenceClassify(debug_level, &rows,
                           leftovers[i].begin, leftovers[i].end, &theory);

    // Pass 2b:
    //   If we had any luck in pass 2a, we got part of the page and didn't
    //   know how to classify a few runs of rows. Take the segments that
    //   didn't find a model and reprocess them individually.
    GenericVector<Interval> leftovers2;
    LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
    bool pass2a_was_useful = leftovers2.size() > 1 ||
        (leftovers2.size() == 1 &&
         (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
    if (pass2a_was_useful) {
      for (int j = 0; j < leftovers2.size(); j++) {
        StrongEvidenceClassify(debug_level, &rows,
                               leftovers2[j].begin, leftovers2[j].end,
                               &theory);
      }
    }
  }

  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);

  // Pass 3:
  //   These are the dregs for which we didn't have enough strong textual
  //   and geometric clues to form matching models for.  Let's see if
  //   the geometric clues are simple enough that we could just use those.
  LeftoverSegments(rows, &leftovers, 0, rows.size());
  for (int i = 0; i < leftovers.size(); i++) {
    GeometricClassify(debug_level, &rows,
                      leftovers[i].begin, leftovers[i].end, &theory);
  }
  // Undo any flush models for which there's little evidence.
  DowngradeWeakestToCrowns(debug_level, &theory, &rows);

  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);

  // Pass 4:
  //   Take everything that's still not marked up well and clear all markings.
  LeftoverSegments(rows, &leftovers, 0, rows.size());
  for (int i = 0; i < leftovers.size(); i++) {
    for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
      rows[j].SetUnknown();
    }
  }

  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);

  // Convert all of the unique hypothesis runs to PARAs.
  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
                                           &theory);

  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);

  // Finally, clean up any dangling NULL row paragraph parents.
  CanonicalizeDetectionResults(row_owners, paragraphs);
}

void tesseract::DetectParagraphs	(	int	debug_level,
		const MutableIterator *	block_start,
		GenericVector< ParagraphModel * > *	models
	)

Definition at line 2401 of file paragraphs.cpp.

                                                               {
  // Clear out any preconceived notions.
  if (block_start->Empty(RIL_TEXTLINE)) {
    return;
  }
  BLOCK *block = block_start->PageResIt()->block()->block;
  block->para_list()->clear();
  bool is_image_block = block->poly_block() && !block->poly_block()->IsText();

  // Convert the Tesseract structures to RowInfos
  // for the paragraph detection algorithm.
  MutableIterator row(*block_start);
  if (row.Empty(RIL_TEXTLINE))
    return;  // end of input already.

  GenericVector<RowInfo> row_infos;
  do {
    if (!row.PageResIt()->row())
      continue;  // empty row.
    row.PageResIt()->row()->row->set_para(NULL);
    row_infos.push_back(RowInfo());
    RowInfo &ri = row_infos.back();
    InitializeRowInfo(row, &ri);
  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
           row.Next(RIL_TEXTLINE));

  // Run the paragraph detection algorithm.
  GenericVector<PARA *> row_owners;
  GenericVector<PARA *> the_paragraphs;
  if (!is_image_block) {
    DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
                     models);
  } else {
    row_owners.init_to_size(row_infos.size(), NULL);
    CanonicalizeDetectionResults(&row_owners, block->para_list());
  }

  // Now stitch in the row_owners into the rows.
  row = *block_start;
  for (int i = 0; i < row_owners.size(); i++) {
    while (!row.PageResIt()->row())
      row.Next(RIL_TEXTLINE);
    row.PageResIt()->row()->row->set_para(row_owners[i]);
    row.Next(RIL_TEXTLINE);
  }
}

void tesseract::DiscardUnusedModels	(	const GenericVector< RowScratchRegisters > &	rows,
		ParagraphTheory *	theory
	)

Definition at line 1404 of file paragraphs.cpp.

                                                  {
  SetOfModels used_models;
  for (int i = 0; i < rows.size(); i++) {
    rows[i].StrongHypotheses(&used_models);
  }
  theory->DiscardUnusedModels(used_models);
}

void tesseract::DowngradeWeakestToCrowns	(	int	debug_level,
		ParagraphTheory *	theory,
		GenericVector< RowScratchRegisters > *	rows
	)

Definition at line 1437 of file paragraphs.cpp.

                                                                        {
  int start;
  for (int end = rows->size(); end > 0; end = start) {
    // Search back for a body line of a unique type.
    const ParagraphModel *model = NULL;
    while (end > 0 &&
           (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
      end--;
    }
    if (end == 0) break;
    start = end - 1;
    while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
      start--;  // walk back to the first line that is not the same body type.
    }
    if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
        StrongModel(model) &&
        NearlyEqual(model->first_indent(), model->body_indent(),
                    model->tolerance())) {
        start--;
    }
    start++;
    // Now rows[start, end) is a sequence of unique body hypotheses of model.
    if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
      continue;
    if (!StrongModel(model)) {
      while (start > 0 &&
             CrownCompatible(rows, start - 1, start, model))
        start--;
    }
    if (start == 0 ||
        (!StrongModel(model)) ||
        (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
      // crownify rows[start, end)
      const ParagraphModel *crown_model = model;
      if (StrongModel(model)) {
          if (model->justification() == JUSTIFICATION_LEFT)
            crown_model = kCrownLeft;
          else
            crown_model = kCrownRight;
      }
      (*rows)[start].SetUnknown();
      (*rows)[start].AddStartLine(crown_model);
      for (int row = start + 1; row < end; row++) {
        (*rows)[row].SetUnknown();
        (*rows)[row].AddBodyLine(crown_model);
      }
    }
  }
  DiscardUnusedModels(*rows, theory);
}

tesseract::ELISTIZE ( ViterbiStateEntry )

tesseract::ELISTIZE ( AmbigSpec )

tesseract::ELISTIZEH ( AmbigSpec )

tesseract::ELISTIZEH ( ViterbiStateEntry )

BLOB_CHOICE* tesseract::find_choice_by_script	(	BLOB_CHOICE_LIST *	blob_choices,
		int	target_sid,
		int	backup_sid,
		int	secondary_sid
	)

Iterate through all the character choices (for a single blob) and return the first that matches the target script ID. If backup_sid is not 0, then a match on either the target or backup sid is allowed. Note that there is no preference between a target or backup sid. To search for another sid only if no target_sid matched, use secondary_sid. So for example, to find first Han or Common char choice, do find_choice_by_script(cchoice, han_sid, common_sid, 0); To find first Han choice, but allow Common if none is found, do find_choice_by_script(cchoice, han_sid, 0, common_sid);

Definition at line 206 of file permute.cpp.

                       {
  BLOB_CHOICE_IT c_it(blob_choices);
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
    bool found = false;
    if (c_it.data()->script_id() == 0) continue;
    if (c_it.data()->script_id() == target_sid) found = true;
    if (backup_sid > 0 && c_it.data()->script_id() == backup_sid) found = true;
    if (found) return c_it.data();
  }
  if (secondary_sid > 0) {
    c_it.set_to_list(blob_choices);
    for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
      if (c_it.data()->script_id() == 0) continue;
      if (c_it.data()->script_id() == secondary_sid)
        return c_it.data();
    }
  }
  return NULL;
}

BLOB_CHOICE* tesseract::find_choice_by_type	(	BLOB_CHOICE_LIST *	blob_choices,
		char	target_type,
		const UNICHARSET &	unicharset
	)

Iterate through all the character choices (for a single blob) and return the first that matches the given type, which is one of 'aA0px*', for lower, upper, digit, punctuation, other, and 'any', respectively. If not match is found, a NULL is returned.

Definition at line 181 of file permute.cpp.

                                  {
  BLOB_CHOICE_IT c_it(blob_choices);
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
    if (c_it.data() &&
        unicharset.get_chartype(c_it.data()->unichar_id()) == target_type)
      return c_it.data();
  }
  return NULL;
}

int tesseract::find_choice_by_uid	(	BLOB_CHOICE_LIST *	blob_list,
		UNICHAR_ID	target_uid
	)

Returns the rank (starting at 0) of a given unichar ID in the char choice list, or -1 if not found.

Definition at line 110 of file permute.cpp.

                                                                           {
  BLOB_CHOICE_IT c_it(blob_list);
  int pos = 0;
  while (1) {
    if (c_it.data()->unichar_id() == target_uid) return pos;
    if (c_it.at_last()) break;
    c_it.forward();
    pos++;
  }
  return -1;
}

bool tesseract::FirstWordWouldHaveFit	(	const RowScratchRegisters &	before,
		const RowScratchRegisters &	after,
		tesseract::ParagraphJustification	justification
	)

Definition at line 1568 of file paragraphs.cpp.

                                                                          {
  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
    return true;

  if (justification == JUSTIFICATION_UNKNOWN) {
    tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
  }
  int available_space;
  if (justification == JUSTIFICATION_CENTER) {
    available_space = before.lindent_ + before.rindent_;
  } else {
    available_space = before.OffsideIndent(justification);
  }
  available_space -= before.ri_->average_interword_space;

  if (before.ri_->ltr)
    return after.ri_->lword_box.width() < available_space;
  return after.ri_->rword_box.width() < available_space;
}

bool tesseract::FirstWordWouldHaveFit	(	const RowScratchRegisters &	before,
		const RowScratchRegisters &	after
	)

Definition at line 1593 of file paragraphs.cpp.

                                                             {
  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
    return true;

  int available_space = before.lindent_;
  if (before.rindent_ > available_space)
    available_space = before.rindent_;
  available_space -= before.ri_->average_interword_space;

  if (before.ri_->ltr)
    return after.ri_->lword_box.width() < available_space;
  return after.ri_->rword_box.width() < available_space;
}

void tesseract::FontInfoDeleteCallback ( FontInfo f )

Definition at line 44 of file fontinfo.cpp.

                                        {
  if (f.spacing_vec != NULL) {
    f.spacing_vec->delete_data_pointers();
    delete f.spacing_vec;
  }
  delete[] f.name;
}

void tesseract::FontSetDeleteCallback ( FontSet fs )

Definition at line 51 of file fontinfo.cpp.

                                       {
  delete[] fs.configs;
}

void tesseract::GeometricClassify	(	int	debug_level,
		GenericVector< RowScratchRegisters > *	rows,
		int	row_start,
		int	row_end,
		ParagraphTheory *	theory
	)

Definition at line 1028 of file paragraphs.cpp.

                                                {
  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
    return;
  if (debug_level > 1) {
    tprintf("###############################################\n");
    tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n",
            row_start, row_end);
    tprintf("###############################################\n");
  }
  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);

  GeometricClassifierState s(debug_level, rows, row_start, row_end);
  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
    s.Fail(2, "Too much variety for simple outline classification.");
    return;
  }
  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
    s.Fail(1, "Not enough variety for simple outline classification.");
    return;
  }
  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
    GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
    return;
  }

  // At this point, we know that one side has at least two tab stops, and the
  // other side has one or two tab stops.
  // Left to determine:
  //   (1) Which is the body indent and which is the first line indent?
  //   (2) Is the text fully justified?

  // If one side happens to have three or more tab stops, assume that side
  // is opposite of the aligned side.
  if (s.right_tabs.size() > 2) {
    s.AssumeLeftJustification();
  } else if (s.left_tabs.size() > 2) {
    s.AssumeRightJustification();
  } else if (s.ltr) {  // guess based on script direction
    s.AssumeLeftJustification();
  } else {
    s.AssumeRightJustification();
  }

  if (s.AlignTabs().size() == 2) {
    // For each tab stop on the aligned side, how many of them appear
    // to be paragraph start lines?  [first lines]
    int firsts[2] = {0, 0};
    // Count the first line as a likely paragraph start line.
    firsts[s.AlignsideTabIndex(s.row_start)]++;
    // For each line, if the first word would have fit on the previous
    // line count it as a likely paragraph start line.
    for (int i = s.row_start + 1; i < s.row_end; i++) {
      if (s.FirstWordWouldHaveFit(i - 1, i)) {
        firsts[s.AlignsideTabIndex(i)]++;
      }
    }
    // Make an extra accounting for the last line of the paragraph just
    // in case it's the only short line in the block.  That is, take its
    // first word as typical and see if this looks like the *last* line
    // of a paragraph.  If so, mark the *other* indent as probably a first.
    if (s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
      firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
    }

    int percent0firsts, percent1firsts;
    percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
    percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;

    // TODO(eger): Tune these constants if necessary.
    if ((percent0firsts < 20 && 30 < percent1firsts) ||
        percent0firsts + 30 < percent1firsts) {
      s.first_indent = s.AlignTabs()[1].center;
      s.body_indent = s.AlignTabs()[0].center;
    } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
               percent1firsts + 30 < percent0firsts) {
      s.first_indent = s.AlignTabs()[0].center;
      s.body_indent = s.AlignTabs()[1].center;
    } else {
      // Ambiguous! Probably lineated (poetry)
      if (debug_level > 1) {
        tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
                s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
                s.AlignTabs()[0].center, percent0firsts);
        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
                s.AlignTabs()[1].center, percent1firsts);
        s.PrintRows();
      }
      return;
    }
  } else {
    // There's only one tab stop for the "aligned to" side.
    s.first_indent = s.body_indent = s.AlignTabs()[0].center;
  }

  // At this point, we have our model.
  const ParagraphModel *model = theory->AddModel(s.Model());

  // Now all we have to do is figure out if the text is fully justified or not.
  // eop_threshold: default to fully justified unless we see evidence below.
  //    See description on MarkRowsWithModel()
  s.eop_threshold =
      (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
  // If the text is not fully justified, re-set the eop_threshold to 0.
  if (s.AlignTabs().size() == 2) {
    // Paragraphs with a paragraph-start indent.
    for (int i = s.row_start; i < s.row_end - 1; i++) {
      if (ValidFirstLine(s.rows, i + 1, model) &&
          !NearlyEqual(s.OffsideTabs()[0].center,
                       (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
        // We found a non-end-of-paragraph short line: not fully justified.
        s.eop_threshold = 0;
        break;
      }
    }
  } else {
    // Paragraphs with no paragraph-start indent.
    for (int i = s.row_start; i < s.row_end - 1; i++) {
      if (!s.FirstWordWouldHaveFit(i, i + 1) &&
          !NearlyEqual(s.OffsideTabs()[0].center,
                       (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
        // We found a non-end-of-paragraph short line: not fully justified.
        s.eop_threshold = 0;
        break;
      }
    }
  }
  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
}

void tesseract::GeometricClassifyThreeTabStopTextBlock	(	int	debug_level,
		GeometricClassifierState &	s,
		ParagraphTheory *	theory
	)

Definition at line 936 of file paragraphs.cpp.

                             {
  int num_rows = s.row_end - s.row_start;
  int num_full_rows = 0;
  int last_row_full = 0;
  for (int i = s.row_start; i < s.row_end; i++) {
    if (s.IsFullRow(i)) {
      num_full_rows++;
      if (i == s.row_end - 1) last_row_full++;
    }
  }

  if (num_full_rows < 0.7 * num_rows) {
    s.Fail(1, "Not enough full lines to know which lines start paras.");
    return;
  }

  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
  s.eop_threshold = 0;

  if (s.ltr) {
    s.AssumeLeftJustification();
  } else {
    s.AssumeRightJustification();
  }

  if (debug_level > 0) {
    tprintf("# Not enough variety for clear outline classification. "
            "Guessing these are %s aligned based on script.\n",
            s.ltr ? "left" : "right");
    s.PrintRows();
  }

  if (s.AlignTabs().size() == 2) {  // case A1 or A2
    s.first_indent = s.AlignTabs()[1].center;
    s.body_indent = s.AlignTabs()[0].center;
  } else {                      // case B1 or B2
    if (num_rows - 1 == num_full_rows - last_row_full) {
      // case B2
      const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
      (*s.rows)[s.row_start].AddStartLine(model);
      for (int i = s.row_start + 1; i < s.row_end; i++) {
        (*s.rows)[i].AddBodyLine(model);
      }
      return;
    } else {
      // case B1
      s.first_indent = s.body_indent = s.AlignTabs()[0].center;
      s.eop_threshold = (s.OffsideTabs()[0].center +
                         s.OffsideTabs()[1].center) / 2;
    }
  }
  const ParagraphModel *model = theory->AddModel(s.Model());
  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
                    s.ltr, s.eop_threshold);
  return;
}

WERD_CHOICE* tesseract::get_best_delete_other	(	WERD_CHOICE *	choice1,
		WERD_CHOICE *	choice2
	)

get_best_delete_other

Returns the best of two choices and deletes the other (worse) choice. A choice is better if it has a non-empty string and has a lower rating than the other choice. If the ratings are the same, choice2 is preferred over choice1.

Definition at line 74 of file permute.cpp.

                                                         {
  if (!choice1) return choice2;
  if (!choice2) return choice1;
  if (choice1->rating() < choice2->rating() || choice2->length() == 0) {
    delete choice2;
    return choice1;
  } else {
    delete choice1;
    return choice2;
  }
}

WERD_CHOICE* tesseract::get_choice_from_posstr	(	const UNICHARSET *	unicharset,
		const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		int	start_pos,
		const char *	pos_str,
		float *	certainties
	)

Returns a WERD formed by taking the specified position (nth choice) string from char_choices starting at the given position. For example, if start_pos=2, pos_str="0121" will form a word using the 1st choice of char 3, 2nd choice of char 4, 3rd choice of char 5, 2nd choice of char 6. If n > number of choice, the closest (last) one is used.

Definition at line 129 of file permute.cpp.

                                                        {
  int pos_str_len = strlen(pos_str);
  WERD_CHOICE* wchoice = new WERD_CHOICE(unicharset);
  if (start_pos + pos_str_len > char_choices.length()) {
    wchoice->make_bad();
    return wchoice;
  }
  for (int x = 0; x < pos_str_len; x++) {
    int pos = pos_str[x]-'0';
    if (pos < 0) pos = 0;   // use the top choice by default, eg. '.'
    if (pos >= 10)
      tprintf("PosStr[%d](%d)=%c  %d\n", x, pos_str_len, pos_str[x], pos);
    ASSERT_HOST(pos < 10);
    BLOB_CHOICE* blob_it = get_nth_choice(char_choices.get(start_pos+x), pos);
    wchoice->set_permuter(NO_PERM);
    wchoice->append_unichar_id(blob_it->unichar_id(), 1,
                               blob_it->rating(),
                               blob_it->certainty());
    if (certainties != NULL) certainties[x] = blob_it->certainty();
  }
  return wchoice;
}

BLOB_CHOICE* tesseract::get_nth_choice	(	BLOB_CHOICE_LIST *	blob_list,
		int	n
	)

Returns the n-th choice in the given blob_list (top-K choices). If n > K, the last choice is returned.

Definition at line 91 of file permute.cpp.

                                                                {
  BLOB_CHOICE_IT c_it(blob_list);
  while (n-- > 0 && !c_it.at_last())
    c_it.forward();
  return c_it.data();
}

void tesseract::get_posstr_from_choice	(	const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		WERD_CHOICE *	word_choice,
		int	start_pos,
		char *	pos_str
	)

Given a WERD_CHOICE, find the corresponding position string from char_choices. Pos_str must have been allocated already. This is the reverse of get_choice_from_posstr.

Definition at line 161 of file permute.cpp.

                                           {
  for (int i = 0; i < word_choice->length(); i++) {
    UNICHAR_ID target_id = word_choice->unichar_id(i);
    BLOB_CHOICE_LIST* blob_choice_list = char_choices.get(start_pos + i);
    int pos = find_choice_by_uid(blob_choice_list, target_id);
    if (pos < 0) pos = 0;
    pos_str[i] = pos + '0';
  }
  pos_str[word_choice->length()] = '\0';
}

UNICHAR_ID tesseract::get_top_choice_uid ( BLOB_CHOICE_LIST * blob_list )

Returns the top choice char id. A helper function to make code cleaner.

Definition at line 99 of file permute.cpp.

                                                           {
  if (!blob_list) return INVALID_UNICHAR_ID;
  BLOB_CHOICE_IT blob_choice_it(blob_list);
  return (blob_choice_it.data()) ? blob_choice_it.data()->unichar_id()
                                 : INVALID_UNICHAR_ID;
}

Pix* tesseract::GridReducedPix	(	const TBOX &	box,
		int	gridsize,
		ICOORD	bleft,
		int *	left,
		int *	bottom
	)

Definition at line 212 of file bbgrid.cpp.

                                                          {
  // Compute grid bounds of the outline and pad all round by 1.
  int grid_left = (box.left() - bleft.x()) / gridsize - 1;
  int grid_bottom = (box.bottom() - bleft.y()) / gridsize - 1;
  int grid_right = (box.right() - bleft.x()) / gridsize + 1;
  int grid_top = (box.top() - bleft.y()) / gridsize + 1;
  *left = grid_left;
  *bottom = grid_bottom;
  return pixCreate(grid_right - grid_left + 1,
                   grid_top - grid_bottom + 1,
                   1);
}

void tesseract::HistogramRect	(	const unsigned char *	imagedata,
		int	bytes_per_pixel,
		int	bytes_per_line,
		int	left,
		int	top,
		int	width,
		int	height,
		int *	histogram
	)

Definition at line 93 of file otsuthr.cpp.

                                   {
  int bottom = top + height;
  memset(histogram, 0, sizeof(*histogram) * kHistogramSize);
  const unsigned char* pixels = imagedata +
                                top * bytes_per_line +
                                left * bytes_per_pixel;
  for (int y = top; y < bottom; ++y) {
    for (int x = 0; x < width; ++x) {
      ++histogram[pixels[x * bytes_per_pixel]];
    }
    pixels += bytes_per_line;
  }
}

void tesseract::InitializeRowInfo	(	const MutableIterator &	it,
		RowInfo *	info
	)

Definition at line 2307 of file paragraphs.cpp.

                                                                 {
  if (it.PageResIt()->row() != NULL) {
    ROW *row = it.PageResIt()->row()->row;
    info->pix_ldistance = row->lmargin();
    info->pix_rdistance = row->rmargin();
    info->average_interword_space =
        row->space() > 0 ? row->space() : MAX(row->x_height(), 1);
    info->pix_xheight = row->x_height();
    info->has_leaders = false;
    info->has_drop_cap = row->has_drop_cap();
    info->ltr = true;  // set below depending on word scripts
  } else {
    info->pix_ldistance = info->pix_rdistance = 0;
    info->average_interword_space = 1;
    info->pix_xheight = 1.0;
    info->has_leaders = false;
    info->has_drop_cap = false;
    info->ltr = true;
  }

  info->text = "";
  char *text = it.GetUTF8Text(RIL_TEXTLINE);
  int trailing_ws_idx = strlen(text);  // strip trailing space
  while (trailing_ws_idx > 0 &&
         // isspace() only takes ASCII
         ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
         isspace(text[trailing_ws_idx - 1]))
    trailing_ws_idx--;
  if (trailing_ws_idx > 0) {
    int lspaces = info->pix_ldistance / info->average_interword_space;
    for (int i = 0; i < lspaces; i++)
      info->text += ' ';
    for (int i = 0; i < trailing_ws_idx; i++)
      info->text += text[i];
  }
  delete []text;

  info->num_words = 0;
  info->lword_indicates_list_item = false;
  info->lword_likely_starts_idea = false;
  info->lword_likely_ends_idea = false;
  info->rword_indicates_list_item = false;
  info->rword_likely_starts_idea = false;
  info->rword_likely_ends_idea = false;

  if (info->text.size() == 0) {
    info->rword_likely_ends_idea = false;
    info->rword_likely_ends_idea = false;
    return;
  }

  int ltr = 0;
  int rtl = 0;

  PAGE_RES_IT page_res_it = *it.PageResIt();
  GenericVector<WERD_RES *> werds;
  WERD_RES *word_res = page_res_it.restart_row();
  ROW_RES *this_row = page_res_it.row();
  int num_leaders = 0;
  do {
    if (word_res && word_res->best_choice->unichar_string().length() > 0) {
      werds.push_back(word_res);
      ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
      rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
      if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
    }
    word_res = page_res_it.forward();
  } while (page_res_it.row() == this_row);

  info->has_leaders = num_leaders > 3;
  info->num_words = werds.size();
  if (werds.size() > 0) {
    WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
    info->lword_text = lword->best_choice->unichar_string().string();
    info->rword_text = rword->best_choice->unichar_string().string();
    info->lword_box = lword->word->bounding_box();
    info->rword_box = rword->word->bounding_box();
    LeftWordAttributes(lword->uch_set, lword->best_choice,
                       info->lword_text,
                       &info->lword_indicates_list_item,
                       &info->lword_likely_starts_idea,
                       &info->lword_likely_ends_idea);
    RightWordAttributes(rword->uch_set, rword->best_choice,
                        info->rword_text,
                        &info->rword_indicates_list_item,
                        &info->rword_likely_starts_idea,
                        &info->rword_likely_ends_idea);
  }
  info->ltr = ltr >= rtl;
}

ParagraphModel tesseract::InternalParagraphModelByOutline	(	const GenericVector< RowScratchRegisters > *	rows,
		int	start,
		int	end,
		int	tolerance,
		bool *	consistent
	)

Definition at line 1639 of file paragraphs.cpp.

                                                         {
  int ltr_line_count = 0;
  for (int i = start; i < end; i++) {
    ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
  }
  bool ltr = (ltr_line_count >= (end - start) / 2);

  *consistent = true;
  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
    return ParagraphModel();

  // Ensure the caller only passed us a region with a common rmargin and
  // lmargin.
  int lmargin = (*rows)[start].lmargin_;
  int rmargin = (*rows)[start].rmargin_;
  int lmin, lmax, rmin, rmax, cmin, cmax;
  lmin = lmax = (*rows)[start + 1].lindent_;
  rmin = rmax = (*rows)[start + 1].rindent_;
  cmin = cmax = 0;
  for (int i = start + 1; i < end; i++) {
    if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
      tprintf("Margins don't match! Software error.\n");
      *consistent = false;
      return ParagraphModel();
    }
    UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
    UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
    UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
  }
  int ldiff = lmax - lmin;
  int rdiff = rmax - rmin;
  int cdiff = cmax - cmin;
  if (rdiff > tolerance && ldiff > tolerance) {
    if (cdiff < tolerance * 2) {
      if (end - start < 3)
        return ParagraphModel();
      return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
    }
    *consistent = false;
    return ParagraphModel();
  }
  if (end - start < 3)  // Don't return a model for two line paras.
    return ParagraphModel();

  // These booleans keep us from saying something is aligned left when the body
  // left variance is too large.
  bool body_admits_left_alignment = ldiff < tolerance;
  bool body_admits_right_alignment = rdiff < tolerance;

  ParagraphModel left_model =
      ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
                     (lmin + lmax) / 2, tolerance);
  ParagraphModel right_model =
      ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
                     (rmin + rmax) / 2, tolerance);

  // These booleans keep us from having an indent on the "wrong side" for the
  // first line.
  bool text_admits_left_alignment = ltr || left_model.is_flush();
  bool text_admits_right_alignment = !ltr || right_model.is_flush();

  // At least one of the edges is less than tolerance in variance.
  // If the other is obviously ragged, it can't be the one aligned to.
  // [Note the last line is included in this raggedness.]
  if (tolerance < rdiff) {
    if (body_admits_left_alignment && text_admits_left_alignment)
      return left_model;
    *consistent = false;
    return ParagraphModel();
  }
  if (tolerance < ldiff) {
    if (body_admits_right_alignment && text_admits_right_alignment)
      return right_model;
    *consistent = false;
    return ParagraphModel();
  }

  // At this point, we know the body text doesn't vary much on either side.

  // If the first line juts out oddly in one direction or the other,
  // that likely indicates the side aligned to.
  int first_left = (*rows)[start].lindent_;
  int first_right = (*rows)[start].rindent_;

  if (ltr && body_admits_left_alignment &&
      (first_left < lmin || first_left > lmax))
    return left_model;
  if (!ltr && body_admits_right_alignment &&
      (first_right < rmin || first_right > rmax))
    return right_model;

  *consistent = false;
  return ParagraphModel();
}

int tesseract::InterwordSpace	(	const GenericVector< RowScratchRegisters > &	rows,
		int	row_start,
		int	row_end
	)

Definition at line 1547 of file paragraphs.cpp.

                                               {
  if (row_end < row_start + 1) return 1;
  bool legit = false;
  int natural_space = rows[row_start].ri_->average_interword_space;
  for (int i = row_start; i < row_end; i++) {
    if (rows[i].ri_->num_words > 1) {
      if (!legit) {
        natural_space = rows[i].ri_->average_interword_space;
        legit = true;
      } else {
        if (rows[i].ri_->average_interword_space < natural_space)
          natural_space = rows[i].ri_->average_interword_space;
      }
    }
  }
  return natural_space;
}

bool tesseract::IsDigitLike ( int ch )

Definition at line 209 of file paragraphs.cpp.

                         {
  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
}

bool tesseract::IsLatinLetter ( int ch )

Definition at line 205 of file paragraphs.cpp.

                           {
  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
}

bool tesseract::IsLeftIndented ( const EquationDetect::IndentType type ) [inline]

Definition at line 95 of file equationdetect.cpp.

                                                                {
  return type == EquationDetect::LEFT_INDENT ||
      type == EquationDetect::BOTH_INDENT;
}

bool tesseract::IsOpeningPunct ( int ch )

Definition at line 213 of file paragraphs.cpp.

                            {
  return strchr("'\"({[", ch) != NULL;
}

bool tesseract::IsRightIndented ( const EquationDetect::IndentType type ) [inline]

Definition at line 100 of file equationdetect.cpp.

                                                                 {
  return type == EquationDetect::RIGHT_INDENT ||
      type == EquationDetect::BOTH_INDENT;
}

bool tesseract::IsTerminalPunct ( int ch )

Definition at line 217 of file paragraphs.cpp.

                             {
  return strchr(":'\".?!]})", ch) != NULL;
}

bool tesseract::IsTextOrEquationType ( PolyBlockType type ) [inline]

Definition at line 91 of file equationdetect.cpp.

                                                     {
  return PTIsTextType(type) || type == PT_EQUATION;
}

void tesseract::LeftoverSegments	(	const GenericVector< RowScratchRegisters > &	rows,
		GenericVector< Interval > *	to_fix,
		int	row_start,
		int	row_end
	)

Definition at line 2128 of file paragraphs.cpp.

                                                  {
  to_fix->clear();
  for (int i = row_start; i < row_end; i++) {
    bool needs_fixing = false;

    SetOfModels models;
    SetOfModels models_w_crowns;
    rows[i].StrongHypotheses(&models);
    rows[i].NonNullHypotheses(&models_w_crowns);
    if (models.empty() && models_w_crowns.size() > 0) {
      // Crown paragraph.  Is it followed by a modeled line?
      for (int end = i + 1; end < rows.size(); end++) {
        SetOfModels end_models;
        SetOfModels strong_end_models;
        rows[end].NonNullHypotheses(&end_models);
        rows[end].StrongHypotheses(&strong_end_models);
        if (end_models.size() == 0) {
          needs_fixing = true;
          break;
        } else if (strong_end_models.size() > 0) {
          needs_fixing = false;
          break;
        }
      }
    } else if (models.empty() && rows[i].ri_->num_words > 0) {
      // No models at all.
      needs_fixing = true;
    }

    if (!needs_fixing && !models.empty()) {
      needs_fixing = RowIsStranded(rows, i);
    }

    if (needs_fixing) {
      if (!to_fix->empty() && to_fix->back().end == i - 1)
        to_fix->back().end = i;
      else
        to_fix->push_back(Interval(i, i));
    }
  }
  // Convert inclusive intervals to half-open intervals.
  for (int i = 0; i < to_fix->size(); i++) {
    (*to_fix)[i].end = (*to_fix)[i].end + 1;
  }
}

void tesseract::LeftWordAttributes	(	const UNICHARSET *	unicharset,
		const WERD_CHOICE *	werd,
		const STRING &	utf8,
		bool *	is_list,
		bool *	starts_idea,
		bool *	ends_idea
	)

Definition at line 406 of file paragraphs.cpp.

                                                                           {
  *is_list = false;
  *starts_idea = false;
  *ends_idea = false;
  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
    *ends_idea = true;
    return;
  }

  if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
    if (UniLikelyListItem(unicharset, werd)) {
      *is_list = true;
      *starts_idea = true;
      *ends_idea = true;
    }
    if (unicharset->get_isupper(werd->unichar_id(0))) {
      *starts_idea = true;
    }
    if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
      *starts_idea = true;
      *ends_idea = true;
    }
  } else {  // Assume utf8 is mostly ASCII
    if (AsciiLikelyListItem(utf8)) {
      *is_list = true;
      *starts_idea = true;
    }
    int start_letter = utf8[0];
    if (IsOpeningPunct(start_letter)) {
      *starts_idea = true;
    }
    if (IsTerminalPunct(start_letter)) {
      *ends_idea = true;
    }
    if (start_letter >= 'A' && start_letter <= 'Z') {
      *starts_idea = true;
    }
  }
}

bool tesseract::LikelyListMark ( const STRING & word )

Definition at line 274 of file paragraphs.cpp.

                                        {
  const char *kListMarks = "0Oo*.,+.";
  return word.size() == 1 && strchr(kListMarks, word[0]) != NULL;
}

bool tesseract::LikelyListMarkUnicode ( int ch )

Definition at line 340 of file paragraphs.cpp.

                                   {
  if (ch < 0x80) {
    STRING single_ch;
    single_ch += ch;
    return LikelyListMark(single_ch);
  }
  switch (ch) {
    // TODO(eger) expand this list of unicodes as needed.
    case 0x00B0:  // degree sign
    case 0x2022:  // bullet
    case 0x25E6:  // white bullet
    case 0x00B7:  // middle dot
    case 0x25A1:  // white square
    case 0x25A0:  // black square
    case 0x25AA:  // black small square
    case 0x2B1D:  // black very small square
    case 0x25BA:  // black right-pointing pointer
    case 0x25CF:  // black circle
    case 0x25CB:  // white circle
      return true;
    default:
      break;  // fall through
  }
  return false;
}

bool tesseract::LikelyListNumeral ( const STRING & word )

Definition at line 240 of file paragraphs.cpp.

                                           {
  const char *kRomans = "ivxlmdIVXLMD";
  const char *kDigits = "012345789";
  const char *kOpen = "[{(";
  const char *kSep = ":;-.,";
  const char *kClose = "]})";

  int num_segments = 0;
  const char *pos = word.string();
  while (*pos != '\0' && num_segments < 3) {
    // skip up to two open parens.
    const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
    const char *numeral_end = SkipChars(numeral_start, kRomans);
    if (numeral_end != numeral_start) {
      // Got Roman Numeral. Great.
    } else {
      numeral_end = SkipChars(numeral_start, kDigits);
      if (numeral_end == numeral_start) {
        // If there's a single latin letter, we can use that.
        numeral_end = SkipChars(numeral_start, IsLatinLetter);
        if (numeral_end - numeral_start != 1)
          break;
      }
    }
    // We got some sort of numeral.
    num_segments++;
    // Skip any trailing parens or punctuation.
    pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
    if (pos == numeral_end)
      break;
  }
  return *pos == '\0';
}

bool tesseract::LikelyParagraphStart	(	const RowScratchRegisters &	before,
		const RowScratchRegisters &	after
	)

Definition at line 1619 of file paragraphs.cpp.

                                                            {
  return before.ri_->num_words == 0 ||
      (FirstWordWouldHaveFit(before, after) &&
       TextSupportsBreak(before, after));
}

bool tesseract::LikelyParagraphStart	(	const RowScratchRegisters &	before,
		const RowScratchRegisters &	after,
		tesseract::ParagraphJustification	j
	)

Definition at line 1626 of file paragraphs.cpp.

                                                             {
  return before.ri_->num_words == 0 ||
      (FirstWordWouldHaveFit(before, after, j) &&
       TextSupportsBreak(before, after));
}

ShapeTable * tesseract::LoadShapeTable ( const STRING & file_prefix )

Definition at line 183 of file commontraining.cpp.

                                                      {
  ShapeTable* shape_table = NULL;
  STRING shape_table_file = file_prefix;
  shape_table_file += kShapeTableFileSuffix;
  FILE* shape_fp = fopen(shape_table_file.string(), "rb");
  if (shape_fp != NULL) {
    shape_table = new ShapeTable;
    if (!shape_table->DeSerialize(false, shape_fp)) {
      delete shape_table;
      shape_table = NULL;
      tprintf("Error: Failed to read shape table %s\n",
              shape_table_file.string());
    } else {
      int num_shapes = shape_table->NumShapes();
      tprintf("Read shape table %s of %d shapes\n",
              shape_table_file.string(), num_shapes);
    }
    fclose(shape_fp);
  } else {
    tprintf("Warning: No shape table file present: %s\n",
            shape_table_file.string());
  }
  return shape_table;
}

MasterTrainer * tesseract::LoadTrainingData	(	int	argc,
		const char const	argv,
		bool	replication,
		ShapeTable **	shape_table,
		STRING *	file_prefix
	)

Definition at line 238 of file commontraining.cpp.

                                                     {
  InitFeatureDefs(&feature_defs);
  InitIntegerFX();
  *file_prefix = "";
  if (!FLAGS_D.empty()) {
    *file_prefix += FLAGS_D.c_str();
    *file_prefix += "/";
  }
  // If we are shape clustering (NULL shape_table) or we successfully load
  // a shape_table written by a previous shape clustering, then
  // shape_analysis will be true, meaning that the MasterTrainer will replace
  // some members of the unicharset with their fragments.
  bool shape_analysis = false;
  if (shape_table != NULL) {
    *shape_table = LoadShapeTable(*file_prefix);
    if (*shape_table != NULL)
      shape_analysis = true;
  } else {
    shape_analysis = true;
  }
  MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
                                             shape_analysis,
                                             replication,
                                             FLAGS_debug_level);
  if (FLAGS_input_trainer.empty()) {
    trainer->LoadUnicharset(FLAGS_U.c_str());
    // Get basic font information from font_properties.
    if (!FLAGS_F.empty()) {
      if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
        delete trainer;
        return NULL;
      }
    }
    if (!FLAGS_X.empty()) {
      if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
        delete trainer;
        return NULL;
      }
    }
    IntFeatureSpace fs;
    fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
    trainer->SetFeatureSpace(fs);
    const char* page_name;
    // Load training data from .tr files on the command line.
    while ((page_name = GetNextFilename(argc, argv)) != NULL) {
      tprintf("Reading %s ...\n", page_name);
      FILE* fp = Efopen(page_name, "rb");
      trainer->ReadTrainingSamples(fp, feature_defs, false);
      fclose(fp);

      // If there is a file with [lang].[fontname].exp[num].fontinfo present,
      // read font spacing information in to fontinfo_table.
      int pagename_len = strlen(page_name);
      char *fontinfo_file_name = new char[pagename_len + 7];
      strncpy(fontinfo_file_name, page_name, pagename_len - 2);  // remove "tr"
      strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo");  // +"fontinfo"
      trainer->AddSpacingInfo(fontinfo_file_name);
      delete[] fontinfo_file_name;

      // Load the images into memory if required by the classifier.
      if (FLAGS_load_images) {
        STRING image_name = page_name;
        // Chop off the tr and replace with tif. Extension must be tif!
        image_name.truncate_at(image_name.length() - 2);
        image_name += "tif";
        trainer->LoadPageImages(image_name.string());
      }
    }
    trainer->PostLoadCleanup();
    // Write the master trainer if required.
    if (!FLAGS_output_trainer.empty()) {
      FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
      if (fp == NULL) {
        tprintf("Can't create saved trainer data!\n");
      } else {
        trainer->Serialize(fp);
        fclose(fp);
      }
    }
  } else {
    bool success = false;
    tprintf("Loading master trainer from file:%s\n",
            FLAGS_input_trainer.c_str());
    FILE* fp = fopen(FLAGS_input_trainer.c_str(), "rb");
    if (fp == NULL) {
      tprintf("Can't read file %s to initialize master trainer\n",
              FLAGS_input_trainer.c_str());
    } else {
      success = trainer->DeSerialize(false, fp);
      fclose(fp);
    }
    if (!success) {
      tprintf("Deserialize of master trainer failed!\n");
      delete trainer;
      return NULL;
    }
  }
  trainer->PreTrainingSetup();
  if (!FLAGS_O.empty() &&
      !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
    fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
    delete trainer;
    return NULL;
  }
  if (shape_table != NULL) {
    // If we previously failed to load a shapetable, then shape clustering
    // wasn't run so make a flat one now.
    if (*shape_table == NULL) {
      *shape_table = new ShapeTable;
      trainer->SetupFlatShapeTable(*shape_table);
      tprintf("Flat shape table summary: %s\n",
              (*shape_table)->SummaryStr().string());
    }
    (*shape_table)->set_unicharset(trainer->unicharset());
  }
  return trainer;
}

TBLOB* tesseract::make_tesseract_blob	(	float	baseline,
		float	xheight,
		float	descender,
		float	ascender,
		bool	numeric_mode,
		Pix *	pix
	)

Return a TBLOB * from the whole pix. To be freed later with delete.

Definition at line 1858 of file baseapi.cpp.

                                                        {
  TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);

  // Normalize TBLOB
  ROW *row =
      TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
  TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode, NULL);
  delete row;
  return tblob;
}

void tesseract::MarkRowsWithModel	(	GenericVector< RowScratchRegisters > *	rows,
		int	row_start,
		int	row_end,
		const ParagraphModel *	model,
		bool	ltr,
		int	eop_threshold
	)

Definition at line 763 of file paragraphs.cpp.

                                          {
  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
    return;
  for (int row = row_start; row < row_end; row++) {
    bool valid_first = ValidFirstLine(rows, row, model);
    bool valid_body = ValidBodyLine(rows, row, model);
    if (valid_first && !valid_body) {
      (*rows)[row].AddStartLine(model);
    } else if (valid_body && !valid_first) {
      (*rows)[row].AddBodyLine(model);
    } else if (valid_body && valid_first) {
      bool after_eop = (row == row_start);
      if (row > row_start) {
        if (eop_threshold > 0) {
          if (model->justification() == JUSTIFICATION_LEFT) {
            after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
          } else {
            after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
          }
        } else {
         after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
                                           model->justification());
        }
      }
      if (after_eop) {
        (*rows)[row].AddStartLine(model);
      } else {
        (*rows)[row].AddBodyLine(model);
      }
    } else {
      // Do nothing. Stray row.
    }
  }
}

void tesseract::MarkStrongEvidence	(	GenericVector< RowScratchRegisters > *	rows,
		int	row_start,
		int	row_end
	)

Definition at line 1777 of file paragraphs.cpp.

                                                    {
  // Record patently obvious body text.
  for (int i = row_start + 1; i < row_end; i++) {
    const RowScratchRegisters &prev = (*rows)[i - 1];
    RowScratchRegisters &curr = (*rows)[i];
    tesseract::ParagraphJustification typical_justification =
        prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
    if (!curr.ri_->rword_likely_starts_idea &&
        !curr.ri_->lword_likely_starts_idea &&
        !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
      curr.SetBodyLine();
    }
  }

  // Record patently obvious start paragraph lines.
  //
  // It's an extremely good signal of the start of a paragraph that
  // the first word would have fit on the end of the previous line.
  // However, applying just that signal would have us mark random
  // start lines of lineated text (poetry and source code) and some
  // centered headings as paragraph start lines.  Therefore, we use
  // a second qualification for a paragraph start: Not only should
  // the first word of this line have fit on the previous line,
  // but also, this line should go full to the right of the block,
  // disallowing a subsequent word from having fit on this line.

  // First row:
  {
    RowScratchRegisters &curr = (*rows)[row_start];
    RowScratchRegisters &next = (*rows)[row_start + 1];
    tesseract::ParagraphJustification j =
        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
    if (curr.GetLineType() == LT_UNKNOWN &&
        !FirstWordWouldHaveFit(curr, next, j) &&
        (curr.ri_->lword_likely_starts_idea ||
         curr.ri_->rword_likely_starts_idea)) {
      curr.SetStartLine();
    }
  }
  // Middle rows
  for (int i = row_start + 1; i < row_end - 1; i++) {
    RowScratchRegisters &prev = (*rows)[i - 1];
    RowScratchRegisters &curr = (*rows)[i];
    RowScratchRegisters &next = (*rows)[i + 1];
    tesseract::ParagraphJustification j =
        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
    if (curr.GetLineType() == LT_UNKNOWN &&
        !FirstWordWouldHaveFit(curr, next, j) &&
        LikelyParagraphStart(prev, curr, j)) {
      curr.SetStartLine();
    }
  }
  // Last row
  {  // the short circuit at the top means we have at least two lines.
    RowScratchRegisters &prev = (*rows)[row_end - 2];
    RowScratchRegisters &curr = (*rows)[row_end - 1];
    tesseract::ParagraphJustification j =
        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
    if (curr.GetLineType() == LT_UNKNOWN &&
        !FirstWordWouldHaveFit(curr, curr, j) &&
        LikelyParagraphStart(prev, curr, j)) {
      curr.SetStartLine();
    }
  }
}

void tesseract::ModelStrongEvidence	(	int	debug_level,
		GenericVector< RowScratchRegisters > *	rows,
		int	row_start,
		int	row_end,
		bool	allow_flush_models,
		ParagraphTheory *	theory
	)

Definition at line 1847 of file paragraphs.cpp.

                                                  {
  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
    return;

  int start = row_start;
  while (start < row_end) {
    while (start < row_end && (*rows)[start].GetLineType() != LT_START)
      start++;
    if (start >= row_end - 1)
      break;

    int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
    int end = start;
    ParagraphModel last_model;
    bool next_consistent;
    do {
      ++end;
      // rows[row, end) was consistent.
      // If rows[row, end + 1) is not consistent,
      //   just model rows[row, end)
      if (end < row_end - 1) {
        RowScratchRegisters &next = (*rows)[end];
        LineType lt = next.GetLineType();
        next_consistent = lt == LT_BODY ||
            (lt == LT_UNKNOWN &&
             !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
      } else {
        next_consistent = false;
      }
      if (next_consistent) {
        ParagraphModel next_model = InternalParagraphModelByOutline(
            rows, start, end + 1, tolerance, &next_consistent);
        if (((*rows)[start].ri_->ltr &&
             last_model.justification() == JUSTIFICATION_LEFT &&
             next_model.justification() != JUSTIFICATION_LEFT) ||
            (!(*rows)[start].ri_->ltr &&
             last_model.justification() == JUSTIFICATION_RIGHT &&
             next_model.justification() != JUSTIFICATION_RIGHT)) {
          next_consistent = false;
        }
        last_model = next_model;
      } else {
        next_consistent = false;
      }
    } while (next_consistent && end < row_end);
    // At this point, rows[start, end) looked like it could have been a
    // single paragraph.  If we can make a good ParagraphModel for it,
    // do so and mark this sequence with that model.
    if (end > start + 1) {
      // emit a new paragraph if we have more than one line.
      const ParagraphModel *model = NULL;
      ParagraphModel new_model = ParagraphModelByOutline(
          debug_level, rows, start, end,
          Epsilon(InterwordSpace(*rows, start, end)));
      if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
        // couldn't create a good model, oh well.
      } else if (new_model.is_flush()) {
        if (end == start + 2) {
          // It's very likely we just got two paragraph starts in a row.
          end = start + 1;
        } else if (start == row_start) {
          // Mark this as a Crown.
          if (new_model.justification() == JUSTIFICATION_LEFT) {
            model = kCrownLeft;
          } else {
            model = kCrownRight;
          }
        } else if (allow_flush_models) {
          model = theory->AddModel(new_model);
        }
      } else {
        model = theory->AddModel(new_model);
      }
      if (model) {
        (*rows)[start].AddStartLine(model);
        for (int i = start + 1; i < end; i++) {
          (*rows)[i].AddBodyLine(model);
        }
      }
    }
    start = end;
  }
}

int tesseract::OtsuStats	(	const int *	histogram,
		int *	H_out,
		int *	omega0_out
	)

Definition at line 113 of file otsuthr.cpp.

                                                                 {
  int H = 0;
  double mu_T = 0.0;
  for (int i = 0; i < kHistogramSize; ++i) {
    H += histogram[i];
    mu_T += static_cast<double>(i) * histogram[i];
  }

  // Now maximize sig_sq_B over t.
  // http://www.ctie.monash.edu.au/hargreave/Cornall_Terry_328.pdf
  int best_t = -1;
  int omega_0, omega_1;
  int best_omega_0 = 0;
  double best_sig_sq_B = 0.0;
  double mu_0, mu_1, mu_t;
  omega_0 = 0;
  mu_t = 0.0;
  for (int t = 0; t < kHistogramSize - 1; ++t) {
    omega_0 += histogram[t];
    mu_t += t * static_cast<double>(histogram[t]);
    if (omega_0 == 0)
      continue;
    omega_1 = H - omega_0;
    if (omega_1 == 0)
      break;
    mu_0 = mu_t / omega_0;
    mu_1 = (mu_T - mu_t) / omega_1;
    double sig_sq_B = mu_1 - mu_0;
    sig_sq_B *= sig_sq_B * omega_0 * omega_1;
    if (best_t < 0 || sig_sq_B > best_sig_sq_B) {
      best_sig_sq_B = sig_sq_B;
      best_t = t;
      best_omega_0 = omega_0;
    }
  }
  if (H_out != NULL) *H_out = H;
  if (omega0_out != NULL) *omega0_out = best_omega_0;
  return best_t;
}

void tesseract::OtsuThreshold	(	const unsigned char *	imagedata,
		int	bytes_per_pixel,
		int	bytes_per_line,
		int	left,
		int	top,
		int	width,
		int	height,
		int **	thresholds,
		int **	hi_values
	)

Definition at line 32 of file otsuthr.cpp.

                                                      {
  // Of all channels with no good hi_value, keep the best so we can always
  // produce at least one answer.
  int best_hi_value = 1;
  int best_hi_index = 0;
  bool any_good_hivalue = false;
  double best_hi_dist = 0.0;
  *thresholds = new int[bytes_per_pixel];
  *hi_values = new int[bytes_per_pixel];

  for (int ch = 0; ch < bytes_per_pixel; ++ch) {
    (*thresholds)[ch] = -1;
    (*hi_values)[ch] = -1;
    // Compute the histogram of the image rectangle.
    int histogram[kHistogramSize];
    HistogramRect(imagedata + ch, bytes_per_pixel, bytes_per_line,
                  left, top, width, height, histogram);
    int H;
    int best_omega_0;
    int best_t = OtsuStats(histogram, &H, &best_omega_0);
    if (best_omega_0 == 0 || best_omega_0 == H) {
       // This channel is empty.
       continue;
     }
    // To be a convincing foreground we must have a small fraction of H
    // or to be a convincing background we must have a large fraction of H.
    // In between we assume this channel contains no thresholding information.
    int hi_value = best_omega_0 < H * 0.5;
    (*thresholds)[ch] = best_t;
    if (best_omega_0 > H * 0.75) {
      any_good_hivalue = true;
      (*hi_values)[ch] = 0;
    } else if (best_omega_0 < H * 0.25) {
      any_good_hivalue = true;
      (*hi_values)[ch] = 1;
    } else {
      // In case all channels are like this, keep the best of the bad lot.
      double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
      if (hi_dist > best_hi_dist) {
        best_hi_dist = hi_dist;
        best_hi_value = hi_value;
        best_hi_index = ch;
      }
    }
  }
  if (!any_good_hivalue) {
    // Use the best of the ones that were not good enough.
    (*hi_values)[best_hi_index] = best_hi_value;
  }
}

ParagraphModel tesseract::ParagraphModelByOutline	(	int	debug_level,
		const GenericVector< RowScratchRegisters > *	rows,
		int	start,
		int	end,
		int	tolerance
	)

Definition at line 1740 of file paragraphs.cpp.

                                       {
  bool unused_consistent;
  ParagraphModel retval = InternalParagraphModelByOutline(
      rows, start, end, tolerance, &unused_consistent);
  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
    tprintf("Could not determine a model for this paragraph:\n");
    PrintRowRange(*rows, start, end);
  }
  return retval;
}

bool tesseract::read_info	(	FILE *	f,
		FontInfo *	fi,
		bool	swap
	)

Definition at line 57 of file fontinfo.cpp.

                                                 {
  inT32 size;
  if (fread(&size, sizeof(size), 1, f) != 1) return false;
  if (swap)
    Reverse32(&size);
  char* font_name = new char[size + 1];
  fi->name = font_name;
  if (fread(font_name, sizeof(*font_name), size, f) != size) return false;
  font_name[size] = '\0';
  if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
  if (swap)
    Reverse32(&fi->properties);
  return true;
}

bool tesseract::read_set	(	FILE *	f,
		FontSet *	fs,
		bool	swap
	)

Definition at line 140 of file fontinfo.cpp.

                                               {
  if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
  if (swap)
    Reverse32(&fs->size);
  fs->configs = new int[fs->size];
  for (int i = 0; i < fs->size; ++i) {
    if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
    if (swap)
      Reverse32(&fs->configs[i]);
  }
  return true;
}

bool tesseract::read_spacing_info	(	FILE *	f,
		FontInfo *	fi,
		bool	swap
	)

Definition at line 80 of file fontinfo.cpp.

                                                         {
  inT32 vec_size, kern_size;
  if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
  if (swap) Reverse32(&vec_size);
  ASSERT_HOST(vec_size >= 0);
  if (vec_size == 0) return true;
  fi->init_spacing(vec_size);
  for (int i = 0; i < vec_size; ++i) {
    FontSpacingInfo *fs = new FontSpacingInfo();
    if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
        fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
        fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
      return false;
    }
    if (swap) {
      ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
      ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
      Reverse32(&kern_size);
    }
    if (kern_size < 0) {  // indication of a NULL entry in fi->spacing_vec
      delete fs;
      continue;
    }
    if (kern_size > 0 && (!fs->kerned_unichar_ids.DeSerialize(swap, f) ||
                          !fs->kerned_x_gaps.DeSerialize(swap, f))) {
      return false;
    }
    fi->add_spacing(i, fs);
  }
  return true;
}

bool tesseract::read_t	(	PAGE_RES_IT *	page_res_it,
		TBOX *	tbox
	)

Definition at line 58 of file recogtraining.cpp.

                                                  {
  while (page_res_it->block() != NULL) {
    if (page_res_it->word() != NULL)
      break;
    page_res_it->forward();
  }

  if (page_res_it->word() != NULL) {
    *tbox = page_res_it->word()->word->bounding_box();
    page_res_it->forward();

    // If tbox->left() is negative, the training image has vertical text and
    // all the coordinates of bounding boxes of page_res are rotated by 90
    // degrees in a counterclockwise direction. We need to rotate the TBOX back
    // in order to compare with the TBOXes of box files.
    if (tbox->left() < 0) {
      tbox->rotate(FCOORD(0.0, -1.0));
    }

    return true;
  } else {
    return false;
  }
}

void tesseract::RecomputeMarginsAndClearHypotheses	(	GenericVector< RowScratchRegisters > *	rows,
		int	start,
		int	end,
		int	percentile
	)

Definition at line 1507 of file paragraphs.cpp.

                    {
  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
    return;

  int lmin, lmax, rmin, rmax;
  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
  for (int i = start; i < end; i++) {
    RowScratchRegisters &sr = (*rows)[i];
    sr.SetUnknown();
    if (sr.ri_->num_words == 0)
      continue;
    UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
    UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
  }
  STATS lefts(lmin, lmax + 1);
  STATS rights(rmin, rmax + 1);
  for (int i = start; i < end; i++) {
    RowScratchRegisters &sr = (*rows)[i];
    if (sr.ri_->num_words == 0)
      continue;
    lefts.add(sr.lmargin_ + sr.lindent_, 1);
    rights.add(sr.rmargin_ + sr.rindent_, 1);
  }
  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
  for (int i = start; i < end; i++) {
    RowScratchRegisters &sr = (*rows)[i];
    int ldelta = ignorable_left - sr.lmargin_;
    sr.lmargin_ += ldelta;
    sr.lindent_ -= ldelta;
    int rdelta = ignorable_right - sr.rmargin_;
    sr.rmargin_ += rdelta;
    sr.rindent_ -= rdelta;
  }
}

void tesseract::RightWordAttributes	(	const UNICHARSET *	unicharset,
		const WERD_CHOICE *	werd,
		const STRING &	utf8,
		bool *	is_list,
		bool *	starts_idea,
		bool *	ends_idea
	)

Definition at line 453 of file paragraphs.cpp.

                                                                            {
  *is_list = false;
  *starts_idea = false;
  *ends_idea = false;
  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
    *ends_idea = true;
    return;
  }

  if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
    if (UniLikelyListItem(unicharset, werd)) {
      *is_list = true;
      *starts_idea = true;
    }
    UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
    if (unicharset->get_ispunctuation(last_letter)) {
      *ends_idea = true;
    }
  } else {  // Assume utf8 is mostly ASCII
    if (AsciiLikelyListItem(utf8)) {
      *is_list = true;
      *starts_idea = true;
    }
    int last_letter = utf8[utf8.size() - 1];
    if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
      *ends_idea = true;
    }
  }
}

bool tesseract::RowIsStranded	(	const GenericVector< RowScratchRegisters > &	rows,
		int	row
	)

Definition at line 2086 of file paragraphs.cpp.

                                                                            {
  SetOfModels row_models;
  rows[row].StrongHypotheses(&row_models);

  for (int m = 0; m < row_models.size(); m++) {
    bool all_starts = rows[row].GetLineType();
    int run_length = 1;
    bool continues = true;
    for (int i = row - 1; i >= 0 && continues; i--) {
      SetOfModels models;
      rows[i].NonNullHypotheses(&models);
      switch (rows[i].GetLineType(row_models[m])) {
        case LT_START: run_length++; break;
        case LT_MULTIPLE:  // explicit fall-through
        case LT_BODY: run_length++; all_starts = false; break;
        case LT_UNKNOWN:  // explicit fall-through
        default: continues = false;
      }
    }
    continues = true;
    for (int i = row + 1; i < rows.size() && continues; i++) {
      SetOfModels models;
      rows[i].NonNullHypotheses(&models);
      switch (rows[i].GetLineType(row_models[m])) {
        case LT_START: run_length++; break;
        case LT_MULTIPLE:  // explicit fall-through
        case LT_BODY: run_length++; all_starts = false; break;
        case LT_UNKNOWN:  // explicit fall-through
        default: continues = false;
      }
    }
    if (run_length > 2 || (!all_starts && run_length > 1)) return false;
  }
  return true;
}

bool tesseract::RowsFitModel	(	const GenericVector< RowScratchRegisters > *	rows,
		int	start,
		int	end,
		const ParagraphModel *	model
	)

Definition at line 1755 of file paragraphs.cpp.

                                                                   {
  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
    return false;
  if (!ValidFirstLine(rows, start, model)) return false;
  for (int i = start + 1 ; i < end; i++) {
    if (!ValidBodyLine(rows, i, model)) return false;
  }
  return true;
}

STRING tesseract::RtlEmbed	(	const STRING &	word,
		bool	rtlify
	)

Definition at line 133 of file paragraphs.cpp.

                                                 {
  if (rtlify)
    return STRING(kRLE) + word + STRING(kPDF);
  return word;
}

void tesseract::SeparateSimpleLeaderLines	(	GenericVector< RowScratchRegisters > *	rows,
		int	row_start,
		int	row_end,
		ParagraphTheory *	theory
	)

Definition at line 1972 of file paragraphs.cpp.

                                                        {
  for (int i = row_start + 1; i < row_end - 1; i++) {
    if ((*rows)[i - 1].ri_->has_leaders &&
        (*rows)[i].ri_->has_leaders &&
        (*rows)[i + 1].ri_->has_leaders) {
      const ParagraphModel *model = theory->AddModel(
          ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
      (*rows)[i].AddStartLine(model);
    }
  }
}

template<typename T >

void tesseract::SimpleSwap	(	T &	a,
		T &	b
	)

Definition at line 62 of file paragraphs.cpp.

                            {
  T c = a;
  a = b;
  b = c;
}

const char* tesseract::SkipChars	(	const char *	str,
		const char *	toskip
	)

Definition at line 222 of file paragraphs.cpp.

                                                           {
  while (*str != '\0' && strchr(toskip, *str)) { str++; }
  return str;
}

const char* tesseract::SkipChars	(	const char *	str,
		bool(*)(int)	skip
	)

Definition at line 227 of file paragraphs.cpp.

                                                          {
  while (*str != '\0' && skip(*str)) { str++; }
  return str;
}

const char* tesseract::SkipOne	(	const char *	str,
		const char *	toskip
	)

Definition at line 232 of file paragraphs.cpp.

                                                         {
  if (*str != '\0' && strchr(toskip, *str)) return str + 1;
  return str;
}

template<typename T >

int tesseract::sort_cmp	(	const void *	t1,
		const void *	t2
	)

Definition at line 294 of file genericvector.h.

                                             {
  const T* a = static_cast<const T *> (t1);
  const T* b = static_cast<const T *> (t2);
  if (*a < *b) {
    return -1;
  } else if (*b < *a) {
    return 1;
  } else {
    return 0;
  }
}

template<typename T >

int tesseract::sort_ptr_cmp	(	const void *	t1,
		const void *	t2
	)

Definition at line 311 of file genericvector.h.

                                                 {
  const T* a = *reinterpret_cast<T * const *>(t1);
  const T* b = *reinterpret_cast<T * const *>(t2);
  if (*a < *b) {
    return -1;
  } else if (*b < *a) {
    return 1;
  } else {
    return 0;
  }
}

template<class BBC >

int tesseract::SortByBoxBottom	(	const void *	void1,
		const void *	void2
	)

Definition at line 405 of file bbgrid.h.

                                                          {
  // The void*s are actually doubly indirected, so get rid of one level.
  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
  int result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
  if (result != 0)
    return result;
  result =  p1->bounding_box().top() - p2->bounding_box().top();
  if (result != 0)
    return result;
  result = p1->bounding_box().left() - p2->bounding_box().left();
  if (result != 0)
    return result;
  return p1->bounding_box().right() - p2->bounding_box().right();
}

template<class BBC >

int tesseract::SortByBoxLeft	(	const void *	void1,
		const void *	void2
	)

Definition at line 369 of file bbgrid.h.

                                                        {
  // The void*s are actually doubly indirected, so get rid of one level.
  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
  int result = p1->bounding_box().left() - p2->bounding_box().left();
  if (result != 0)
    return result;
  result = p1->bounding_box().right() - p2->bounding_box().right();
  if (result != 0)
    return result;
  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
  if (result != 0)
    return result;
  return p1->bounding_box().top() - p2->bounding_box().top();
}

template<class BLOB_CHOICE >

int tesseract::SortByRating	(	const void *	void1,
		const void *	void2
	)

Definition at line 116 of file pieces.cpp.

                                                       {
  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);

  if (p1->rating() < p2->rating())
    return 1;
  return -1;
}

template<class BLOB_CHOICE >

int tesseract::SortByUnicharID	(	const void *	void1,
		const void *	void2
	)

Definition at line 108 of file pieces.cpp.

                                                          {
  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);

  return p1->unichar_id() - p2->unichar_id();
}

template<class BBC >

int tesseract::SortRightToLeft	(	const void *	void1,
		const void *	void2
	)

Definition at line 387 of file bbgrid.h.

                                                          {
  // The void*s are actually doubly indirected, so get rid of one level.
  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
  int result = p2->bounding_box().right() - p1->bounding_box().right();
  if (result != 0)
    return result;
  result = p2->bounding_box().left() - p1->bounding_box().left();
  if (result != 0)
    return result;
  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
  if (result != 0)
    return result;
  return p1->bounding_box().top() - p2->bounding_box().top();
}

void tesseract::StrongEvidenceClassify	(	int	debug_level,
		GenericVector< RowScratchRegisters > *	rows,
		int	row_start,
		int	row_end,
		ParagraphTheory *	theory
	)

Definition at line 1942 of file paragraphs.cpp.

                                                     {
  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
    return;

  if (debug_level > 1) {
    tprintf("#############################################\n");
    tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
    tprintf("#############################################\n");
  }

  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
  MarkStrongEvidence(rows, row_start, row_end);

  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);

  // Create paragraph models.
  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);

  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);

  // At this point, some rows are marked up as paragraphs with model numbers,
  // and some rows are marked up as either LT_START or LT_BODY.  Now let's
  // smear any good paragraph hypotheses forward and backward.
  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
  smearer.Smear();
}

bool tesseract::StrongModel ( const ParagraphModel * model ) [inline]

Definition at line 75 of file paragraphs_internal.h.

                                                     {
  return model != NULL && model != kCrownLeft && model != kCrownRight;
}

bool tesseract::TextSupportsBreak	(	const RowScratchRegisters &	before,
		const RowScratchRegisters &	after
	)

Definition at line 1608 of file paragraphs.cpp.

                                                         {
  if (before.ri_->ltr) {
    return before.ri_->rword_likely_ends_idea &&
           after.ri_->lword_likely_starts_idea;
  } else {
    return before.ri_->lword_likely_ends_idea &&
           after.ri_->rword_likely_starts_idea;
  }
}

Pix * tesseract::TraceBlockOnReducedPix	(	BLOCK *	block,
		int	gridsize,
		ICOORD	bleft,
		int *	left,
		int *	bottom
	)

Definition at line 258 of file bbgrid.cpp.

                                                                  {
  TBOX box = block->bounding_box();
  Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom);
  int wpl = pixGetWpl(pix);
  l_uint32* data = pixGetData(pix);
  ICOORDELT_IT it(block->poly_block()->points());
  for (it.mark_cycle_pt(); !it.cycled_list();) {
    ICOORD pos = *it.data();
    it.forward();
    ICOORD next_pos = *it.data();
    ICOORD line_vector = next_pos - pos;
    int major, minor;
    ICOORD major_step, minor_step;
    line_vector.setup_render(&major_step, &minor_step, &major, &minor);
    int accumulator = major / 2;
    while (pos != next_pos) {
      int grid_x = (pos.x() - bleft.x()) / gridsize - *left;
      int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;
      SET_DATA_BIT(data + grid_y * wpl, grid_x);
      pos += major_step;
      accumulator += minor;
      if (accumulator >= major) {
        accumulator -= major;
        pos += minor_step;
      }
    }
  }
  return pix;
}

Pix * tesseract::TraceOutlineOnReducedPix	(	C_OUTLINE *	outline,
		int	gridsize,
		ICOORD	bleft,
		int *	left,
		int *	bottom
	)

Definition at line 232 of file bbgrid.cpp.

                                                                    {
  TBOX box = outline->bounding_box();
  Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom);
  int wpl = pixGetWpl(pix);
  l_uint32* data = pixGetData(pix);
  int length = outline->pathlength();
  ICOORD pos = outline->start_pos();
  for (int i = 0; i < length; ++i) {
    int grid_x = (pos.x() - bleft.x()) / gridsize - *left;
    int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;
    SET_DATA_BIT(data + grid_y * wpl, grid_x);
    pos += outline->step(i);
  }
  return pix;
}

int tesseract::UnicodeFor	(	const UNICHARSET *	u,
		const WERD_CHOICE *	werd,
		int	pos
	)

Definition at line 286 of file paragraphs.cpp.

                                                                      {
  if (!u || !werd || pos > werd->length())
    return 0;
  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
}

bool tesseract::UniLikelyListItem	(	const UNICHARSET *	u,
		const WERD_CHOICE *	werd
	)

Definition at line 369 of file paragraphs.cpp.

                                                                     {
  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
    return true;

  UnicodeSpanSkipper m(u, werd);
  int num_segments = 0;
  int pos = 0;
  while (pos < werd->length() && num_segments < 3) {
    int numeral_start = m.SkipPunc(pos);
    if (numeral_start > pos + 1) break;
    int numeral_end = m.SkipRomans(numeral_start);
    if (numeral_end == numeral_start) {
      numeral_end = m.SkipDigits(numeral_start);
      if (numeral_end == numeral_start) {
        // If there's a single latin letter, we can use that.
        numeral_end = m.SkipAlpha(numeral_start);
        if (numeral_end - numeral_start != 1)
          break;
      }
    }
    // We got some sort of numeral.
    num_segments++;
    // Skip any trailing punctuation.
    pos = m.SkipPunc(numeral_end);
    if (pos == numeral_end)
      break;
  }
  return pos == werd->length();
}

bool tesseract::ValidBodyLine	(	const GenericVector< RowScratchRegisters > *	rows,
		int	row,
		const ParagraphModel *	model
	)

Definition at line 1226 of file paragraphs.cpp.

                                                         {
  if (!StrongModel(model)) {
    tprintf("ValidBodyLine() should only be called with strong models!\n");
  }
  return StrongModel(model) &&
      model->ValidBodyLine(
          (*rows)[row].lmargin_, (*rows)[row].lindent_,
          (*rows)[row].rindent_, (*rows)[row].rmargin_);
}

bool tesseract::ValidFirstLine	(	const GenericVector< RowScratchRegisters > *	rows,
		int	row,
		const ParagraphModel *	model
	)

Definition at line 1215 of file paragraphs.cpp.

                                                          {
  if (!StrongModel(model)) {
    tprintf("ValidFirstLine() should only be called with strong models!\n");
  }
  return StrongModel(model) &&
      model->ValidFirstLine(
          (*rows)[row].lmargin_, (*rows)[row].lindent_,
          (*rows)[row].rindent_, (*rows)[row].rmargin_);
}

bool tesseract::write_info	(	FILE *	f,
		const FontInfo &	fi
	)

Definition at line 72 of file fontinfo.cpp.

                                             {
  inT32 size = strlen(fi.name);
  if (fwrite(&size, sizeof(size), 1, f) != 1) return false;
  if (fwrite(fi.name, sizeof(*fi.name), size, f) != size) return false;
  if (fwrite(&fi.properties, sizeof(fi.properties), 1, f) != 1) return false;
  return true;
}

bool tesseract::write_set	(	FILE *	f,
		const FontSet &	fs
	)

Definition at line 153 of file fontinfo.cpp.

                                           {
  if (fwrite(&fs.size, sizeof(fs.size), 1, f) != 1) return false;
  for (int i = 0; i < fs.size; ++i) {
    if (fwrite(&fs.configs[i], sizeof(fs.configs[i]), 1, f) != 1) return false;
  }
  return true;
}

bool tesseract::write_spacing_info	(	FILE *	f,
		const FontInfo &	fi
	)

Definition at line 112 of file fontinfo.cpp.

                                                     {
  inT32 vec_size = (fi.spacing_vec == NULL) ? 0 : fi.spacing_vec->size();
  if (fwrite(&vec_size,  sizeof(vec_size), 1, f) != 1) return false;
  inT16 x_gap_invalid = -1;
  for (int i = 0; i < vec_size; ++i) {
    FontSpacingInfo *fs = fi.spacing_vec->get(i);
    inT32 kern_size = (fs == NULL) ? -1 : fs->kerned_x_gaps.size();
    if (fs == NULL) {
      if (fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
          fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
          fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
        return false;
      }
    } else {
      if (fwrite(&(fs->x_gap_before), sizeof(fs->x_gap_before), 1, f) != 1 ||
          fwrite(&(fs->x_gap_after), sizeof(fs->x_gap_after), 1, f) != 1 ||
          fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
        return false;
      }
    }
    if (kern_size > 0 && (!fs->kerned_unichar_ids.Serialize(f) ||
                          !fs->kerned_x_gaps.Serialize(f))) {
      return false;
    }
  }
  return true;
}

void tesseract::WriteShapeTable	(	const STRING &	file_prefix,
		const ShapeTable &	shape_table
	)

Definition at line 209 of file commontraining.cpp.

                                                                               {
  STRING shape_table_file = file_prefix;
  shape_table_file += kShapeTableFileSuffix;
  FILE* fp = fopen(shape_table_file.string(), "wb");
  if (fp != NULL) {
    if (!shape_table.Serialize(fp)) {
      fprintf(stderr, "Error writing shape table: %s\n",
              shape_table_file.string());
    }
    fclose(fp);
  } else {
    fprintf(stderr, "Error creating shape table: %s\n",
            shape_table_file.string());
  }
}

Variable Documentation

const int tesseract::case_state_table[6][4]

Initial value:

 { {
                                  
    
                                  
      0, 1, 5, 4
    },
    {                            
      0, 3, 2, 4
    },
    {                            
      0, -1, 2, -1
    },
    {                            
      0, 3, -1, 4
    },
    {                            
      0, -1, -1, 4
    },
    {                            
      5, -1, 2, -1
    },
  }

Definition at line 35 of file context.cpp.

const int tesseract::kAdjacentLeaderSearchPadding = 2

Definition at line 124 of file tablefind.cpp.

const double tesseract::kAlignedFraction = 0.03125

Definition at line 40 of file alignedblob.cpp.

const double tesseract::kAlignedGapFraction = 0.75

Definition at line 44 of file alignedblob.cpp.

const char* tesseract::kAlignmentNames[]

Initial value:

 {
  "Left Aligned",
  "Left Ragged",
  "Center",
  "Right Aligned",
  "Right Ragged",
  "Separator"
}

Definition at line 516 of file tabvector.cpp.

const double tesseract::kAllowBlobArea = 0.05

Definition at line 60 of file tablefind.cpp.

const double tesseract::kAllowBlobHeight = 0.3

Definition at line 58 of file tablefind.cpp.

const double tesseract::kAllowBlobWidth = 0.4

Definition at line 59 of file tablefind.cpp.

const double tesseract::kAllowTextArea = 0.8

Definition at line 53 of file tablefind.cpp.

const double tesseract::kAllowTextHeight = 0.5

Definition at line 51 of file tablefind.cpp.

const double tesseract::kAllowTextWidth = 0.6

Definition at line 52 of file tablefind.cpp.

const char * tesseract::kApostropheLikeUTF8

Initial value:

 {
  "'",       
  "`",       
  "\u2018",  
  "\u2019",  
  "\u2032",  
  NULL,      
}

Definition at line 48 of file unicodes.cpp.

const double tesseract::kBigPartSizeRatio = 1.75

Definition at line 47 of file colpartitiongrid.cpp.

const int tesseract::kBoxClipTolerance = 2

Definition at line 31 of file boxword.cpp.

const double tesseract::kBrokenCJKIterationFraction = 0.125

Definition at line 79 of file strokewidth.cpp.

const int tesseract::kBytesPer64BitNumber = 20

Max bytes in the decimal representation of inT64.

Definition at line 1121 of file baseapi.cpp.

const int tesseract::kBytesPerBlob = kNumbersPerBlob * (kBytesPerNumber + 1) + 1

Multiplier for max expected textlength assumes (kBytesPerNumber + space) * kNumbersPerBlob plus the newline. Add to this the original UTF8 characters, and one kMaxBytesPerLine for safety.

Definition at line 1118 of file baseapi.cpp.

const int tesseract::kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1

Definition at line 1119 of file baseapi.cpp.

const int tesseract::kBytesPerNumber = 5

The number of bytes taken by each number. Since we use inT16 for ICOORD, assume only 5 digits max.

Definition at line 1112 of file baseapi.cpp.

const int tesseract::kCellSplitColumnThreshold = 0

Definition at line 36 of file tablerecog.cpp.

const int tesseract::kCellSplitRowThreshold = 0

Definition at line 35 of file tablerecog.cpp.

const double tesseract::kCharVerticalOverlapFraction = 0.375

Definition at line 63 of file tabfind.cpp.

const double tesseract::kCJKAspectRatio = 1.25

Definition at line 73 of file strokewidth.cpp.

const double tesseract::kCJKAspectRatioIncrease = 1.0625

Definition at line 75 of file strokewidth.cpp.

const double tesseract::kCJKBrokenDistanceFraction = 0.25

Definition at line 69 of file strokewidth.cpp.

const int tesseract::kCJKMaxComponents = 8

Definition at line 71 of file strokewidth.cpp.

const int tesseract::kCJKRadius = 2

Definition at line 67 of file strokewidth.cpp.

const int tesseract::kColumnWidthFactor = 20

Pixel resolution of column width estimates.

Definition at line 51 of file tabfind.h.

const double tesseract::kCosMaxSkewAngle = 0.866025

Definition at line 82 of file tabfind.cpp.

const int tesseract::kCrackSpacing = 100

Spacing of cracks across the page to break up tall vertical lines.

Definition at line 44 of file linefind.cpp.

const ParagraphModel * tesseract::kCrownLeft = reinterpret_cast<ParagraphModel *>(0xDEAD111F)

Definition at line 50 of file paragraphs.cpp.

const ParagraphModel * tesseract::kCrownRight = reinterpret_cast<ParagraphModel *>(0xDEAD888F)

Definition at line 52 of file paragraphs.cpp.

const int tesseract::kDefaultResolution = 300

Default resolution used if input in not believable.

Definition at line 58 of file pagesegmain.cpp.

const double tesseract::kDiacriticXPadRatio = 7.0

Definition at line 82 of file strokewidth.cpp.

const double tesseract::kDiacriticYPadRatio = 1.75

Definition at line 85 of file strokewidth.cpp.

const char tesseract::kDoNotReverse[] = "RRP_DO_NO_REVERSE"

Definition at line 43 of file trie.cpp.

const float tesseract::kFontMergeDistance = 0.025

Definition at line 44 of file mastertrainer.cpp.

const char tesseract::kForceReverse[] = "RRP_FORCE_REVERSE"

Definition at line 45 of file trie.cpp.

const double tesseract::kGoodRowNumberOfColumnsLarge = 0.7

Definition at line 54 of file tablerecog.cpp.

const double tesseract::kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 }

Definition at line 50 of file tablerecog.cpp.

const int tesseract::kGoodRowNumberOfColumnsSmallSize

Initial value:

 
    sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1

Definition at line 51 of file tablerecog.cpp.

const int tesseract::kGutterMultiple = 4

Definition at line 39 of file tabvector.cpp.

const int tesseract::kGutterToNeighbourRatio = 3

Definition at line 41 of file tabvector.cpp.

const int tesseract::kHistogramSize = 256

Definition at line 25 of file otsuthr.h.

const double tesseract::kHorizontalGapMergeFraction = 0.5

Definition at line 57 of file colfind.cpp.

const double tesseract::kHorizontalSpacing = 0.30

Definition at line 29 of file tablerecog.cpp.

const int tesseract::kHorzStrongTextlineAspect = 5

Definition at line 70 of file colpartition.cpp.

const int tesseract::kHorzStrongTextlineCount = 8

Definition at line 66 of file colpartition.cpp.

const int tesseract::kHorzStrongTextlineHeight = 10

Definition at line 68 of file colpartition.cpp.

const char * tesseract::kHyphenLikeUTF8

Initial value:

 {
  "-",       
  "\u05BE",  
  "\u2010",  
  "\u2011",  
  "\u2012",  
  "\u2013",  
  "\u2014",  
  "\u2015",  
  "\u2212",  
  "\uFE58",  
  "\uFE63",  
  "\uFF0D",  
  NULL,      
}

Definition at line 32 of file unicodes.cpp.

const float tesseract::kInfiniteDist = 999.0f

Definition at line 891 of file mastertrainer.cpp.

const char* tesseract::kInputFile = "noname.tif"

Filename used for input image file, from which to derive a name to search for a possible UNLV zone file, if none is specified by SetInputName.

Definition at line 78 of file baseapi.cpp.

const double tesseract::kLargeTableProjectionThreshold = 0.45

Definition at line 109 of file tablefind.cpp.

const int tesseract::kLargeTableRowCount = 6

Definition at line 111 of file tablefind.cpp.

const int tesseract::kLatinChs[]

Initial value:

 {
  0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
}

Latin chars corresponding to the unicode chars above.

Definition at line 1181 of file baseapi.cpp.

const int tesseract::kLeaderCutCost = 8

Definition at line 60 of file colpartition.cpp.

const int tesseract::kLeftIndentAlignmentCountTh = 1

Definition at line 88 of file equationdetect.cpp.

const double tesseract::kLineCountReciprocal = 4.0

Definition at line 52 of file tabvector.cpp.

const int tesseract::kLinedTableMinHorizontalLines = 3

Definition at line 39 of file tablerecog.cpp.

const int tesseract::kLinedTableMinVerticalLines = 3

Definition at line 38 of file tablerecog.cpp.

const int tesseract::kLineFindGridSize = 50

Grid size used by line finder. Not very critical.

Definition at line 46 of file linefind.cpp.

const double tesseract::kLineFragmentAspectRatio = 10.0

Definition at line 57 of file tabfind.cpp.

const double tesseract::kLineResidueAspectRatio = 8.0

Definition at line 108 of file strokewidth.cpp.

const int tesseract::kLineResiduePadRatio = 3

Definition at line 110 of file strokewidth.cpp.

const double tesseract::kLineResidueSizeRatio = 1.75

Definition at line 112 of file strokewidth.cpp.

const int tesseract::kLineTrapLongest = 4

Definition at line 101 of file strokewidth.cpp.

const int tesseract::kLineTrapShortest = 2

Definition at line 103 of file strokewidth.cpp.

const char * tesseract::kLRM = "\u200E"

Definition at line 27 of file unicodes.cpp.

const double tesseract::kMarginFactor = 1.1

Definition at line 44 of file tablerecog.cpp.

const double tesseract::kMarginOverlapFraction = 0.25

Definition at line 54 of file colfind.cpp.

const float tesseract::kMathDigitDensityTh1 = 0.25

Definition at line 83 of file equationdetect.cpp.

const float tesseract::kMathDigitDensityTh2 = 0.1

Definition at line 84 of file equationdetect.cpp.

const float tesseract::kMathItalicDensityTh = 0.5

Definition at line 85 of file equationdetect.cpp.

const double tesseract::kMaxBaselineError = 0.4375

Definition at line 73 of file colpartition.cpp.

const double tesseract::kMaxBlobOverlapFactor = 4.0

Definition at line 79 of file tablefind.cpp.

const int tesseract::kMaxBlobWidth = 500

Definition at line 42 of file tablefind.cpp.

const inT16 tesseract::kMaxBoxEdgeDiff = 2

Definition at line 33 of file recogtraining.cpp.

const int tesseract::kMaxBoxesInDataPartition = 20

Definition at line 68 of file tablefind.cpp.

const int tesseract::kMaxBytesPerLine

Initial value:

 kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 +
    UNICHAR_LEN

A maximal single box could occupy kNumbersPerBlob numbers at kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a space plus the newline and the maximum length of a UNICHAR. Test against this on each iteration for safety.

Definition at line 1128 of file baseapi.cpp.

const int tesseract::kMaxCaptionLines = 7

Definition at line 39 of file colpartitiongrid.cpp.

const int tesseract::kMaxCharTopRange = 48

Definition at line 61 of file fixxht.cpp.

const int tesseract::kMaxCircleErosions = 8

Definition at line 60 of file pagesegmain.cpp.

const int tesseract::kMaxCJKSizeRatio = 5

Definition at line 77 of file strokewidth.cpp.

const int tesseract::kMaxColorDistance = 900

Definition at line 80 of file colpartition.cpp.

const int tesseract::kMaxColumnHeaderDistance = 4

Definition at line 87 of file tablefind.cpp.

const int tesseract::kMaxCredibleResolution = 2400

Maximum believable resolution.

Definition at line 89 of file baseapi.cpp.

const double tesseract::kMaxDiacriticDistanceRatio = 1.25

Definition at line 91 of file strokewidth.cpp.

const double tesseract::kMaxDiacriticGapToBaseCharHeight = 1.0

Definition at line 94 of file strokewidth.cpp.

const double tesseract::kMaxDistToPartSizeRatio = 1.5

Definition at line 64 of file colfind.cpp.

const int tesseract::kMaxDropCapBottom = -128

Definition at line 37 of file boxword.cpp.

const int tesseract::kMaxFillinMultiple = 11

Definition at line 48 of file tabvector.cpp.

const double tesseract::kMaxGapInTextPartition = 4.0

Definition at line 71 of file tablefind.cpp.

const double tesseract::kMaxGutterWidthAbsolute = 2.00

Definition at line 52 of file tabfind.cpp.

const double tesseract::kMaxHorizontalGap = 3.0

Definition at line 65 of file tabfind.cpp.

const int tesseract::kMaxIncompatibleColumnCount = 2

Definition at line 52 of file colfind.cpp.

const int tesseract::kMaxIntSize = 22

Max string length of an int.

Definition at line 82 of file baseapi.cpp.

const int tesseract::kMaxLargeOverlaps = 3

Definition at line 117 of file strokewidth.cpp.

const int tesseract::kMaxLargeOverlapsWithMedium = 12

Definition at line 40 of file ccnontextdetect.cpp.

const int tesseract::kMaxLargeOverlapsWithSmall = 3

Definition at line 31 of file ccnontextdetect.cpp.

const double tesseract::kMaxLeaderGapFractionOfMax = 0.25

Definition at line 54 of file colpartition.cpp.

const double tesseract::kMaxLeaderGapFractionOfMin = 0.5

Definition at line 56 of file colpartition.cpp.

const int tesseract::kMaxLineResidue = 6

Definition at line 52 of file linefind.cpp.

const int tesseract::kMaxMediumOverlapsWithSmall = 12

Definition at line 36 of file ccnontextdetect.cpp.

const int tesseract::kMaxNeighbourDistFactor = 4

Definition at line 33 of file colpartitiongrid.cpp.

const double tesseract::kMaxNonLineDensity = 0.25

Definition at line 57 of file linefind.cpp.

const int tesseract::kMaxOffsetDist = 32

Definition at line 32 of file intfeaturemap.cpp.

const int tesseract::kMaxPadFactor = 6

Definition at line 30 of file colpartitiongrid.cpp.

const double tesseract::kMaxParagraphEndingLeftSpaceMultiple = 3.0

Definition at line 133 of file tablefind.cpp.

const double tesseract::kMaxPartitionSpacing = 1.75

Definition at line 66 of file colpartitiongrid.cpp.

const int tesseract::kMaxPartnerDepth = 4

Definition at line 42 of file colpartition.cpp.

const int tesseract::kMaxRaggedSearch = 25

Definition at line 40 of file tabfind.cpp.

const double tesseract::kMaxRectangularFraction = 0.75

Definition at line 47 of file imagefind.cpp.

const double tesseract::kMaxRectangularGradient = 0.1

Definition at line 50 of file imagefind.cpp.

const int tesseract::kMaxRMSColorNoise = 128

Definition at line 77 of file colpartition.cpp.

const double tesseract::kMaxRowSize = 2.5

Definition at line 47 of file tablerecog.cpp.

const double tesseract::kMaxSameBlockLineSpacing = 3

Definition at line 50 of file colpartition.cpp.

const double tesseract::kMaxSizeRatio = 1.5

Definition at line 52 of file colpartition.cpp.

const int tesseract::kMaxSkewFactor = 15

Definition at line 66 of file alignedblob.cpp.

const double tesseract::kMaxSmallNeighboursPerPix = 1.0 / 32

Definition at line 28 of file ccnontextdetect.cpp.

const double tesseract::kMaxSpacingDrift = 1.0 / 72

Definition at line 44 of file colpartition.cpp.

const double tesseract::kMaxStaveHeight = 1.0

Definition at line 59 of file linefind.cpp.

const double tesseract::kMaxTableCellXheight = 2.0

Definition at line 83 of file tablefind.cpp.

const int tesseract::kMaxTextLineBlobRatio = 5

Definition at line 73 of file tabfind.cpp.

const double tesseract::kMaxTopSpacingFraction = 0.25

Definition at line 47 of file colpartition.cpp.

const int tesseract::kMaxUnicharsPerCluster = 2000

Definition at line 42 of file mastertrainer.cpp.

const int tesseract::kMaxVerticalSearch = 12

Definition at line 39 of file tabfind.cpp.

const int tesseract::kMaxVerticalSpacing = 500

Definition at line 40 of file tablefind.cpp.

const double tesseract::kMaxXProjectionGapFactor = 2.0

Definition at line 143 of file tablefind.cpp.

const double tesseract::kMinAlignedGutter = 0.25

Definition at line 54 of file tabvector.cpp.

const int tesseract::kMinAlignedTabs = 4

Definition at line 56 of file alignedblob.cpp.

const double tesseract::kMinBaselineCoverage = 0.5

Definition at line 75 of file colpartition.cpp.

const int tesseract::kMinBoxesInTextPartition = 10

Definition at line 65 of file tablefind.cpp.

const double tesseract::kMinCaptionGapHeightRatio = 0.5

Definition at line 43 of file colpartitiongrid.cpp.

const double tesseract::kMinCaptionGapRatio = 2.0

Definition at line 41 of file colpartitiongrid.cpp.

const int tesseract::kMinChainTextValue = 3

Definition at line 64 of file colpartition.cpp.

const int tesseract::kMinClusteredShapes = 1

Definition at line 40 of file mastertrainer.cpp.

const int tesseract::kMinColorDifference = 16

Definition at line 56 of file imagefind.cpp.

const int tesseract::kMinColumnWidth = 100

Definition at line 49 of file colfind.cpp.

const int tesseract::kMinCredibleResolution = 70

Minimum believable resolution.

Minimum believable resolution. Used as a default if there is no other information, as it is safer to under-estimate than over-estimate.

Definition at line 87 of file baseapi.cpp.

const double tesseract::kMinDiacriticSizeRatio = 1.0625

Definition at line 88 of file strokewidth.cpp.

const int tesseract::kMinEvaluatedTabs = 3

Definition at line 70 of file tabfind.cpp.

const double tesseract::kMinFilledArea = 0.35

Definition at line 57 of file tablerecog.cpp.

const double tesseract::kMinFractionalLinesInColumn = 0.125

Definition at line 46 of file tabfind.cpp.

const double tesseract::kMinGoodTextPARatio = 1.5

Definition at line 56 of file ccnontextdetect.cpp.

const double tesseract::kMinGutterFraction = 0.5

Definition at line 50 of file tabvector.cpp.

const double tesseract::kMinGutterWidthAbsolute = 0.02

Definition at line 50 of file tabfind.cpp.

const double tesseract::kMinGutterWidthGrid = 0.5

Definition at line 61 of file colfind.cpp.

const double tesseract::kMinImageArea = 0.5

Definition at line 78 of file tabfind.cpp.

const int tesseract::kMinImageFindSize = 100

Definition at line 52 of file imagefind.cpp.

const int tesseract::kMinLeaderCount = 5

Definition at line 58 of file colpartition.cpp.

const int tesseract::kMinLineLengthFraction = 4

Denominator of resolution makes min pixels to demand line lengths to be.

Definition at line 42 of file linefind.cpp.

const int tesseract::kMinLinesInColumn = 10

Definition at line 42 of file tabfind.cpp.

const double tesseract::kMinMaxGapInTextPartition = 0.5

Definition at line 75 of file tablefind.cpp.

const double tesseract::kMinMusicPixelFraction = 0.75

Definition at line 61 of file linefind.cpp.

const double tesseract::kMinNonNoiseFraction = 0.5

Definition at line 59 of file colfind.cpp.

const int tesseract::kMinOutlierSamples = 5

Definition at line 37 of file trainingsampleset.cpp.

const double tesseract::kMinOverlapWithTable = 0.6

Definition at line 99 of file tablefind.cpp.

const double tesseract::kMinParagraphEndingTextToWhitespaceRatio = 3.0

Definition at line 139 of file tablefind.cpp.

const double tesseract::kMinPCLengthIncrease = 1.0 / 1024

Definition at line 33 of file intfeaturemap.cpp.

const double tesseract::kMinRaggedGutter = 1.5

Definition at line 56 of file tabvector.cpp.

const int tesseract::kMinRaggedTabs = 5

Definition at line 54 of file alignedblob.cpp.

const double tesseract::kMinRectangularFraction = 0.125

Definition at line 45 of file imagefind.cpp.

const int tesseract::kMinRectSize = 10

Minimum sensible image size to be worth running tesseract.

Definition at line 67 of file baseapi.cpp.

const int tesseract::kMinRowsInTable = 3

Definition at line 114 of file tablefind.cpp.

const int tesseract::kMinStrongTextValue = 6

Definition at line 62 of file colpartition.cpp.

const int tesseract::kMinSubscriptOffset = 20

Definition at line 33 of file boxword.cpp.

const int tesseract::kMinSuperscriptOffset = 20

Definition at line 35 of file boxword.cpp.

const double tesseract::kMinTabGradient = 4.0

Definition at line 62 of file alignedblob.cpp.

const int tesseract::kMinTextLineBlobRatio = 3

Definition at line 76 of file tabfind.cpp.

const int tesseract::kMinThickLineWidth = 12

Definition at line 48 of file linefind.cpp.

const int tesseract::kMinVerticalSearch = 3

Definition at line 38 of file tabfind.cpp.

const int tesseract::kMostlyOneDirRatio = 3

Definition at line 106 of file strokewidth.cpp.

const double tesseract::kNeighbourSearchFactor = 2.5

Definition at line 119 of file strokewidth.cpp.

const int tesseract::kNoisePadding = 4

Definition at line 47 of file ccnontextdetect.cpp.

const int tesseract::kNumbersPerBlob = 5

The 5 numbers output for each box (the usual 4 and a page number.)

Definition at line 1107 of file baseapi.cpp.

const int tesseract::kNumEndPoints = 3

Definition at line 27 of file detlinefit.cpp.

const int tesseract::kNumLiteralCnt = 5

Definition at line 36 of file tess_lang_model.h.

const char* tesseract::kOldVarsFile = "failed_vars.txt"

Temp file used for storing current parameters before applying retry values.

Definition at line 80 of file baseapi.cpp.

const int tesseract::kOriginalNoiseMultiple = 8

Definition at line 43 of file ccnontextdetect.cpp.

const double tesseract::kParagraphEndingPreviousLineRatio = 1.3

Definition at line 129 of file tablefind.cpp.

const char * tesseract::kPDF = "\u202C"

Definition at line 30 of file unicodes.cpp.

const double tesseract::kPhotoOffsetFraction = 0.375

Definition at line 50 of file ccnontextdetect.cpp.

const int tesseract::kPrime1 = 17

Definition at line 34 of file trainingsampleset.cpp.

const int tesseract::kPrime2 = 13

Definition at line 35 of file trainingsampleset.cpp.

const double tesseract::kRaggedFraction = 2.5

Definition at line 42 of file alignedblob.cpp.

const double tesseract::kRaggedGapFraction = 1.0

Definition at line 46 of file alignedblob.cpp.

const int tesseract::kRaggedGutterMultiple = 5

Definition at line 54 of file tabfind.cpp.

const int tesseract::kRandomizingCenter = 128

Definition at line 30 of file trainingsample.cpp.

const double tesseract::kRequiredColumns = 0.7

Definition at line 42 of file tablerecog.cpp.

const double tesseract::kRequiredFullJustifiedSpacing = 4.0

Definition at line 119 of file tablefind.cpp.

const char tesseract::kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"

Definition at line 44 of file trie.cpp.

const int tesseract::kRGBRMSColors = 4

Definition at line 36 of file colpartition.h.

const char * tesseract::kRLE = "\u202A"

Definition at line 29 of file unicodes.cpp.

const char * tesseract::kRLM = "\u200F"

Definition at line 28 of file unicodes.cpp.

const double tesseract::kRMSFitScaling = 8.0

Definition at line 54 of file imagefind.cpp.

const int tesseract::kRulingVerticalMargin = 3

Definition at line 95 of file tablefind.cpp.

const int tesseract::kSearchRadius = 2

Definition at line 96 of file strokewidth.cpp.

const int tesseract::kSeedBlobsCountTh = 10

Definition at line 87 of file equationdetect.cpp.

const int tesseract::kSideSpaceMargin = 10

Definition at line 104 of file tablefind.cpp.

const int tesseract::kSimilarRaggedDist = 50

Definition at line 46 of file tabvector.cpp.

const int tesseract::kSimilarVectorDist = 10

Definition at line 43 of file tabvector.cpp.

const float tesseract::kSizeRatioToReject = 2.0

Definition at line 114 of file strokewidth.cpp.

const double tesseract::kSmallTableProjectionThreshold = 0.35

Definition at line 108 of file tablefind.cpp.

const int tesseract::kSmoothDecisionMargin = 4

Definition at line 69 of file colpartitiongrid.cpp.

const double tesseract::kSmoothFactor = 0.25

Definition at line 59 of file tabfind.cpp.

const double tesseract::kSplitPartitionSize = 2.0

Definition at line 46 of file tablefind.cpp.

const int tesseract::kSquareLimit = 25

Definition at line 32 of file trainingsampleset.cpp.

const int tesseract::kStateCnt = 4

Definition at line 35 of file tess_lang_model.h.

const int tesseract::kStrayLinePer = 6

Definition at line 46 of file paragraphs.cpp.

const double tesseract::kStrokeWidthCJK = 2.0

Definition at line 64 of file strokewidth.cpp.

const double tesseract::kStrokeWidthConstantTolerance = 2.0

Definition at line 51 of file colpartitiongrid.cpp.

const double tesseract::kStrokeWidthFractionalTolerance = 0.25

Definition at line 147 of file tablefind.cpp.

const double tesseract::kStrokeWidthFractionCJK = 0.25

Definition at line 63 of file strokewidth.cpp.

const double tesseract::kStrokeWidthFractionTolerance = 0.25

Allowed proportional change in stroke width to be the same font.

Definition at line 49 of file colpartitiongrid.cpp.

const double tesseract::kStrokeWidthTolerance = 1.5

Allowed constant change in stroke width to be the same font. Really 1.5 pixels.

Definition at line 61 of file strokewidth.cpp.

const double tesseract::kTableColumnThreshold = 3.0

Definition at line 91 of file tablefind.cpp.

const int tesseract::kTabRadiusFactor = 5

Definition at line 36 of file tabfind.cpp.

const char tesseract::kTesseractReject = '~'

Character returned when Tesseract couldn't recognize as anything.

Definition at line 69 of file baseapi.cpp.

const int tesseract::kTestChar = -1

Definition at line 30 of file trainingsampleset.cpp.

const char* tesseract::kTextordDebugPix = "psdebug_pix"

Definition at line 69 of file alignedblob.cpp.

const double tesseract::kThickLengthMultiple = 0.75

Definition at line 55 of file linefind.cpp.

const int tesseract::kThinLineFraction = 20

Denominator of resolution makes max pixel width to allow thin lines.

Definition at line 40 of file linefind.cpp.

const double tesseract::kTinyEnoughTextlineOverlapFraction = 0.25

Definition at line 53 of file colpartitiongrid.cpp.

const float tesseract::kUnclearDensityTh = 0.25

Definition at line 86 of file equationdetect.cpp.

const int tesseract::kUniChs[]

Initial value:

 {
  0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
}

Conversion table for non-latin characters. Maps characters out of the latin set into the latin set. TODO(rays) incorporate this translation into unicharset.

Definition at line 1177 of file baseapi.cpp.

const char tesseract::kUNLVReject = '~'

Character used by UNLV error counter as a reject.

Definition at line 71 of file baseapi.cpp.

const char tesseract::kUNLVSuspect = '^'

Character used by UNLV as a suspect marker.

Definition at line 73 of file baseapi.cpp.

const char * tesseract::kUTF8LineSeparator = "\u2028"

Definition at line 25 of file unicodes.cpp.

const char * tesseract::kUTF8ParagraphSeparator = "\u2029"

Definition at line 26 of file unicodes.cpp.

const double tesseract::kVerticalSpacing = -0.2

Definition at line 32 of file tablerecog.cpp.

const int tesseract::kVLineAlignment = 3

Definition at line 48 of file alignedblob.cpp.

const int tesseract::kVLineGutter = 1

Definition at line 50 of file alignedblob.cpp.

const int tesseract::kVLineMinLength = 500

Definition at line 58 of file alignedblob.cpp.

const int tesseract::kVLineSearchSize = 150

Definition at line 52 of file alignedblob.cpp.

const char* const tesseract::RTLReversePolicyNames[]

Initial value:

 {
  kDoNotReverse,
  kReverseIfHasRTL,
  kForceReverse
}

Definition at line 47 of file trie.cpp.

bool tesseract::textord_dump_table_images = false

"Paint table detection output"

Definition at line 150 of file tablefind.cpp.

bool tesseract::textord_show_tables = false

"Show table regions"

Definition at line 151 of file tablefind.cpp.

double tesseract::textord_tabfind_aligned_gap_fraction = 0.75

"Fraction of height used as a minimum gap for aligned blobs."

Definition at line 87 of file tabfind.cpp.

bool tesseract::textord_tabfind_find_tables = true

"run table detection"

Definition at line 74 of file colfind.cpp.

bool tesseract::textord_tabfind_force_vertical_text = false

"Force using vertical text page mode"

Definition at line 49 of file strokewidth.cpp.

bool tesseract::textord_tabfind_only_strokewidths = false

"Only run stroke widths"

Definition at line 46 of file strokewidth.cpp.

bool tesseract::textord_tabfind_show_blocks = false

"Show final block bounds"

Definition at line 73 of file colfind.cpp.

bool tesseract::textord_tabfind_show_color_fit = false

"Show stroke widths"

Definition at line 26 of file colpartitiongrid.cpp.

bool tesseract::textord_tabfind_show_columns = false

"Show column bounds"

Definition at line 72 of file colfind.cpp.

bool tesseract::textord_tabfind_show_finaltabs = false

"Show tab vectors"

Definition at line 85 of file tabfind.cpp.

bool tesseract::textord_tabfind_show_initial_partitions = false

"Show partition bounds"

Definition at line 67 of file colfind.cpp.

bool tesseract::textord_tabfind_show_initialtabs = false

"Show tab candidates"

Definition at line 84 of file tabfind.cpp.

int tesseract::textord_tabfind_show_partitions = 0

"Show partition bounds, waiting if >1"

Definition at line 71 of file colfind.cpp.

bool tesseract::textord_tabfind_show_reject_blobs = false

"Show blobs rejected as noise"

Definition at line 69 of file colfind.cpp.

int tesseract::textord_tabfind_show_strokewidths = 0

"Show stroke widths"

Definition at line 45 of file strokewidth.cpp.

bool tesseract::textord_tabfind_vertical_horizontal_mix = true

"find horizontal lines such as headers in vertical page mode"

Definition at line 51 of file strokewidth.cpp.

bool tesseract::textord_tabfind_vertical_text = true

"Enable vertical detection"

Definition at line 47 of file strokewidth.cpp.

double tesseract::textord_tabfind_vertical_text_ratio = 0.5

"Fraction of textlines deemed vertical to use vertical page mode"

Definition at line 53 of file strokewidth.cpp.

bool tesseract::textord_tablefind_recognize_tables = false

"Enables the table recognizer for table layout and filtering."

Definition at line 157 of file tablefind.cpp.

bool tesseract::textord_tablefind_show_mark = false

"Debug table marking steps in detail"

Definition at line 153 of file tablefind.cpp.

bool tesseract::textord_tablefind_show_stats = false

"Show page stats used in table finding"

Definition at line 155 of file tablefind.cpp.

double tesseract::textord_tabvector_vertical_box_ratio = 0.5

"Fraction of box matches required to declare a line vertical"

Definition at line 62 of file tabvector.cpp.

double tesseract::textord_tabvector_vertical_gap_fraction = 0.5

"max fraction of mean blob width allowed for vertical gaps in vertical text"

"Max fraction of mean blob width allowed for vertical gaps in vertical text"

Definition at line 59 of file tabvector.cpp.

CCUtilMutex tesseract::tprintfMutex

Definition at line 51 of file ccutil.cpp.

Classes

Typedefs

Enumerations

Functions

Variables

Detailed Description

Typedef Documentation

Enumeration Type Documentation

Function Documentation

Variable Documentation