Tesseract  3.02
tesseract-ocr/ccmain/equationdetect.h
Go to the documentation of this file.
00001 
00002 // File:        equationdetect.h
00003 // Description: The equation detection class that inherits equationdetectbase.
00004 // Author:      Zongyi (Joe) Liu (joeliu@google.com)
00005 // Created:     Fri Aug 31 11:13:01 PST 2011
00006 //
00007 // (C) Copyright 2011, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H__
00021 #define TESSERACT_CCMAIN_EQUATIONDETECT_H__
00022 
00023 #include "blobbox.h"
00024 #include "equationdetectbase.h"
00025 #include "genericvector.h"
00026 #include "unichar.h"
00027 
00028 class BLOBNBOX;
00029 class BLOB_CHOICE;
00030 class BLOB_CHOICE_LIST;
00031 class TO_BLOCK_LIST;
00032 class TBOX;
00033 class UNICHARSET;
00034 
00035 namespace tesseract {
00036 
00037 class Tesseract;
00038 class ColPartition;
00039 class ColPartitionGrid;
00040 class ColPartitionSet;
00041 
00042 class EquationDetect : public EquationDetectBase {
00043  public:
00044   EquationDetect(const char* equ_datapath,
00045                  const char* equ_language);
00046   ~EquationDetect();
00047 
00048   enum IndentType {
00049     NO_INDENT,
00050     LEFT_INDENT,
00051     RIGHT_INDENT,
00052     BOTH_INDENT,
00053     INDENT_TYPE_COUNT
00054   };
00055 
00056   // Reset the lang_tesseract_ pointer. This function should be called before we
00057   // do any detector work.
00058   void SetLangTesseract(Tesseract* lang_tesseract);
00059 
00060   // Iterate over the blobs inside to_block, and set the blobs that we want to
00061   // process to BSTT_NONE. (By default, they should be BSTT_SKIP). The function
00062   // returns 0 upon success.
00063   int LabelSpecialText(TO_BLOCK* to_block);
00064 
00065   // Find possible equation partitions from part_grid. Should be called
00066   // after the special_text_type of blobs are set.
00067   // It returns 0 upon success.
00068   int FindEquationParts(ColPartitionGrid* part_grid,
00069                         ColPartitionSet** best_columns);
00070 
00071   // Reset the resolution of the processing image. TEST only function.
00072   void SetResolution(const int resolution);
00073 
00074  protected:
00075   // Identify the special text type for one blob, and update its field. When
00076   // height_th is set (> 0), we will label the blob as BSTT_NONE if its height
00077   // is less than height_th.
00078   void IdentifySpecialText(BLOBNBOX *blob, const int height_th);
00079 
00080   // Estimate the type for one unichar.
00081   BlobSpecialTextType EstimateTypeForUnichar(
00082       const UNICHARSET& unicharset, const UNICHAR_ID id) const;
00083 
00084   // Compute special text type for each blobs in part_grid_.
00085   void IdentifySpecialText();
00086 
00087   // Identify blobs that we want to skip during special blob type
00088   // classification.
00089   void IdentifyBlobsToSkip(ColPartition* part);
00090 
00091   // The ColPartitions in part_grid_ maybe over-segmented, particularly in the
00092   // block equation regions. So we like to identify these partitions and merge
00093   // them before we do the searching.
00094   void MergePartsByLocation();
00095 
00096   // Staring from the seed center, we do radius search. And for partitions that
00097   // have large overlaps with seed, we remove them from part_grid_ and add into
00098   // parts_overlap. Note: this function may update the part_grid_, so if the
00099   // caller is also running ColPartitionGridSearch, use the RepositionIterator
00100   // to continue.
00101   void SearchByOverlap(ColPartition* seed,
00102                        GenericVector<ColPartition*>* parts_overlap);
00103 
00104   // Insert part back into part_grid_, after it absorbs some other parts.
00105   void InsertPartAfterAbsorb(ColPartition* part);
00106 
00107   // Identify the colparitions in part_grid_, label them as PT_EQUATION, and
00108   // save them into cp_seeds_.
00109   void IdentifySeedParts();
00110 
00111   // Check the blobs count for a seed region candidate.
00112   bool CheckSeedBlobsCount(ColPartition* part);
00113 
00114   // Compute the foreground pixel density for a tbox area.
00115   float ComputeForegroundDensity(const TBOX& tbox);
00116 
00117   // Check if part from seed2 label: with low math density and left indented. We
00118   // are using two checks:
00119   // 1. If its left is aligned with any coordinates in indented_texts_left,
00120   // which we assume have been sorted.
00121   // 2. If its foreground density is over foreground_density_th.
00122   bool CheckForSeed2(
00123       const GenericVector<int>& indented_texts_left,
00124       const float foreground_density_th,
00125       ColPartition* part);
00126 
00127   // Count the number of values in sorted_vec that is close to val, used to
00128   // check if a partition is aligned with text partitions.
00129   int CountAlignment(
00130       const GenericVector<int>& sorted_vec, const int val) const;
00131 
00132   // Check for a seed candidate using the foreground pixel density. And we
00133   // return true if the density is below a certain threshold, because characters
00134   // in equation regions usually are apart with more white spaces.
00135   bool CheckSeedFgDensity(const float density_th, ColPartition* part);
00136 
00137   // A light version of SplitCPHor: instead of really doing the part split, we
00138   // simply compute the union bounding box of each splitted part.
00139   void SplitCPHorLite(ColPartition* part, GenericVector<TBOX>* splitted_boxes);
00140 
00141   // Split the part (horizontally), and save the splitted result into
00142   // parts_splitted. Note that it is caller's responsibility to release the
00143   // memory owns by parts_splitted. On the other hand, the part is unchanged
00144   // during this process and still owns the blobs, so do NOT call DeleteBoxes
00145   // when freeing the colpartitions in parts_splitted.
00146   void SplitCPHor(ColPartition* part,
00147                   GenericVector<ColPartition*>* parts_splitted);
00148 
00149   // Check the density for a seed candidate (part) using its math density and
00150   // italic density, returns true if the check passed.
00151   bool CheckSeedDensity(const float math_density_high,
00152                         const float math_density_low,
00153                         const ColPartition* part) const;
00154 
00155   // Check if part is indented.
00156   IndentType IsIndented(ColPartition* part);
00157 
00158   // Identify inline partitions from cp_seeds_, and re-label them.
00159   void IdentifyInlineParts();
00160 
00161   // Comute the super bounding box for all colpartitions inside part_grid_.
00162   void ComputeCPsSuperBBox();
00163 
00164   // Identify inline partitions from cp_seeds_ using the horizontal search.
00165   void IdentifyInlinePartsHorizontal();
00166 
00167   // Estimate the line spacing between two text partitions. Returns -1 if not
00168   // enough data.
00169   int EstimateTextPartLineSpacing();
00170 
00171   // Identify inline partitions from cp_seeds_ using vertical search.
00172   void IdentifyInlinePartsVertical(const bool top_to_bottom,
00173                                    const int textPartsLineSpacing);
00174 
00175   // Check if part is an inline equation zone. This should be called after we
00176   // identified the seed regions.
00177   bool IsInline(const bool search_bottom,
00178                 const int textPartsLineSpacing,
00179                 ColPartition* part);
00180 
00181   // For a given seed partition, we search the part_grid_ and see if there is
00182   // any partition can be merged with it. It returns true if the seed has been
00183   // expanded.
00184   bool ExpandSeed(ColPartition* seed);
00185 
00186   // Starting from the seed position, we search the part_grid_
00187   // horizontally/vertically, find all parititions that can be
00188   // merged with seed, remove them from part_grid_, and put them  into
00189   // parts_to_merge.
00190   void ExpandSeedHorizontal(const bool search_left,
00191                             ColPartition* seed,
00192                             GenericVector<ColPartition*>* parts_to_merge);
00193   void ExpandSeedVertical(const bool search_bottom,
00194                           ColPartition* seed,
00195                           GenericVector<ColPartition*>* parts_to_merge);
00196 
00197   // Check if a part_box is the small neighbor of seed_box.
00198   bool IsNearSmallNeighbor(const TBOX& seed_box,
00199                            const TBOX& part_box) const;
00200 
00201   // Perform the density check for part, which we assume is nearing a seed
00202   // partition. It returns true if the check passed.
00203   bool CheckSeedNeighborDensity(const ColPartition* part) const;
00204 
00205   // After identify the math blocks, we do one more scanning on all text
00206   // partitions, and check if any of them is the satellite of:
00207   // math blocks: here a p is the satellite of q if:
00208   // 1. q is the nearest vertical neighbor of p, and
00209   // 2. y_gap(p, q) is less than a threshold, and
00210   // 3. x_overlap(p, q) is over a threshold.
00211   // Note that p can be the satellites of two blocks: its top neighbor and
00212   // bottom neighbor.
00213   void ProcessMathBlockSatelliteParts();
00214 
00215   // Check if part is the satellite of one/two math blocks. If it is, we return
00216   // true, and save the blocks into math_blocks.
00217   bool IsMathBlockSatellite(
00218       ColPartition* part, GenericVector<ColPartition*>* math_blocks);
00219 
00220   // Search the nearest neighbor of part in one vertical direction as defined in
00221   // search_bottom. It returns the neighbor found that major x overlap with it,
00222   // or NULL when not found.
00223   ColPartition* SearchNNVertical(const bool search_bottom,
00224                                  const ColPartition* part);
00225 
00226   // Check if the neighbor with vertical distance of y_gap is a near and math
00227   // block partition.
00228   bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
00229 
00230   // Generate the tiff file name for output/debug file.
00231   void GetOutputTiffName(const char* name, STRING* image_name) const;
00232 
00233   // Debugger function that renders ColPartitions on the input image, where:
00234   // parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
00235   // will be painted in green, and other parts will be painted in blue.
00236   void PaintColParts(const STRING& outfile) const;
00237 
00238   // Debugger function that renders the blobs in part_grid_ over the input
00239   // image.
00240   void PaintSpecialTexts(const STRING& outfile) const;
00241 
00242   // Debugger function that print the math blobs density values for a
00243   // ColPartition object.
00244   void PrintSpecialBlobsDensity(const ColPartition* part) const;
00245 
00246   // The tesseract engine intialized from equation training data.
00247   Tesseract* equ_tesseract_;
00248 
00249   // The tesseract engine used for OCR. This pointer is passed in by the caller,
00250   // so do NOT destroy it in this class.
00251   Tesseract* lang_tesseract_;
00252 
00253   // The ColPartitionGrid that we are processing. This pointer is passed in from
00254   // the caller, so do NOT destroy it in the class.
00255   ColPartitionGrid* part_grid_;
00256 
00257   // A simple array of pointers to the best assigned column division at
00258   // each grid y coordinate. This pointer is passed in from the caller, so do
00259   // NOT destroy it in the class.
00260   ColPartitionSet** best_columns_;
00261 
00262   // The super bounding box of all cps in the part_grid_.
00263   TBOX* cps_super_bbox_;
00264 
00265   // The seed ColPartition for equation region.
00266   GenericVector<ColPartition*> cp_seeds_;
00267 
00268   // The resolution (dpi) of the processing image.
00269   int resolution_;
00270 
00271   // The number of pages we have processed.
00272   int page_count_;
00273 };
00274 
00275 }  // namespace tesseract
00276 
00277 #endif  // TESSERACT_CCMAIN_EQUATIONDETECT_H_