Tesseract
3.02
|
00001 00002 // File: cjkpitch.h 00003 // Description: Code to determine fixed pitchness and the pitch if fixed, 00004 // for CJK text. 00005 // Copyright 2011 Google Inc. All Rights Reserved. 00006 // Author: takenaka@google.com (Hiroshi Takenaka) 00007 // Created: Mon Jun 27 12:48:35 JST 2011 00008 // 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 #ifndef CJKPITCH_H_ 00021 #define CJKPITCH_H_ 00022 00023 #include "blobbox.h" 00024 #include "notdll.h" 00025 00026 // Function to test "fixed-pitchness" of the input text and estimating 00027 // character pitch parameters for it, based on CJK fixed-pitch layout 00028 // model. 00029 // 00030 // This function assumes that a fixed-pitch CJK text has following 00031 // characteristics: 00032 // 00033 // - Most glyphs are designed to fit within the same sized square 00034 // (imaginary body). Also they are aligned to the center of their 00035 // imaginary bodies. 00036 // - The imaginary body is always a regular rectangle. 00037 // - There may be some extra space between character bodies 00038 // (tracking). 00039 // - There may be some extra space after punctuations. 00040 // - The text is *not* space-delimited. Thus spaces are rare. 00041 // - Character may consists of multiple unconnected blobs. 00042 // 00043 // And the function works in two passes. On pass 1, it looks for such 00044 // "good" blobs that has the pitch same pitch on the both side and 00045 // looks like a complete CJK character. Then estimates the character 00046 // pitch for every row, based on those good blobs. If we couldn't find 00047 // enough good blobs for a row, then the pitch is estimated from other 00048 // rows with similar character height instead. 00049 // 00050 // Pass 2 is an iterative process to fit the blobs into fixed-pitch 00051 // character cells. Once we have estimated the character pitch, blobs 00052 // that are almost as large as the pitch can be considered to be 00053 // complete characters. And once we know that some characters are 00054 // complete characters, we can estimate the region occupied by its 00055 // neighbors. And so on. 00056 // 00057 // We repeat the process until all ambiguities are resolved. Then make 00058 // the final decision about fixed-pitchness of each row and compute 00059 // pitch and spacing parameters. 00060 // 00061 // (If a row is considered to be propotional, pitch_decision for the 00062 // row is set to PITCH_CORR_PROP and the later phase 00063 // (i.e. Textord::to_spacing()) should determine its spacing 00064 // parameters) 00065 // 00066 // This function doesn't provide all information required by 00067 // fixed_pitch_words() and the rows need to be processed with 00068 // make_prop_words() even if they are fixed pitched. 00069 void compute_fixed_pitch_cjk(ICOORD page_tr, // top right 00070 TO_BLOCK_LIST *port_blocks); // input list 00071 00072 #endif // CJKPITCH_H_