Tesseract
3.02
|
00001 /********************************************************************** 00002 * File: paragraphs.cpp 00003 * Description: Paragraph detection for tesseract. 00004 * Author: David Eger 00005 * Created: 25 February 2011 00006 * 00007 * (C) Copyright 2011, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 #ifdef _MSC_VER 00020 #define __func__ __FUNCTION__ 00021 #endif 00022 00023 #include <ctype.h> 00024 00025 #include "genericvector.h" 00026 #include "helpers.h" 00027 #include "mutableiterator.h" 00028 #include "ocrpara.h" 00029 #include "pageres.h" 00030 #include "paragraphs.h" 00031 #include "paragraphs_internal.h" 00032 #include "publictypes.h" 00033 #include "ratngs.h" 00034 #include "rect.h" 00035 #include "statistc.h" 00036 #include "strngs.h" 00037 #include "tprintf.h" 00038 #include "unicharset.h" 00039 #include "unicodes.h" 00040 00041 namespace tesseract { 00042 00043 // The tab vectors for a given line should be ignored if both its tab vectors 00044 // are infrequent, specifically, if both tab vectors appear at most once per 00045 // kStrayLinePer lines in a block. 00046 const int kStrayLinePer = 6; 00047 00048 // Special "weak" ParagraphModels. 00049 const ParagraphModel *kCrownLeft 00050 = reinterpret_cast<ParagraphModel *>(0xDEAD111F); 00051 const ParagraphModel *kCrownRight 00052 = reinterpret_cast<ParagraphModel *>(0xDEAD888F); 00053 00054 // Given the width of a typical space between words, what is the threshold 00055 // by which by which we think left and right alignments for paragraphs 00056 // can vary and still be aligned. 00057 static int Epsilon(int space_pix) { 00058 return space_pix * 4 / 5; 00059 } 00060 00061 template<typename T> 00062 void SimpleSwap(T &a, T &b) { 00063 T c = a; 00064 a = b; 00065 b = c; 00066 } 00067 00068 static bool AcceptableRowArgs( 00069 int debug_level, int min_num_rows, const char *function_name, 00070 const GenericVector<RowScratchRegisters> *rows, 00071 int row_start, int row_end) { 00072 if (row_start < 0 || row_end > rows->size() || row_start > row_end) { 00073 tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", 00074 row_start, row_end, rows->size()); 00075 return false; 00076 } 00077 if (row_end - row_start < min_num_rows) { 00078 if (debug_level > 1) { 00079 tprintf("# Too few rows[%d, %d) for %s.\n", 00080 row_start, row_end, function_name); 00081 } 00082 return false; 00083 } 00084 return true; 00085 } 00086 00087 // =============================== Debug Code ================================ 00088 00089 // Convert an integer to a decimal string. 00090 static STRING StrOf(int num) { 00091 char buffer[30]; 00092 snprintf(buffer, sizeof(buffer), "%d", num); 00093 return STRING(buffer); 00094 } 00095 00096 // Given a row-major matrix of unicode text and a column separator, print 00097 // a formatted table. For ASCII, we get good column alignment. 00098 static void PrintTable(const GenericVector<GenericVector<STRING> > &rows, 00099 const STRING &colsep) { 00100 GenericVector<int> max_col_widths; 00101 for (int r = 0; r < rows.size(); r++) { 00102 int num_columns = rows[r].size(); 00103 for (int c = 0; c < num_columns; c++) { 00104 int num_unicodes = 0; 00105 for (int i = 0; i < rows[r][c].size(); i++) { 00106 if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++; 00107 } 00108 if (c >= max_col_widths.size()) { 00109 max_col_widths.push_back(num_unicodes); 00110 } else { 00111 if (num_unicodes > max_col_widths[c]) 00112 max_col_widths[c] = num_unicodes; 00113 } 00114 } 00115 } 00116 00117 GenericVector<STRING> col_width_patterns; 00118 for (int c = 0; c < max_col_widths.size(); c++) { 00119 col_width_patterns.push_back( 00120 STRING("%-") + StrOf(max_col_widths[c]) + "s"); 00121 } 00122 00123 for (int r = 0; r < rows.size(); r++) { 00124 for (int c = 0; c < rows[r].size(); c++) { 00125 if (c > 0) 00126 tprintf("%s", colsep.string()); 00127 tprintf(col_width_patterns[c].string(), rows[r][c].string()); 00128 } 00129 tprintf("\n"); 00130 } 00131 } 00132 00133 STRING RtlEmbed(const STRING &word, bool rtlify) { 00134 if (rtlify) 00135 return STRING(kRLE) + word + STRING(kPDF); 00136 return word; 00137 } 00138 00139 // Print the current thoughts of the paragraph detector. 00140 static void PrintDetectorState(const ParagraphTheory &theory, 00141 const GenericVector<RowScratchRegisters> &rows) { 00142 GenericVector<GenericVector<STRING> > output; 00143 output.push_back(GenericVector<STRING>()); 00144 output.back().push_back("#row"); 00145 output.back().push_back("space"); 00146 output.back().push_back(".."); 00147 output.back().push_back("lword[widthSEL]"); 00148 output.back().push_back("rword[widthSEL]"); 00149 RowScratchRegisters::AppendDebugHeaderFields(&output.back()); 00150 output.back().push_back("text"); 00151 00152 for (int i = 0; i < rows.size(); i++) { 00153 output.push_back(GenericVector<STRING>()); 00154 GenericVector<STRING> &row = output.back(); 00155 const RowInfo& ri = *rows[i].ri_; 00156 row.push_back(StrOf(i)); 00157 row.push_back(StrOf(ri.average_interword_space)); 00158 row.push_back(ri.has_leaders ? ".." : " "); 00159 row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + 00160 "[" + StrOf(ri.lword_box.width()) + 00161 (ri.lword_likely_starts_idea ? "S" : "s") + 00162 (ri.lword_likely_ends_idea ? "E" : "e") + 00163 (ri.lword_indicates_list_item ? "L" : "l") + 00164 "]"); 00165 row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + 00166 "[" + StrOf(ri.rword_box.width()) + 00167 (ri.rword_likely_starts_idea ? "S" : "s") + 00168 (ri.rword_likely_ends_idea ? "E" : "e") + 00169 (ri.rword_indicates_list_item ? "L" : "l") + 00170 "]"); 00171 rows[i].AppendDebugInfo(theory, &row); 00172 row.push_back(RtlEmbed(ri.text, !ri.ltr)); 00173 } 00174 PrintTable(output, " "); 00175 00176 tprintf("Active Paragraph Models:\n"); 00177 for (int m = 0; m < theory.models().size(); m++) { 00178 tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string()); 00179 } 00180 } 00181 00182 static void DebugDump( 00183 bool should_print, 00184 const STRING &phase, 00185 const ParagraphTheory &theory, 00186 const GenericVector<RowScratchRegisters> &rows) { 00187 if (!should_print) 00188 return; 00189 tprintf("# %s\n", phase.string()); 00190 PrintDetectorState(theory, rows); 00191 } 00192 00193 // Print out the text for rows[row_start, row_end) 00194 static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows, 00195 int row_start, int row_end) { 00196 tprintf("======================================\n"); 00197 for (int row = row_start; row < row_end; row++) { 00198 tprintf("%s\n", rows[row].ri_->text.string()); 00199 } 00200 tprintf("======================================\n"); 00201 } 00202 00203 // ============= Brain Dead Language Model (ASCII Version) =================== 00204 00205 bool IsLatinLetter(int ch) { 00206 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); 00207 } 00208 00209 bool IsDigitLike(int ch) { 00210 return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I'; 00211 } 00212 00213 bool IsOpeningPunct(int ch) { 00214 return strchr("'\"({[", ch) != NULL; 00215 } 00216 00217 bool IsTerminalPunct(int ch) { 00218 return strchr(":'\".?!]})", ch) != NULL; 00219 } 00220 00221 // Return a pointer after consuming as much text as qualifies as roman numeral. 00222 const char *SkipChars(const char *str, const char *toskip) { 00223 while (*str != '\0' && strchr(toskip, *str)) { str++; } 00224 return str; 00225 } 00226 00227 const char *SkipChars(const char *str, bool (*skip)(int)) { 00228 while (*str != '\0' && skip(*str)) { str++; } 00229 return str; 00230 } 00231 00232 const char *SkipOne(const char *str, const char *toskip) { 00233 if (*str != '\0' && strchr(toskip, *str)) return str + 1; 00234 return str; 00235 } 00236 00237 // Return whether it is very likely that this is a numeral marker that could 00238 // start a list item. Some examples include: 00239 // A I iii. VI (2) 3.5. [C-4] 00240 bool LikelyListNumeral(const STRING &word) { 00241 const char *kRomans = "ivxlmdIVXLMD"; 00242 const char *kDigits = "012345789"; 00243 const char *kOpen = "[{("; 00244 const char *kSep = ":;-.,"; 00245 const char *kClose = "]})"; 00246 00247 int num_segments = 0; 00248 const char *pos = word.string(); 00249 while (*pos != '\0' && num_segments < 3) { 00250 // skip up to two open parens. 00251 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen); 00252 const char *numeral_end = SkipChars(numeral_start, kRomans); 00253 if (numeral_end != numeral_start) { 00254 // Got Roman Numeral. Great. 00255 } else { 00256 numeral_end = SkipChars(numeral_start, kDigits); 00257 if (numeral_end == numeral_start) { 00258 // If there's a single latin letter, we can use that. 00259 numeral_end = SkipChars(numeral_start, IsLatinLetter); 00260 if (numeral_end - numeral_start != 1) 00261 break; 00262 } 00263 } 00264 // We got some sort of numeral. 00265 num_segments++; 00266 // Skip any trailing parens or punctuation. 00267 pos = SkipChars(SkipChars(numeral_end, kClose), kSep); 00268 if (pos == numeral_end) 00269 break; 00270 } 00271 return *pos == '\0'; 00272 } 00273 00274 bool LikelyListMark(const STRING &word) { 00275 const char *kListMarks = "0Oo*.,+."; 00276 return word.size() == 1 && strchr(kListMarks, word[0]) != NULL; 00277 } 00278 00279 bool AsciiLikelyListItem(const STRING &word) { 00280 return LikelyListMark(word) || LikelyListNumeral(word); 00281 } 00282 00283 // ========== Brain Dead Language Model (Tesseract Version) ================ 00284 00285 // Return the first Unicode Codepoint from werd[pos]. 00286 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) { 00287 if (!u || !werd || pos > werd->length()) 00288 return 0; 00289 return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni(); 00290 } 00291 00292 // A useful helper class for finding the first j >= i so that word[j] 00293 // does not have given character type. 00294 class UnicodeSpanSkipper { 00295 public: 00296 UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word) 00297 : u_(unicharset), word_(word) { wordlen_ = word->length(); } 00298 00299 // Given an input position, return the first position >= pos not punc. 00300 int SkipPunc(int pos); 00301 // Given an input position, return the first position >= pos not digit. 00302 int SkipDigits(int pos); 00303 // Given an input position, return the first position >= pos not roman. 00304 int SkipRomans(int pos); 00305 // Given an input position, return the first position >= pos not alpha. 00306 int SkipAlpha(int pos); 00307 00308 private: 00309 const UNICHARSET *u_; 00310 const WERD_CHOICE *word_; 00311 int wordlen_; 00312 }; 00313 00314 int UnicodeSpanSkipper::SkipPunc(int pos) { 00315 while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++; 00316 return pos; 00317 } 00318 00319 int UnicodeSpanSkipper::SkipDigits(int pos) { 00320 while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) || 00321 IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++; 00322 return pos; 00323 } 00324 00325 int UnicodeSpanSkipper::SkipRomans(int pos) { 00326 const char *kRomans = "ivxlmdIVXLMD"; 00327 while (pos < wordlen_) { 00328 int ch = UnicodeFor(u_, word_, pos); 00329 if (ch >= 0xF0 || strchr(kRomans, ch) == 0) break; 00330 pos++; 00331 } 00332 return pos; 00333 } 00334 00335 int UnicodeSpanSkipper::SkipAlpha(int pos) { 00336 while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++; 00337 return pos; 00338 } 00339 00340 bool LikelyListMarkUnicode(int ch) { 00341 if (ch < 0x80) { 00342 STRING single_ch; 00343 single_ch += ch; 00344 return LikelyListMark(single_ch); 00345 } 00346 switch (ch) { 00347 // TODO(eger) expand this list of unicodes as needed. 00348 case 0x00B0: // degree sign 00349 case 0x2022: // bullet 00350 case 0x25E6: // white bullet 00351 case 0x00B7: // middle dot 00352 case 0x25A1: // white square 00353 case 0x25A0: // black square 00354 case 0x25AA: // black small square 00355 case 0x2B1D: // black very small square 00356 case 0x25BA: // black right-pointing pointer 00357 case 0x25CF: // black circle 00358 case 0x25CB: // white circle 00359 return true; 00360 default: 00361 break; // fall through 00362 } 00363 return false; 00364 } 00365 00366 // Return whether it is very likely that this is a numeral marker that could 00367 // start a list item. Some examples include: 00368 // A I iii. VI (2) 3.5. [C-4] 00369 bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) { 00370 if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) 00371 return true; 00372 00373 UnicodeSpanSkipper m(u, werd); 00374 int num_segments = 0; 00375 int pos = 0; 00376 while (pos < werd->length() && num_segments < 3) { 00377 int numeral_start = m.SkipPunc(pos); 00378 if (numeral_start > pos + 1) break; 00379 int numeral_end = m.SkipRomans(numeral_start); 00380 if (numeral_end == numeral_start) { 00381 numeral_end = m.SkipDigits(numeral_start); 00382 if (numeral_end == numeral_start) { 00383 // If there's a single latin letter, we can use that. 00384 numeral_end = m.SkipAlpha(numeral_start); 00385 if (numeral_end - numeral_start != 1) 00386 break; 00387 } 00388 } 00389 // We got some sort of numeral. 00390 num_segments++; 00391 // Skip any trailing punctuation. 00392 pos = m.SkipPunc(numeral_end); 00393 if (pos == numeral_end) 00394 break; 00395 } 00396 return pos == werd->length(); 00397 } 00398 00399 // ========= Brain Dead Language Model (combined entry points) ================ 00400 00401 // Given the leftmost word of a line either as a Tesseract unicharset + werd 00402 // or a utf8 string, set the following attributes for it: 00403 // is_list - this word might be a list number or bullet. 00404 // starts_idea - this word is likely to start a sentence. 00405 // ends_idea - this word is likely to end a sentence. 00406 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, 00407 const STRING &utf8, 00408 bool *is_list, bool *starts_idea, bool *ends_idea) { 00409 *is_list = false; 00410 *starts_idea = false; 00411 *ends_idea = false; 00412 if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty 00413 *ends_idea = true; 00414 return; 00415 } 00416 00417 if (unicharset && werd) { // We have a proper werd and unicharset so use it. 00418 if (UniLikelyListItem(unicharset, werd)) { 00419 *is_list = true; 00420 *starts_idea = true; 00421 *ends_idea = true; 00422 } 00423 if (unicharset->get_isupper(werd->unichar_id(0))) { 00424 *starts_idea = true; 00425 } 00426 if (unicharset->get_ispunctuation(werd->unichar_id(0))) { 00427 *starts_idea = true; 00428 *ends_idea = true; 00429 } 00430 } else { // Assume utf8 is mostly ASCII 00431 if (AsciiLikelyListItem(utf8)) { 00432 *is_list = true; 00433 *starts_idea = true; 00434 } 00435 int start_letter = utf8[0]; 00436 if (IsOpeningPunct(start_letter)) { 00437 *starts_idea = true; 00438 } 00439 if (IsTerminalPunct(start_letter)) { 00440 *ends_idea = true; 00441 } 00442 if (start_letter >= 'A' && start_letter <= 'Z') { 00443 *starts_idea = true; 00444 } 00445 } 00446 } 00447 00448 // Given the rightmost word of a line either as a Tesseract unicharset + werd 00449 // or a utf8 string, set the following attributes for it: 00450 // is_list - this word might be a list number or bullet. 00451 // starts_idea - this word is likely to start a sentence. 00452 // ends_idea - this word is likely to end a sentence. 00453 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, 00454 const STRING &utf8, 00455 bool *is_list, bool *starts_idea, bool *ends_idea) { 00456 *is_list = false; 00457 *starts_idea = false; 00458 *ends_idea = false; 00459 if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty 00460 *ends_idea = true; 00461 return; 00462 } 00463 00464 if (unicharset && werd) { // We have a proper werd and unicharset so use it. 00465 if (UniLikelyListItem(unicharset, werd)) { 00466 *is_list = true; 00467 *starts_idea = true; 00468 } 00469 UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1); 00470 if (unicharset->get_ispunctuation(last_letter)) { 00471 *ends_idea = true; 00472 } 00473 } else { // Assume utf8 is mostly ASCII 00474 if (AsciiLikelyListItem(utf8)) { 00475 *is_list = true; 00476 *starts_idea = true; 00477 } 00478 int last_letter = utf8[utf8.size() - 1]; 00479 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) { 00480 *ends_idea = true; 00481 } 00482 } 00483 } 00484 00485 // =============== Implementation of RowScratchRegisters ===================== 00486 /* static */ 00487 void RowScratchRegisters::AppendDebugHeaderFields( 00488 GenericVector<STRING> *header) { 00489 header->push_back("[lmarg,lind;rind,rmarg]"); 00490 header->push_back("model"); 00491 } 00492 00493 void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory, 00494 GenericVector<STRING> *dbg) const { 00495 char s[30]; 00496 snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", 00497 lmargin_, lindent_, rindent_, rmargin_); 00498 dbg->push_back(s); 00499 STRING model_string; 00500 model_string += static_cast<char>(GetLineType()); 00501 model_string += ":"; 00502 00503 int model_numbers = 0; 00504 for (int h = 0; h < hypotheses_.size(); h++) { 00505 if (hypotheses_[h].model == NULL) 00506 continue; 00507 if (model_numbers > 0) 00508 model_string += ","; 00509 if (StrongModel(hypotheses_[h].model)) { 00510 model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model)); 00511 } else if (hypotheses_[h].model == kCrownLeft) { 00512 model_string += "CrL"; 00513 } else if (hypotheses_[h].model == kCrownRight) { 00514 model_string += "CrR"; 00515 } 00516 model_numbers++; 00517 } 00518 if (model_numbers == 0) 00519 model_string += "0"; 00520 00521 dbg->push_back(model_string); 00522 } 00523 00524 void RowScratchRegisters::Init(const RowInfo &row) { 00525 ri_ = &row; 00526 lmargin_ = 0; 00527 lindent_ = row.pix_ldistance; 00528 rmargin_ = 0; 00529 rindent_ = row.pix_rdistance; 00530 } 00531 00532 LineType RowScratchRegisters::GetLineType() const { 00533 if (hypotheses_.empty()) 00534 return LT_UNKNOWN; 00535 bool has_start = false; 00536 bool has_body = false; 00537 for (int i = 0; i < hypotheses_.size(); i++) { 00538 switch (hypotheses_[i].ty) { 00539 case LT_START: has_start = true; break; 00540 case LT_BODY: has_body = true; break; 00541 default: 00542 tprintf("Encountered bad value in hypothesis list: %c\n", 00543 hypotheses_[i].ty); 00544 break; 00545 } 00546 } 00547 if (has_start && has_body) 00548 return LT_MULTIPLE; 00549 return has_start ? LT_START : LT_BODY; 00550 } 00551 00552 LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const { 00553 if (hypotheses_.empty()) 00554 return LT_UNKNOWN; 00555 bool has_start = false; 00556 bool has_body = false; 00557 for (int i = 0; i < hypotheses_.size(); i++) { 00558 if (hypotheses_[i].model != model) 00559 continue; 00560 switch (hypotheses_[i].ty) { 00561 case LT_START: has_start = true; break; 00562 case LT_BODY: has_body = true; break; 00563 default: 00564 tprintf("Encountered bad value in hypothesis list: %c\n", 00565 hypotheses_[i].ty); 00566 break; 00567 } 00568 } 00569 if (has_start && has_body) 00570 return LT_MULTIPLE; 00571 return has_start ? LT_START : LT_BODY; 00572 } 00573 00574 void RowScratchRegisters::SetStartLine() { 00575 LineType current_lt = GetLineType(); 00576 if (current_lt != LT_UNKNOWN && current_lt != LT_START) { 00577 tprintf("Trying to set a line to be START when it's already BODY.\n"); 00578 } 00579 if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { 00580 hypotheses_.push_back_new(LineHypothesis(LT_START, NULL)); 00581 } 00582 } 00583 00584 void RowScratchRegisters::SetBodyLine() { 00585 LineType current_lt = GetLineType(); 00586 if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) { 00587 tprintf("Trying to set a line to be BODY when it's already START.\n"); 00588 } 00589 if (current_lt == LT_UNKNOWN || current_lt == LT_START) { 00590 hypotheses_.push_back_new(LineHypothesis(LT_BODY, NULL)); 00591 } 00592 } 00593 00594 void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { 00595 hypotheses_.push_back_new(LineHypothesis(LT_START, model)); 00596 int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, NULL)); 00597 if (old_idx >= 0) 00598 hypotheses_.remove(old_idx); 00599 } 00600 00601 void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { 00602 hypotheses_.push_back_new(LineHypothesis(LT_BODY, model)); 00603 int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, NULL)); 00604 if (old_idx >= 0) 00605 hypotheses_.remove(old_idx); 00606 } 00607 00608 void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { 00609 for (int h = 0; h < hypotheses_.size(); h++) { 00610 if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model)) 00611 models->push_back_new(hypotheses_[h].model); 00612 } 00613 } 00614 00615 void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { 00616 for (int h = 0; h < hypotheses_.size(); h++) { 00617 if (StrongModel(hypotheses_[h].model)) 00618 models->push_back_new(hypotheses_[h].model); 00619 } 00620 } 00621 00622 void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { 00623 for (int h = 0; h < hypotheses_.size(); h++) { 00624 if (hypotheses_[h].model != NULL) 00625 models->push_back_new(hypotheses_[h].model); 00626 } 00627 } 00628 00629 const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const { 00630 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) 00631 return NULL; 00632 return hypotheses_[0].model; 00633 } 00634 00635 const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const { 00636 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) 00637 return NULL; 00638 return hypotheses_[0].model; 00639 } 00640 00641 // Discard any hypotheses whose model is not in the given list. 00642 void RowScratchRegisters::DiscardNonMatchingHypotheses( 00643 const SetOfModels &models) { 00644 if (models.empty()) 00645 return; 00646 for (int h = hypotheses_.size() - 1; h >= 0; h--) { 00647 if (!models.contains(hypotheses_[h].model)) { 00648 hypotheses_.remove(h); 00649 } 00650 } 00651 } 00652 00653 // ============ Geometry based Paragraph Detection Algorithm ================= 00654 00655 struct Cluster { 00656 Cluster() : center(0), count(0) {} 00657 Cluster(int cen, int num) : center(cen), count(num) {} 00658 00659 int center; // The center of the cluster. 00660 int count; // The number of entries within the cluster. 00661 }; 00662 00663 class SimpleClusterer { 00664 public: 00665 explicit SimpleClusterer(int max_cluster_width) 00666 : max_cluster_width_(max_cluster_width) {} 00667 void Add(int value) { values_.push_back(value); } 00668 int size() const { return values_.size(); } 00669 void GetClusters(GenericVector<Cluster> *clusters); 00670 00671 private: 00672 int max_cluster_width_; 00673 GenericVectorEqEq<int> values_; 00674 }; 00675 00676 // Return the index of the cluster closest to value. 00677 int ClosestCluster(const GenericVector<Cluster> &clusters, int value) { 00678 int best_index = 0; 00679 for (int i = 0; i < clusters.size(); i++) { 00680 if (abs(value - clusters[i].center) < 00681 abs(value - clusters[best_index].center)) 00682 best_index = i; 00683 } 00684 return best_index; 00685 } 00686 00687 void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) { 00688 clusters->clear(); 00689 values_.sort(); 00690 for (int i = 0; i < values_.size();) { 00691 int orig_i = i; 00692 int lo = values_[i]; 00693 int hi = lo; 00694 while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) { 00695 hi = values_[i]; 00696 } 00697 clusters->push_back(Cluster((hi + lo) / 2, i - orig_i)); 00698 } 00699 } 00700 00701 // Calculate left- and right-indent tab stop values seen in 00702 // rows[row_start, row_end) given a tolerance of tolerance. 00703 void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, 00704 int row_start, int row_end, 00705 int tolerance, 00706 GenericVector<Cluster> *left_tabs, 00707 GenericVector<Cluster> *right_tabs) { 00708 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) 00709 return; 00710 // First pass: toss all left and right indents into clusterers. 00711 SimpleClusterer initial_lefts(tolerance); 00712 SimpleClusterer initial_rights(tolerance); 00713 GenericVector<Cluster> initial_left_tabs; 00714 GenericVector<Cluster> initial_right_tabs; 00715 for (int i = row_start; i < row_end; i++) { 00716 initial_lefts.Add((*rows)[i].lindent_); 00717 initial_rights.Add((*rows)[i].rindent_); 00718 } 00719 initial_lefts.GetClusters(&initial_left_tabs); 00720 initial_rights.GetClusters(&initial_right_tabs); 00721 00722 // Second pass: cluster only lines that are not "stray" 00723 // An example of a stray line is a page number -- a line whose start 00724 // and end tab-stops are far outside the typical start and end tab-stops 00725 // for the block. 00726 // Put another way, we only cluster data from lines whose start or end 00727 // tab stop is frequent. 00728 SimpleClusterer lefts(tolerance); 00729 SimpleClusterer rights(tolerance); 00730 int infrequent_enough_to_ignore = (row_end - row_start) / kStrayLinePer; 00731 for (int i = row_start; i < row_end; i++) { 00732 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); 00733 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); 00734 if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore || 00735 initial_right_tabs[ridx].count > infrequent_enough_to_ignore) { 00736 lefts.Add((*rows)[i].lindent_); 00737 rights.Add((*rows)[i].rindent_); 00738 } 00739 } 00740 lefts.GetClusters(left_tabs); 00741 rights.GetClusters(right_tabs); 00742 } 00743 00744 // Given a paragraph model mark rows[row_start, row_end) as said model 00745 // start or body lines. 00746 // 00747 // Case 1: model->first_indent_ != model->body_indent_ 00748 // Differentiating the paragraph start lines from the paragraph body lines in 00749 // this case is easy, we just see how far each line is indented. 00750 // 00751 // Case 2: model->first_indent_ == model->body_indent_ 00752 // Here, we find end-of-paragraph lines by looking for "short lines." 00753 // What constitutes a "short line" changes depending on whether the text 00754 // ragged-right[left] or fully justified (aligned left and right). 00755 // 00756 // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0) 00757 // We have a new paragraph it the first word would have at the end 00758 // of the previous line. 00759 // 00760 // Case 2b: Fully Justified. (eop_threshold > 0) 00761 // We mark a line as short (end of paragraph) if the offside indent 00762 // is greater than eop_threshold. 00763 void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, 00764 int row_start, int row_end, 00765 const ParagraphModel *model, 00766 bool ltr, 00767 int eop_threshold) { 00768 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) 00769 return; 00770 for (int row = row_start; row < row_end; row++) { 00771 bool valid_first = ValidFirstLine(rows, row, model); 00772 bool valid_body = ValidBodyLine(rows, row, model); 00773 if (valid_first && !valid_body) { 00774 (*rows)[row].AddStartLine(model); 00775 } else if (valid_body && !valid_first) { 00776 (*rows)[row].AddBodyLine(model); 00777 } else if (valid_body && valid_first) { 00778 bool after_eop = (row == row_start); 00779 if (row > row_start) { 00780 if (eop_threshold > 0) { 00781 if (model->justification() == JUSTIFICATION_LEFT) { 00782 after_eop = (*rows)[row - 1].rindent_ > eop_threshold; 00783 } else { 00784 after_eop = (*rows)[row - 1].lindent_ > eop_threshold; 00785 } 00786 } else { 00787 after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], 00788 model->justification()); 00789 } 00790 } 00791 if (after_eop) { 00792 (*rows)[row].AddStartLine(model); 00793 } else { 00794 (*rows)[row].AddBodyLine(model); 00795 } 00796 } else { 00797 // Do nothing. Stray row. 00798 } 00799 } 00800 } 00801 00802 // GeometricClassifierState holds all of the information we'll use while 00803 // trying to determine a paragraph model for the text lines in a block of 00804 // text: 00805 // + the rows under consideration [row_start, row_end) 00806 // + the common left- and right-indent tab stops 00807 // + does the block start out left-to-right or right-to-left 00808 // Further, this struct holds the data we amass for the (single) ParagraphModel 00809 // we'll assign to the text lines (assuming we get that far). 00810 struct GeometricClassifierState { 00811 GeometricClassifierState(int dbg_level, 00812 GenericVector<RowScratchRegisters> *r, 00813 int r_start, int r_end) 00814 : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end), 00815 margin(0) { 00816 tolerance = InterwordSpace(*r, r_start, r_end); 00817 CalculateTabStops(r, r_start, r_end, tolerance, 00818 &left_tabs, &right_tabs); 00819 ltr = (*r)[r_start].ri_->ltr; 00820 } 00821 00822 void AssumeLeftJustification() { 00823 just = tesseract::JUSTIFICATION_LEFT; 00824 margin = (*rows)[row_start].lmargin_; 00825 } 00826 00827 void AssumeRightJustification() { 00828 just = tesseract::JUSTIFICATION_RIGHT; 00829 margin = (*rows)[row_start].rmargin_; 00830 } 00831 00832 // Align tabs are the tab stops the text is aligned to. 00833 const GenericVector<Cluster> &AlignTabs() const { 00834 if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs; 00835 return left_tabs; 00836 } 00837 00838 // Offside tabs are the tab stops opposite the tabs used to align the text. 00839 // 00840 // Note that for a left-to-right text which is aligned to the right such as 00841 // this function comment, the offside tabs are the horizontal tab stops 00842 // marking the beginning of ("Note", "this" and "marking"). 00843 const GenericVector<Cluster> &OffsideTabs() const { 00844 if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs; 00845 return right_tabs; 00846 } 00847 00848 // Return whether the i'th row extends from the leftmost left tab stop 00849 // to the right most right tab stop. 00850 bool IsFullRow(int i) const { 00851 return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 && 00852 ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0; 00853 } 00854 00855 int AlignsideTabIndex(int row_idx) const { 00856 return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just)); 00857 } 00858 00859 // Given what we know about the paragraph justification (just), would the 00860 // first word of row_b have fit at the end of row_a? 00861 bool FirstWordWouldHaveFit(int row_a, int row_b) { 00862 return ::tesseract::FirstWordWouldHaveFit( 00863 (*rows)[row_a], (*rows)[row_b], just); 00864 } 00865 00866 void PrintRows() const { PrintRowRange(*rows, row_start, row_end); } 00867 00868 void Fail(int min_debug_level, const char *why) const { 00869 if (debug_level < min_debug_level) return; 00870 tprintf("# %s\n", why); 00871 PrintRows(); 00872 } 00873 00874 ParagraphModel Model() const { 00875 return ParagraphModel(just, margin, first_indent, body_indent, tolerance); 00876 } 00877 00878 // We print out messages with a debug level at least as great as debug_level. 00879 int debug_level; 00880 00881 // The Geometric Classifier was asked to find a single paragraph model 00882 // to fit the text rows (*rows)[row_start, row_end) 00883 GenericVector<RowScratchRegisters> *rows; 00884 int row_start; 00885 int row_end; 00886 00887 // The amount by which we expect the text edge can vary and still be aligned. 00888 int tolerance; 00889 00890 // Is the script in this text block left-to-right? 00891 // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve 00892 bool ltr; 00893 00894 // These left and right tab stops were determined to be the common tab 00895 // stops for the given text. 00896 GenericVector<Cluster> left_tabs; 00897 GenericVector<Cluster> right_tabs; 00898 00899 // These are parameters we must determine to create a ParagraphModel. 00900 tesseract::ParagraphJustification just; 00901 int margin; 00902 int first_indent; 00903 int body_indent; 00904 00905 // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel() 00906 int eop_threshold; 00907 }; 00908 00909 // Given a section of text where strong textual clues did not help identifying 00910 // paragraph breaks, and for which the left and right indents have exactly 00911 // three tab stops between them, attempt to find the paragraph breaks based 00912 // solely on the outline of the text and whether the script is left-to-right. 00913 // 00914 // Algorithm Detail: 00915 // The selected rows are in the form of a rectangle except 00916 // for some number of "short lines" of the same length: 00917 // 00918 // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx 00919 // xxxxxxxxxxx xxxxxxxxxx # A "short" line. 00920 // xxxxxxxxxxxxx xxxxxxxxxxxx 00921 // xxxxxxxxxxxxx xxxxxxxxxxxx 00922 // 00923 // We have a slightly different situation if the only short 00924 // line is at the end of the excerpt. 00925 // 00926 // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx 00927 // xxxxxxxxxxxxx xxxxxxxxxxxx 00928 // xxxxxxxxxxxxx xxxxxxxxxxxx 00929 // xxxxxxxxxxx xxxxxxxxxx # A "short" line. 00930 // 00931 // We'll interpret these as follows based on the reasoning in the comment for 00932 // GeometricClassify(): 00933 // [script direction: first indent, body indent] 00934 // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0 00935 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0 00936 void GeometricClassifyThreeTabStopTextBlock( 00937 int debug_level, 00938 GeometricClassifierState &s, 00939 ParagraphTheory *theory) { 00940 int num_rows = s.row_end - s.row_start; 00941 int num_full_rows = 0; 00942 int last_row_full = 0; 00943 for (int i = s.row_start; i < s.row_end; i++) { 00944 if (s.IsFullRow(i)) { 00945 num_full_rows++; 00946 if (i == s.row_end - 1) last_row_full++; 00947 } 00948 } 00949 00950 if (num_full_rows < 0.7 * num_rows) { 00951 s.Fail(1, "Not enough full lines to know which lines start paras."); 00952 return; 00953 } 00954 00955 // eop_threshold gets set if we're fully justified; see MarkRowsWithModel() 00956 s.eop_threshold = 0; 00957 00958 if (s.ltr) { 00959 s.AssumeLeftJustification(); 00960 } else { 00961 s.AssumeRightJustification(); 00962 } 00963 00964 if (debug_level > 0) { 00965 tprintf("# Not enough variety for clear outline classification. " 00966 "Guessing these are %s aligned based on script.\n", 00967 s.ltr ? "left" : "right"); 00968 s.PrintRows(); 00969 } 00970 00971 if (s.AlignTabs().size() == 2) { // case A1 or A2 00972 s.first_indent = s.AlignTabs()[1].center; 00973 s.body_indent = s.AlignTabs()[0].center; 00974 } else { // case B1 or B2 00975 if (num_rows - 1 == num_full_rows - last_row_full) { 00976 // case B2 00977 const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight; 00978 (*s.rows)[s.row_start].AddStartLine(model); 00979 for (int i = s.row_start + 1; i < s.row_end; i++) { 00980 (*s.rows)[i].AddBodyLine(model); 00981 } 00982 return; 00983 } else { 00984 // case B1 00985 s.first_indent = s.body_indent = s.AlignTabs()[0].center; 00986 s.eop_threshold = (s.OffsideTabs()[0].center + 00987 s.OffsideTabs()[1].center) / 2; 00988 } 00989 } 00990 const ParagraphModel *model = theory->AddModel(s.Model()); 00991 MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, 00992 s.ltr, s.eop_threshold); 00993 return; 00994 } 00995 00996 // This function is called if strong textual clues were not available, but 00997 // the caller hopes that the paragraph breaks will be super obvious just 00998 // by the outline of the text. 00999 // 01000 // The particularly difficult case is figuring out what's going on if you 01001 // don't have enough short paragraph end lines to tell us what's going on. 01002 // 01003 // For instance, let's say you have the following outline: 01004 // 01005 // (A1) xxxxxxxxxxxxxxxxxxxxxx 01006 // xxxxxxxxxxxxxxxxxxxx 01007 // xxxxxxxxxxxxxxxxxxxxxx 01008 // xxxxxxxxxxxxxxxxxxxxxx 01009 // 01010 // Even if we know that the text is left-to-right and so will probably be 01011 // left-aligned, both of the following are possible texts: 01012 // 01013 // (A1a) 1. Here our list item 01014 // with two full lines. 01015 // 2. Here a second item. 01016 // 3. Here our third one. 01017 // 01018 // (A1b) so ends paragraph one. 01019 // Here starts another 01020 // paragraph we want to 01021 // read. This continues 01022 // 01023 // These examples are obvious from the text and should have been caught 01024 // by the StrongEvidenceClassify pass. However, for languages where we don't 01025 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese), 01026 // it's worth guessing that (A1b) is the correct interpretation if there are 01027 // far more "full" lines than "short" lines. 01028 void GeometricClassify(int debug_level, 01029 GenericVector<RowScratchRegisters> *rows, 01030 int row_start, int row_end, 01031 ParagraphTheory *theory) { 01032 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) 01033 return; 01034 if (debug_level > 1) { 01035 tprintf("###############################################\n"); 01036 tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", 01037 row_start, row_end); 01038 tprintf("###############################################\n"); 01039 } 01040 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); 01041 01042 GeometricClassifierState s(debug_level, rows, row_start, row_end); 01043 if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) { 01044 s.Fail(2, "Too much variety for simple outline classification."); 01045 return; 01046 } 01047 if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) { 01048 s.Fail(1, "Not enough variety for simple outline classification."); 01049 return; 01050 } 01051 if (s.left_tabs.size() + s.right_tabs.size() == 3) { 01052 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory); 01053 return; 01054 } 01055 01056 // At this point, we know that one side has at least two tab stops, and the 01057 // other side has one or two tab stops. 01058 // Left to determine: 01059 // (1) Which is the body indent and which is the first line indent? 01060 // (2) Is the text fully justified? 01061 01062 // If one side happens to have three or more tab stops, assume that side 01063 // is opposite of the aligned side. 01064 if (s.right_tabs.size() > 2) { 01065 s.AssumeLeftJustification(); 01066 } else if (s.left_tabs.size() > 2) { 01067 s.AssumeRightJustification(); 01068 } else if (s.ltr) { // guess based on script direction 01069 s.AssumeLeftJustification(); 01070 } else { 01071 s.AssumeRightJustification(); 01072 } 01073 01074 if (s.AlignTabs().size() == 2) { 01075 // For each tab stop on the aligned side, how many of them appear 01076 // to be paragraph start lines? [first lines] 01077 int firsts[2] = {0, 0}; 01078 // Count the first line as a likely paragraph start line. 01079 firsts[s.AlignsideTabIndex(s.row_start)]++; 01080 // For each line, if the first word would have fit on the previous 01081 // line count it as a likely paragraph start line. 01082 for (int i = s.row_start + 1; i < s.row_end; i++) { 01083 if (s.FirstWordWouldHaveFit(i - 1, i)) { 01084 firsts[s.AlignsideTabIndex(i)]++; 01085 } 01086 } 01087 // Make an extra accounting for the last line of the paragraph just 01088 // in case it's the only short line in the block. That is, take its 01089 // first word as typical and see if this looks like the *last* line 01090 // of a paragraph. If so, mark the *other* indent as probably a first. 01091 if (s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) { 01092 firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++; 01093 } 01094 01095 int percent0firsts, percent1firsts; 01096 percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count; 01097 percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count; 01098 01099 // TODO(eger): Tune these constants if necessary. 01100 if ((percent0firsts < 20 && 30 < percent1firsts) || 01101 percent0firsts + 30 < percent1firsts) { 01102 s.first_indent = s.AlignTabs()[1].center; 01103 s.body_indent = s.AlignTabs()[0].center; 01104 } else if ((percent1firsts < 20 && 30 < percent0firsts) || 01105 percent1firsts + 30 < percent0firsts) { 01106 s.first_indent = s.AlignTabs()[0].center; 01107 s.body_indent = s.AlignTabs()[1].center; 01108 } else { 01109 // Ambiguous! Probably lineated (poetry) 01110 if (debug_level > 1) { 01111 tprintf("# Cannot determine %s indent likely to start paragraphs.\n", 01112 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right"); 01113 tprintf("# Indent of %d looks like a first line %d%% of the time.\n", 01114 s.AlignTabs()[0].center, percent0firsts); 01115 tprintf("# Indent of %d looks like a first line %d%% of the time.\n", 01116 s.AlignTabs()[1].center, percent1firsts); 01117 s.PrintRows(); 01118 } 01119 return; 01120 } 01121 } else { 01122 // There's only one tab stop for the "aligned to" side. 01123 s.first_indent = s.body_indent = s.AlignTabs()[0].center; 01124 } 01125 01126 // At this point, we have our model. 01127 const ParagraphModel *model = theory->AddModel(s.Model()); 01128 01129 // Now all we have to do is figure out if the text is fully justified or not. 01130 // eop_threshold: default to fully justified unless we see evidence below. 01131 // See description on MarkRowsWithModel() 01132 s.eop_threshold = 01133 (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; 01134 // If the text is not fully justified, re-set the eop_threshold to 0. 01135 if (s.AlignTabs().size() == 2) { 01136 // Paragraphs with a paragraph-start indent. 01137 for (int i = s.row_start; i < s.row_end - 1; i++) { 01138 if (ValidFirstLine(s.rows, i + 1, model) && 01139 !NearlyEqual(s.OffsideTabs()[0].center, 01140 (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { 01141 // We found a non-end-of-paragraph short line: not fully justified. 01142 s.eop_threshold = 0; 01143 break; 01144 } 01145 } 01146 } else { 01147 // Paragraphs with no paragraph-start indent. 01148 for (int i = s.row_start; i < s.row_end - 1; i++) { 01149 if (!s.FirstWordWouldHaveFit(i, i + 1) && 01150 !NearlyEqual(s.OffsideTabs()[0].center, 01151 (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { 01152 // We found a non-end-of-paragraph short line: not fully justified. 01153 s.eop_threshold = 0; 01154 break; 01155 } 01156 } 01157 } 01158 MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold); 01159 } 01160 01161 // =============== Implementation of ParagraphTheory ===================== 01162 01163 const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) { 01164 for (int i = 0; i < models_->size(); i++) { 01165 if ((*models_)[i]->Comparable(model)) 01166 return (*models_)[i]; 01167 } 01168 ParagraphModel *m = new ParagraphModel(model); 01169 models_->push_back(m); 01170 models_we_added_.push_back_new(m); 01171 return m; 01172 } 01173 01174 void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { 01175 for (int i = models_->size() - 1; i >= 0; i--) { 01176 ParagraphModel *m = (*models_)[i]; 01177 if (!used_models.contains(m) && models_we_added_.contains(m)) { 01178 delete m; 01179 models_->remove(i); 01180 models_we_added_.remove(models_we_added_.get_index(m)); 01181 } 01182 } 01183 } 01184 01185 // Examine rows[start, end) and try to determine if an existing non-centered 01186 // paragraph model would fit them perfectly. If so, return a pointer to it. 01187 // If not, return NULL. 01188 const ParagraphModel *ParagraphTheory::Fits( 01189 const GenericVector<RowScratchRegisters> *rows, int start, int end) const { 01190 for (int m = 0; m < models_->size(); m++) { 01191 const ParagraphModel *model = (*models_)[m]; 01192 if (model->justification() != JUSTIFICATION_CENTER && 01193 RowsFitModel(rows, start, end, model)) 01194 return model; 01195 } 01196 return NULL; 01197 } 01198 01199 void ParagraphTheory::NonCenteredModels(SetOfModels *models) { 01200 for (int m = 0; m < models_->size(); m++) { 01201 const ParagraphModel *model = (*models_)[m]; 01202 if (model->justification() != JUSTIFICATION_CENTER) 01203 models->push_back_new(model); 01204 } 01205 } 01206 01207 int ParagraphTheory::IndexOf(const ParagraphModel *model) const { 01208 for (int i = 0; i < models_->size(); i++) { 01209 if ((*models_)[i] == model) 01210 return i; 01211 } 01212 return -1; 01213 } 01214 01215 bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, 01216 int row, const ParagraphModel *model) { 01217 if (!StrongModel(model)) { 01218 tprintf("ValidFirstLine() should only be called with strong models!\n"); 01219 } 01220 return StrongModel(model) && 01221 model->ValidFirstLine( 01222 (*rows)[row].lmargin_, (*rows)[row].lindent_, 01223 (*rows)[row].rindent_, (*rows)[row].rmargin_); 01224 } 01225 01226 bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, 01227 int row, const ParagraphModel *model) { 01228 if (!StrongModel(model)) { 01229 tprintf("ValidBodyLine() should only be called with strong models!\n"); 01230 } 01231 return StrongModel(model) && 01232 model->ValidBodyLine( 01233 (*rows)[row].lmargin_, (*rows)[row].lindent_, 01234 (*rows)[row].rindent_, (*rows)[row].rmargin_); 01235 } 01236 01237 bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, 01238 int a, int b, const ParagraphModel *model) { 01239 if (model != kCrownRight && model != kCrownLeft) { 01240 tprintf("CrownCompatible() should only be called with crown models!\n"); 01241 return false; 01242 } 01243 RowScratchRegisters &row_a = (*rows)[a]; 01244 RowScratchRegisters &row_b = (*rows)[b]; 01245 if (model == kCrownRight) { 01246 return NearlyEqual(row_a.rindent_ + row_a.rmargin_, 01247 row_b.rindent_ + row_b.rmargin_, 01248 Epsilon(row_a.ri_->average_interword_space)); 01249 } 01250 return NearlyEqual(row_a.lindent_ + row_a.lmargin_, 01251 row_b.lindent_ + row_b.lmargin_, 01252 Epsilon(row_a.ri_->average_interword_space)); 01253 } 01254 01255 01256 // =============== Implementation of ParagraphModelSmearer ==================== 01257 01258 ParagraphModelSmearer::ParagraphModelSmearer( 01259 GenericVector<RowScratchRegisters> *rows, 01260 int row_start, int row_end, ParagraphTheory *theory) 01261 : theory_(theory), rows_(rows), row_start_(row_start), 01262 row_end_(row_end) { 01263 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { 01264 row_start_ = 0; 01265 row_end_ = 0; 01266 return; 01267 } 01268 SetOfModels no_models; 01269 for (int row = row_start - 1; row <= row_end; row++) { 01270 open_models_.push_back(no_models); 01271 } 01272 } 01273 01274 // see paragraphs_internal.h 01275 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) { 01276 SetOfModels no_models; 01277 if (row_start < row_start_) row_start = row_start_; 01278 if (row_end > row_end_) row_end = row_end_; 01279 01280 for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; 01281 row++) { 01282 if ((*rows_)[row].ri_->num_words == 0) { 01283 OpenModels(row + 1) = no_models; 01284 } else { 01285 SetOfModels &opened = OpenModels(row); 01286 (*rows_)[row].StartHypotheses(&opened); 01287 01288 // Which models survive the transition from row to row + 1? 01289 SetOfModels still_open; 01290 for (int m = 0; m < opened.size(); m++) { 01291 if (ValidFirstLine(rows_, row, opened[m]) || 01292 ValidBodyLine(rows_, row, opened[m])) { 01293 // This is basic filtering; we check likely paragraph starty-ness down 01294 // below in Smear() -- you know, whether the first word would have fit 01295 // and such. 01296 still_open.push_back_new(opened[m]); 01297 } 01298 } 01299 OpenModels(row + 1) = still_open; 01300 } 01301 } 01302 } 01303 01304 // see paragraphs_internal.h 01305 void ParagraphModelSmearer::Smear() { 01306 CalculateOpenModels(row_start_, row_end_); 01307 01308 // For each row which we're unsure about (that is, it is LT_UNKNOWN or 01309 // we have multiple LT_START hypotheses), see if there's a model that 01310 // was recently used (an "open" model) which might model it well. 01311 for (int i = row_start_; i < row_end_; i++) { 01312 RowScratchRegisters &row = (*rows_)[i]; 01313 if (row.ri_->num_words == 0) 01314 continue; 01315 01316 // Step One: 01317 // Figure out if there are "open" models which are left-alined or 01318 // right-aligned. This is important for determining whether the 01319 // "first" word in a row would fit at the "end" of the previous row. 01320 bool left_align_open = false; 01321 bool right_align_open = false; 01322 for (int m = 0; m < OpenModels(i).size(); m++) { 01323 switch (OpenModels(i)[m]->justification()) { 01324 case JUSTIFICATION_LEFT: left_align_open = true; break; 01325 case JUSTIFICATION_RIGHT: right_align_open = true; break; 01326 default: left_align_open = right_align_open = true; 01327 } 01328 } 01329 // Step Two: 01330 // Use that knowledge to figure out if this row is likely to 01331 // start a paragraph. 01332 bool likely_start; 01333 if (i == 0) { 01334 likely_start = true; 01335 } else { 01336 if ((left_align_open && right_align_open) || 01337 (!left_align_open && !right_align_open)) { 01338 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, 01339 JUSTIFICATION_LEFT) || 01340 LikelyParagraphStart((*rows_)[i - 1], row, 01341 JUSTIFICATION_RIGHT); 01342 } else if (left_align_open) { 01343 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, 01344 JUSTIFICATION_LEFT); 01345 } else { 01346 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, 01347 JUSTIFICATION_RIGHT); 01348 } 01349 } 01350 01351 // Step Three: 01352 // If this text line seems like an obvious first line of an 01353 // open model, or an obvious continuation of an existing 01354 // modelled paragraph, mark it up. 01355 if (likely_start) { 01356 // Add Start Hypotheses for all Open models that fit. 01357 for (int m = 0; m < OpenModels(i).size(); m++) { 01358 if (ValidFirstLine(rows_, i, OpenModels(i)[m])) { 01359 row.AddStartLine(OpenModels(i)[m]); 01360 } 01361 } 01362 } else { 01363 // Add relevant body line hypotheses. 01364 SetOfModels last_line_models; 01365 if (i > 0) { 01366 (*rows_)[i - 1].StrongHypotheses(&last_line_models); 01367 } else { 01368 theory_->NonCenteredModels(&last_line_models); 01369 } 01370 for (int m = 0; m < last_line_models.size(); m++) { 01371 const ParagraphModel *model = last_line_models[m]; 01372 if (ValidBodyLine(rows_, i, model)) 01373 row.AddBodyLine(model); 01374 } 01375 } 01376 01377 // Step Four: 01378 // If we're still quite unsure about this line, go through all 01379 // models in our theory and see if this row could be the start 01380 // of any of our models. 01381 if (row.GetLineType() == LT_UNKNOWN || 01382 (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) { 01383 SetOfModels all_models; 01384 theory_->NonCenteredModels(&all_models); 01385 for (int m = 0; m < all_models.size(); m++) { 01386 if (ValidFirstLine(rows_, i, all_models[m])) { 01387 row.AddStartLine(all_models[m]); 01388 } 01389 } 01390 } 01391 // Step Five: 01392 // Since we may have updated the hypotheses about this row, we need 01393 // to recalculate the Open models for the rest of rows[i + 1, row_end) 01394 if (row.GetLineType() != LT_UNKNOWN) { 01395 CalculateOpenModels(i + 1, row_end_); 01396 } 01397 } 01398 } 01399 01400 // ================ Main Paragraph Detection Algorithm ======================= 01401 01402 // Find out what ParagraphModels are actually used, and discard any 01403 // that are not. 01404 void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows, 01405 ParagraphTheory *theory) { 01406 SetOfModels used_models; 01407 for (int i = 0; i < rows.size(); i++) { 01408 rows[i].StrongHypotheses(&used_models); 01409 } 01410 theory->DiscardUnusedModels(used_models); 01411 } 01412 01413 // DowngradeWeakestToCrowns: 01414 // Forget any flush-{left, right} models unless we see two or more 01415 // of them in sequence. 01416 // 01417 // In pass 3, we start to classify even flush-left paragraphs (paragraphs 01418 // where the first line and body indent are the same) as having proper Models. 01419 // This is generally dangerous, since if you start imagining that flush-left 01420 // is a typical paragraph model when it is not, it will lead you to chop normal 01421 // indented paragraphs in the middle whenever a sentence happens to start on a 01422 // new line (see "This" above). What to do? 01423 // What we do is to take any paragraph which is flush left and is not 01424 // preceded by another paragraph of the same model and convert it to a "Crown" 01425 // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder 01426 // for later. It means that the paragraph is flush, but it would be desirable 01427 // to mark it as the same model as following text if it fits. This downgrade 01428 // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we 01429 // avoid making flush left Paragraph Models whenever we see a top-of-the-page 01430 // half-of-a-paragraph. and instead we mark it the same as normal body text. 01431 // 01432 // Implementation: 01433 // 01434 // Comb backwards through the row scratch registers, and turn any 01435 // sequences of body lines of equivalent type abutted against the beginning 01436 // or a body or start line of a different type into a crown paragraph. 01437 void DowngradeWeakestToCrowns(int debug_level, 01438 ParagraphTheory *theory, 01439 GenericVector<RowScratchRegisters> *rows) { 01440 int start; 01441 for (int end = rows->size(); end > 0; end = start) { 01442 // Search back for a body line of a unique type. 01443 const ParagraphModel *model = NULL; 01444 while (end > 0 && 01445 (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) { 01446 end--; 01447 } 01448 if (end == 0) break; 01449 start = end - 1; 01450 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) { 01451 start--; // walk back to the first line that is not the same body type. 01452 } 01453 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && 01454 StrongModel(model) && 01455 NearlyEqual(model->first_indent(), model->body_indent(), 01456 model->tolerance())) { 01457 start--; 01458 } 01459 start++; 01460 // Now rows[start, end) is a sequence of unique body hypotheses of model. 01461 if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) 01462 continue; 01463 if (!StrongModel(model)) { 01464 while (start > 0 && 01465 CrownCompatible(rows, start - 1, start, model)) 01466 start--; 01467 } 01468 if (start == 0 || 01469 (!StrongModel(model)) || 01470 (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) { 01471 // crownify rows[start, end) 01472 const ParagraphModel *crown_model = model; 01473 if (StrongModel(model)) { 01474 if (model->justification() == JUSTIFICATION_LEFT) 01475 crown_model = kCrownLeft; 01476 else 01477 crown_model = kCrownRight; 01478 } 01479 (*rows)[start].SetUnknown(); 01480 (*rows)[start].AddStartLine(crown_model); 01481 for (int row = start + 1; row < end; row++) { 01482 (*rows)[row].SetUnknown(); 01483 (*rows)[row].AddBodyLine(crown_model); 01484 } 01485 } 01486 } 01487 DiscardUnusedModels(*rows, theory); 01488 } 01489 01490 01491 // Clear all hypotheses about lines [start, end) and reset margins. 01492 // 01493 // The empty space between the left of a row and the block boundary (and 01494 // similarly for the right) is split into two pieces: margin and indent. 01495 // In initial processing, we assume the block is tight and the margin for 01496 // all lines is set to zero. However, if our first pass does not yield 01497 // models for everything, it may be due to an inset paragraph like a 01498 // block-quote. In that case, we make a second pass over that unmarked 01499 // section of the page and reset the "margin" portion of the empty space 01500 // to the common amount of space at the ends of the lines under consid- 01501 // eration. This would be equivalent to percentile set to 0. However, 01502 // sometimes we have a single character sticking out in the right margin 01503 // of a text block (like the 'r' in 'for' on line 3 above), and we can 01504 // really just ignore it as an outlier. To express this, we allow the 01505 // user to specify the percentile (0..100) of indent values to use as 01506 // the common margin for each row in the run of rows[start, end). 01507 void RecomputeMarginsAndClearHypotheses( 01508 GenericVector<RowScratchRegisters> *rows, int start, int end, 01509 int percentile) { 01510 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) 01511 return; 01512 01513 int lmin, lmax, rmin, rmax; 01514 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_; 01515 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_; 01516 for (int i = start; i < end; i++) { 01517 RowScratchRegisters &sr = (*rows)[i]; 01518 sr.SetUnknown(); 01519 if (sr.ri_->num_words == 0) 01520 continue; 01521 UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax); 01522 UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax); 01523 } 01524 STATS lefts(lmin, lmax + 1); 01525 STATS rights(rmin, rmax + 1); 01526 for (int i = start; i < end; i++) { 01527 RowScratchRegisters &sr = (*rows)[i]; 01528 if (sr.ri_->num_words == 0) 01529 continue; 01530 lefts.add(sr.lmargin_ + sr.lindent_, 1); 01531 rights.add(sr.rmargin_ + sr.rindent_, 1); 01532 } 01533 int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0); 01534 int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0); 01535 for (int i = start; i < end; i++) { 01536 RowScratchRegisters &sr = (*rows)[i]; 01537 int ldelta = ignorable_left - sr.lmargin_; 01538 sr.lmargin_ += ldelta; 01539 sr.lindent_ -= ldelta; 01540 int rdelta = ignorable_right - sr.rmargin_; 01541 sr.rmargin_ += rdelta; 01542 sr.rindent_ -= rdelta; 01543 } 01544 } 01545 01546 // Return the minimum inter-word space in rows[row_start, row_end). 01547 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, 01548 int row_start, int row_end) { 01549 if (row_end < row_start + 1) return 1; 01550 bool legit = false; 01551 int natural_space = rows[row_start].ri_->average_interword_space; 01552 for (int i = row_start; i < row_end; i++) { 01553 if (rows[i].ri_->num_words > 1) { 01554 if (!legit) { 01555 natural_space = rows[i].ri_->average_interword_space; 01556 legit = true; 01557 } else { 01558 if (rows[i].ri_->average_interword_space < natural_space) 01559 natural_space = rows[i].ri_->average_interword_space; 01560 } 01561 } 01562 } 01563 return natural_space; 01564 } 01565 01566 // Return whether the first word on the after line can fit in the space at 01567 // the end of the before line (knowing which way the text is aligned and read). 01568 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, 01569 const RowScratchRegisters &after, 01570 tesseract::ParagraphJustification justification) { 01571 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) 01572 return true; 01573 01574 if (justification == JUSTIFICATION_UNKNOWN) { 01575 tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n"); 01576 } 01577 int available_space; 01578 if (justification == JUSTIFICATION_CENTER) { 01579 available_space = before.lindent_ + before.rindent_; 01580 } else { 01581 available_space = before.OffsideIndent(justification); 01582 } 01583 available_space -= before.ri_->average_interword_space; 01584 01585 if (before.ri_->ltr) 01586 return after.ri_->lword_box.width() < available_space; 01587 return after.ri_->rword_box.width() < available_space; 01588 } 01589 01590 // Return whether the first word on the after line can fit in the space at 01591 // the end of the before line (not knowing which way the text goes) in a left 01592 // or right alignemnt. 01593 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, 01594 const RowScratchRegisters &after) { 01595 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) 01596 return true; 01597 01598 int available_space = before.lindent_; 01599 if (before.rindent_ > available_space) 01600 available_space = before.rindent_; 01601 available_space -= before.ri_->average_interword_space; 01602 01603 if (before.ri_->ltr) 01604 return after.ri_->lword_box.width() < available_space; 01605 return after.ri_->rword_box.width() < available_space; 01606 } 01607 01608 bool TextSupportsBreak(const RowScratchRegisters &before, 01609 const RowScratchRegisters &after) { 01610 if (before.ri_->ltr) { 01611 return before.ri_->rword_likely_ends_idea && 01612 after.ri_->lword_likely_starts_idea; 01613 } else { 01614 return before.ri_->lword_likely_ends_idea && 01615 after.ri_->rword_likely_starts_idea; 01616 } 01617 } 01618 01619 bool LikelyParagraphStart(const RowScratchRegisters &before, 01620 const RowScratchRegisters &after) { 01621 return before.ri_->num_words == 0 || 01622 (FirstWordWouldHaveFit(before, after) && 01623 TextSupportsBreak(before, after)); 01624 } 01625 01626 bool LikelyParagraphStart(const RowScratchRegisters &before, 01627 const RowScratchRegisters &after, 01628 tesseract::ParagraphJustification j) { 01629 return before.ri_->num_words == 0 || 01630 (FirstWordWouldHaveFit(before, after, j) && 01631 TextSupportsBreak(before, after)); 01632 } 01633 01634 // Examine rows[start, end) and try to determine what sort of ParagraphModel 01635 // would fit them as a single paragraph. 01636 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN. 01637 // If the rows given could be a consistent start to a paragraph, set *consistent 01638 // true. 01639 ParagraphModel InternalParagraphModelByOutline( 01640 const GenericVector<RowScratchRegisters> *rows, 01641 int start, int end, int tolerance, bool *consistent) { 01642 int ltr_line_count = 0; 01643 for (int i = start; i < end; i++) { 01644 ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr); 01645 } 01646 bool ltr = (ltr_line_count >= (end - start) / 2); 01647 01648 *consistent = true; 01649 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) 01650 return ParagraphModel(); 01651 01652 // Ensure the caller only passed us a region with a common rmargin and 01653 // lmargin. 01654 int lmargin = (*rows)[start].lmargin_; 01655 int rmargin = (*rows)[start].rmargin_; 01656 int lmin, lmax, rmin, rmax, cmin, cmax; 01657 lmin = lmax = (*rows)[start + 1].lindent_; 01658 rmin = rmax = (*rows)[start + 1].rindent_; 01659 cmin = cmax = 0; 01660 for (int i = start + 1; i < end; i++) { 01661 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) { 01662 tprintf("Margins don't match! Software error.\n"); 01663 *consistent = false; 01664 return ParagraphModel(); 01665 } 01666 UpdateRange((*rows)[i].lindent_, &lmin, &lmax); 01667 UpdateRange((*rows)[i].rindent_, &rmin, &rmax); 01668 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax); 01669 } 01670 int ldiff = lmax - lmin; 01671 int rdiff = rmax - rmin; 01672 int cdiff = cmax - cmin; 01673 if (rdiff > tolerance && ldiff > tolerance) { 01674 if (cdiff < tolerance * 2) { 01675 if (end - start < 3) 01676 return ParagraphModel(); 01677 return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance); 01678 } 01679 *consistent = false; 01680 return ParagraphModel(); 01681 } 01682 if (end - start < 3) // Don't return a model for two line paras. 01683 return ParagraphModel(); 01684 01685 // These booleans keep us from saying something is aligned left when the body 01686 // left variance is too large. 01687 bool body_admits_left_alignment = ldiff < tolerance; 01688 bool body_admits_right_alignment = rdiff < tolerance; 01689 01690 ParagraphModel left_model = 01691 ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_, 01692 (lmin + lmax) / 2, tolerance); 01693 ParagraphModel right_model = 01694 ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_, 01695 (rmin + rmax) / 2, tolerance); 01696 01697 // These booleans keep us from having an indent on the "wrong side" for the 01698 // first line. 01699 bool text_admits_left_alignment = ltr || left_model.is_flush(); 01700 bool text_admits_right_alignment = !ltr || right_model.is_flush(); 01701 01702 // At least one of the edges is less than tolerance in variance. 01703 // If the other is obviously ragged, it can't be the one aligned to. 01704 // [Note the last line is included in this raggedness.] 01705 if (tolerance < rdiff) { 01706 if (body_admits_left_alignment && text_admits_left_alignment) 01707 return left_model; 01708 *consistent = false; 01709 return ParagraphModel(); 01710 } 01711 if (tolerance < ldiff) { 01712 if (body_admits_right_alignment && text_admits_right_alignment) 01713 return right_model; 01714 *consistent = false; 01715 return ParagraphModel(); 01716 } 01717 01718 // At this point, we know the body text doesn't vary much on either side. 01719 01720 // If the first line juts out oddly in one direction or the other, 01721 // that likely indicates the side aligned to. 01722 int first_left = (*rows)[start].lindent_; 01723 int first_right = (*rows)[start].rindent_; 01724 01725 if (ltr && body_admits_left_alignment && 01726 (first_left < lmin || first_left > lmax)) 01727 return left_model; 01728 if (!ltr && body_admits_right_alignment && 01729 (first_right < rmin || first_right > rmax)) 01730 return right_model; 01731 01732 *consistent = false; 01733 return ParagraphModel(); 01734 } 01735 01736 // Examine rows[start, end) and try to determine what sort of ParagraphModel 01737 // would fit them as a single paragraph. If nothing fits, 01738 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug 01739 // output if we're debugging. 01740 ParagraphModel ParagraphModelByOutline( 01741 int debug_level, 01742 const GenericVector<RowScratchRegisters> *rows, 01743 int start, int end, int tolerance) { 01744 bool unused_consistent; 01745 ParagraphModel retval = InternalParagraphModelByOutline( 01746 rows, start, end, tolerance, &unused_consistent); 01747 if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) { 01748 tprintf("Could not determine a model for this paragraph:\n"); 01749 PrintRowRange(*rows, start, end); 01750 } 01751 return retval; 01752 } 01753 01754 // Do rows[start, end) form a single instance of the given paragraph model? 01755 bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, 01756 int start, int end, const ParagraphModel *model) { 01757 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) 01758 return false; 01759 if (!ValidFirstLine(rows, start, model)) return false; 01760 for (int i = start + 1 ; i < end; i++) { 01761 if (!ValidBodyLine(rows, i, model)) return false; 01762 } 01763 return true; 01764 } 01765 01766 // Examine rows[row_start, row_end) as an independent section of text, 01767 // and mark rows that are exceptionally clear as start-of-paragraph 01768 // and paragraph-body lines. 01769 // 01770 // We presume that any lines surrounding rows[row_start, row_end) may 01771 // have wildly different paragraph models, so we don't key any data off 01772 // of those lines. 01773 // 01774 // We only take the very strongest signals, as we don't want to get 01775 // confused and marking up centered text, poetry, or source code as 01776 // clearly part of a typical paragraph. 01777 void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, 01778 int row_start, int row_end) { 01779 // Record patently obvious body text. 01780 for (int i = row_start + 1; i < row_end; i++) { 01781 const RowScratchRegisters &prev = (*rows)[i - 1]; 01782 RowScratchRegisters &curr = (*rows)[i]; 01783 tesseract::ParagraphJustification typical_justification = 01784 prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; 01785 if (!curr.ri_->rword_likely_starts_idea && 01786 !curr.ri_->lword_likely_starts_idea && 01787 !FirstWordWouldHaveFit(prev, curr, typical_justification)) { 01788 curr.SetBodyLine(); 01789 } 01790 } 01791 01792 // Record patently obvious start paragraph lines. 01793 // 01794 // It's an extremely good signal of the start of a paragraph that 01795 // the first word would have fit on the end of the previous line. 01796 // However, applying just that signal would have us mark random 01797 // start lines of lineated text (poetry and source code) and some 01798 // centered headings as paragraph start lines. Therefore, we use 01799 // a second qualification for a paragraph start: Not only should 01800 // the first word of this line have fit on the previous line, 01801 // but also, this line should go full to the right of the block, 01802 // disallowing a subsequent word from having fit on this line. 01803 01804 // First row: 01805 { 01806 RowScratchRegisters &curr = (*rows)[row_start]; 01807 RowScratchRegisters &next = (*rows)[row_start + 1]; 01808 tesseract::ParagraphJustification j = 01809 curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; 01810 if (curr.GetLineType() == LT_UNKNOWN && 01811 !FirstWordWouldHaveFit(curr, next, j) && 01812 (curr.ri_->lword_likely_starts_idea || 01813 curr.ri_->rword_likely_starts_idea)) { 01814 curr.SetStartLine(); 01815 } 01816 } 01817 // Middle rows 01818 for (int i = row_start + 1; i < row_end - 1; i++) { 01819 RowScratchRegisters &prev = (*rows)[i - 1]; 01820 RowScratchRegisters &curr = (*rows)[i]; 01821 RowScratchRegisters &next = (*rows)[i + 1]; 01822 tesseract::ParagraphJustification j = 01823 curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; 01824 if (curr.GetLineType() == LT_UNKNOWN && 01825 !FirstWordWouldHaveFit(curr, next, j) && 01826 LikelyParagraphStart(prev, curr, j)) { 01827 curr.SetStartLine(); 01828 } 01829 } 01830 // Last row 01831 { // the short circuit at the top means we have at least two lines. 01832 RowScratchRegisters &prev = (*rows)[row_end - 2]; 01833 RowScratchRegisters &curr = (*rows)[row_end - 1]; 01834 tesseract::ParagraphJustification j = 01835 curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; 01836 if (curr.GetLineType() == LT_UNKNOWN && 01837 !FirstWordWouldHaveFit(curr, curr, j) && 01838 LikelyParagraphStart(prev, curr, j)) { 01839 curr.SetStartLine(); 01840 } 01841 } 01842 } 01843 01844 // Look for sequences of a start line followed by some body lines in 01845 // rows[row_start, row_end) and create ParagraphModels for them if 01846 // they seem coherent. 01847 void ModelStrongEvidence(int debug_level, 01848 GenericVector<RowScratchRegisters> *rows, 01849 int row_start, int row_end, 01850 bool allow_flush_models, 01851 ParagraphTheory *theory) { 01852 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) 01853 return; 01854 01855 int start = row_start; 01856 while (start < row_end) { 01857 while (start < row_end && (*rows)[start].GetLineType() != LT_START) 01858 start++; 01859 if (start >= row_end - 1) 01860 break; 01861 01862 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space); 01863 int end = start; 01864 ParagraphModel last_model; 01865 bool next_consistent; 01866 do { 01867 ++end; 01868 // rows[row, end) was consistent. 01869 // If rows[row, end + 1) is not consistent, 01870 // just model rows[row, end) 01871 if (end < row_end - 1) { 01872 RowScratchRegisters &next = (*rows)[end]; 01873 LineType lt = next.GetLineType(); 01874 next_consistent = lt == LT_BODY || 01875 (lt == LT_UNKNOWN && 01876 !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end])); 01877 } else { 01878 next_consistent = false; 01879 } 01880 if (next_consistent) { 01881 ParagraphModel next_model = InternalParagraphModelByOutline( 01882 rows, start, end + 1, tolerance, &next_consistent); 01883 if (((*rows)[start].ri_->ltr && 01884 last_model.justification() == JUSTIFICATION_LEFT && 01885 next_model.justification() != JUSTIFICATION_LEFT) || 01886 (!(*rows)[start].ri_->ltr && 01887 last_model.justification() == JUSTIFICATION_RIGHT && 01888 next_model.justification() != JUSTIFICATION_RIGHT)) { 01889 next_consistent = false; 01890 } 01891 last_model = next_model; 01892 } else { 01893 next_consistent = false; 01894 } 01895 } while (next_consistent && end < row_end); 01896 // At this point, rows[start, end) looked like it could have been a 01897 // single paragraph. If we can make a good ParagraphModel for it, 01898 // do so and mark this sequence with that model. 01899 if (end > start + 1) { 01900 // emit a new paragraph if we have more than one line. 01901 const ParagraphModel *model = NULL; 01902 ParagraphModel new_model = ParagraphModelByOutline( 01903 debug_level, rows, start, end, 01904 Epsilon(InterwordSpace(*rows, start, end))); 01905 if (new_model.justification() == JUSTIFICATION_UNKNOWN) { 01906 // couldn't create a good model, oh well. 01907 } else if (new_model.is_flush()) { 01908 if (end == start + 2) { 01909 // It's very likely we just got two paragraph starts in a row. 01910 end = start + 1; 01911 } else if (start == row_start) { 01912 // Mark this as a Crown. 01913 if (new_model.justification() == JUSTIFICATION_LEFT) { 01914 model = kCrownLeft; 01915 } else { 01916 model = kCrownRight; 01917 } 01918 } else if (allow_flush_models) { 01919 model = theory->AddModel(new_model); 01920 } 01921 } else { 01922 model = theory->AddModel(new_model); 01923 } 01924 if (model) { 01925 (*rows)[start].AddStartLine(model); 01926 for (int i = start + 1; i < end; i++) { 01927 (*rows)[i].AddBodyLine(model); 01928 } 01929 } 01930 } 01931 start = end; 01932 } 01933 } 01934 01935 // We examine rows[row_start, row_end) and do the following: 01936 // (1) Clear all existing hypotheses for the rows being considered. 01937 // (2) Mark up any rows as exceptionally likely to be paragraph starts 01938 // or paragraph body lines as such using both geometric and textual 01939 // clues. 01940 // (3) Form models for any sequence of start + continuation lines. 01941 // (4) Smear the paragraph models to cover surrounding text. 01942 void StrongEvidenceClassify(int debug_level, 01943 GenericVector<RowScratchRegisters> *rows, 01944 int row_start, int row_end, 01945 ParagraphTheory *theory) { 01946 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) 01947 return; 01948 01949 if (debug_level > 1) { 01950 tprintf("#############################################\n"); 01951 tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end); 01952 tprintf("#############################################\n"); 01953 } 01954 01955 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); 01956 MarkStrongEvidence(rows, row_start, row_end); 01957 01958 DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows); 01959 01960 // Create paragraph models. 01961 ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory); 01962 01963 DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows); 01964 01965 // At this point, some rows are marked up as paragraphs with model numbers, 01966 // and some rows are marked up as either LT_START or LT_BODY. Now let's 01967 // smear any good paragraph hypotheses forward and backward. 01968 ParagraphModelSmearer smearer(rows, row_start, row_end, theory); 01969 smearer.Smear(); 01970 } 01971 01972 void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows, 01973 int row_start, int row_end, 01974 ParagraphTheory *theory) { 01975 for (int i = row_start + 1; i < row_end - 1; i++) { 01976 if ((*rows)[i - 1].ri_->has_leaders && 01977 (*rows)[i].ri_->has_leaders && 01978 (*rows)[i + 1].ri_->has_leaders) { 01979 const ParagraphModel *model = theory->AddModel( 01980 ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0)); 01981 (*rows)[i].AddStartLine(model); 01982 } 01983 } 01984 } 01985 01986 // Collect sequences of unique hypotheses in row registers and create proper 01987 // paragraphs for them, referencing the paragraphs in row_owners. 01988 void ConvertHypothesizedModelRunsToParagraphs( 01989 int debug_level, 01990 const GenericVector<RowScratchRegisters> &rows, 01991 GenericVector<PARA *> *row_owners, 01992 ParagraphTheory *theory) { 01993 int end = rows.size(); 01994 int start; 01995 for (; end > 0; end = start) { 01996 start = end - 1; 01997 const ParagraphModel *model = NULL; 01998 // TODO(eger): Be smarter about dealing with multiple hypotheses. 01999 bool single_line_paragraph = false; 02000 SetOfModels models; 02001 rows[start].NonNullHypotheses(&models); 02002 if (models.size() > 0) { 02003 model = models[0]; 02004 if (rows[start].GetLineType(model) != LT_BODY) 02005 single_line_paragraph = true; 02006 } 02007 if (model && !single_line_paragraph) { 02008 // walk back looking for more body lines and then a start line. 02009 while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) { 02010 // do nothing 02011 } 02012 if (start < 0 || rows[start].GetLineType(model) != LT_START) { 02013 model = NULL; 02014 } 02015 } 02016 if (model == NULL) { 02017 continue; 02018 } 02019 // rows[start, end) should be a paragraph. 02020 PARA *p = new PARA(); 02021 if (model == kCrownLeft || model == kCrownRight) { 02022 p->is_very_first_or_continuation = true; 02023 // Crown paragraph. 02024 // If we can find an existing ParagraphModel that fits, use it, 02025 // else create a new one. 02026 for (int row = end; row < rows.size(); row++) { 02027 if ((*row_owners)[row] && 02028 (ValidBodyLine(&rows, start, (*row_owners)[row]->model) && 02029 (start == 0 || 02030 ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) { 02031 model = (*row_owners)[row]->model; 02032 break; 02033 } 02034 } 02035 if (model == kCrownLeft) { 02036 // No subsequent model fits, so cons one up. 02037 model = theory->AddModel(ParagraphModel( 02038 JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_, 02039 0, 0, Epsilon(rows[start].ri_->average_interword_space))); 02040 } else if (model == kCrownRight) { 02041 // No subsequent model fits, so cons one up. 02042 model = theory->AddModel(ParagraphModel( 02043 JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_, 02044 0, 0, Epsilon(rows[start].ri_->average_interword_space))); 02045 } 02046 } 02047 rows[start].SetUnknown(); 02048 rows[start].AddStartLine(model); 02049 for (int i = start + 1; i < end; i++) { 02050 rows[i].SetUnknown(); 02051 rows[i].AddBodyLine(model); 02052 } 02053 p->model = model; 02054 p->has_drop_cap = rows[start].ri_->has_drop_cap; 02055 p->is_list_item = 02056 model->justification() == JUSTIFICATION_RIGHT 02057 ? rows[start].ri_->rword_indicates_list_item 02058 : rows[start].ri_->lword_indicates_list_item; 02059 for (int row = start; row < end; row++) { 02060 if ((*row_owners)[row] != NULL) { 02061 tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " 02062 "more than once!\n"); 02063 } 02064 (*row_owners)[row] = p; 02065 } 02066 } 02067 } 02068 02069 struct Interval { 02070 Interval() : begin(0), end(0) {} 02071 Interval(int b, int e) : begin(b), end(e) {} 02072 02073 int begin; 02074 int end; 02075 }; 02076 02077 // Return whether rows[row] appears to be stranded, meaning that the evidence 02078 // for this row is very weak due to context. For instance, two lines of source 02079 // code may happen to be indented at the same tab vector as body text starts, 02080 // leading us to think they are two start-of-paragraph lines. This is not 02081 // optimal. However, we also don't want to mark a sequence of short dialog 02082 // as "weak," so our heuristic is: 02083 // (1) If a line is surrounded by lines of unknown type, it's weak. 02084 // (2) If two lines in a row are start lines for a given paragraph type, but 02085 // after that the same paragraph type does not continue, they're weak. 02086 bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) { 02087 SetOfModels row_models; 02088 rows[row].StrongHypotheses(&row_models); 02089 02090 for (int m = 0; m < row_models.size(); m++) { 02091 bool all_starts = rows[row].GetLineType(); 02092 int run_length = 1; 02093 bool continues = true; 02094 for (int i = row - 1; i >= 0 && continues; i--) { 02095 SetOfModels models; 02096 rows[i].NonNullHypotheses(&models); 02097 switch (rows[i].GetLineType(row_models[m])) { 02098 case LT_START: run_length++; break; 02099 case LT_MULTIPLE: // explicit fall-through 02100 case LT_BODY: run_length++; all_starts = false; break; 02101 case LT_UNKNOWN: // explicit fall-through 02102 default: continues = false; 02103 } 02104 } 02105 continues = true; 02106 for (int i = row + 1; i < rows.size() && continues; i++) { 02107 SetOfModels models; 02108 rows[i].NonNullHypotheses(&models); 02109 switch (rows[i].GetLineType(row_models[m])) { 02110 case LT_START: run_length++; break; 02111 case LT_MULTIPLE: // explicit fall-through 02112 case LT_BODY: run_length++; all_starts = false; break; 02113 case LT_UNKNOWN: // explicit fall-through 02114 default: continues = false; 02115 } 02116 } 02117 if (run_length > 2 || (!all_starts && run_length > 1)) return false; 02118 } 02119 return true; 02120 } 02121 02122 // Go through rows[row_start, row_end) and gather up sequences that need better 02123 // classification. 02124 // + Sequences of non-empty rows without hypotheses. 02125 // + Crown paragraphs not immediately followed by a strongly modeled line. 02126 // + Single line paragraphs surrounded by text that doesn't match the 02127 // model. 02128 void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows, 02129 GenericVector<Interval> *to_fix, 02130 int row_start, int row_end) { 02131 to_fix->clear(); 02132 for (int i = row_start; i < row_end; i++) { 02133 bool needs_fixing = false; 02134 02135 SetOfModels models; 02136 SetOfModels models_w_crowns; 02137 rows[i].StrongHypotheses(&models); 02138 rows[i].NonNullHypotheses(&models_w_crowns); 02139 if (models.empty() && models_w_crowns.size() > 0) { 02140 // Crown paragraph. Is it followed by a modeled line? 02141 for (int end = i + 1; end < rows.size(); end++) { 02142 SetOfModels end_models; 02143 SetOfModels strong_end_models; 02144 rows[end].NonNullHypotheses(&end_models); 02145 rows[end].StrongHypotheses(&strong_end_models); 02146 if (end_models.size() == 0) { 02147 needs_fixing = true; 02148 break; 02149 } else if (strong_end_models.size() > 0) { 02150 needs_fixing = false; 02151 break; 02152 } 02153 } 02154 } else if (models.empty() && rows[i].ri_->num_words > 0) { 02155 // No models at all. 02156 needs_fixing = true; 02157 } 02158 02159 if (!needs_fixing && !models.empty()) { 02160 needs_fixing = RowIsStranded(rows, i); 02161 } 02162 02163 if (needs_fixing) { 02164 if (!to_fix->empty() && to_fix->back().end == i - 1) 02165 to_fix->back().end = i; 02166 else 02167 to_fix->push_back(Interval(i, i)); 02168 } 02169 } 02170 // Convert inclusive intervals to half-open intervals. 02171 for (int i = 0; i < to_fix->size(); i++) { 02172 (*to_fix)[i].end = (*to_fix)[i].end + 1; 02173 } 02174 } 02175 02176 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known), 02177 // normalize each row_owner to point to an actual PARA, and output the 02178 // paragraphs in order onto paragraphs. 02179 void CanonicalizeDetectionResults( 02180 GenericVector<PARA *> *row_owners, 02181 PARA_LIST *paragraphs) { 02182 GenericVector<PARA *> &rows = *row_owners; 02183 paragraphs->clear(); 02184 PARA_IT out(paragraphs); 02185 PARA *formerly_null = NULL; 02186 for (int i = 0; i < rows.size(); i++) { 02187 if (rows[i] == NULL) { 02188 if (i == 0 || rows[i - 1] != formerly_null) { 02189 rows[i] = formerly_null = new PARA(); 02190 } else { 02191 rows[i] = formerly_null; 02192 continue; 02193 } 02194 } else if (i > 0 && rows[i - 1] == rows[i]) { 02195 continue; 02196 } 02197 out.add_after_then_move(rows[i]); 02198 } 02199 } 02200 02201 // Main entry point for Paragraph Detection Algorithm. 02202 // 02203 // Given a set of equally spaced textlines (described by row_infos), 02204 // Split them into paragraphs. 02205 // 02206 // Output: 02207 // row_owners - one pointer for each row, to the paragraph it belongs to. 02208 // paragraphs - this is the actual list of PARA objects. 02209 // models - the list of paragraph models referenced by the PARA objects. 02210 // caller is responsible for deleting the models. 02211 void DetectParagraphs(int debug_level, 02212 GenericVector<RowInfo> *row_infos, 02213 GenericVector<PARA *> *row_owners, 02214 PARA_LIST *paragraphs, 02215 GenericVector<ParagraphModel *> *models) { 02216 GenericVector<RowScratchRegisters> rows; 02217 ParagraphTheory theory(models); 02218 02219 // Initialize row_owners to be a bunch of NULL pointers. 02220 row_owners->init_to_size(row_infos->size(), NULL); 02221 02222 // Set up row scratch registers for the main algorithm. 02223 rows.init_to_size(row_infos->size(), RowScratchRegisters()); 02224 for (int i = 0; i < row_infos->size(); i++) { 02225 rows[i].Init((*row_infos)[i]); 02226 } 02227 02228 // Pass 1: 02229 // Detect sequences of lines that all contain leader dots (.....) 02230 // These are likely Tables of Contents. If there are three text lines in 02231 // a row with leader dots, it's pretty safe to say the middle one should 02232 // be a paragraph of its own. 02233 SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory); 02234 02235 DebugDump(debug_level > 1, "End of Pass 1", theory, rows); 02236 02237 GenericVector<Interval> leftovers; 02238 LeftoverSegments(rows, &leftovers, 0, rows.size()); 02239 for (int i = 0; i < leftovers.size(); i++) { 02240 // Pass 2a: 02241 // Find any strongly evidenced start-of-paragraph lines. If they're 02242 // followed by two lines that look like body lines, make a paragraph 02243 // model for that and see if that model applies throughout the text 02244 // (that is, "smear" it). 02245 StrongEvidenceClassify(debug_level, &rows, 02246 leftovers[i].begin, leftovers[i].end, &theory); 02247 02248 // Pass 2b: 02249 // If we had any luck in pass 2a, we got part of the page and didn't 02250 // know how to classify a few runs of rows. Take the segments that 02251 // didn't find a model and reprocess them individually. 02252 GenericVector<Interval> leftovers2; 02253 LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end); 02254 bool pass2a_was_useful = leftovers2.size() > 1 || 02255 (leftovers2.size() == 1 && 02256 (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size())); 02257 if (pass2a_was_useful) { 02258 for (int j = 0; j < leftovers2.size(); j++) { 02259 StrongEvidenceClassify(debug_level, &rows, 02260 leftovers2[j].begin, leftovers2[j].end, 02261 &theory); 02262 } 02263 } 02264 } 02265 02266 DebugDump(debug_level > 1, "End of Pass 2", theory, rows); 02267 02268 // Pass 3: 02269 // These are the dregs for which we didn't have enough strong textual 02270 // and geometric clues to form matching models for. Let's see if 02271 // the geometric clues are simple enough that we could just use those. 02272 LeftoverSegments(rows, &leftovers, 0, rows.size()); 02273 for (int i = 0; i < leftovers.size(); i++) { 02274 GeometricClassify(debug_level, &rows, 02275 leftovers[i].begin, leftovers[i].end, &theory); 02276 } 02277 // Undo any flush models for which there's little evidence. 02278 DowngradeWeakestToCrowns(debug_level, &theory, &rows); 02279 02280 DebugDump(debug_level > 1, "End of Pass 3", theory, rows); 02281 02282 // Pass 4: 02283 // Take everything that's still not marked up well and clear all markings. 02284 LeftoverSegments(rows, &leftovers, 0, rows.size()); 02285 for (int i = 0; i < leftovers.size(); i++) { 02286 for (int j = leftovers[i].begin; j < leftovers[i].end; j++) { 02287 rows[j].SetUnknown(); 02288 } 02289 } 02290 02291 DebugDump(debug_level > 1, "End of Pass 4", theory, rows); 02292 02293 // Convert all of the unique hypothesis runs to PARAs. 02294 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, 02295 &theory); 02296 02297 DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows); 02298 02299 // Finally, clean up any dangling NULL row paragraph parents. 02300 CanonicalizeDetectionResults(row_owners, paragraphs); 02301 } 02302 02303 // ============ Code interfacing with the rest of Tesseract ================== 02304 02305 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph 02306 // detector RowInfo with all relevant information from the row. 02307 void InitializeRowInfo(const MutableIterator &it, RowInfo *info) { 02308 if (it.PageResIt()->row() != NULL) { 02309 ROW *row = it.PageResIt()->row()->row; 02310 info->pix_ldistance = row->lmargin(); 02311 info->pix_rdistance = row->rmargin(); 02312 info->average_interword_space = 02313 row->space() > 0 ? row->space() : MAX(row->x_height(), 1); 02314 info->pix_xheight = row->x_height(); 02315 info->has_leaders = false; 02316 info->has_drop_cap = row->has_drop_cap(); 02317 info->ltr = true; // set below depending on word scripts 02318 } else { 02319 info->pix_ldistance = info->pix_rdistance = 0; 02320 info->average_interword_space = 1; 02321 info->pix_xheight = 1.0; 02322 info->has_leaders = false; 02323 info->has_drop_cap = false; 02324 info->ltr = true; 02325 } 02326 02327 info->text = ""; 02328 char *text = it.GetUTF8Text(RIL_TEXTLINE); 02329 int trailing_ws_idx = strlen(text); // strip trailing space 02330 while (trailing_ws_idx > 0 && 02331 // isspace() only takes ASCII 02332 ((text[trailing_ws_idx - 1] & 0x80) == 0) && 02333 isspace(text[trailing_ws_idx - 1])) 02334 trailing_ws_idx--; 02335 if (trailing_ws_idx > 0) { 02336 int lspaces = info->pix_ldistance / info->average_interword_space; 02337 for (int i = 0; i < lspaces; i++) 02338 info->text += ' '; 02339 for (int i = 0; i < trailing_ws_idx; i++) 02340 info->text += text[i]; 02341 } 02342 delete []text; 02343 02344 info->num_words = 0; 02345 info->lword_indicates_list_item = false; 02346 info->lword_likely_starts_idea = false; 02347 info->lword_likely_ends_idea = false; 02348 info->rword_indicates_list_item = false; 02349 info->rword_likely_starts_idea = false; 02350 info->rword_likely_ends_idea = false; 02351 02352 if (info->text.size() == 0) { 02353 info->rword_likely_ends_idea = false; 02354 info->rword_likely_ends_idea = false; 02355 return; 02356 } 02357 02358 int ltr = 0; 02359 int rtl = 0; 02360 02361 PAGE_RES_IT page_res_it = *it.PageResIt(); 02362 GenericVector<WERD_RES *> werds; 02363 WERD_RES *word_res = page_res_it.restart_row(); 02364 ROW_RES *this_row = page_res_it.row(); 02365 int num_leaders = 0; 02366 do { 02367 if (word_res && word_res->best_choice->unichar_string().length() > 0) { 02368 werds.push_back(word_res); 02369 ltr += word_res->AnyLtrCharsInWord() ? 1 : 0; 02370 rtl += word_res->AnyRtlCharsInWord() ? 1 : 0; 02371 if (word_res->word->flag(W_REP_CHAR)) num_leaders++; 02372 } 02373 word_res = page_res_it.forward(); 02374 } while (page_res_it.row() == this_row); 02375 02376 info->has_leaders = num_leaders > 3; 02377 info->num_words = werds.size(); 02378 if (werds.size() > 0) { 02379 WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1]; 02380 info->lword_text = lword->best_choice->unichar_string().string(); 02381 info->rword_text = rword->best_choice->unichar_string().string(); 02382 info->lword_box = lword->word->bounding_box(); 02383 info->rword_box = rword->word->bounding_box(); 02384 LeftWordAttributes(lword->uch_set, lword->best_choice, 02385 info->lword_text, 02386 &info->lword_indicates_list_item, 02387 &info->lword_likely_starts_idea, 02388 &info->lword_likely_ends_idea); 02389 RightWordAttributes(rword->uch_set, rword->best_choice, 02390 info->rword_text, 02391 &info->rword_indicates_list_item, 02392 &info->rword_likely_starts_idea, 02393 &info->rword_likely_ends_idea); 02394 } 02395 info->ltr = ltr >= rtl; 02396 } 02397 02398 // This is called after rows have been identified and words are recognized. 02399 // Much of this could be implemented before word recognition, but text helps 02400 // to identify bulleted lists and gives good signals for sentence boundaries. 02401 void DetectParagraphs(int debug_level, 02402 const MutableIterator *block_start, 02403 GenericVector<ParagraphModel *> *models) { 02404 // Clear out any preconceived notions. 02405 if (block_start->Empty(RIL_TEXTLINE)) { 02406 return; 02407 } 02408 BLOCK *block = block_start->PageResIt()->block()->block; 02409 block->para_list()->clear(); 02410 bool is_image_block = block->poly_block() && !block->poly_block()->IsText(); 02411 02412 // Convert the Tesseract structures to RowInfos 02413 // for the paragraph detection algorithm. 02414 MutableIterator row(*block_start); 02415 if (row.Empty(RIL_TEXTLINE)) 02416 return; // end of input already. 02417 02418 GenericVector<RowInfo> row_infos; 02419 do { 02420 if (!row.PageResIt()->row()) 02421 continue; // empty row. 02422 row.PageResIt()->row()->row->set_para(NULL); 02423 row_infos.push_back(RowInfo()); 02424 RowInfo &ri = row_infos.back(); 02425 InitializeRowInfo(row, &ri); 02426 } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && 02427 row.Next(RIL_TEXTLINE)); 02428 02429 // Run the paragraph detection algorithm. 02430 GenericVector<PARA *> row_owners; 02431 GenericVector<PARA *> the_paragraphs; 02432 if (!is_image_block) { 02433 DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), 02434 models); 02435 } else { 02436 row_owners.init_to_size(row_infos.size(), NULL); 02437 CanonicalizeDetectionResults(&row_owners, block->para_list()); 02438 } 02439 02440 // Now stitch in the row_owners into the rows. 02441 row = *block_start; 02442 for (int i = 0; i < row_owners.size(); i++) { 02443 while (!row.PageResIt()->row()) 02444 row.Next(RIL_TEXTLINE); 02445 row.PageResIt()->row()->row->set_para(row_owners[i]); 02446 row.Next(RIL_TEXTLINE); 02447 } 02448 } 02449 02450 } // namespace