tesseract-doc/paragraphs_8cpp_source.html

00001 /**********************************************************************
00002  * File:        paragraphs.cpp
00003  * Description: Paragraph detection for tesseract.
00004  * Author:      David Eger
00005  * Created:     25 February 2011
00006  *
00007  * (C) Copyright 2011, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 #ifdef _MSC_VER
00020 #define __func__ __FUNCTION__
00021 #endif
00022
00023 #include <ctype.h>
00024
00025 #include "genericvector.h"
00026 #include "helpers.h"
00027 #include "mutableiterator.h"
00028 #include "ocrpara.h"
00029 #include "pageres.h"
00030 #include "paragraphs.h"
00031 #include "paragraphs_internal.h"
00032 #include "publictypes.h"
00033 #include "ratngs.h"
00034 #include "rect.h"
00035 #include "statistc.h"
00036 #include "strngs.h"
00037 #include "tprintf.h"
00038 #include "unicharset.h"
00039 #include "unicodes.h"
00040
00041 namespace tesseract {
00042
00043 // The tab vectors for a given line should be ignored if both its tab vectors
00044 // are infrequent, specifically, if both tab vectors appear at most once per
00045 // kStrayLinePer lines in a block.
00046 const int kStrayLinePer = 6;
00047
00048 // Special "weak" ParagraphModels.
00049 const ParagraphModel *kCrownLeft
00050     = reinterpret_cast<ParagraphModel *>(0xDEAD111F);
00051 const ParagraphModel *kCrownRight
00052     = reinterpret_cast<ParagraphModel *>(0xDEAD888F);
00053
00054 // Given the width of a typical space between words, what is the threshold
00055 // by which by which we think left and right alignments for paragraphs
00056 // can vary and still be aligned.
00057 static int Epsilon(int space_pix) {
00058   return space_pix * 4 / 5;
00059 }
00060
00061 template<typename T>
00062 void SimpleSwap(T &a, T &b) {
00063   T c = a;
00064   a = b;
00065   b = c;
00066 }
00067
00068 static bool AcceptableRowArgs(
00069     int debug_level, int min_num_rows, const char *function_name,
00070     const GenericVector<RowScratchRegisters> *rows,
00071     int row_start, int row_end) {
00072   if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
00073     tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
00074             row_start, row_end, rows->size());
00075     return false;
00076   }
00077   if (row_end - row_start < min_num_rows) {
00078     if (debug_level > 1) {
00079       tprintf("# Too few rows[%d, %d) for %s.\n",
00080               row_start, row_end, function_name);
00081     }
00082     return false;
00083   }
00084   return true;
00085 }
00086
00087 // =============================== Debug Code ================================
00088
00089 // Convert an integer to a decimal string.
00090 static STRING StrOf(int num) {
00091   char buffer[30];
00092   snprintf(buffer, sizeof(buffer), "%d", num);
00093   return STRING(buffer);
00094 }
00095
00096 // Given a row-major matrix of unicode text and a column separator, print
00097 // a formatted table.  For ASCII, we get good column alignment.
00098 static void PrintTable(const GenericVector<GenericVector<STRING> > &rows,
00099                        const STRING &colsep) {
00100   GenericVector<int> max_col_widths;
00101   for (int r = 0; r < rows.size(); r++) {
00102     int num_columns = rows[r].size();
00103     for (int c = 0; c < num_columns; c++) {
00104       int num_unicodes = 0;
00105       for (int i = 0; i < rows[r][c].size(); i++) {
00106         if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
00107       }
00108       if (c >= max_col_widths.size()) {
00109         max_col_widths.push_back(num_unicodes);
00110       } else {
00111         if (num_unicodes > max_col_widths[c])
00112           max_col_widths[c] = num_unicodes;
00113       }
00114     }
00115   }
00116
00117   GenericVector<STRING> col_width_patterns;
00118   for (int c = 0; c < max_col_widths.size(); c++) {
00119     col_width_patterns.push_back(
00120         STRING("%-") + StrOf(max_col_widths[c]) + "s");
00121   }
00122
00123   for (int r = 0; r < rows.size(); r++) {
00124     for (int c = 0; c < rows[r].size(); c++) {
00125       if (c > 0)
00126         tprintf("%s", colsep.string());
00127       tprintf(col_width_patterns[c].string(), rows[r][c].string());
00128     }
00129     tprintf("\n");
00130   }
00131 }
00132
00133 STRING RtlEmbed(const STRING &word, bool rtlify) {
00134   if (rtlify)
00135     return STRING(kRLE) + word + STRING(kPDF);
00136   return word;
00137 }
00138
00139 // Print the current thoughts of the paragraph detector.
00140 static void PrintDetectorState(const ParagraphTheory &theory,
00141                                const GenericVector<RowScratchRegisters> &rows) {
00142   GenericVector<GenericVector<STRING> > output;
00143   output.push_back(GenericVector<STRING>());
00144   output.back().push_back("#row");
00145   output.back().push_back("space");
00146   output.back().push_back("..");
00147   output.back().push_back("lword[widthSEL]");
00148   output.back().push_back("rword[widthSEL]");
00149   RowScratchRegisters::AppendDebugHeaderFields(&output.back());
00150   output.back().push_back("text");
00151
00152   for (int i = 0; i < rows.size(); i++) {
00153     output.push_back(GenericVector<STRING>());
00154     GenericVector<STRING> &row = output.back();
00155     const RowInfo& ri = *rows[i].ri_;
00156     row.push_back(StrOf(i));
00157     row.push_back(StrOf(ri.average_interword_space));
00158     row.push_back(ri.has_leaders ? ".." : " ");
00159     row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
00160                   "[" + StrOf(ri.lword_box.width()) +
00161                   (ri.lword_likely_starts_idea ? "S" : "s") +
00162                   (ri.lword_likely_ends_idea ? "E" : "e") +
00163                   (ri.lword_indicates_list_item ? "L" : "l") +
00164                   "]");
00165     row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
00166                   "[" + StrOf(ri.rword_box.width()) +
00167                   (ri.rword_likely_starts_idea ? "S" : "s") +
00168                   (ri.rword_likely_ends_idea ? "E" : "e") +
00169                   (ri.rword_indicates_list_item ? "L" : "l") +
00170                   "]");
00171     rows[i].AppendDebugInfo(theory, &row);
00172     row.push_back(RtlEmbed(ri.text, !ri.ltr));
00173   }
00174   PrintTable(output, " ");
00175
00176   tprintf("Active Paragraph Models:\n");
00177   for (int m = 0; m < theory.models().size(); m++) {
00178     tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
00179   }
00180 }
00181
00182 static void DebugDump(
00183     bool should_print,
00184     const STRING &phase,
00185     const ParagraphTheory &theory,
00186     const GenericVector<RowScratchRegisters> &rows) {
00187   if (!should_print)
00188     return;
00189   tprintf("# %s\n", phase.string());
00190   PrintDetectorState(theory, rows);
00191 }
00192
00193 // Print out the text for rows[row_start, row_end)
00194 static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
00195                           int row_start, int row_end) {
00196   tprintf("======================================\n");
00197   for (int row = row_start; row < row_end; row++) {
00198     tprintf("%s\n", rows[row].ri_->text.string());
00199   }
00200   tprintf("======================================\n");
00201 }
00202
00203 // ============= Brain Dead Language Model (ASCII Version) ===================
00204
00205 bool IsLatinLetter(int ch) {
00206   return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
00207 }
00208
00209 bool IsDigitLike(int ch) {
00210   return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
00211 }
00212
00213 bool IsOpeningPunct(int ch) {
00214   return strchr("'\"({[", ch) != NULL;
00215 }
00216
00217 bool IsTerminalPunct(int ch) {
00218   return strchr(":'\".?!]})", ch) != NULL;
00219 }
00220
00221 // Return a pointer after consuming as much text as qualifies as roman numeral.
00222 const char *SkipChars(const char *str, const char *toskip) {
00223   while (*str != '\0' && strchr(toskip, *str)) { str++; }
00224   return str;
00225 }
00226
00227 const char *SkipChars(const char *str, bool (*skip)(int)) {
00228   while (*str != '\0' && skip(*str)) { str++; }
00229   return str;
00230 }
00231
00232 const char *SkipOne(const char *str, const char *toskip) {
00233   if (*str != '\0' && strchr(toskip, *str)) return str + 1;
00234   return str;
00235 }
00236
00237 // Return whether it is very likely that this is a numeral marker that could
00238 // start a list item.  Some examples include:
00239 //   A   I   iii.   VI   (2)   3.5.   [C-4]
00240 bool LikelyListNumeral(const STRING &word) {
00241   const char *kRomans = "ivxlmdIVXLMD";
00242   const char *kDigits = "012345789";
00243   const char *kOpen = "[{(";
00244   const char *kSep = ":;-.,";
00245   const char *kClose = "]})";
00246
00247   int num_segments = 0;
00248   const char *pos = word.string();
00249   while (*pos != '\0' && num_segments < 3) {
00250     // skip up to two open parens.
00251     const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
00252     const char *numeral_end = SkipChars(numeral_start, kRomans);
00253     if (numeral_end != numeral_start) {
00254       // Got Roman Numeral. Great.
00255     } else {
00256       numeral_end = SkipChars(numeral_start, kDigits);
00257       if (numeral_end == numeral_start) {
00258         // If there's a single latin letter, we can use that.
00259         numeral_end = SkipChars(numeral_start, IsLatinLetter);
00260         if (numeral_end - numeral_start != 1)
00261           break;
00262       }
00263     }
00264     // We got some sort of numeral.
00265     num_segments++;
00266     // Skip any trailing parens or punctuation.
00267     pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
00268     if (pos == numeral_end)
00269       break;
00270   }
00271   return *pos == '\0';
00272 }
00273
00274 bool LikelyListMark(const STRING &word) {
00275   const char *kListMarks = "0Oo*.,+.";
00276   return word.size() == 1 && strchr(kListMarks, word[0]) != NULL;
00277 }
00278
00279 bool AsciiLikelyListItem(const STRING &word) {
00280   return LikelyListMark(word) || LikelyListNumeral(word);
00281 }
00282
00283 // ========== Brain Dead Language Model (Tesseract Version) ================
00284
00285 // Return the first Unicode Codepoint from werd[pos].
00286 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
00287   if (!u || !werd || pos > werd->length())
00288     return 0;
00289   return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
00290 }
00291
00292 // A useful helper class for finding the first j >= i so that word[j]
00293 // does not have given character type.
00294 class UnicodeSpanSkipper {
00295  public:
00296   UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
00297       : u_(unicharset), word_(word) { wordlen_ = word->length(); }
00298
00299   // Given an input position, return the first position >= pos not punc.
00300   int SkipPunc(int pos);
00301   // Given an input position, return the first position >= pos not digit.
00302   int SkipDigits(int pos);
00303   // Given an input position, return the first position >= pos not roman.
00304   int SkipRomans(int pos);
00305   // Given an input position, return the first position >= pos not alpha.
00306   int SkipAlpha(int pos);
00307
00308  private:
00309   const UNICHARSET *u_;
00310   const WERD_CHOICE *word_;
00311   int wordlen_;
00312 };
00313
00314 int UnicodeSpanSkipper::SkipPunc(int pos) {
00315   while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
00316   return pos;
00317 }
00318
00319 int UnicodeSpanSkipper::SkipDigits(int pos) {
00320   while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
00321                             IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
00322   return pos;
00323 }
00324
00325 int UnicodeSpanSkipper::SkipRomans(int pos) {
00326   const char *kRomans = "ivxlmdIVXLMD";
00327   while (pos < wordlen_) {
00328     int ch = UnicodeFor(u_, word_, pos);
00329     if (ch >= 0xF0 || strchr(kRomans, ch) == 0) break;
00330     pos++;
00331   }
00332   return pos;
00333 }
00334
00335 int UnicodeSpanSkipper::SkipAlpha(int pos) {
00336   while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
00337   return pos;
00338 }
00339
00340 bool LikelyListMarkUnicode(int ch) {
00341   if (ch < 0x80) {
00342     STRING single_ch;
00343     single_ch += ch;
00344     return LikelyListMark(single_ch);
00345   }
00346   switch (ch) {
00347     // TODO(eger) expand this list of unicodes as needed.
00348     case 0x00B0:  // degree sign
00349     case 0x2022:  // bullet
00350     case 0x25E6:  // white bullet
00351     case 0x00B7:  // middle dot
00352     case 0x25A1:  // white square
00353     case 0x25A0:  // black square
00354     case 0x25AA:  // black small square
00355     case 0x2B1D:  // black very small square
00356     case 0x25BA:  // black right-pointing pointer
00357     case 0x25CF:  // black circle
00358     case 0x25CB:  // white circle
00359       return true;
00360     default:
00361       break;  // fall through
00362   }
00363   return false;
00364 }
00365
00366 // Return whether it is very likely that this is a numeral marker that could
00367 // start a list item.  Some examples include:
00368 //   A   I   iii.   VI   (2)   3.5.   [C-4]
00369 bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
00370   if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
00371     return true;
00372
00373   UnicodeSpanSkipper m(u, werd);
00374   int num_segments = 0;
00375   int pos = 0;
00376   while (pos < werd->length() && num_segments < 3) {
00377     int numeral_start = m.SkipPunc(pos);
00378     if (numeral_start > pos + 1) break;
00379     int numeral_end = m.SkipRomans(numeral_start);
00380     if (numeral_end == numeral_start) {
00381       numeral_end = m.SkipDigits(numeral_start);
00382       if (numeral_end == numeral_start) {
00383         // If there's a single latin letter, we can use that.
00384         numeral_end = m.SkipAlpha(numeral_start);
00385         if (numeral_end - numeral_start != 1)
00386           break;
00387       }
00388     }
00389     // We got some sort of numeral.
00390     num_segments++;
00391     // Skip any trailing punctuation.
00392     pos = m.SkipPunc(numeral_end);
00393     if (pos == numeral_end)
00394       break;
00395   }
00396   return pos == werd->length();
00397 }
00398
00399 // ========= Brain Dead Language Model (combined entry points) ================
00400
00401 // Given the leftmost word of a line either as a Tesseract unicharset + werd
00402 // or a utf8 string, set the following attributes for it:
00403 //   is_list -      this word might be a list number or bullet.
00404 //   starts_idea -  this word is likely to start a sentence.
00405 //   ends_idea -    this word is likely to end a sentence.
00406 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
00407                         const STRING &utf8,
00408                         bool *is_list, bool *starts_idea, bool *ends_idea) {
00409   *is_list = false;
00410   *starts_idea = false;
00411   *ends_idea = false;
00412   if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
00413     *ends_idea = true;
00414     return;
00415   }
00416
00417   if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
00418     if (UniLikelyListItem(unicharset, werd)) {
00419       *is_list = true;
00420       *starts_idea = true;
00421       *ends_idea = true;
00422     }
00423     if (unicharset->get_isupper(werd->unichar_id(0))) {
00424       *starts_idea = true;
00425     }
00426     if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
00427       *starts_idea = true;
00428       *ends_idea = true;
00429     }
00430   } else {  // Assume utf8 is mostly ASCII
00431     if (AsciiLikelyListItem(utf8)) {
00432       *is_list = true;
00433       *starts_idea = true;
00434     }
00435     int start_letter = utf8[0];
00436     if (IsOpeningPunct(start_letter)) {
00437       *starts_idea = true;
00438     }
00439     if (IsTerminalPunct(start_letter)) {
00440       *ends_idea = true;
00441     }
00442     if (start_letter >= 'A' && start_letter <= 'Z') {
00443       *starts_idea = true;
00444     }
00445   }
00446 }
00447
00448 // Given the rightmost word of a line either as a Tesseract unicharset + werd
00449 // or a utf8 string, set the following attributes for it:
00450 //   is_list -      this word might be a list number or bullet.
00451 //   starts_idea -  this word is likely to start a sentence.
00452 //   ends_idea -    this word is likely to end a sentence.
00453 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
00454                          const STRING &utf8,
00455                          bool *is_list, bool *starts_idea, bool *ends_idea) {
00456   *is_list = false;
00457   *starts_idea = false;
00458   *ends_idea = false;
00459   if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
00460     *ends_idea = true;
00461     return;
00462   }
00463
00464   if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
00465     if (UniLikelyListItem(unicharset, werd)) {
00466       *is_list = true;
00467       *starts_idea = true;
00468     }
00469     UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
00470     if (unicharset->get_ispunctuation(last_letter)) {
00471       *ends_idea = true;
00472     }
00473   } else {  // Assume utf8 is mostly ASCII
00474     if (AsciiLikelyListItem(utf8)) {
00475       *is_list = true;
00476       *starts_idea = true;
00477     }
00478     int last_letter = utf8[utf8.size() - 1];
00479     if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
00480       *ends_idea = true;
00481     }
00482   }
00483 }
00484
00485 // =============== Implementation of RowScratchRegisters =====================
00486 /* static */
00487 void RowScratchRegisters::AppendDebugHeaderFields(
00488     GenericVector<STRING> *header) {
00489   header->push_back("[lmarg,lind;rind,rmarg]");
00490   header->push_back("model");
00491 }
00492
00493 void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
00494                                           GenericVector<STRING> *dbg) const {
00495   char s[30];
00496   snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
00497            lmargin_, lindent_, rindent_, rmargin_);
00498   dbg->push_back(s);
00499   STRING model_string;
00500   model_string += static_cast<char>(GetLineType());
00501   model_string += ":";
00502
00503   int model_numbers = 0;
00504   for (int h = 0; h < hypotheses_.size(); h++) {
00505     if (hypotheses_[h].model == NULL)
00506       continue;
00507     if (model_numbers > 0)
00508       model_string += ",";
00509     if (StrongModel(hypotheses_[h].model)) {
00510       model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
00511     } else if (hypotheses_[h].model == kCrownLeft) {
00512       model_string += "CrL";
00513     } else if (hypotheses_[h].model == kCrownRight) {
00514       model_string += "CrR";
00515     }
00516     model_numbers++;
00517   }
00518   if (model_numbers == 0)
00519     model_string += "0";
00520
00521   dbg->push_back(model_string);
00522 }
00523
00524 void RowScratchRegisters::Init(const RowInfo &row) {
00525   ri_ = &row;
00526   lmargin_ = 0;
00527   lindent_ = row.pix_ldistance;
00528   rmargin_ = 0;
00529   rindent_ = row.pix_rdistance;
00530 }
00531
00532 LineType RowScratchRegisters::GetLineType() const {
00533   if (hypotheses_.empty())
00534     return LT_UNKNOWN;
00535   bool has_start = false;
00536   bool has_body = false;
00537   for (int i = 0; i < hypotheses_.size(); i++) {
00538     switch (hypotheses_[i].ty) {
00539       case LT_START: has_start = true; break;
00540       case LT_BODY: has_body = true; break;
00541       default:
00542         tprintf("Encountered bad value in hypothesis list: %c\n",
00543                 hypotheses_[i].ty);
00544         break;
00545     }
00546   }
00547   if (has_start && has_body)
00548     return LT_MULTIPLE;
00549   return has_start ? LT_START : LT_BODY;
00550 }
00551
00552 LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {
00553   if (hypotheses_.empty())
00554     return LT_UNKNOWN;
00555   bool has_start = false;
00556   bool has_body = false;
00557   for (int i = 0; i < hypotheses_.size(); i++) {
00558     if (hypotheses_[i].model != model)
00559       continue;
00560     switch (hypotheses_[i].ty) {
00561       case LT_START: has_start = true; break;
00562       case LT_BODY: has_body = true; break;
00563       default:
00564         tprintf("Encountered bad value in hypothesis list: %c\n",
00565                 hypotheses_[i].ty);
00566         break;
00567     }
00568   }
00569   if (has_start && has_body)
00570     return LT_MULTIPLE;
00571   return has_start ? LT_START : LT_BODY;
00572 }
00573
00574 void RowScratchRegisters::SetStartLine() {
00575   LineType current_lt = GetLineType();
00576   if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
00577     tprintf("Trying to set a line to be START when it's already BODY.\n");
00578   }
00579   if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
00580     hypotheses_.push_back_new(LineHypothesis(LT_START, NULL));
00581   }
00582 }
00583
00584 void RowScratchRegisters::SetBodyLine() {
00585   LineType current_lt = GetLineType();
00586   if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
00587     tprintf("Trying to set a line to be BODY when it's already START.\n");
00588   }
00589   if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
00590     hypotheses_.push_back_new(LineHypothesis(LT_BODY, NULL));
00591   }
00592 }
00593
00594 void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
00595   hypotheses_.push_back_new(LineHypothesis(LT_START, model));
00596   int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, NULL));
00597   if (old_idx >= 0)
00598     hypotheses_.remove(old_idx);
00599 }
00600
00601 void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
00602   hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
00603   int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, NULL));
00604   if (old_idx >= 0)
00605     hypotheses_.remove(old_idx);
00606 }
00607
00608 void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
00609   for (int h = 0; h < hypotheses_.size(); h++) {
00610     if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
00611       models->push_back_new(hypotheses_[h].model);
00612   }
00613 }
00614
00615 void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
00616   for (int h = 0; h < hypotheses_.size(); h++) {
00617     if (StrongModel(hypotheses_[h].model))
00618       models->push_back_new(hypotheses_[h].model);
00619   }
00620 }
00621
00622 void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
00623   for (int h = 0; h < hypotheses_.size(); h++) {
00624     if (hypotheses_[h].model != NULL)
00625       models->push_back_new(hypotheses_[h].model);
00626   }
00627 }
00628
00629 const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {
00630   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
00631     return NULL;
00632   return hypotheses_[0].model;
00633 }
00634
00635 const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {
00636   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
00637     return NULL;
00638   return hypotheses_[0].model;
00639 }
00640
00641 // Discard any hypotheses whose model is not in the given list.
00642 void RowScratchRegisters::DiscardNonMatchingHypotheses(
00643     const SetOfModels &models) {
00644   if (models.empty())
00645     return;
00646   for (int h = hypotheses_.size() - 1; h >= 0; h--) {
00647     if (!models.contains(hypotheses_[h].model)) {
00648       hypotheses_.remove(h);
00649     }
00650   }
00651 }
00652
00653 // ============ Geometry based Paragraph Detection Algorithm =================
00654
00655 struct Cluster {
00656   Cluster() : center(0), count(0) {}
00657   Cluster(int cen, int num) : center(cen), count(num) {}
00658
00659   int center;  // The center of the cluster.
00660   int count;   // The number of entries within the cluster.
00661 };
00662
00663 class SimpleClusterer {
00664  public:
00665   explicit SimpleClusterer(int max_cluster_width)
00666       : max_cluster_width_(max_cluster_width) {}
00667   void Add(int value) { values_.push_back(value); }
00668   int size() const { return values_.size(); }
00669   void GetClusters(GenericVector<Cluster> *clusters);
00670
00671  private:
00672   int max_cluster_width_;
00673   GenericVectorEqEq<int> values_;
00674 };
00675
00676 // Return the index of the cluster closest to value.
00677 int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
00678   int best_index = 0;
00679   for (int i = 0; i < clusters.size(); i++) {
00680     if (abs(value - clusters[i].center) <
00681         abs(value - clusters[best_index].center))
00682         best_index = i;
00683   }
00684   return best_index;
00685 }
00686
00687 void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
00688   clusters->clear();
00689   values_.sort();
00690   for (int i = 0; i < values_.size();) {
00691     int orig_i = i;
00692     int lo = values_[i];
00693     int hi = lo;
00694     while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
00695       hi = values_[i];
00696     }
00697     clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
00698   }
00699 }
00700
00701 // Calculate left- and right-indent tab stop values seen in
00702 // rows[row_start, row_end) given a tolerance of tolerance.
00703 void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
00704                        int row_start, int row_end,
00705                        int tolerance,
00706                        GenericVector<Cluster> *left_tabs,
00707                        GenericVector<Cluster> *right_tabs) {
00708   if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
00709     return;
00710   // First pass: toss all left and right indents into clusterers.
00711   SimpleClusterer initial_lefts(tolerance);
00712   SimpleClusterer initial_rights(tolerance);
00713   GenericVector<Cluster> initial_left_tabs;
00714   GenericVector<Cluster> initial_right_tabs;
00715   for (int i = row_start; i < row_end; i++) {
00716     initial_lefts.Add((*rows)[i].lindent_);
00717     initial_rights.Add((*rows)[i].rindent_);
00718   }
00719   initial_lefts.GetClusters(&initial_left_tabs);
00720   initial_rights.GetClusters(&initial_right_tabs);
00721
00722   // Second pass: cluster only lines that are not "stray"
00723   //   An example of a stray line is a page number -- a line whose start
00724   //   and end tab-stops are far outside the typical start and end tab-stops
00725   //   for the block.
00726   //   Put another way, we only cluster data from lines whose start or end
00727   //   tab stop is frequent.
00728   SimpleClusterer lefts(tolerance);
00729   SimpleClusterer rights(tolerance);
00730   int infrequent_enough_to_ignore = (row_end - row_start) / kStrayLinePer;
00731   for (int i = row_start; i < row_end; i++) {
00732     int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
00733     int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
00734     if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
00735         initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
00736       lefts.Add((*rows)[i].lindent_);
00737       rights.Add((*rows)[i].rindent_);
00738     }
00739   }
00740   lefts.GetClusters(left_tabs);
00741   rights.GetClusters(right_tabs);
00742 }
00743
00744 // Given a paragraph model mark rows[row_start, row_end) as said model
00745 // start or body lines.
00746 //
00747 // Case 1: model->first_indent_ != model->body_indent_
00748 //   Differentiating the paragraph start lines from the paragraph body lines in
00749 //   this case is easy, we just see how far each line is indented.
00750 //
00751 // Case 2: model->first_indent_ == model->body_indent_
00752 //   Here, we find end-of-paragraph lines by looking for "short lines."
00753 //   What constitutes a "short line" changes depending on whether the text
00754 //   ragged-right[left] or fully justified (aligned left and right).
00755 //
00756 //   Case 2a: Ragged Right (or Left) text.  (eop_threshold == 0)
00757 //     We have a new paragraph it the first word would have at the end
00758 //     of the previous line.
00759 //
00760 //   Case 2b: Fully Justified.  (eop_threshold > 0)
00761 //     We mark a line as short (end of paragraph) if the offside indent
00762 //     is greater than eop_threshold.
00763 void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows,
00764                        int row_start, int row_end,
00765                        const ParagraphModel *model,
00766                        bool ltr,
00767                        int eop_threshold) {
00768   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
00769     return;
00770   for (int row = row_start; row < row_end; row++) {
00771     bool valid_first = ValidFirstLine(rows, row, model);
00772     bool valid_body = ValidBodyLine(rows, row, model);
00773     if (valid_first && !valid_body) {
00774       (*rows)[row].AddStartLine(model);
00775     } else if (valid_body && !valid_first) {
00776       (*rows)[row].AddBodyLine(model);
00777     } else if (valid_body && valid_first) {
00778       bool after_eop = (row == row_start);
00779       if (row > row_start) {
00780         if (eop_threshold > 0) {
00781           if (model->justification() == JUSTIFICATION_LEFT) {
00782             after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
00783           } else {
00784             after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
00785           }
00786         } else {
00787          after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
00788                                            model->justification());
00789         }
00790       }
00791       if (after_eop) {
00792         (*rows)[row].AddStartLine(model);
00793       } else {
00794         (*rows)[row].AddBodyLine(model);
00795       }
00796     } else {
00797       // Do nothing. Stray row.
00798     }
00799   }
00800 }
00801
00802 // GeometricClassifierState holds all of the information we'll use while
00803 // trying to determine a paragraph model for the text lines in a block of
00804 // text:
00805 //   + the rows under consideration [row_start, row_end)
00806 //   + the common left- and right-indent tab stops
00807 //   + does the block start out left-to-right or right-to-left
00808 // Further, this struct holds the data we amass for the (single) ParagraphModel
00809 // we'll assign to the text lines (assuming we get that far).
00810 struct GeometricClassifierState {
00811   GeometricClassifierState(int dbg_level,
00812                            GenericVector<RowScratchRegisters> *r,
00813                            int r_start, int r_end)
00814       : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),
00815         margin(0) {
00816     tolerance = InterwordSpace(*r, r_start, r_end);
00817     CalculateTabStops(r, r_start, r_end, tolerance,
00818                       &left_tabs, &right_tabs);
00819     ltr = (*r)[r_start].ri_->ltr;
00820   }
00821
00822   void AssumeLeftJustification() {
00823     just = tesseract::JUSTIFICATION_LEFT;
00824     margin = (*rows)[row_start].lmargin_;
00825   }
00826
00827   void AssumeRightJustification() {
00828     just = tesseract::JUSTIFICATION_RIGHT;
00829     margin = (*rows)[row_start].rmargin_;
00830   }
00831
00832   // Align tabs are the tab stops the text is aligned to.
00833   const GenericVector<Cluster> &AlignTabs() const {
00834     if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs;
00835     return left_tabs;
00836   }
00837
00838   // Offside tabs are the tab stops opposite the tabs used to align the text.
00839   //
00840   // Note that for a left-to-right text which is aligned to the right such as
00841   //     this function comment, the offside tabs are the horizontal tab stops
00842   //                 marking the beginning of ("Note", "this" and "marking").
00843   const GenericVector<Cluster> &OffsideTabs() const {
00844     if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs;
00845     return right_tabs;
00846   }
00847
00848   // Return whether the i'th row extends from the leftmost left tab stop
00849   // to the right most right tab stop.
00850   bool IsFullRow(int i) const {
00851     return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
00852         ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
00853   }
00854
00855   int AlignsideTabIndex(int row_idx) const {
00856     return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
00857   }
00858
00859   // Given what we know about the paragraph justification (just), would the
00860   // first word of row_b have fit at the end of row_a?
00861   bool FirstWordWouldHaveFit(int row_a, int row_b) {
00862     return ::tesseract::FirstWordWouldHaveFit(
00863         (*rows)[row_a], (*rows)[row_b], just);
00864   }
00865
00866   void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
00867
00868   void Fail(int min_debug_level, const char *why) const {
00869     if (debug_level < min_debug_level) return;
00870     tprintf("# %s\n", why);
00871     PrintRows();
00872   }
00873
00874   ParagraphModel Model() const {
00875     return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
00876   }
00877
00878   // We print out messages with a debug level at least as great as debug_level.
00879   int debug_level;
00880
00881   // The Geometric Classifier was asked to find a single paragraph model
00882   // to fit the text rows (*rows)[row_start, row_end)
00883   GenericVector<RowScratchRegisters> *rows;
00884   int row_start;
00885   int row_end;
00886
00887   // The amount by which we expect the text edge can vary and still be aligned.
00888   int tolerance;
00889
00890   // Is the script in this text block left-to-right?
00891   // HORRIBLE ROUGH APPROXIMATION.  TODO(eger): Improve
00892   bool ltr;
00893
00894   // These left and right tab stops were determined to be the common tab
00895   // stops for the given text.
00896   GenericVector<Cluster> left_tabs;
00897   GenericVector<Cluster> right_tabs;
00898
00899   // These are parameters we must determine to create a ParagraphModel.
00900   tesseract::ParagraphJustification just;
00901   int margin;
00902   int first_indent;
00903   int body_indent;
00904
00905   // eop_threshold > 0 if the text is fully justified.  See MarkRowsWithModel()
00906   int eop_threshold;
00907 };
00908
00909 // Given a section of text where strong textual clues did not help identifying
00910 // paragraph breaks, and for which the left and right indents have exactly
00911 // three tab stops between them, attempt to find the paragraph breaks based
00912 // solely on the outline of the text and whether the script is left-to-right.
00913 //
00914 // Algorithm Detail:
00915 //   The selected rows are in the form of a rectangle except
00916 //   for some number of "short lines" of the same length:
00917 //
00918 //   (A1)  xxxxxxxxxxxxx  (B1) xxxxxxxxxxxx
00919 //           xxxxxxxxxxx       xxxxxxxxxx    # A "short" line.
00920 //         xxxxxxxxxxxxx       xxxxxxxxxxxx
00921 //         xxxxxxxxxxxxx       xxxxxxxxxxxx
00922 //
00923 //   We have a slightly different situation if the only short
00924 //   line is at the end of the excerpt.
00925 //
00926 //   (A2) xxxxxxxxxxxxx  (B2) xxxxxxxxxxxx
00927 //        xxxxxxxxxxxxx       xxxxxxxxxxxx
00928 //        xxxxxxxxxxxxx       xxxxxxxxxxxx
00929 //          xxxxxxxxxxx       xxxxxxxxxx     # A "short" line.
00930 //
00931 //   We'll interpret these as follows based on the reasoning in the comment for
00932 //   GeometricClassify():
00933 //       [script direction: first indent, body indent]
00934 //   (A1) LtR: 2,0  RtL: 0,0   (B1) LtR: 0,0  RtL: 2,0
00935 //   (A2) LtR: 2,0  RtL: CrR   (B2) LtR: CrL  RtL: 2,0
00936 void GeometricClassifyThreeTabStopTextBlock(
00937     int debug_level,
00938     GeometricClassifierState &s,
00939     ParagraphTheory *theory) {
00940   int num_rows = s.row_end - s.row_start;
00941   int num_full_rows = 0;
00942   int last_row_full = 0;
00943   for (int i = s.row_start; i < s.row_end; i++) {
00944     if (s.IsFullRow(i)) {
00945       num_full_rows++;
00946       if (i == s.row_end - 1) last_row_full++;
00947     }
00948   }
00949
00950   if (num_full_rows < 0.7 * num_rows) {
00951     s.Fail(1, "Not enough full lines to know which lines start paras.");
00952     return;
00953   }
00954
00955   // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
00956   s.eop_threshold = 0;
00957
00958   if (s.ltr) {
00959     s.AssumeLeftJustification();
00960   } else {
00961     s.AssumeRightJustification();
00962   }
00963
00964   if (debug_level > 0) {
00965     tprintf("# Not enough variety for clear outline classification. "
00966             "Guessing these are %s aligned based on script.\n",
00967             s.ltr ? "left" : "right");
00968     s.PrintRows();
00969   }
00970
00971   if (s.AlignTabs().size() == 2) {  // case A1 or A2
00972     s.first_indent = s.AlignTabs()[1].center;
00973     s.body_indent = s.AlignTabs()[0].center;
00974   } else {                      // case B1 or B2
00975     if (num_rows - 1 == num_full_rows - last_row_full) {
00976       // case B2
00977       const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
00978       (*s.rows)[s.row_start].AddStartLine(model);
00979       for (int i = s.row_start + 1; i < s.row_end; i++) {
00980         (*s.rows)[i].AddBodyLine(model);
00981       }
00982       return;
00983     } else {
00984       // case B1
00985       s.first_indent = s.body_indent = s.AlignTabs()[0].center;
00986       s.eop_threshold = (s.OffsideTabs()[0].center +
00987                          s.OffsideTabs()[1].center) / 2;
00988     }
00989   }
00990   const ParagraphModel *model = theory->AddModel(s.Model());
00991   MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
00992                     s.ltr, s.eop_threshold);
00993   return;
00994 }
00995
00996 // This function is called if strong textual clues were not available, but
00997 // the caller hopes that the paragraph breaks will be super obvious just
00998 // by the outline of the text.
00999 //
01000 // The particularly difficult case is figuring out what's going on if you
01001 // don't have enough short paragraph end lines to tell us what's going on.
01002 //
01003 // For instance, let's say you have the following outline:
01004 //
01005 //   (A1)  xxxxxxxxxxxxxxxxxxxxxx
01006 //           xxxxxxxxxxxxxxxxxxxx
01007 //         xxxxxxxxxxxxxxxxxxxxxx
01008 //         xxxxxxxxxxxxxxxxxxxxxx
01009 //
01010 // Even if we know that the text is left-to-right and so will probably be
01011 // left-aligned, both of the following are possible texts:
01012 //
01013 //  (A1a)  1. Here our list item
01014 //           with two full lines.
01015 //         2. Here a second item.
01016 //         3. Here our third one.
01017 //
01018 //  (A1b)  so ends paragraph one.
01019 //           Here  starts another
01020 //         paragraph  we want  to
01021 //         read.  This  continues
01022 //
01023 // These examples are obvious from the text and should have been caught
01024 // by the StrongEvidenceClassify pass.  However, for languages where we don't
01025 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
01026 // it's worth guessing that (A1b) is the correct interpretation if there are
01027 // far more "full" lines than "short" lines.
01028 void GeometricClassify(int debug_level,
01029                        GenericVector<RowScratchRegisters> *rows,
01030                        int row_start, int row_end,
01031                        ParagraphTheory *theory) {
01032   if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
01033     return;
01034   if (debug_level > 1) {
01035     tprintf("###############################################\n");
01036     tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n",
01037             row_start, row_end);
01038     tprintf("###############################################\n");
01039   }
01040   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
01041
01042   GeometricClassifierState s(debug_level, rows, row_start, row_end);
01043   if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
01044     s.Fail(2, "Too much variety for simple outline classification.");
01045     return;
01046   }
01047   if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
01048     s.Fail(1, "Not enough variety for simple outline classification.");
01049     return;
01050   }
01051   if (s.left_tabs.size() + s.right_tabs.size() == 3) {
01052     GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
01053     return;
01054   }
01055
01056   // At this point, we know that one side has at least two tab stops, and the
01057   // other side has one or two tab stops.
01058   // Left to determine:
01059   //   (1) Which is the body indent and which is the first line indent?
01060   //   (2) Is the text fully justified?
01061
01062   // If one side happens to have three or more tab stops, assume that side
01063   // is opposite of the aligned side.
01064   if (s.right_tabs.size() > 2) {
01065     s.AssumeLeftJustification();
01066   } else if (s.left_tabs.size() > 2) {
01067     s.AssumeRightJustification();
01068   } else if (s.ltr) {  // guess based on script direction
01069     s.AssumeLeftJustification();
01070   } else {
01071     s.AssumeRightJustification();
01072   }
01073
01074   if (s.AlignTabs().size() == 2) {
01075     // For each tab stop on the aligned side, how many of them appear
01076     // to be paragraph start lines?  [first lines]
01077     int firsts[2] = {0, 0};
01078     // Count the first line as a likely paragraph start line.
01079     firsts[s.AlignsideTabIndex(s.row_start)]++;
01080     // For each line, if the first word would have fit on the previous
01081     // line count it as a likely paragraph start line.
01082     for (int i = s.row_start + 1; i < s.row_end; i++) {
01083       if (s.FirstWordWouldHaveFit(i - 1, i)) {
01084         firsts[s.AlignsideTabIndex(i)]++;
01085       }
01086     }
01087     // Make an extra accounting for the last line of the paragraph just
01088     // in case it's the only short line in the block.  That is, take its
01089     // first word as typical and see if this looks like the *last* line
01090     // of a paragraph.  If so, mark the *other* indent as probably a first.
01091     if (s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
01092       firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
01093     }
01094
01095     int percent0firsts, percent1firsts;
01096     percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
01097     percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
01098
01099     // TODO(eger): Tune these constants if necessary.
01100     if ((percent0firsts < 20 && 30 < percent1firsts) ||
01101         percent0firsts + 30 < percent1firsts) {
01102       s.first_indent = s.AlignTabs()[1].center;
01103       s.body_indent = s.AlignTabs()[0].center;
01104     } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
01105                percent1firsts + 30 < percent0firsts) {
01106       s.first_indent = s.AlignTabs()[0].center;
01107       s.body_indent = s.AlignTabs()[1].center;
01108     } else {
01109       // Ambiguous! Probably lineated (poetry)
01110       if (debug_level > 1) {
01111         tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
01112                 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
01113         tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
01114                 s.AlignTabs()[0].center, percent0firsts);
01115         tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
01116                 s.AlignTabs()[1].center, percent1firsts);
01117         s.PrintRows();
01118       }
01119       return;
01120     }
01121   } else {
01122     // There's only one tab stop for the "aligned to" side.
01123     s.first_indent = s.body_indent = s.AlignTabs()[0].center;
01124   }
01125
01126   // At this point, we have our model.
01127   const ParagraphModel *model = theory->AddModel(s.Model());
01128
01129   // Now all we have to do is figure out if the text is fully justified or not.
01130   // eop_threshold: default to fully justified unless we see evidence below.
01131   //    See description on MarkRowsWithModel()
01132   s.eop_threshold =
01133       (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
01134   // If the text is not fully justified, re-set the eop_threshold to 0.
01135   if (s.AlignTabs().size() == 2) {
01136     // Paragraphs with a paragraph-start indent.
01137     for (int i = s.row_start; i < s.row_end - 1; i++) {
01138       if (ValidFirstLine(s.rows, i + 1, model) &&
01139           !NearlyEqual(s.OffsideTabs()[0].center,
01140                        (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
01141         // We found a non-end-of-paragraph short line: not fully justified.
01142         s.eop_threshold = 0;
01143         break;
01144       }
01145     }
01146   } else {
01147     // Paragraphs with no paragraph-start indent.
01148     for (int i = s.row_start; i < s.row_end - 1; i++) {
01149       if (!s.FirstWordWouldHaveFit(i, i + 1) &&
01150           !NearlyEqual(s.OffsideTabs()[0].center,
01151                        (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
01152         // We found a non-end-of-paragraph short line: not fully justified.
01153         s.eop_threshold = 0;
01154         break;
01155       }
01156     }
01157   }
01158   MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
01159 }
01160
01161 // =============== Implementation of ParagraphTheory =====================
01162
01163 const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
01164   for (int i = 0; i < models_->size(); i++) {
01165     if ((*models_)[i]->Comparable(model))
01166       return (*models_)[i];
01167   }
01168   ParagraphModel *m = new ParagraphModel(model);
01169   models_->push_back(m);
01170   models_we_added_.push_back_new(m);
01171   return m;
01172 }
01173
01174 void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
01175   for (int i = models_->size() - 1; i >= 0; i--) {
01176     ParagraphModel *m = (*models_)[i];
01177     if (!used_models.contains(m) && models_we_added_.contains(m)) {
01178       delete m;
01179       models_->remove(i);
01180       models_we_added_.remove(models_we_added_.get_index(m));
01181     }
01182   }
01183 }
01184
01185 // Examine rows[start, end) and try to determine if an existing non-centered
01186 // paragraph model would fit them perfectly.  If so, return a pointer to it.
01187 // If not, return NULL.
01188 const ParagraphModel *ParagraphTheory::Fits(
01189     const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
01190   for (int m = 0; m < models_->size(); m++) {
01191     const ParagraphModel *model = (*models_)[m];
01192     if (model->justification() != JUSTIFICATION_CENTER &&
01193         RowsFitModel(rows, start, end, model))
01194       return model;
01195   }
01196   return NULL;
01197 }
01198
01199 void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
01200   for (int m = 0; m < models_->size(); m++) {
01201     const ParagraphModel *model = (*models_)[m];
01202     if (model->justification() != JUSTIFICATION_CENTER)
01203       models->push_back_new(model);
01204   }
01205 }
01206
01207 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
01208   for (int i = 0; i < models_->size(); i++) {
01209     if ((*models_)[i] == model)
01210       return i;
01211   }
01212   return -1;
01213 }
01214
01215 bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
01216                     int row, const ParagraphModel *model) {
01217   if (!StrongModel(model)) {
01218     tprintf("ValidFirstLine() should only be called with strong models!\n");
01219   }
01220   return StrongModel(model) &&
01221       model->ValidFirstLine(
01222           (*rows)[row].lmargin_, (*rows)[row].lindent_,
01223           (*rows)[row].rindent_, (*rows)[row].rmargin_);
01224 }
01225
01226 bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
01227                    int row, const ParagraphModel *model) {
01228   if (!StrongModel(model)) {
01229     tprintf("ValidBodyLine() should only be called with strong models!\n");
01230   }
01231   return StrongModel(model) &&
01232       model->ValidBodyLine(
01233           (*rows)[row].lmargin_, (*rows)[row].lindent_,
01234           (*rows)[row].rindent_, (*rows)[row].rmargin_);
01235 }
01236
01237 bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
01238                      int a, int b, const ParagraphModel *model) {
01239   if (model != kCrownRight && model != kCrownLeft) {
01240     tprintf("CrownCompatible() should only be called with crown models!\n");
01241     return false;
01242   }
01243   RowScratchRegisters &row_a = (*rows)[a];
01244   RowScratchRegisters &row_b = (*rows)[b];
01245   if (model == kCrownRight) {
01246     return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
01247                        row_b.rindent_ + row_b.rmargin_,
01248                        Epsilon(row_a.ri_->average_interword_space));
01249   }
01250   return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
01251                      row_b.lindent_ + row_b.lmargin_,
01252                      Epsilon(row_a.ri_->average_interword_space));
01253 }
01254
01255
01256 // =============== Implementation of ParagraphModelSmearer ====================
01257
01258 ParagraphModelSmearer::ParagraphModelSmearer(
01259     GenericVector<RowScratchRegisters> *rows,
01260     int row_start, int row_end, ParagraphTheory *theory)
01261         : theory_(theory), rows_(rows), row_start_(row_start),
01262           row_end_(row_end) {
01263   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
01264     row_start_ = 0;
01265     row_end_ = 0;
01266     return;
01267   }
01268   SetOfModels no_models;
01269   for (int row = row_start - 1; row <= row_end; row++) {
01270     open_models_.push_back(no_models);
01271   }
01272 }
01273
01274 // see paragraphs_internal.h
01275 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
01276   SetOfModels no_models;
01277   if (row_start < row_start_) row_start = row_start_;
01278   if (row_end > row_end_) row_end = row_end_;
01279
01280   for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
01281        row++) {
01282     if ((*rows_)[row].ri_->num_words == 0) {
01283       OpenModels(row + 1) = no_models;
01284     } else {
01285       SetOfModels &opened = OpenModels(row);
01286       (*rows_)[row].StartHypotheses(&opened);
01287
01288       // Which models survive the transition from row to row + 1?
01289       SetOfModels still_open;
01290       for (int m = 0; m < opened.size(); m++) {
01291         if (ValidFirstLine(rows_, row, opened[m]) ||
01292             ValidBodyLine(rows_, row, opened[m])) {
01293           // This is basic filtering; we check likely paragraph starty-ness down
01294           // below in Smear() -- you know, whether the first word would have fit
01295           // and such.
01296           still_open.push_back_new(opened[m]);
01297         }
01298       }
01299       OpenModels(row + 1) = still_open;
01300     }
01301   }
01302 }
01303
01304 // see paragraphs_internal.h
01305 void ParagraphModelSmearer::Smear() {
01306   CalculateOpenModels(row_start_, row_end_);
01307
01308   // For each row which we're unsure about (that is, it is LT_UNKNOWN or
01309   // we have multiple LT_START hypotheses), see if there's a model that
01310   // was recently used (an "open" model) which might model it well.
01311   for (int i = row_start_; i < row_end_; i++) {
01312     RowScratchRegisters &row = (*rows_)[i];
01313     if (row.ri_->num_words == 0)
01314       continue;
01315
01316     // Step One:
01317     //   Figure out if there are "open" models which are left-alined or
01318     //   right-aligned.  This is important for determining whether the
01319     //   "first" word in a row would fit at the "end" of the previous row.
01320     bool left_align_open = false;
01321     bool right_align_open = false;
01322     for (int m = 0; m < OpenModels(i).size(); m++) {
01323       switch (OpenModels(i)[m]->justification()) {
01324         case JUSTIFICATION_LEFT: left_align_open = true; break;
01325         case JUSTIFICATION_RIGHT: right_align_open = true; break;
01326         default: left_align_open = right_align_open = true;
01327       }
01328     }
01329     // Step Two:
01330     //   Use that knowledge to figure out if this row is likely to
01331     //   start a paragraph.
01332     bool likely_start;
01333     if (i == 0) {
01334       likely_start = true;
01335     } else {
01336       if ((left_align_open && right_align_open) ||
01337           (!left_align_open && !right_align_open)) {
01338         likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
01339                                             JUSTIFICATION_LEFT) ||
01340                        LikelyParagraphStart((*rows_)[i - 1], row,
01341                                             JUSTIFICATION_RIGHT);
01342       } else if (left_align_open) {
01343         likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
01344                                             JUSTIFICATION_LEFT);
01345       } else {
01346         likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
01347                                             JUSTIFICATION_RIGHT);
01348       }
01349     }
01350
01351     // Step Three:
01352     //   If this text line seems like an obvious first line of an
01353     //   open model, or an obvious continuation of an existing
01354     //   modelled paragraph, mark it up.
01355     if (likely_start) {
01356       // Add Start Hypotheses for all Open models that fit.
01357       for (int m = 0; m < OpenModels(i).size(); m++) {
01358         if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
01359           row.AddStartLine(OpenModels(i)[m]);
01360         }
01361       }
01362     } else {
01363       // Add relevant body line hypotheses.
01364       SetOfModels last_line_models;
01365       if (i > 0) {
01366         (*rows_)[i - 1].StrongHypotheses(&last_line_models);
01367       } else {
01368         theory_->NonCenteredModels(&last_line_models);
01369       }
01370       for (int m = 0; m < last_line_models.size(); m++) {
01371         const ParagraphModel *model = last_line_models[m];
01372         if (ValidBodyLine(rows_, i, model))
01373           row.AddBodyLine(model);
01374       }
01375     }
01376
01377     // Step Four:
01378     //   If we're still quite unsure about this line, go through all
01379     //   models in our theory and see if this row could be the start
01380     //   of any of our  models.
01381     if (row.GetLineType() == LT_UNKNOWN ||
01382         (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
01383       SetOfModels all_models;
01384       theory_->NonCenteredModels(&all_models);
01385       for (int m = 0; m < all_models.size(); m++) {
01386         if (ValidFirstLine(rows_, i, all_models[m])) {
01387           row.AddStartLine(all_models[m]);
01388         }
01389       }
01390     }
01391     // Step Five:
01392     //   Since we may have updated the hypotheses about this row, we need
01393     //   to recalculate the Open models for the rest of rows[i + 1, row_end)
01394     if (row.GetLineType() != LT_UNKNOWN) {
01395       CalculateOpenModels(i + 1, row_end_);
01396     }
01397   }
01398 }
01399
01400 // ================ Main Paragraph Detection Algorithm =======================
01401
01402 // Find out what ParagraphModels are actually used, and discard any
01403 // that are not.
01404 void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
01405                          ParagraphTheory *theory) {
01406   SetOfModels used_models;
01407   for (int i = 0; i < rows.size(); i++) {
01408     rows[i].StrongHypotheses(&used_models);
01409   }
01410   theory->DiscardUnusedModels(used_models);
01411 }
01412
01413 // DowngradeWeakestToCrowns:
01414 //   Forget any flush-{left, right} models unless we see two or more
01415 //   of them in sequence.
01416 //
01417 // In pass 3, we start to classify even flush-left paragraphs (paragraphs
01418 // where the first line and body indent are the same) as having proper Models.
01419 // This is generally dangerous, since if you start imagining that flush-left
01420 // is a typical paragraph model when it is not, it will lead you to chop normal
01421 // indented paragraphs in the middle whenever a sentence happens to start on a
01422 // new line (see "This" above).  What to do?
01423 //   What we do is to take any paragraph which is flush left and is not
01424 // preceded by another paragraph of the same model and convert it to a "Crown"
01425 // paragraph.  This is a weak pseudo-ParagraphModel which is a placeholder
01426 // for later.  It means that the paragraph is flush, but it would be desirable
01427 // to mark it as the same model as following text if it fits.  This downgrade
01428 // FlushLeft -> CrownLeft -> Model of following paragraph.  Means that we
01429 // avoid making flush left Paragraph Models whenever we see a top-of-the-page
01430 // half-of-a-paragraph. and instead we mark it the same as normal body text.
01431 //
01432 // Implementation:
01433 //
01434 //   Comb backwards through the row scratch registers, and turn any
01435 //   sequences of body lines of equivalent type abutted against the beginning
01436 //   or a body or start line of a different type into a crown paragraph.
01437 void DowngradeWeakestToCrowns(int debug_level,
01438                               ParagraphTheory *theory,
01439                               GenericVector<RowScratchRegisters> *rows) {
01440   int start;
01441   for (int end = rows->size(); end > 0; end = start) {
01442     // Search back for a body line of a unique type.
01443     const ParagraphModel *model = NULL;
01444     while (end > 0 &&
01445            (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
01446       end--;
01447     }
01448     if (end == 0) break;
01449     start = end - 1;
01450     while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
01451       start--;  // walk back to the first line that is not the same body type.
01452     }
01453     if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
01454         StrongModel(model) &&
01455         NearlyEqual(model->first_indent(), model->body_indent(),
01456                     model->tolerance())) {
01457         start--;
01458     }
01459     start++;
01460     // Now rows[start, end) is a sequence of unique body hypotheses of model.
01461     if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
01462       continue;
01463     if (!StrongModel(model)) {
01464       while (start > 0 &&
01465              CrownCompatible(rows, start - 1, start, model))
01466         start--;
01467     }
01468     if (start == 0 ||
01469         (!StrongModel(model)) ||
01470         (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
01471       // crownify rows[start, end)
01472       const ParagraphModel *crown_model = model;
01473       if (StrongModel(model)) {
01474           if (model->justification() == JUSTIFICATION_LEFT)
01475             crown_model = kCrownLeft;
01476           else
01477             crown_model = kCrownRight;
01478       }
01479       (*rows)[start].SetUnknown();
01480       (*rows)[start].AddStartLine(crown_model);
01481       for (int row = start + 1; row < end; row++) {
01482         (*rows)[row].SetUnknown();
01483         (*rows)[row].AddBodyLine(crown_model);
01484       }
01485     }
01486   }
01487   DiscardUnusedModels(*rows, theory);
01488 }
01489
01490
01491 // Clear all hypotheses about lines [start, end) and reset margins.
01492 //
01493 // The empty space between the left of a row and the block boundary (and
01494 // similarly for the right) is split into two pieces: margin and indent.
01495 // In initial processing, we assume the block is tight and the margin for
01496 // all lines is set to zero.   However, if our first pass does not yield
01497 // models for  everything,  it may be  due to an  inset paragraph like a
01498 // block-quote.   In that case, we make a second pass over that unmarked
01499 // section of the page and reset the "margin" portion of the empty space
01500 // to the common amount of space at  the ends of the lines under consid-
01501 // eration.    This would be equivalent to percentile set to 0. However,
01502 // sometimes we have a single character sticking out in the right margin
01503 // of a text block  (like the 'r' in 'for' on line 3 above),  and we can
01504 // really  just ignore it as an outlier.   To express this, we allow the
01505 // user to specify  the percentile (0..100)  of indent values  to use as
01506 // the common margin for each row in the run of rows[start, end).
01507 void RecomputeMarginsAndClearHypotheses(
01508     GenericVector<RowScratchRegisters> *rows, int start, int end,
01509     int percentile) {
01510   if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
01511     return;
01512
01513   int lmin, lmax, rmin, rmax;
01514   lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
01515   rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
01516   for (int i = start; i < end; i++) {
01517     RowScratchRegisters &sr = (*rows)[i];
01518     sr.SetUnknown();
01519     if (sr.ri_->num_words == 0)
01520       continue;
01521     UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
01522     UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
01523   }
01524   STATS lefts(lmin, lmax + 1);
01525   STATS rights(rmin, rmax + 1);
01526   for (int i = start; i < end; i++) {
01527     RowScratchRegisters &sr = (*rows)[i];
01528     if (sr.ri_->num_words == 0)
01529       continue;
01530     lefts.add(sr.lmargin_ + sr.lindent_, 1);
01531     rights.add(sr.rmargin_ + sr.rindent_, 1);
01532   }
01533   int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
01534   int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
01535   for (int i = start; i < end; i++) {
01536     RowScratchRegisters &sr = (*rows)[i];
01537     int ldelta = ignorable_left - sr.lmargin_;
01538     sr.lmargin_ += ldelta;
01539     sr.lindent_ -= ldelta;
01540     int rdelta = ignorable_right - sr.rmargin_;
01541     sr.rmargin_ += rdelta;
01542     sr.rindent_ -= rdelta;
01543   }
01544 }
01545
01546 // Return the minimum inter-word space in rows[row_start, row_end).
01547 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
01548                    int row_start, int row_end) {
01549   if (row_end < row_start + 1) return 1;
01550   bool legit = false;
01551   int natural_space = rows[row_start].ri_->average_interword_space;
01552   for (int i = row_start; i < row_end; i++) {
01553     if (rows[i].ri_->num_words > 1) {
01554       if (!legit) {
01555         natural_space = rows[i].ri_->average_interword_space;
01556         legit = true;
01557       } else {
01558         if (rows[i].ri_->average_interword_space < natural_space)
01559           natural_space = rows[i].ri_->average_interword_space;
01560       }
01561     }
01562   }
01563   return natural_space;
01564 }
01565
01566 // Return whether the first word on the after line can fit in the space at
01567 // the end of the before line (knowing which way the text is aligned and read).
01568 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
01569                            const RowScratchRegisters &after,
01570                            tesseract::ParagraphJustification justification) {
01571   if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
01572     return true;
01573
01574   if (justification == JUSTIFICATION_UNKNOWN) {
01575     tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
01576   }
01577   int available_space;
01578   if (justification == JUSTIFICATION_CENTER) {
01579     available_space = before.lindent_ + before.rindent_;
01580   } else {
01581     available_space = before.OffsideIndent(justification);
01582   }
01583   available_space -= before.ri_->average_interword_space;
01584
01585   if (before.ri_->ltr)
01586     return after.ri_->lword_box.width() < available_space;
01587   return after.ri_->rword_box.width() < available_space;
01588 }
01589
01590 // Return whether the first word on the after line can fit in the space at
01591 // the end of the before line (not knowing which way the text goes) in a left
01592 // or right alignemnt.
01593 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
01594                            const RowScratchRegisters &after) {
01595   if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
01596     return true;
01597
01598   int available_space = before.lindent_;
01599   if (before.rindent_ > available_space)
01600     available_space = before.rindent_;
01601   available_space -= before.ri_->average_interword_space;
01602
01603   if (before.ri_->ltr)
01604     return after.ri_->lword_box.width() < available_space;
01605   return after.ri_->rword_box.width() < available_space;
01606 }
01607
01608 bool TextSupportsBreak(const RowScratchRegisters &before,
01609                        const RowScratchRegisters &after) {
01610   if (before.ri_->ltr) {
01611     return before.ri_->rword_likely_ends_idea &&
01612            after.ri_->lword_likely_starts_idea;
01613   } else {
01614     return before.ri_->lword_likely_ends_idea &&
01615            after.ri_->rword_likely_starts_idea;
01616   }
01617 }
01618
01619 bool LikelyParagraphStart(const RowScratchRegisters &before,
01620                           const RowScratchRegisters &after) {
01621   return before.ri_->num_words == 0 ||
01622       (FirstWordWouldHaveFit(before, after) &&
01623        TextSupportsBreak(before, after));
01624 }
01625
01626 bool LikelyParagraphStart(const RowScratchRegisters &before,
01627                           const RowScratchRegisters &after,
01628                           tesseract::ParagraphJustification j) {
01629   return before.ri_->num_words == 0 ||
01630       (FirstWordWouldHaveFit(before, after, j) &&
01631        TextSupportsBreak(before, after));
01632 }
01633
01634 // Examine rows[start, end) and try to determine what sort of ParagraphModel
01635 // would fit them as a single paragraph.
01636 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
01637 // If the rows given could be a consistent start to a paragraph, set *consistent
01638 // true.
01639 ParagraphModel InternalParagraphModelByOutline(
01640     const GenericVector<RowScratchRegisters> *rows,
01641     int start, int end, int tolerance, bool *consistent) {
01642   int ltr_line_count = 0;
01643   for (int i = start; i < end; i++) {
01644     ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
01645   }
01646   bool ltr = (ltr_line_count >= (end - start) / 2);
01647
01648   *consistent = true;
01649   if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
01650     return ParagraphModel();
01651
01652   // Ensure the caller only passed us a region with a common rmargin and
01653   // lmargin.
01654   int lmargin = (*rows)[start].lmargin_;
01655   int rmargin = (*rows)[start].rmargin_;
01656   int lmin, lmax, rmin, rmax, cmin, cmax;
01657   lmin = lmax = (*rows)[start + 1].lindent_;
01658   rmin = rmax = (*rows)[start + 1].rindent_;
01659   cmin = cmax = 0;
01660   for (int i = start + 1; i < end; i++) {
01661     if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
01662       tprintf("Margins don't match! Software error.\n");
01663       *consistent = false;
01664       return ParagraphModel();
01665     }
01666     UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
01667     UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
01668     UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
01669   }
01670   int ldiff = lmax - lmin;
01671   int rdiff = rmax - rmin;
01672   int cdiff = cmax - cmin;
01673   if (rdiff > tolerance && ldiff > tolerance) {
01674     if (cdiff < tolerance * 2) {
01675       if (end - start < 3)
01676         return ParagraphModel();
01677       return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
01678     }
01679     *consistent = false;
01680     return ParagraphModel();
01681   }
01682   if (end - start < 3)  // Don't return a model for two line paras.
01683     return ParagraphModel();
01684
01685   // These booleans keep us from saying something is aligned left when the body
01686   // left variance is too large.
01687   bool body_admits_left_alignment = ldiff < tolerance;
01688   bool body_admits_right_alignment = rdiff < tolerance;
01689
01690   ParagraphModel left_model =
01691       ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
01692                      (lmin + lmax) / 2, tolerance);
01693   ParagraphModel right_model =
01694       ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
01695                      (rmin + rmax) / 2, tolerance);
01696
01697   // These booleans keep us from having an indent on the "wrong side" for the
01698   // first line.
01699   bool text_admits_left_alignment = ltr || left_model.is_flush();
01700   bool text_admits_right_alignment = !ltr || right_model.is_flush();
01701
01702   // At least one of the edges is less than tolerance in variance.
01703   // If the other is obviously ragged, it can't be the one aligned to.
01704   // [Note the last line is included in this raggedness.]
01705   if (tolerance < rdiff) {
01706     if (body_admits_left_alignment && text_admits_left_alignment)
01707       return left_model;
01708     *consistent = false;
01709     return ParagraphModel();
01710   }
01711   if (tolerance < ldiff) {
01712     if (body_admits_right_alignment && text_admits_right_alignment)
01713       return right_model;
01714     *consistent = false;
01715     return ParagraphModel();
01716   }
01717
01718   // At this point, we know the body text doesn't vary much on either side.
01719
01720   // If the first line juts out oddly in one direction or the other,
01721   // that likely indicates the side aligned to.
01722   int first_left = (*rows)[start].lindent_;
01723   int first_right = (*rows)[start].rindent_;
01724
01725   if (ltr && body_admits_left_alignment &&
01726       (first_left < lmin || first_left > lmax))
01727     return left_model;
01728   if (!ltr && body_admits_right_alignment &&
01729       (first_right < rmin || first_right > rmax))
01730     return right_model;
01731
01732   *consistent = false;
01733   return ParagraphModel();
01734 }
01735
01736 // Examine rows[start, end) and try to determine what sort of ParagraphModel
01737 // would fit them as a single paragraph.   If nothing fits,
01738 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
01739 // output if we're debugging.
01740 ParagraphModel ParagraphModelByOutline(
01741     int debug_level,
01742     const GenericVector<RowScratchRegisters> *rows,
01743     int start, int end, int tolerance) {
01744   bool unused_consistent;
01745   ParagraphModel retval = InternalParagraphModelByOutline(
01746       rows, start, end, tolerance, &unused_consistent);
01747   if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
01748     tprintf("Could not determine a model for this paragraph:\n");
01749     PrintRowRange(*rows, start, end);
01750   }
01751   return retval;
01752 }
01753
01754 // Do rows[start, end) form a single instance of the given paragraph model?
01755 bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
01756                   int start, int end, const ParagraphModel *model) {
01757   if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
01758     return false;
01759   if (!ValidFirstLine(rows, start, model)) return false;
01760   for (int i = start + 1 ; i < end; i++) {
01761     if (!ValidBodyLine(rows, i, model)) return false;
01762   }
01763   return true;
01764 }
01765
01766 // Examine rows[row_start, row_end) as an independent section of text,
01767 // and mark rows that are exceptionally clear as start-of-paragraph
01768 // and paragraph-body lines.
01769 //
01770 // We presume that any lines surrounding rows[row_start, row_end) may
01771 // have wildly different paragraph models, so we don't key any data off
01772 // of those lines.
01773 //
01774 // We only take the very strongest signals, as we don't want to get
01775 // confused and marking up centered text, poetry, or source code as
01776 // clearly part of a typical paragraph.
01777 void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows,
01778                         int row_start, int row_end) {
01779   // Record patently obvious body text.
01780   for (int i = row_start + 1; i < row_end; i++) {
01781     const RowScratchRegisters &prev = (*rows)[i - 1];
01782     RowScratchRegisters &curr = (*rows)[i];
01783     tesseract::ParagraphJustification typical_justification =
01784         prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
01785     if (!curr.ri_->rword_likely_starts_idea &&
01786         !curr.ri_->lword_likely_starts_idea &&
01787         !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
01788       curr.SetBodyLine();
01789     }
01790   }
01791
01792   // Record patently obvious start paragraph lines.
01793   //
01794   // It's an extremely good signal of the start of a paragraph that
01795   // the first word would have fit on the end of the previous line.
01796   // However, applying just that signal would have us mark random
01797   // start lines of lineated text (poetry and source code) and some
01798   // centered headings as paragraph start lines.  Therefore, we use
01799   // a second qualification for a paragraph start: Not only should
01800   // the first word of this line have fit on the previous line,
01801   // but also, this line should go full to the right of the block,
01802   // disallowing a subsequent word from having fit on this line.
01803
01804   // First row:
01805   {
01806     RowScratchRegisters &curr = (*rows)[row_start];
01807     RowScratchRegisters &next = (*rows)[row_start + 1];
01808     tesseract::ParagraphJustification j =
01809         curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
01810     if (curr.GetLineType() == LT_UNKNOWN &&
01811         !FirstWordWouldHaveFit(curr, next, j) &&
01812         (curr.ri_->lword_likely_starts_idea ||
01813          curr.ri_->rword_likely_starts_idea)) {
01814       curr.SetStartLine();
01815     }
01816   }
01817   // Middle rows
01818   for (int i = row_start + 1; i < row_end - 1; i++) {
01819     RowScratchRegisters &prev = (*rows)[i - 1];
01820     RowScratchRegisters &curr = (*rows)[i];
01821     RowScratchRegisters &next = (*rows)[i + 1];
01822     tesseract::ParagraphJustification j =
01823         curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
01824     if (curr.GetLineType() == LT_UNKNOWN &&
01825         !FirstWordWouldHaveFit(curr, next, j) &&
01826         LikelyParagraphStart(prev, curr, j)) {
01827       curr.SetStartLine();
01828     }
01829   }
01830   // Last row
01831   {  // the short circuit at the top means we have at least two lines.
01832     RowScratchRegisters &prev = (*rows)[row_end - 2];
01833     RowScratchRegisters &curr = (*rows)[row_end - 1];
01834     tesseract::ParagraphJustification j =
01835         curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
01836     if (curr.GetLineType() == LT_UNKNOWN &&
01837         !FirstWordWouldHaveFit(curr, curr, j) &&
01838         LikelyParagraphStart(prev, curr, j)) {
01839       curr.SetStartLine();
01840     }
01841   }
01842 }
01843
01844 // Look for sequences of a start line followed by some body lines in
01845 // rows[row_start, row_end) and create ParagraphModels for them if
01846 // they seem coherent.
01847 void ModelStrongEvidence(int debug_level,
01848                          GenericVector<RowScratchRegisters> *rows,
01849                          int row_start, int row_end,
01850                          bool allow_flush_models,
01851                          ParagraphTheory *theory) {
01852   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
01853     return;
01854
01855   int start = row_start;
01856   while (start < row_end) {
01857     while (start < row_end && (*rows)[start].GetLineType() != LT_START)
01858       start++;
01859     if (start >= row_end - 1)
01860       break;
01861
01862     int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
01863     int end = start;
01864     ParagraphModel last_model;
01865     bool next_consistent;
01866     do {
01867       ++end;
01868       // rows[row, end) was consistent.
01869       // If rows[row, end + 1) is not consistent,
01870       //   just model rows[row, end)
01871       if (end < row_end - 1) {
01872         RowScratchRegisters &next = (*rows)[end];
01873         LineType lt = next.GetLineType();
01874         next_consistent = lt == LT_BODY ||
01875             (lt == LT_UNKNOWN &&
01876              !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
01877       } else {
01878         next_consistent = false;
01879       }
01880       if (next_consistent) {
01881         ParagraphModel next_model = InternalParagraphModelByOutline(
01882             rows, start, end + 1, tolerance, &next_consistent);
01883         if (((*rows)[start].ri_->ltr &&
01884              last_model.justification() == JUSTIFICATION_LEFT &&
01885              next_model.justification() != JUSTIFICATION_LEFT) ||
01886             (!(*rows)[start].ri_->ltr &&
01887              last_model.justification() == JUSTIFICATION_RIGHT &&
01888              next_model.justification() != JUSTIFICATION_RIGHT)) {
01889           next_consistent = false;
01890         }
01891         last_model = next_model;
01892       } else {
01893         next_consistent = false;
01894       }
01895     } while (next_consistent && end < row_end);
01896     // At this point, rows[start, end) looked like it could have been a
01897     // single paragraph.  If we can make a good ParagraphModel for it,
01898     // do so and mark this sequence with that model.
01899     if (end > start + 1) {
01900       // emit a new paragraph if we have more than one line.
01901       const ParagraphModel *model = NULL;
01902       ParagraphModel new_model = ParagraphModelByOutline(
01903           debug_level, rows, start, end,
01904           Epsilon(InterwordSpace(*rows, start, end)));
01905       if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
01906         // couldn't create a good model, oh well.
01907       } else if (new_model.is_flush()) {
01908         if (end == start + 2) {
01909           // It's very likely we just got two paragraph starts in a row.
01910           end = start + 1;
01911         } else if (start == row_start) {
01912           // Mark this as a Crown.
01913           if (new_model.justification() == JUSTIFICATION_LEFT) {
01914             model = kCrownLeft;
01915           } else {
01916             model = kCrownRight;
01917           }
01918         } else if (allow_flush_models) {
01919           model = theory->AddModel(new_model);
01920         }
01921       } else {
01922         model = theory->AddModel(new_model);
01923       }
01924       if (model) {
01925         (*rows)[start].AddStartLine(model);
01926         for (int i = start + 1; i < end; i++) {
01927           (*rows)[i].AddBodyLine(model);
01928         }
01929       }
01930     }
01931     start = end;
01932   }
01933 }
01934
01935 // We examine rows[row_start, row_end) and do the following:
01936 //   (1) Clear all existing hypotheses for the rows being considered.
01937 //   (2) Mark up any rows as exceptionally likely to be paragraph starts
01938 //       or paragraph body lines as such using both geometric and textual
01939 //       clues.
01940 //   (3) Form models for any sequence of start + continuation lines.
01941 //   (4) Smear the paragraph models to cover surrounding text.
01942 void StrongEvidenceClassify(int debug_level,
01943                             GenericVector<RowScratchRegisters> *rows,
01944                             int row_start, int row_end,
01945                             ParagraphTheory *theory) {
01946   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
01947     return;
01948
01949   if (debug_level > 1) {
01950     tprintf("#############################################\n");
01951     tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
01952     tprintf("#############################################\n");
01953   }
01954
01955   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
01956   MarkStrongEvidence(rows, row_start, row_end);
01957
01958   DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
01959
01960   // Create paragraph models.
01961   ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
01962
01963   DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
01964
01965   // At this point, some rows are marked up as paragraphs with model numbers,
01966   // and some rows are marked up as either LT_START or LT_BODY.  Now let's
01967   // smear any good paragraph hypotheses forward and backward.
01968   ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
01969   smearer.Smear();
01970 }
01971
01972 void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
01973                                int row_start, int row_end,
01974                                ParagraphTheory *theory) {
01975   for (int i = row_start + 1; i < row_end - 1; i++) {
01976     if ((*rows)[i - 1].ri_->has_leaders &&
01977         (*rows)[i].ri_->has_leaders &&
01978         (*rows)[i + 1].ri_->has_leaders) {
01979       const ParagraphModel *model = theory->AddModel(
01980           ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
01981       (*rows)[i].AddStartLine(model);
01982     }
01983   }
01984 }
01985
01986 // Collect sequences of unique hypotheses in row registers and create proper
01987 // paragraphs for them, referencing the paragraphs in row_owners.
01988 void ConvertHypothesizedModelRunsToParagraphs(
01989     int debug_level,
01990     const GenericVector<RowScratchRegisters> &rows,
01991     GenericVector<PARA *> *row_owners,
01992     ParagraphTheory *theory) {
01993   int end = rows.size();
01994   int start;
01995   for (; end > 0; end = start) {
01996     start = end - 1;
01997     const ParagraphModel *model = NULL;
01998     // TODO(eger): Be smarter about dealing with multiple hypotheses.
01999     bool single_line_paragraph = false;
02000     SetOfModels models;
02001     rows[start].NonNullHypotheses(&models);
02002     if (models.size() > 0) {
02003       model = models[0];
02004       if (rows[start].GetLineType(model) != LT_BODY)
02005         single_line_paragraph = true;
02006     }
02007     if (model && !single_line_paragraph) {
02008       // walk back looking for more body lines and then a start line.
02009       while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
02010         // do nothing
02011       }
02012       if (start < 0 || rows[start].GetLineType(model) != LT_START) {
02013         model = NULL;
02014       }
02015     }
02016     if (model == NULL) {
02017       continue;
02018     }
02019     // rows[start, end) should be a paragraph.
02020     PARA *p = new PARA();
02021     if (model == kCrownLeft || model == kCrownRight) {
02022       p->is_very_first_or_continuation = true;
02023       // Crown paragraph.
02024       //   If we can find an existing ParagraphModel that fits, use it,
02025       //   else create a new one.
02026       for (int row = end; row < rows.size(); row++) {
02027         if ((*row_owners)[row] &&
02028             (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
02029             (start == 0 ||
02030              ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
02031           model = (*row_owners)[row]->model;
02032           break;
02033         }
02034       }
02035       if (model == kCrownLeft) {
02036         // No subsequent model fits, so cons one up.
02037         model = theory->AddModel(ParagraphModel(
02038             JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
02039             0, 0, Epsilon(rows[start].ri_->average_interword_space)));
02040       } else if (model == kCrownRight) {
02041         // No subsequent model fits, so cons one up.
02042         model = theory->AddModel(ParagraphModel(
02043             JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
02044             0, 0, Epsilon(rows[start].ri_->average_interword_space)));
02045       }
02046     }
02047     rows[start].SetUnknown();
02048     rows[start].AddStartLine(model);
02049     for (int i = start + 1; i < end; i++) {
02050       rows[i].SetUnknown();
02051       rows[i].AddBodyLine(model);
02052     }
02053     p->model = model;
02054     p->has_drop_cap = rows[start].ri_->has_drop_cap;
02055     p->is_list_item =
02056         model->justification() == JUSTIFICATION_RIGHT
02057             ? rows[start].ri_->rword_indicates_list_item
02058             : rows[start].ri_->lword_indicates_list_item;
02059     for (int row = start; row < end; row++) {
02060       if ((*row_owners)[row] != NULL) {
02061         tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
02062                 "more than once!\n");
02063       }
02064       (*row_owners)[row] = p;
02065     }
02066   }
02067 }
02068
02069 struct Interval {
02070   Interval() : begin(0), end(0) {}
02071   Interval(int b, int e) : begin(b), end(e) {}
02072
02073   int begin;
02074   int end;
02075 };
02076
02077 // Return whether rows[row] appears to be stranded, meaning that the evidence
02078 // for this row is very weak due to context.  For instance, two lines of source
02079 // code may happen to be indented at the same tab vector as body text starts,
02080 // leading us to think they are two start-of-paragraph lines.  This is not
02081 // optimal.  However, we also don't want to mark a sequence of short dialog
02082 // as "weak," so our heuristic is:
02083 //   (1) If a line is surrounded by lines of unknown type, it's weak.
02084 //   (2) If two lines in a row are start lines for a given paragraph type, but
02085 //       after that the same paragraph type does not continue, they're weak.
02086 bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) {
02087   SetOfModels row_models;
02088   rows[row].StrongHypotheses(&row_models);
02089
02090   for (int m = 0; m < row_models.size(); m++) {
02091     bool all_starts = rows[row].GetLineType();
02092     int run_length = 1;
02093     bool continues = true;
02094     for (int i = row - 1; i >= 0 && continues; i--) {
02095       SetOfModels models;
02096       rows[i].NonNullHypotheses(&models);
02097       switch (rows[i].GetLineType(row_models[m])) {
02098         case LT_START: run_length++; break;
02099         case LT_MULTIPLE:  // explicit fall-through
02100         case LT_BODY: run_length++; all_starts = false; break;
02101         case LT_UNKNOWN:  // explicit fall-through
02102         default: continues = false;
02103       }
02104     }
02105     continues = true;
02106     for (int i = row + 1; i < rows.size() && continues; i++) {
02107       SetOfModels models;
02108       rows[i].NonNullHypotheses(&models);
02109       switch (rows[i].GetLineType(row_models[m])) {
02110         case LT_START: run_length++; break;
02111         case LT_MULTIPLE:  // explicit fall-through
02112         case LT_BODY: run_length++; all_starts = false; break;
02113         case LT_UNKNOWN:  // explicit fall-through
02114         default: continues = false;
02115       }
02116     }
02117     if (run_length > 2 || (!all_starts && run_length > 1)) return false;
02118   }
02119   return true;
02120 }
02121
02122 // Go through rows[row_start, row_end) and gather up sequences that need better
02123 // classification.
02124 // + Sequences of non-empty rows without hypotheses.
02125 // + Crown paragraphs not immediately followed by a strongly modeled line.
02126 // + Single line paragraphs surrounded by text that doesn't match the
02127 //   model.
02128 void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
02129                       GenericVector<Interval> *to_fix,
02130                       int row_start, int row_end) {
02131   to_fix->clear();
02132   for (int i = row_start; i < row_end; i++) {
02133     bool needs_fixing = false;
02134
02135     SetOfModels models;
02136     SetOfModels models_w_crowns;
02137     rows[i].StrongHypotheses(&models);
02138     rows[i].NonNullHypotheses(&models_w_crowns);
02139     if (models.empty() && models_w_crowns.size() > 0) {
02140       // Crown paragraph.  Is it followed by a modeled line?
02141       for (int end = i + 1; end < rows.size(); end++) {
02142         SetOfModels end_models;
02143         SetOfModels strong_end_models;
02144         rows[end].NonNullHypotheses(&end_models);
02145         rows[end].StrongHypotheses(&strong_end_models);
02146         if (end_models.size() == 0) {
02147           needs_fixing = true;
02148           break;
02149         } else if (strong_end_models.size() > 0) {
02150           needs_fixing = false;
02151           break;
02152         }
02153       }
02154     } else if (models.empty() && rows[i].ri_->num_words > 0) {
02155       // No models at all.
02156       needs_fixing = true;
02157     }
02158
02159     if (!needs_fixing && !models.empty()) {
02160       needs_fixing = RowIsStranded(rows, i);
02161     }
02162
02163     if (needs_fixing) {
02164       if (!to_fix->empty() && to_fix->back().end == i - 1)
02165         to_fix->back().end = i;
02166       else
02167         to_fix->push_back(Interval(i, i));
02168     }
02169   }
02170   // Convert inclusive intervals to half-open intervals.
02171   for (int i = 0; i < to_fix->size(); i++) {
02172     (*to_fix)[i].end = (*to_fix)[i].end + 1;
02173   }
02174 }
02175
02176 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
02177 // normalize each row_owner to point to an actual PARA, and output the
02178 // paragraphs in order onto paragraphs.
02179 void CanonicalizeDetectionResults(
02180     GenericVector<PARA *> *row_owners,
02181     PARA_LIST *paragraphs) {
02182   GenericVector<PARA *> &rows = *row_owners;
02183   paragraphs->clear();
02184   PARA_IT out(paragraphs);
02185   PARA *formerly_null = NULL;
02186   for (int i = 0; i < rows.size(); i++) {
02187     if (rows[i] == NULL) {
02188       if (i == 0 || rows[i - 1] != formerly_null) {
02189         rows[i] = formerly_null = new PARA();
02190       } else {
02191         rows[i] = formerly_null;
02192         continue;
02193       }
02194     } else if (i > 0 && rows[i - 1] == rows[i]) {
02195       continue;
02196     }
02197     out.add_after_then_move(rows[i]);
02198   }
02199 }
02200
02201 // Main entry point for Paragraph Detection Algorithm.
02202 //
02203 // Given a set of equally spaced textlines (described by row_infos),
02204 // Split them into paragraphs.
02205 //
02206 // Output:
02207 //   row_owners - one pointer for each row, to the paragraph it belongs to.
02208 //   paragraphs - this is the actual list of PARA objects.
02209 //   models - the list of paragraph models referenced by the PARA objects.
02210 //            caller is responsible for deleting the models.
02211 void DetectParagraphs(int debug_level,
02212                       GenericVector<RowInfo> *row_infos,
02213                       GenericVector<PARA *> *row_owners,
02214                       PARA_LIST *paragraphs,
02215                       GenericVector<ParagraphModel *> *models) {
02216   GenericVector<RowScratchRegisters> rows;
02217   ParagraphTheory theory(models);
02218
02219   // Initialize row_owners to be a bunch of NULL pointers.
02220   row_owners->init_to_size(row_infos->size(), NULL);
02221
02222   // Set up row scratch registers for the main algorithm.
02223   rows.init_to_size(row_infos->size(), RowScratchRegisters());
02224   for (int i = 0; i < row_infos->size(); i++) {
02225     rows[i].Init((*row_infos)[i]);
02226   }
02227
02228   // Pass 1:
02229   //   Detect sequences of lines that all contain leader dots (.....)
02230   //   These are likely Tables of Contents.  If there are three text lines in
02231   //   a row with leader dots, it's pretty safe to say the middle one should
02232   //   be a paragraph of its own.
02233   SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
02234
02235   DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
02236
02237   GenericVector<Interval> leftovers;
02238   LeftoverSegments(rows, &leftovers, 0, rows.size());
02239   for (int i = 0; i < leftovers.size(); i++) {
02240     // Pass 2a:
02241     //   Find any strongly evidenced start-of-paragraph lines.  If they're
02242     //   followed by two lines that look like body lines, make a paragraph
02243     //   model for that and see if that model applies throughout the text
02244     //   (that is, "smear" it).
02245     StrongEvidenceClassify(debug_level, &rows,
02246                            leftovers[i].begin, leftovers[i].end, &theory);
02247
02248     // Pass 2b:
02249     //   If we had any luck in pass 2a, we got part of the page and didn't
02250     //   know how to classify a few runs of rows. Take the segments that
02251     //   didn't find a model and reprocess them individually.
02252     GenericVector<Interval> leftovers2;
02253     LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
02254     bool pass2a_was_useful = leftovers2.size() > 1 ||
02255         (leftovers2.size() == 1 &&
02256          (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
02257     if (pass2a_was_useful) {
02258       for (int j = 0; j < leftovers2.size(); j++) {
02259         StrongEvidenceClassify(debug_level, &rows,
02260                                leftovers2[j].begin, leftovers2[j].end,
02261                                &theory);
02262       }
02263     }
02264   }
02265
02266   DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
02267
02268   // Pass 3:
02269   //   These are the dregs for which we didn't have enough strong textual
02270   //   and geometric clues to form matching models for.  Let's see if
02271   //   the geometric clues are simple enough that we could just use those.
02272   LeftoverSegments(rows, &leftovers, 0, rows.size());
02273   for (int i = 0; i < leftovers.size(); i++) {
02274     GeometricClassify(debug_level, &rows,
02275                       leftovers[i].begin, leftovers[i].end, &theory);
02276   }
02277   // Undo any flush models for which there's little evidence.
02278   DowngradeWeakestToCrowns(debug_level, &theory, &rows);
02279
02280   DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
02281
02282   // Pass 4:
02283   //   Take everything that's still not marked up well and clear all markings.
02284   LeftoverSegments(rows, &leftovers, 0, rows.size());
02285   for (int i = 0; i < leftovers.size(); i++) {
02286     for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
02287       rows[j].SetUnknown();
02288     }
02289   }
02290
02291   DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
02292
02293   // Convert all of the unique hypothesis runs to PARAs.
02294   ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
02295                                            &theory);
02296
02297   DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
02298
02299   // Finally, clean up any dangling NULL row paragraph parents.
02300   CanonicalizeDetectionResults(row_owners, paragraphs);
02301 }
02302
02303 // ============ Code interfacing with the rest of Tesseract ==================
02304
02305 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph
02306 // detector RowInfo with all relevant information from the row.
02307 void InitializeRowInfo(const MutableIterator &it, RowInfo *info) {
02308   if (it.PageResIt()->row() != NULL) {
02309     ROW *row = it.PageResIt()->row()->row;
02310     info->pix_ldistance = row->lmargin();
02311     info->pix_rdistance = row->rmargin();
02312     info->average_interword_space =
02313         row->space() > 0 ? row->space() : MAX(row->x_height(), 1);
02314     info->pix_xheight = row->x_height();
02315     info->has_leaders = false;
02316     info->has_drop_cap = row->has_drop_cap();
02317     info->ltr = true;  // set below depending on word scripts
02318   } else {
02319     info->pix_ldistance = info->pix_rdistance = 0;
02320     info->average_interword_space = 1;
02321     info->pix_xheight = 1.0;
02322     info->has_leaders = false;
02323     info->has_drop_cap = false;
02324     info->ltr = true;
02325   }
02326
02327   info->text = "";
02328   char *text = it.GetUTF8Text(RIL_TEXTLINE);
02329   int trailing_ws_idx = strlen(text);  // strip trailing space
02330   while (trailing_ws_idx > 0 &&
02331          // isspace() only takes ASCII
02332          ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
02333          isspace(text[trailing_ws_idx - 1]))
02334     trailing_ws_idx--;
02335   if (trailing_ws_idx > 0) {
02336     int lspaces = info->pix_ldistance / info->average_interword_space;
02337     for (int i = 0; i < lspaces; i++)
02338       info->text += ' ';
02339     for (int i = 0; i < trailing_ws_idx; i++)
02340       info->text += text[i];
02341   }
02342   delete []text;
02343
02344   info->num_words = 0;
02345   info->lword_indicates_list_item = false;
02346   info->lword_likely_starts_idea = false;
02347   info->lword_likely_ends_idea = false;
02348   info->rword_indicates_list_item = false;
02349   info->rword_likely_starts_idea = false;
02350   info->rword_likely_ends_idea = false;
02351
02352   if (info->text.size() == 0) {
02353     info->rword_likely_ends_idea = false;
02354     info->rword_likely_ends_idea = false;
02355     return;
02356   }
02357
02358   int ltr = 0;
02359   int rtl = 0;
02360
02361   PAGE_RES_IT page_res_it = *it.PageResIt();
02362   GenericVector<WERD_RES *> werds;
02363   WERD_RES *word_res = page_res_it.restart_row();
02364   ROW_RES *this_row = page_res_it.row();
02365   int num_leaders = 0;
02366   do {
02367     if (word_res && word_res->best_choice->unichar_string().length() > 0) {
02368       werds.push_back(word_res);
02369       ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
02370       rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
02371       if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
02372     }
02373     word_res = page_res_it.forward();
02374   } while (page_res_it.row() == this_row);
02375
02376   info->has_leaders = num_leaders > 3;
02377   info->num_words = werds.size();
02378   if (werds.size() > 0) {
02379     WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
02380     info->lword_text = lword->best_choice->unichar_string().string();
02381     info->rword_text = rword->best_choice->unichar_string().string();
02382     info->lword_box = lword->word->bounding_box();
02383     info->rword_box = rword->word->bounding_box();
02384     LeftWordAttributes(lword->uch_set, lword->best_choice,
02385                        info->lword_text,
02386                        &info->lword_indicates_list_item,
02387                        &info->lword_likely_starts_idea,
02388                        &info->lword_likely_ends_idea);
02389     RightWordAttributes(rword->uch_set, rword->best_choice,
02390                         info->rword_text,
02391                         &info->rword_indicates_list_item,
02392                         &info->rword_likely_starts_idea,
02393                         &info->rword_likely_ends_idea);
02394   }
02395   info->ltr = ltr >= rtl;
02396 }
02397
02398 // This is called after rows have been identified and words are recognized.
02399 // Much of this could be implemented before word recognition, but text helps
02400 // to identify bulleted lists and gives good signals for sentence boundaries.
02401 void DetectParagraphs(int debug_level,
02402                       const MutableIterator *block_start,
02403                       GenericVector<ParagraphModel *> *models) {
02404   // Clear out any preconceived notions.
02405   if (block_start->Empty(RIL_TEXTLINE)) {
02406     return;
02407   }
02408   BLOCK *block = block_start->PageResIt()->block()->block;
02409   block->para_list()->clear();
02410   bool is_image_block = block->poly_block() && !block->poly_block()->IsText();
02411
02412   // Convert the Tesseract structures to RowInfos
02413   // for the paragraph detection algorithm.
02414   MutableIterator row(*block_start);
02415   if (row.Empty(RIL_TEXTLINE))
02416     return;  // end of input already.
02417
02418   GenericVector<RowInfo> row_infos;
02419   do {
02420     if (!row.PageResIt()->row())
02421       continue;  // empty row.
02422     row.PageResIt()->row()->row->set_para(NULL);
02423     row_infos.push_back(RowInfo());
02424     RowInfo &ri = row_infos.back();
02425     InitializeRowInfo(row, &ri);
02426   } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
02427            row.Next(RIL_TEXTLINE));
02428
02429   // Run the paragraph detection algorithm.
02430   GenericVector<PARA *> row_owners;
02431   GenericVector<PARA *> the_paragraphs;
02432   if (!is_image_block) {
02433     DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
02434                      models);
02435   } else {
02436     row_owners.init_to_size(row_infos.size(), NULL);
02437     CanonicalizeDetectionResults(&row_owners, block->para_list());
02438   }
02439
02440   // Now stitch in the row_owners into the rows.
02441   row = *block_start;
02442   for (int i = 0; i < row_owners.size(); i++) {
02443     while (!row.PageResIt()->row())
02444       row.Next(RIL_TEXTLINE);
02445     row.PageResIt()->row()->row->set_para(row_owners[i]);
02446     row.Next(RIL_TEXTLINE);
02447   }
02448 }
02449
02450 }  // namespace