Major refactor of beam search, elimination of dead code, misc bug fix…

…es, updates to Makefile.am, Changelog etc. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20
lsfjGitHub · Sep 23, 2013 · 4d514d5 · 4d514d5
1 parent 2c90970
commit 4d514d5
Show file tree

Hide file tree

Showing 187 changed files with 41,117 additions and 14,408 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,15 @@
+2013-09-20 v3.03
+* Added Renderer to API to allow document-level processing and output
+  of document formats, like hOCR, PDF.
+* Major refactor of word-level recognition, beam search, eliminating dead code.
+* Refactored classifier to make it easier to add new ones.
+* Generalized feature extractor to allow feature extraction from greyscale.
+* Improved sub/superscript treatment.
+* Improved baseline fit.
+* Added set_unicharset_properties to training tools.
+* Many bug fixes.
+
+
 2012-02-01 - v3.02
   * Moved ResultIterator/PageIterator to ccmain.
   * Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic.

diff --git a/api/Makefile.am b/api/Makefile.am
@@ -9,7 +9,7 @@ if VISIBILITY
 AM_CPPFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
 endif
 
-include_HEADERS = apitypes.h baseapi.h capi.h
+include_HEADERS = apitypes.h baseapi.h capi.h renderer.h
 lib_LTLIBRARIES = 
 
 if !USING_MULTIPLELIBS
@@ -35,7 +35,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
 if VISIBILITY
 libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
 endif
-libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
+libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp
 
 lib_LTLIBRARIES += libtesseract.la
 libtesseract_la_LDFLAGS = 

diff --git a/api/capi.cpp b/api/capi.cpp
@@ -2,6 +2,8 @@
 #   define TESS_CAPI_INCLUDE_BASEAPI
 #endif
 #include "capi.h"
+#include "genericvector.h"
+#include "strngs.h"
 
 TESS_API const char* TESS_CALL TessVersion()
 {
@@ -382,21 +384,21 @@ TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* resu
     return handle->DetectOS(results) ? TRUE : FALSE;
 }
 
-TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
+TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
                                                             int* num_features, int* FeatureOutlineIndex)
 {
-    handle->GetFeaturesForBlob(blob, *denorm, int_features, num_features, FeatureOutlineIndex);
+    handle->GetFeaturesForBlob(blob, int_features, num_features, FeatureOutlineIndex);
 }
 
 TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom)
 {
     return TessBaseAPI::FindRowForBox(blocks, left, top, right, bottom);
 }
 
-TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
+TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
                                                                int* unichar_ids, float* ratings, int* num_matches_returned)
 {
-    handle->RunAdaptiveClassifier(blob, *denorm, num_max_matches, unichar_ids, ratings, num_matches_returned);
+    handle->RunAdaptiveClassifier(blob, num_max_matches, unichar_ids, ratings, num_matches_returned);
 }
 
 TESS_API const char* TESS_CALL TessBaseAPIGetUnichar(TessBaseAPI* handle, int unichar_id)
@@ -424,9 +426,9 @@ TESS_API TBLOB* TESS_CALL TessMakeTBLOB(struct Pix *pix)
     return TessBaseAPI::MakeTBLOB(pix);
 }
 
-TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm)
+TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode)
 {
-    TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE, denorm);
+    TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE);
 }
 
 TESS_API TessOcrEngineMode TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle)

diff --git a/api/capi.h b/api/capi.h
@@ -205,11 +205,11 @@ TESS_API void  TESS_CALL TessBaseAPISetProbabilityInContextFunc(TessBaseAPI* han
 TESS_API void  TESS_CALL TessBaseAPISetFillLatticeFunc(TessBaseAPI* handle, TessFillLatticeFunc f);
 TESS_API BOOL  TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* results);
 
-TESS_API void  TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
+TESS_API void  TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
                                                        int* num_features, int* FeatureOutlineIndex);
 
 TESS_API ROW*  TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom);
-TESS_API void  TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
+TESS_API void  TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
                                                           int* unichar_ids, float* ratings, int* num_matches_returned);
 #endif
 
@@ -226,7 +226,7 @@ TESS_API int   TESS_CALL TessBaseAPINumDawgs(const TessBaseAPI* handle);
 TESS_API ROW*  TESS_CALL TessMakeTessOCRRow(float baseline, float xheight, float descender, float ascender);
 TESS_API TBLOB*
                TESS_CALL TessMakeTBLOB(Pix *pix);
-TESS_API void  TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm);
+TESS_API void  TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode);
 
 TESS_API TessOcrEngineMode
                TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle);

diff --git a/ccmain/Makefile.am b/ccmain/Makefile.am
@@ -19,7 +19,7 @@ noinst_HEADERS = \
     equationdetect.h fixspace.h imgscale.h mutableiterator.h osdetect.h \
     output.h paragraphs.h paragraphs_internal.h paramsd.h pgedit.h \
     reject.h scaleimg.h tessbox.h tessedit.h tesseractclass.h \
-    tesseract_cube_combiner.h tessvars.h tfacep.h tfacepp.h werdit.h
+    tesseract_cube_combiner.h tessvars.h werdit.h
 
 if !USING_MULTIPLELIBS
 noinst_LTLIBRARIES = libtesseract_main.la
@@ -46,7 +46,7 @@ libtesseract_main_la_SOURCES = \
     imgscale.cpp ltrresultiterator.cpp \
     osdetect.cpp output.cpp pageiterator.cpp pagesegmain.cpp \
     pagewalk.cpp paragraphs.cpp paramsd.cpp pgedit.cpp recogtraining.cpp \
-    reject.cpp resultiterator.cpp scaleimg.cpp \
+    reject.cpp resultiterator.cpp scaleimg.cpp superscript.cpp \
     tesseract_cube_combiner.cpp \
     tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \
     tfacepp.cpp thresholder.cpp \

diff --git a/ccmain/adaptions.cpp b/ccmain/adaptions.cpp
@@ -114,27 +114,12 @@ BOOL8 Tesseract::word_adaptable(  //should we adapt?
     return FALSE;
   }
 
-//  if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
   if (flags.bit (CHECK_AMBIG_WERD) &&
-      !getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
+      word->best_choice->dangerous_ambig_found()) {
     if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
     return FALSE;
   }
 
-  // Do not adapt to words that are composed from fragments if
-  // tessedit_adapt_to_char_fragments is false.
-  if (!tessedit_adapt_to_char_fragments) {
-    const char *fragment_lengths = word->best_choice->fragment_lengths();
-    if (fragment_lengths != NULL && *fragment_lengths != '\0') {
-      for (int i = 0; i < word->best_choice->length(); ++i) {
-        if (fragment_lengths[i] > 1) {
-          if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
-          return false;  // found a character composed from fragments
-        }
-      }
-    }
-  }
-
   if (tessedit_adaption_debug) {
     tprintf("returning status %d\n", status);
   }

diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp
@@ -235,29 +235,15 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
   return page_res;
 }
 
-// Helper to make a WERD_CHOICE from the BLOB_CHOICE_LIST_VECTOR using only
-// the top choices. Avoids problems with very long words.
-static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
-                           const UNICHARSET& unicharset,
-                           WERD_CHOICE* word_choice) {
-  *word_choice = WERD_CHOICE(&unicharset);  // clear the word choice.
-  word_choice->make_bad();
-  for (int i = 0; i < char_choices.size(); ++i) {
-    BLOB_CHOICE_IT it(char_choices[i]);
-    BLOB_CHOICE* bc = it.data();
-    word_choice->append_unichar_id(bc->unichar_id(), 1,
-                                   bc->rating(), bc->certainty());
-  }
-}
-
 // Tests the chopper by exhaustively running chop_one_blob.
 // The word_res will contain filled chopped_word, seam_array, denorm,
 // box_word and best_state for the maximally chopped word.
 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
                                   BLOCK* block, ROW* row,
                                   WERD_RES* word_res) {
   if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
-                                         this->textord_use_cjk_fp_model,
+                                         textord_use_cjk_fp_model,
+                                         poly_allow_detailed_fx,
                                          row, block)) {
     word_res->CloneChoppedToRebuild();
     return;
@@ -266,13 +252,10 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
     tprintf("Maximally chopping word at:");
     word_res->word->bounding_box().print();
   }
-  blob_match_table.init_match_table();
-  BLOB_CHOICE_LIST *match_result;
-  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
-  ASSERT_HOST(word_res->chopped_word->blobs != NULL);
+  GenericVector<BLOB_CHOICE*> blob_choices;
+  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
   float rating = static_cast<float>(MAX_INT8);
-  for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
-       blob = blob->next) {
+  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
     // The rating and certainty are not quite arbitrary. Since
     // select_blob_to_chop uses the worst certainty to choose, they all have
     // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
@@ -281,32 +264,33 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
     // produced, however much chopping is required. The chops are thus only
     // limited by the ability of the chopper to find suitable chop points,
     // and not by the value of the certainties.
-    match_result = fake_classify_blob(0, rating, -rating);
-    modify_blob_choice(match_result, 0);
-    ASSERT_HOST(!match_result->empty());
-    *char_choices += match_result;
+    BLOB_CHOICE* choice =
+        new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE);
+    blob_choices.push_back(choice);
     rating -= 0.125f;
   }
-  inT32 blob_number;
+  const double e = exp(1.0);  // The base of natural logs.
+  int blob_number;
   int right_chop_index = 0;
   if (!assume_fixed_pitch_char_segment) {
     // We only chop if the language is not fixed pitch like CJK.
-    if (prioritize_division) {
-      while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
-    } else {
-      while (chop_one_blob(word_res->chopped_word, char_choices,
-                           &blob_number, &word_res->seam_array,
-                           &right_chop_index));
+    SEAM* seam = NULL;
+    while ((seam = chop_one_blob(boxes, blob_choices, word_res,
+                                 &blob_number)) != NULL) {
+      word_res->InsertSeam(blob_number, seam);
+      BLOB_CHOICE* left_choice = blob_choices[blob_number];
+      rating = left_choice->rating() / e;
+      left_choice->set_rating(rating);
+      left_choice->set_certainty(-rating);
+      // combine confidence w/ serial #
+      BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
+                                                  rating - 0.125f, -rating,
+                                                  -1, -1, 0, 0, 0, 0, BCC_FAKE);
+      blob_choices.insert(right_choice, blob_number + 1);
     }
   }
-  MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
-  MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
   word_res->CloneChoppedToRebuild();
-  blob_match_table.end_match_table();
-  if (char_choices != NULL) {
-    char_choices->delete_data_pointers();
-    delete char_choices;
-  }
+  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
 }
 
 // Helper to compute the dispute resolution metric.
@@ -558,16 +542,15 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
 // substitutions ARE used.
 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
                                  WERD_RES* word_res) {
-  blob_match_table.init_match_table();
   // Classify all required combinations of blobs and save results in choices.
   int word_length = word_res->box_word->length();
   GenericVector<BLOB_CHOICE_LIST*>* choices =
       new GenericVector<BLOB_CHOICE_LIST*>[word_length];
   for (int i = 0; i < word_length; ++i) {
     for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
       BLOB_CHOICE_LIST* match_result = classify_piece(
-          word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
-          i, i + j - 1, word_res->blamer_bundle);
+          word_res->seam_array, i, i + j - 1, "Applybox",
+          word_res->chopped_word, word_res->blamer_bundle);
       if (applybox_debug > 2) {
         tprintf("%d+%d:", i, j);
         print_ratings_list("Segment:", match_result, unicharset);
@@ -583,17 +566,15 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
   float best_rating = 0.0f;
   SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
                 &search_segmentation, &best_rating, &word_res->best_state);
-  blob_match_table.end_match_table();
   for (int i = 0; i < word_length; ++i)
     choices[i].delete_data_pointers();
   delete [] choices;
   if (word_res->best_state.empty()) {
     // Build the original segmentation and if it is the same length as the
     // truth, assume it will do.
     int blob_count = 1;
-    for (int s = 0; s < array_count(word_res->seam_array); ++s) {
-      SEAM* seam =
-          reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s));
+    for (int s = 0; s < word_res->seam_array.size(); ++s) {
+      SEAM* seam = word_res->seam_array[s];
       if (seam->split1 == NULL) {
         word_res->best_state.push_back(blob_count);
         blob_count = 1;
@@ -707,21 +688,25 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
   WERD_RES* word_res;
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
     int ok_in_word = 0;
-    BLOB_CHOICE_LIST_VECTOR char_choices;
-    for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
-      if (word_res->correct_text[i].length() > 0) {
+    int blob_count = word_res->correct_text.size();
+    WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
+    word_choice->set_permuter(TOP_CHOICE_PERM);
+    for (int c = 0; c < blob_count; ++c) {
+      if (word_res->correct_text[c].length() > 0) {
         ++ok_in_word;
       }
       // Since we only need a fake word_res->best_choice, the actual
       // unichar_ids do not matter. Which is fortunate, since TidyUp()
       // can be called while training Tesseract, at the stage where
       // unicharset is not meaningful yet.
-      char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
+      word_choice->append_unichar_id_space_allocated(
+          INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
     }
     if (ok_in_word > 0) {
       ok_blob_count += ok_in_word;
       bad_blob_count += word_res->correct_text.size() - ok_in_word;
-      MakeWordChoice(char_choices, unicharset, word_res->best_choice);
+      word_res->LogNewRawChoice(word_choice);
+      word_res->LogNewCookedChoice(1, false, word_choice);
     } else {
       ++unlabelled_words;
       if (applybox_debug > 0) {
@@ -730,7 +715,6 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
       }
       pr_it.DeleteCurrentWord();
     }
-    char_choices.delete_data_pointers();
   }
   pr_it.restart_page();
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
@@ -772,11 +756,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
       GenericVector<STRING> tokens;
       word_res->correct_text[i].split(' ', &tokens);
       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
-      choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
+      choice->append_unichar_id_space_allocated(char_id,
+                                                word_res->best_state[i],
+                                                0.0f, 0.0f);
     }
-    if (word_res->best_choice != NULL)
-      delete word_res->best_choice;
-    word_res->best_choice = choice;
+    word_res->ClearWordChoices();
+    word_res->LogNewRawChoice(choice);
+    word_res->LogNewCookedChoice(1, false, choice);
   }
 }
 
@@ -787,7 +773,7 @@ void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
   int word_count = 0;
   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
        word_res = pr_it.forward()) {
-    LearnWord(filename.string(), NULL, word_res);
+    LearnWord(filename.string(), word_res);
     ++word_count;
   }
   tprintf("Generated training data for %d words\n", word_count);