Skip to content

Commit

Permalink
Added new LSTM-based neural network line recognizer
Browse files Browse the repository at this point in the history
  • Loading branch information
theraysmith committed Nov 7, 2016
1 parent 5d21ecf commit c1c1e42
Show file tree
Hide file tree
Showing 107 changed files with 15,410 additions and 354 deletions.
2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ endif

.PHONY: install-langs ScrollView.jar install-jars training

SUBDIRS = ccutil viewer cutil opencl ccstruct dict classify wordrec textord
SUBDIRS = arch ccutil viewer cutil opencl ccstruct dict classify wordrec textord lstm
if !NO_CUBE_BUILD
SUBDIRS += neural_networks/runtime cube
endif
Expand Down
7 changes: 7 additions & 0 deletions api/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
AM_CPPFLAGS += -DLOCALEDIR=\"$(localedir)\"\
-DUSE_STD_NAMESPACE \
-I$(top_srcdir)/arch -I$(top_srcdir)/lstm \
-I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct -I$(top_srcdir)/cube \
-I$(top_srcdir)/viewer \
-I$(top_srcdir)/textord -I$(top_srcdir)/dict \
Expand Down Expand Up @@ -27,6 +28,9 @@ libtesseract_api_la_LIBADD = \
../wordrec/libtesseract_wordrec.la \
../classify/libtesseract_classify.la \
../dict/libtesseract_dict.la \
../arch/libtesseract_avx.la \
../arch/libtesseract_sse.la \
../lstm/libtesseract_lstm.la \
../ccstruct/libtesseract_ccstruct.la \
../cutil/libtesseract_cutil.la \
../viewer/libtesseract_viewer.la \
Expand Down Expand Up @@ -57,6 +61,9 @@ libtesseract_la_LIBADD = \
../wordrec/libtesseract_wordrec.la \
../classify/libtesseract_classify.la \
../dict/libtesseract_dict.la \
../arch/libtesseract_avx.la \
../arch/libtesseract_sse.la \
../lstm/libtesseract_lstm.la \
../ccstruct/libtesseract_ccstruct.la \
../cutil/libtesseract_cutil.la \
../viewer/libtesseract_viewer.la \
Expand Down
63 changes: 24 additions & 39 deletions api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ TessBaseAPI::TessBaseAPI()
block_list_(NULL),
page_res_(NULL),
input_file_(NULL),
input_image_(NULL),
output_file_(NULL),
datapath_(NULL),
language_(NULL),
Expand Down Expand Up @@ -515,19 +514,19 @@ void TessBaseAPI::ClearAdaptiveClassifier() {

/**
* Provide an image for Tesseract to recognize. Format is as
* TesseractRect above. Does not copy the image buffer, or take
* ownership. The source image may be destroyed after Recognize is called,
* either explicitly or implicitly via one of the Get*Text functions.
* TesseractRect above. Copies the image buffer and converts to Pix.
* SetImage clears all recognition results, and sets the rectangle to the
* full image, so it may be followed immediately by a GetUTF8Text, and it
* will automatically perform recognition.
*/
void TessBaseAPI::SetImage(const unsigned char* imagedata,
int width, int height,
int bytes_per_pixel, int bytes_per_line) {
if (InternalSetImage())
if (InternalSetImage()) {
thresholder_->SetImage(imagedata, width, height,
bytes_per_pixel, bytes_per_line);
SetInputImage(thresholder_->GetPixRect());
}
}

void TessBaseAPI::SetSourceResolution(int ppi) {
Expand All @@ -539,18 +538,17 @@ void TessBaseAPI::SetSourceResolution(int ppi) {

/**
* Provide an image for Tesseract to recognize. As with SetImage above,
* Tesseract doesn't take a copy or ownership or pixDestroy the image, so
* it must persist until after Recognize.
* Tesseract takes its own copy of the image, so it need not persist until
* after Recognize.
* Pix vs raw, which to use?
* Use Pix where possible. A future version of Tesseract may choose to use Pix
* as its internal representation and discard IMAGE altogether.
* Because of that, an implementation that sources and targets Pix may end up
* with less copies than an implementation that does not.
* Use Pix where possible. Tesseract uses Pix as its internal representation
* and it is therefore more efficient to provide a Pix directly.
*/
void TessBaseAPI::SetImage(Pix* pix) {
if (InternalSetImage())
if (InternalSetImage()) {
thresholder_->SetImage(pix);
SetInputImage(pix);
SetInputImage(thresholder_->GetPixRect());
}
}

/**
Expand Down Expand Up @@ -693,8 +691,8 @@ Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level,
if (pixa != NULL) {
Pix* pix = NULL;
if (raw_image) {
pix = page_it->GetImage(level, raw_padding, input_image_,
&left, &top);
pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left,
&top);
} else {
pix = page_it->GetBinaryImage(level);
}
Expand Down Expand Up @@ -849,13 +847,17 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
} else if (tesseract_->tessedit_resegment_from_boxes) {
page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_);
} else {
// TODO(rays) LSTM here.
page_res_ = new PAGE_RES(false,
page_res_ = new PAGE_RES(tesseract_->AnyLSTMLang(),
block_list_, &tesseract_->prev_word_best_choice_);
}
if (page_res_ == NULL) {
return -1;
}
if (tesseract_->tessedit_train_line_recognizer) {
tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_);
tesseract_->CorrectClassifyWords(page_res_);
return 0;
}
if (tesseract_->tessedit_make_boxes_from_boxes) {
tesseract_->CorrectClassifyWords(page_res_);
return 0;
Expand Down Expand Up @@ -938,17 +940,10 @@ int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {
return 0;
}

void TessBaseAPI::SetInputImage(Pix *pix) {
if (input_image_)
pixDestroy(&input_image_);
input_image_ = NULL;
if (pix)
input_image_ = pixCopy(NULL, pix);
}
// Takes ownership of the input pix.
void TessBaseAPI::SetInputImage(Pix* pix) { tesseract_->set_pix_original(pix); }

Pix* TessBaseAPI::GetInputImage() {
return input_image_;
}
Pix* TessBaseAPI::GetInputImage() { return tesseract_->pix_original(); }

const char * TessBaseAPI::GetInputName() {
if (input_file_)
Expand Down Expand Up @@ -992,8 +987,7 @@ bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
}

// Begin producing output
const char* kUnknownTitle = "";
if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
if (renderer && !renderer->BeginDocument(unknown_title_)) {
return false;
}

Expand Down Expand Up @@ -1105,7 +1099,6 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
const char* retry_config,
int timeout_millisec,
TessResultRenderer* renderer) {
#ifndef ANDROID_BUILD
PERF_COUNT_START("ProcessPages")
bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
if (stdInput) {
Expand Down Expand Up @@ -1162,8 +1155,7 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
}

// Begin the output
const char* kUnknownTitle = "";
if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
if (renderer && !renderer->BeginDocument(unknown_title_)) {
pixDestroy(&pix);
return false;
}
Expand All @@ -1185,9 +1177,6 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
}
PERF_COUNT_END
return true;
#else
return false;
#endif
}

bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
Expand Down Expand Up @@ -2107,10 +2096,6 @@ void TessBaseAPI::End() {
delete input_file_;
input_file_ = NULL;
}
if (input_image_ != NULL) {
pixDestroy(&input_image_);
input_image_ = NULL;
}
if (output_file_ != NULL) {
delete output_file_;
output_file_ = NULL;
Expand Down
26 changes: 14 additions & 12 deletions api/baseapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
#ifndef TESSERACT_API_BASEAPI_H__
#define TESSERACT_API_BASEAPI_H__

#define TESSERACT_VERSION_STR "3.05.00dev"
#define TESSERACT_VERSION 0x030500
#define TESSERACT_VERSION_STR "4.00.00alpha"
#define TESSERACT_VERSION 0x040000
#define MAKE_VERSION(major, minor, patch) (((major) << 16) | ((minor) << 8) | \
(patch))

Expand Down Expand Up @@ -142,6 +142,7 @@ class TESS_API TessBaseAPI {
* is stored in the PDF so we need that as well.
*/
const char* GetInputName();
// Takes ownership of the input pix.
void SetInputImage(Pix *pix);
Pix* GetInputImage();
int GetSourceYResolution();
Expand Down Expand Up @@ -333,9 +334,7 @@ class TESS_API TessBaseAPI {

/**
* Provide an image for Tesseract to recognize. Format is as
* TesseractRect above. Does not copy the image buffer, or take
* ownership. The source image may be destroyed after Recognize is called,
* either explicitly or implicitly via one of the Get*Text functions.
* TesseractRect above. Copies the image buffer and converts to Pix.
* SetImage clears all recognition results, and sets the rectangle to the
* full image, so it may be followed immediately by a GetUTF8Text, and it
* will automatically perform recognition.
Expand All @@ -345,13 +344,11 @@ class TESS_API TessBaseAPI {

/**
* Provide an image for Tesseract to recognize. As with SetImage above,
* Tesseract doesn't take a copy or ownership or pixDestroy the image, so
* it must persist until after Recognize.
* Tesseract takes its own copy of the image, so it need not persist until
* after Recognize.
* Pix vs raw, which to use?
* Use Pix where possible. A future version of Tesseract may choose to use Pix
* as its internal representation and discard IMAGE altogether.
* Because of that, an implementation that sources and targets Pix may end up
* with less copies than an implementation that does not.
* Use Pix where possible. Tesseract uses Pix as its internal representation
* and it is therefore more efficient to provide a Pix directly.
*/
void SetImage(Pix* pix);

Expand Down Expand Up @@ -866,7 +863,6 @@ class TESS_API TessBaseAPI {
BLOCK_LIST* block_list_; ///< The page layout.
PAGE_RES* page_res_; ///< The page-level data.
STRING* input_file_; ///< Name used by training code.
Pix* input_image_; ///< Image used for searchable PDF
STRING* output_file_; ///< Name used by debug code.
STRING* datapath_; ///< Current location of tessdata.
STRING* language_; ///< Last initialized language.
Expand Down Expand Up @@ -902,6 +898,12 @@ class TESS_API TessBaseAPI {
int timeout_millisec,
TessResultRenderer* renderer,
int tessedit_page_number);
// There's currently no way to pass a document title from the
// Tesseract command line, and we have multiple places that choose
// to set the title to an empty string. Using a single named
// variable will hopefully reduce confusion if the situation changes
// in the future.
const char *unknown_title_ = "";
}; // class TessBaseAPI.

/** Escape a char string - remove &<>"' with HTML codes. */
Expand Down
7 changes: 3 additions & 4 deletions api/pdfrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,6 @@ bool TessPDFRenderer::BeginDocumentHandler() {
AppendPDFObject(buf);

// FONT DESCRIPTOR
const int kCharHeight = 2; // Effect: highlights are half height
n = snprintf(buf, sizeof(buf),
"7 0 obj\n"
"<<\n"
Expand All @@ -636,10 +635,10 @@ bool TessPDFRenderer::BeginDocumentHandler() {
" /Type /FontDescriptor\n"
">>\n"
"endobj\n",
1000 / kCharHeight,
1000 / kCharHeight,
1000,
1000,
1000 / kCharWidth,
1000 / kCharHeight,
1000,
8L // Font data
);
if (n >= sizeof(buf)) return false;
Expand Down
4 changes: 2 additions & 2 deletions api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class TESS_API TessResultRenderer {
bool EndDocument();

const char* file_extension() const { return file_extension_; }
const char* title() const { return title_; }
const char* title() const { return title_.c_str(); }

/**
* Returns the index of the last image given to AddImage
Expand Down Expand Up @@ -126,7 +126,7 @@ class TESS_API TessResultRenderer {

private:
const char* file_extension_; // standard extension for generated output
const char* title_; // title of document being renderered
STRING title_; // title of document being renderered
int imagenum_; // index of last image added

FILE* fout_; // output file pointer
Expand Down
29 changes: 29 additions & 0 deletions arch/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
AM_CPPFLAGS += -I$(top_srcdir)/ccutil
AUTOMAKE_OPTIONS = subdir-objects
SUBDIRS =
AM_CXXFLAGS =

if VISIBILITY
AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
AM_CPPFLAGS += -DTESS_EXPORTS
endif

include_HEADERS = \
dotproductavx.h dotproductsse.h

noinst_HEADERS =

if !USING_MULTIPLELIBS
noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
else
lib_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la
libtesseract_avx_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
libtesseract_sse_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
endif
libtesseract_avx_la_CXXFLAGS = -mavx
libtesseract_sse_la_CXXFLAGS = -msse4.1

libtesseract_avx_la_SOURCES = dotproductavx.cpp

libtesseract_sse_la_SOURCES = dotproductsse.cpp

Loading

0 comments on commit c1c1e42

Please sign in to comment.