Skip to content

Commit

Permalink
Add hocr option for Tesseract-based OCR
Browse files Browse the repository at this point in the history
This change adds a new setting `fs.ocr.output_type` to change the OCR default behavior.

Set the output type from ocr process. `fs.ocr.output_type` property can be defined to `txt` or `hocr` in your `~/.fscrawler/test/_settings.json` file:

```json
{
 "name" : "test",
 "fs" : {
   "url" : "/path/to/data/dir",
   "ocr" : {
     "output_type": "hocr"
   }
 }
}
```

When omitted, `txt` value is used.

Closes dadoonet#507.
  • Loading branch information
dadoonet committed Jul 30, 2018
1 parent 1c6c4ae commit 500e5f0
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 12 deletions.
42 changes: 33 additions & 9 deletions docs/source/user/tips.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,17 @@ OCR settings

Here is a list of OCR settings (under ``fs.ocr`` prefix)`:

+----------------------+---------------+------------------------------------+
| Name | Default value | Documentation |
+======================+===============+====================================+
| ``fs.ocr.language`` | ``"eng"`` | `OCR Language`_ |
+----------------------+---------------+------------------------------------+
| ``fs.ocr.path`` | ``null`` | `OCR Path`_ |
+----------------------+---------------+------------------------------------+
| ``fs.ocr.data_path`` | ``null`` | `OCR Data Path`_ |
+----------------------+---------------+------------------------------------+
+------------------------+---------------+------------------------------------+
| Name | Default value | Documentation |
+========================+===============+====================================+
| ``fs.ocr.language`` | ``"eng"`` | `OCR Language`_ |
+------------------------+---------------+------------------------------------+
| ``fs.ocr.path`` | ``null`` | `OCR Path`_ |
+------------------------+---------------+------------------------------------+
| ``fs.ocr.data_path`` | ``null`` | `OCR Data Path`_ |
+------------------------+---------------+------------------------------------+
| ``fs.ocr.output_type`` | ``txt`` | `OCR Output Type`_ |
+------------------------+---------------+------------------------------------+

OCR Language
^^^^^^^^^^^^
Expand Down Expand Up @@ -137,6 +139,28 @@ define the path to use by setting ``fs.ocr.data_path`` property in your
}
}
OCR Output Type
^^^^^^^^^^^^^^^

.. versionadded:: 2.5

Set the output type from ocr process. ``fs.ocr.output_type`` property can be defined to
``txt`` or ``hocr`` in your ``~/.fscrawler/test/_settings.json`` file:

.. code:: json
{
"name" : "test",
"fs" : {
"url" : "/path/to/data/dir",
"ocr" : {
"output_type": "hocr"
}
}
}
.. note:: When omitted, ``txt`` value is used.

Using docker
------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ public class Ocr {
private String path = null;
// Path to tesseract data
private String dataPath = null;
// Output Type. Can be txt (default) or hocr. null means the default value.
private String outputType = null;

public static Builder builder() {
return new Builder();
Expand All @@ -36,6 +38,7 @@ public static class Builder {
private String language = "eng";
private String path = null;
private String dataPath = null;
private String outputType = null;

public Builder setLanguage(String language) {
this.language = language;
Expand All @@ -52,8 +55,13 @@ public Builder setDataPath(String dataPath) {
return this;
}

public Builder setOutputType(String outputType) {
this.outputType = outputType;
return this;
}

public Ocr build() {
return new Ocr(language, path, dataPath);
return new Ocr(language, path, dataPath, outputType);
}

}
Expand All @@ -62,10 +70,11 @@ public Ocr( ) {

}

private Ocr(String language, String path, String dataPath) {
private Ocr(String language, String path, String dataPath, String outputType) {
this.language = language;
this.path = path;
this.dataPath = dataPath;
this.outputType = outputType;
}

public String getLanguage() {
Expand All @@ -91,4 +100,12 @@ public String getDataPath() {
public void setDataPath(String dataPath) {
this.dataPath = dataPath;
}

public String getOutputType() {
return outputType;
}

public void setOutputType(String outputType) {
this.outputType = outputType;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

public class FsSettingsParserTest extends AbstractFSCrawlerTestCase {

private static final Ocr OCR_FULL = Ocr.builder().setLanguage("eng").build();
private static final Ocr OCR_FULL = Ocr.builder().setLanguage("eng").setOutputType("txt").build();

private static final Fs FS_EMPTY = Fs.builder().build();
private static final Fs FS_FULL = Fs.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ private static void initContext(Fs fs) {
config.setTessdataPath(fs.getOcr().getDataPath());
}
config.setLanguage(fs.getOcr().getLanguage());
if (fs.getOcr().getOutputType() != null) {
config.setOutputType(fs.getOcr().getOutputType());
}
context.set(TesseractOCRConfig.class, config);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,15 @@ public void testOcr() throws IOException {
assertThat(doc.getContent(), isEmptyString());
doc = extractFromFile("test-ocr.pdf", fsSettings);
assertThat(doc.getContent(), nullValue());

// Test with OCR On with hocr output type
fsSettings = FsSettings.builder(getCurrentTestName())
.setFs(Fs.builder().setOcr(Ocr.builder().setOutputType("hocr").build()).build())
.build();
doc = extractFromFile("test-ocr.png", fsSettings);
assertThat(doc.getContent(), containsString("This file contains some words."));
doc = extractFromFile("test-ocr.pdf", fsSettings);
assertThat(doc.getContent(), containsString("This file contains some words."));
}

@Test
Expand Down

0 comments on commit 500e5f0

Please sign in to comment.