Add hocr option for Tesseract-based OCR

This change adds a new setting `fs.ocr.output_type` to change the OCR default behavior. Set the output type from ocr process. `fs.ocr.output_type` property can be defined to `txt` or `hocr` in your `~/.fscrawler/test/_settings.json` file: ```json { "name" : "test", "fs" : { "url" : "/path/to/data/dir", "ocr" : { "output_type": "hocr" } } } ``` When omitted, `txt` value is used. Closes dadoonet#507.
catchsiva · Jul 30, 2018 · 500e5f0 · 500e5f0
1 parent 1c6c4ae
commit 500e5f0
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 12 deletions.
diff --git a/docs/source/user/tips.rst b/docs/source/user/tips.rst
@@ -65,15 +65,17 @@ OCR settings
 
 Here is a list of OCR settings (under ``fs.ocr`` prefix)`:
 
-+----------------------+---------------+------------------------------------+
-| Name                 | Default value | Documentation                      |
-+======================+===============+====================================+
-| ``fs.ocr.language``  | ``"eng"``     | `OCR Language`_                    |
-+----------------------+---------------+------------------------------------+
-| ``fs.ocr.path``      | ``null``      | `OCR Path`_                        |
-+----------------------+---------------+------------------------------------+
-| ``fs.ocr.data_path`` | ``null``      | `OCR Data Path`_                   |
-+----------------------+---------------+------------------------------------+
++------------------------+---------------+------------------------------------+
+| Name                   | Default value | Documentation                      |
++========================+===============+====================================+
+| ``fs.ocr.language``    | ``"eng"``     | `OCR Language`_                    |
++------------------------+---------------+------------------------------------+
+| ``fs.ocr.path``        | ``null``      | `OCR Path`_                        |
++------------------------+---------------+------------------------------------+
+| ``fs.ocr.data_path``   | ``null``      | `OCR Data Path`_                   |
++------------------------+---------------+------------------------------------+
+| ``fs.ocr.output_type`` | ``txt``       | `OCR Output Type`_                 |
++------------------------+---------------+------------------------------------+
 
 OCR Language
 ^^^^^^^^^^^^
@@ -137,6 +139,28 @@ define the path to use by setting ``fs.ocr.data_path`` property in your
      }
    }
 
+OCR Output Type
+^^^^^^^^^^^^^^^
+
+.. versionadded:: 2.5
+
+Set the output type from ocr process. ``fs.ocr.output_type`` property can be defined to
+``txt`` or ``hocr`` in your ``~/.fscrawler/test/_settings.json`` file:
+
+.. code:: json
+
+   {
+     "name" : "test",
+     "fs" : {
+       "url" : "/path/to/data/dir",
+       "ocr" : {
+         "output_type": "hocr"
+       }
+     }
+   }
+
+.. note:: When omitted, ``txt`` value is used.
+
 Using docker
 ------------
 

diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Ocr.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Ocr.java
@@ -26,6 +26,8 @@ public class Ocr {
     private String path = null;
     // Path to tesseract data
     private String dataPath = null;
+    // Output Type. Can be txt (default) or hocr. null means the default value.
+    private String outputType = null;
 
     public static Builder builder() {
         return new Builder();
@@ -36,6 +38,7 @@ public static class Builder {
         private String language = "eng";
         private String path = null;
         private String dataPath = null;
+        private String outputType = null;
 
         public Builder setLanguage(String language) {
             this.language = language;
@@ -52,8 +55,13 @@ public Builder setDataPath(String dataPath) {
             return this;
         }
 
+        public Builder setOutputType(String outputType) {
+            this.outputType = outputType;
+            return this;
+        }
+
         public Ocr build() {
-            return new Ocr(language, path, dataPath);
+            return new Ocr(language, path, dataPath, outputType);
         }
 
     }
@@ -62,10 +70,11 @@ public Ocr( ) {
 
     }
 
-    private Ocr(String language, String path, String dataPath) {
+    private Ocr(String language, String path, String dataPath, String outputType) {
         this.language = language;
         this.path = path;
         this.dataPath = dataPath;
+        this.outputType = outputType;
     }
 
     public String getLanguage() {
@@ -91,4 +100,12 @@ public String getDataPath() {
     public void setDataPath(String dataPath) {
         this.dataPath = dataPath;
     }
+
+    public String getOutputType() {
+        return outputType;
+    }
+
+    public void setOutputType(String outputType) {
+        this.outputType = outputType;
+    }
 }
diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java
@@ -39,7 +39,7 @@
 
 public class FsSettingsParserTest extends AbstractFSCrawlerTestCase {
 
-    private static final Ocr OCR_FULL = Ocr.builder().setLanguage("eng").build();
+    private static final Ocr OCR_FULL = Ocr.builder().setLanguage("eng").setOutputType("txt").build();
 
     private static final Fs FS_EMPTY = Fs.builder().build();
     private static final Fs FS_FULL = Fs.builder()

diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java
@@ -117,6 +117,9 @@ private static void initContext(Fs fs) {
                     config.setTessdataPath(fs.getOcr().getDataPath());
                 }
                 config.setLanguage(fs.getOcr().getLanguage());
+                if (fs.getOcr().getOutputType() != null) {
+                    config.setOutputType(fs.getOcr().getOutputType());
+                }
                 context.set(TesseractOCRConfig.class, config);
             }
         }

diff --git a/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java b/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java
@@ -648,6 +648,15 @@ public void testOcr() throws IOException {
         assertThat(doc.getContent(), isEmptyString());
         doc = extractFromFile("test-ocr.pdf", fsSettings);
         assertThat(doc.getContent(), nullValue());
+
+        // Test with OCR On with hocr output type
+        fsSettings = FsSettings.builder(getCurrentTestName())
+                .setFs(Fs.builder().setOcr(Ocr.builder().setOutputType("hocr").build()).build())
+                .build();
+        doc = extractFromFile("test-ocr.png", fsSettings);
+        assertThat(doc.getContent(), containsString("This file contains some words."));
+        doc = extractFromFile("test-ocr.pdf", fsSettings);
+        assertThat(doc.getContent(), containsString("This file contains some words."));
     }
 
     @Test