Pdf specific functionality moved in different package.

dimi2 · Feb 18, 2017 · 1f5329d · 1f5329d
1 parent 8e26fc0
commit 1f5329d
Show file tree

Hide file tree

Showing 16 changed files with 407 additions and 262 deletions.
diff --git a/sources/main/java/dsk/anotex/AnnotationExtractor.java b/sources/main/java/dsk/anotex/AnnotationExtractor.java
@@ -1,201 +1,77 @@
 package dsk.anotex;
 
-import com.itextpdf.kernel.geom.Rectangle;
-import com.itextpdf.kernel.pdf.PdfArray;
-import com.itextpdf.kernel.pdf.PdfDocument;
-import com.itextpdf.kernel.pdf.PdfDocumentInfo;
-import com.itextpdf.kernel.pdf.PdfName;
-import com.itextpdf.kernel.pdf.PdfNumber;
-import com.itextpdf.kernel.pdf.PdfPage;
-import com.itextpdf.kernel.pdf.PdfReader;
-import com.itextpdf.kernel.pdf.PdfString;
-import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
-import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor;
-import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
-import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredTextEventListener;
-import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextExtractionStrategy;
-import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
 import dsk.anotex.core.AnnotatedDocument;
-import dsk.anotex.core.Annotation;
+import dsk.anotex.core.FileFormat;
+import dsk.anotex.importer.AnnotationImporter;
+import dsk.anotex.importer.ImporterFactory;
 
-import java.io.File;
-import java.util.Arrays;
-import java.util.LinkedList;
-import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
 
 /**
  * Document annotation extractor.
  */
 public class AnnotationExtractor {
+    protected Map<String, FileFormat> formats;
 
-    /**
-     * Read annotations from given e-document.
-     * @param fileName Document file name.
-     * @return Document annotations.
-     */
-    public AnnotatedDocument readAnnotations(String fileName) {
-        // Check the file existence.
-        File file = new File(fileName).getAbsoluteFile();
-        if (!file.isFile()) {
-            String message = String.format("File '%s' does not exist", file.getName());
-            throw new IllegalArgumentException(message);
-        }
-
-        // Extract the annotations.
-        PdfDocument pdfDocument = readDocument(file);
-        return extractAnnotations(pdfDocument);
-    }
-
-    /**
-     * Read PDF document from file.
-     * @param file File name.
-     * @return PDF document.
-     */
-    protected PdfDocument readDocument(File file) {
-        PdfDocument document;
-        try {
-            document = new PdfDocument(new PdfReader(file.getAbsolutePath()));
-        }
-        catch (Exception e) {
-            throw new IllegalArgumentException(e);
-        }
-        return document;
+    public AnnotationExtractor() {
+        super();
+        formats = getKnownFileFormats();
     }
 
     /**
-     * Extract annotations from given PDF document.
-     * @param pdfDocument PDF document.
-     * @return Extracted annotations.
+     * Extract annotations from given document file.
+     * @param fileName Document file name.
+     * @return Document annotations.
      */
-    protected AnnotatedDocument extractAnnotations(PdfDocument pdfDocument) {
-        AnnotatedDocument document = new AnnotatedDocument();
-        PdfDocumentInfo pdfInfo = pdfDocument.getDocumentInfo();
-        document.setTitle(pdfInfo.getTitle());
-        document.setSubject(pdfInfo.getSubject());
-        document.setAuthor(pdfInfo.getAuthor());
-        List<String> keywords = convertToKeywords(pdfInfo.getKeywords());
-        document.setKeywords(keywords);
-
-        List<Annotation> annotations = new LinkedList<>();
-        for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) {
-            PdfPage page = pdfDocument.getPage(i);
-            for (PdfAnnotation pdfAnnotation : page.getAnnotations()) {
-                Annotation annotation = convertAnnotation(pdfAnnotation);
-                if (annotation != null) {
-                    annotations.add(annotation);
-                }
-            } //
-        } //
-        document.setAnnotations(annotations);
-
+    public AnnotatedDocument extractAnnotations(String fileName) {
+        FileFormat format = detectFileFormat(fileName);
+        AnnotationImporter importer = ImporterFactory.createImporter(format);
+        AnnotatedDocument document = importer.readAnnotations(fileName);
         postProcess(document);
-
         return document;
     }
 
     /**
-     * Convert document annotation to independent format.
-     * @param pdfAnnotation Annotation to be converted.
-     * @return Converted annotation.
-     */
-    protected Annotation convertAnnotation(PdfAnnotation pdfAnnotation) {
-        String text = null;
-        PdfArray textCoordinates = pdfAnnotation.getRectangle();
-        PdfString pdfText = pdfAnnotation.getContents();
-        if (pdfText == null) {
-            // The text is not included in the annotation content - extract from highlighted text.
-            if (PdfName.Highlight.equals(pdfAnnotation.getSubtype())) {
-                // We should use 'getQuadPoints()' here, but 'getRectangle()' gives better result
-                // (because of variances in PDF standard implementations of QuadPoints).
-                Rectangle highlightedArea = toRectangle(textCoordinates);
-                ITextExtractionStrategy textFilter = new FilteredTextEventListener(
-                    new LocationTextExtractionStrategy(), new TextRegionEventFilter(highlightedArea));
-                String highlightedText = PdfTextExtractor.getTextFromPage(pdfAnnotation.getPage(),
-                    textFilter);
-                text = normalizeHighlightedText(highlightedText);
-            }
-        }
-        else {
-            // The text is included in the annotation content - use it directly.
-            if (pdfText.getEncoding() == null) {
-                text = pdfText.toUnicodeString();
-            }
-            else {
-                text = pdfText.getValue();
-            }
-        }
-
-        Annotation annotation = null;
-        if (text != null) {
-            annotation = new Annotation();
-            text = stripUnwantedChunks(text);
-            annotation.setText(text);
-        }
-        return annotation;
-    }
-
-    /**
-     * Convert comma separated string to list of keywords.
-     * @param sKeywords String to be converted.
-     * @return List of keywords.
+     * Detect the file format.
+     * @param fileName Document file name.
+     * @return Detected format, null for unknown formats.
      */
-    protected List<String> convertToKeywords(String sKeywords) {
-        List<String> keywords;
-        if ((sKeywords != null) && !sKeywords.isEmpty()) {
-            // The string can be surrounded with double quotes.
-            sKeywords = stripDoubleQuotes(sKeywords);
-            // Split on comma (and trim around it).
-            String[] words = sKeywords.split(" ?, ?");
-            keywords = Arrays.asList(words);
-        }
-        else {
-            keywords = new LinkedList<>();
-        }
-        return keywords;
+    protected FileFormat detectFileFormat(String fileName) {
+        // We use the file name extension to determine the format.
+        // Reading the first few bytes (signature) from the file would provide more reliable detection.
+        // But this one is good enough, since the associated importer will parse the file anyway and will
+        // detect if the file format is wrong (for example, PNG file, renamed with PDF extension).
+        String extension = getFileExtension(fileName);
+        FileFormat format = formats.get(extension);
+        return format;
     }
 
     /**
-     * Strip double quotes enclosing string. For example:
-     * <pre>
-     *     "Flower" becomes Flower
-     * </pre>
-     * @param text Text to be stripped.
-     * @return Stripped text.
+     * Get file name extension of specified file. Example: for 'file1.ext' it will return '.ext'
+     * @param fileName The file name.
+     * @return File extension (in lowercase) or empty string if there is no extension.
      */
-    protected String stripDoubleQuotes(String text) {
-        String st = text;
-        char dQuota = '"';
-        int endPos = text.length() - 1;
-        if ((text.charAt(0) == dQuota)
-            && (text.charAt(endPos) == dQuota)) {
-            // Remove the surrounding chars.
-            st = text.substring(1, endPos).trim();
+    protected String getFileExtension(String fileName) {
+        String ret = "";
+        if (fileName != null) {
+            int idx = fileName.lastIndexOf('.');
+            if (idx > 0 && (idx < fileName.length() - 1)) {
+                ret = fileName.substring(idx).toLowerCase();
+            }
         }
-        return st;
-    }
-
-    /**
-     * Convert array of points, describing to rectangular shape.
-     * The points describe two diagonally opposite points: lower-left / upper-right.
-     * @param rectPoints Quad points.
-     * @return Corresponding rectangle.
-     */
-    protected Rectangle toRectangle(PdfArray rectPoints) {
-        float x1 = ((PdfNumber)rectPoints.get(0)).floatValue();
-        float y1 = ((PdfNumber)rectPoints.get(1)).floatValue();
-        float x3 = ((PdfNumber)rectPoints.get(2)).floatValue();
-        float y3 = ((PdfNumber)rectPoints.get(3)).floatValue();
-        return new Rectangle(x1, y1, (x3 - x1), (y3 - y1));
+        return ret;
     }
 
     /**
-     * Normalize highlighted text - when retrieved from PDF renderer, it contains defects (like
-     * additional spaces, inappropriate characters).
-     * @param highlightedText Highlighted text.
-     * @return Normalized text.
+     * Create mapping between file extensions and the known file formats.
+     * @return The mapping.
      */
-    protected String normalizeHighlightedText(String highlightedText) {
-        return highlightedText.replaceAll("\\s+", " ").replaceAll("[“”]", "\"");
+    protected Map<String, FileFormat> getKnownFileFormats() {
+        TreeMap<String, FileFormat> mapping = new TreeMap<>();
+        mapping.put(FileFormat.PDF.getExtension(), FileFormat.PDF);
+        mapping.put(FileFormat.MARKDOWN.getExtension(), FileFormat.MARKDOWN);
+        return mapping;
     }
 
     /**
@@ -205,16 +81,5 @@ protected String normalizeHighlightedText(String highlightedText) {
     protected void postProcess(AnnotatedDocument document) {
     }
 
-    /**
-     * Strip unwanted character before or after the annotation (these chunks are PDF library issue).
-     * @param text The text to strip.
-     * @return Stripped text.
-     */
-    protected String stripUnwantedChunks(String text) {
-        text = text.replaceFirst("^\\p{javaLowerCase}?[.?!]? ", "")
-            .replaceFirst(" \\p{IsAlphabetic}?$", "");
-        text = stripDoubleQuotes(text);
-        return text;
-    }
 }
 
diff --git a/sources/main/java/dsk/anotex/ConsoleRunner.java b/sources/main/java/dsk/anotex/ConsoleRunner.java
@@ -1,9 +1,9 @@
 package dsk.anotex;
 
 import dsk.anotex.core.AnnotatedDocument;
-import dsk.anotex.export.AnnotationExporter;
-import dsk.anotex.export.ExportFormat;
-import dsk.anotex.export.ExporterFactory;
+import dsk.anotex.core.FileFormat;
+import dsk.anotex.exporter.AnnotationExporter;
+import dsk.anotex.exporter.ExporterFactory;
 import dsk.anotex.util.CommandLineParser;
 
 import java.io.BufferedWriter;
@@ -38,11 +38,11 @@ public ConsoleRunner() {
     public void extract(String inputFile, Map<String, Object> settings, String outputFile) {
         // Read the annotations.
         AnnotationExtractor extractor = new AnnotationExtractor();
-        AnnotatedDocument document = extractor.readAnnotations(inputFile);
+        AnnotatedDocument document = extractor.extractAnnotations(inputFile);
 
         // Get appropriate exporter.
         String sFormat = (String) settings.get(Constants.EXPORT_FORMAT);
-        ExportFormat exportFormat = ExportFormat.getByName(sFormat);
+        FileFormat exportFormat = FileFormat.getByName(sFormat);
         AnnotationExporter exporter = ExporterFactory.createExporter(exportFormat);
 
         // Write the output.
@@ -59,8 +59,8 @@ public void extract(String inputFile, Map<String, Object> settings, String outpu
      * Get the default export format.
      * @return Export format.
      */
-    protected ExportFormat getDefaultExportFormat() {
-        return ExportFormat.MARKDOWN;
+    protected FileFormat getDefaultExportFormat() {
+        return FileFormat.MARKDOWN;
     }
 
     /**
@@ -147,8 +147,8 @@ public static void main(String[] args) {
         String inputFile = parser.getArgumentValue(ARG_INPUT);
         if ((inputFile != null)) {
             HashMap<String, Object> settings = new HashMap<>();
-            // Currently we support only one export format.
-            ExportFormat exportFormat = runner.getDefaultExportFormat();
+            // Currently we support only one export format, se we do not read this from the command line.
+            FileFormat exportFormat = runner.getDefaultExportFormat();
             settings.put(Constants.EXPORT_FORMAT, exportFormat.getName());
             // Retrieve the output file name.
             String outputFile = parser.getArgumentValue(ARG_OUTPUT);

diff --git a/.../java/dsk/anotex/export/ExportFormat.java → ...main/java/dsk/anotex/core/FileFormat.java b/.../java/dsk/anotex/export/ExportFormat.java → ...main/java/dsk/anotex/core/FileFormat.java
@@ -1,9 +1,10 @@
-package dsk.anotex.export;
+package dsk.anotex.core;
 
 /**
- * Export formats enumeration.
+ * File format enumeration.
  */
-public enum ExportFormat {
+public enum FileFormat {
+    PDF("Pdf", ".pdf"),
     MARKDOWN("Markdown", ".md");
 
     String name;
@@ -17,14 +18,14 @@ public String getExtension() {
         return extension;
     }
 
-    ExportFormat(String name, String fileExtension) {
+    FileFormat(String name, String fileExtension) {
         this.name = name;
         this.extension = fileExtension;
     }
 
-    public static ExportFormat getByName(String name) {
-        ExportFormat match = null;
-        for (ExportFormat v : values()) {
+    public static FileFormat getByName(String name) {
+        FileFormat match = null;
+        for (FileFormat v : values()) {
             if (v.getName().equals(name)) {
                 match = v;
                 break;

diff --git a/sources/main/java/dsk/anotex/export/package-info.java b/sources/main/java/dsk/anotex/export/package-info.java
diff --git a/...dsk/anotex/export/AnnotationExporter.java → ...k/anotex/exporter/AnnotationExporter.java b/...dsk/anotex/export/AnnotationExporter.java → ...k/anotex/exporter/AnnotationExporter.java
@@ -1,4 +1,4 @@
-package dsk.anotex.export;
+package dsk.anotex.exporter;
 
 import dsk.anotex.core.AnnotatedDocument;
 

diff --git a/...va/dsk/anotex/export/ExporterFactory.java → .../dsk/anotex/exporter/ExporterFactory.java b/...va/dsk/anotex/export/ExporterFactory.java → .../dsk/anotex/exporter/ExporterFactory.java
@@ -1,4 +1,6 @@
-package dsk.anotex.export;
+package dsk.anotex.exporter;
+
+import dsk.anotex.core.FileFormat;
 
 /**
  * Annotation exporter factory.
@@ -16,9 +18,9 @@ private ExporterFactory() {
      * @param format Desired file format.
      * @return Exporter instance for this format.
      */
-    public static AnnotationExporter createExporter(ExportFormat format) {
+    public static AnnotationExporter createExporter(FileFormat format) {
         AnnotationExporter exporter;
-        if (ExportFormat.MARKDOWN == format) {
+        if (FileFormat.MARKDOWN == format) {
             exporter = new MarkdownExporter();
         }
         else {

diff --git a/...a/dsk/anotex/export/MarkdownExporter.java → ...dsk/anotex/exporter/MarkdownExporter.java b/...a/dsk/anotex/export/MarkdownExporter.java → ...dsk/anotex/exporter/MarkdownExporter.java
@@ -1,4 +1,4 @@
-package dsk.anotex.export;
+package dsk.anotex.exporter;
 
 import dsk.anotex.core.AnnotatedDocument;
 import dsk.anotex.core.Annotation;

diff --git a/sources/main/java/dsk/anotex/exporter/package-info.java b/sources/main/java/dsk/anotex/exporter/package-info.java
@@ -0,0 +1,5 @@
+/**
+ * Document annotation exporting.
+ * Use the {@link dsk.anotex.exporter.ExporterFactory} to get appropriate exporter for given file format.
+ */
+package dsk.anotex.exporter;