Skip to content

Commit

Permalink
Pdf specific functionality moved in different package.
Browse files Browse the repository at this point in the history
  • Loading branch information
dimi2 committed Feb 18, 2017
1 parent 8e26fc0 commit 1f5329d
Show file tree
Hide file tree
Showing 16 changed files with 407 additions and 262 deletions.
225 changes: 45 additions & 180 deletions sources/main/java/dsk/anotex/AnnotationExtractor.java
Original file line number Diff line number Diff line change
@@ -1,201 +1,77 @@
package dsk.anotex;

import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfArray;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfDocumentInfo;
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfNumber;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.PdfString;
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor;
import com.itextpdf.kernel.pdf.canvas.parser.filter.TextRegionEventFilter;
import com.itextpdf.kernel.pdf.canvas.parser.listener.FilteredTextEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextExtractionStrategy;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import dsk.anotex.core.AnnotatedDocument;
import dsk.anotex.core.Annotation;
import dsk.anotex.core.FileFormat;
import dsk.anotex.importer.AnnotationImporter;
import dsk.anotex.importer.ImporterFactory;

import java.io.File;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/**
* Document annotation extractor.
*/
public class AnnotationExtractor {
protected Map<String, FileFormat> formats;

/**
* Read annotations from given e-document.
* @param fileName Document file name.
* @return Document annotations.
*/
public AnnotatedDocument readAnnotations(String fileName) {
// Check the file existence.
File file = new File(fileName).getAbsoluteFile();
if (!file.isFile()) {
String message = String.format("File '%s' does not exist", file.getName());
throw new IllegalArgumentException(message);
}

// Extract the annotations.
PdfDocument pdfDocument = readDocument(file);
return extractAnnotations(pdfDocument);
}

/**
* Read PDF document from file.
* @param file File name.
* @return PDF document.
*/
protected PdfDocument readDocument(File file) {
PdfDocument document;
try {
document = new PdfDocument(new PdfReader(file.getAbsolutePath()));
}
catch (Exception e) {
throw new IllegalArgumentException(e);
}
return document;
public AnnotationExtractor() {
super();
formats = getKnownFileFormats();
}

/**
* Extract annotations from given PDF document.
* @param pdfDocument PDF document.
* @return Extracted annotations.
* Extract annotations from given document file.
* @param fileName Document file name.
* @return Document annotations.
*/
protected AnnotatedDocument extractAnnotations(PdfDocument pdfDocument) {
AnnotatedDocument document = new AnnotatedDocument();
PdfDocumentInfo pdfInfo = pdfDocument.getDocumentInfo();
document.setTitle(pdfInfo.getTitle());
document.setSubject(pdfInfo.getSubject());
document.setAuthor(pdfInfo.getAuthor());
List<String> keywords = convertToKeywords(pdfInfo.getKeywords());
document.setKeywords(keywords);

List<Annotation> annotations = new LinkedList<>();
for (int i = 1; i <= pdfDocument.getNumberOfPages(); i++) {
PdfPage page = pdfDocument.getPage(i);
for (PdfAnnotation pdfAnnotation : page.getAnnotations()) {
Annotation annotation = convertAnnotation(pdfAnnotation);
if (annotation != null) {
annotations.add(annotation);
}
} //
} //
document.setAnnotations(annotations);

public AnnotatedDocument extractAnnotations(String fileName) {
FileFormat format = detectFileFormat(fileName);
AnnotationImporter importer = ImporterFactory.createImporter(format);
AnnotatedDocument document = importer.readAnnotations(fileName);
postProcess(document);

return document;
}

/**
* Convert document annotation to independent format.
* @param pdfAnnotation Annotation to be converted.
* @return Converted annotation.
*/
protected Annotation convertAnnotation(PdfAnnotation pdfAnnotation) {
String text = null;
PdfArray textCoordinates = pdfAnnotation.getRectangle();
PdfString pdfText = pdfAnnotation.getContents();
if (pdfText == null) {
// The text is not included in the annotation content - extract from highlighted text.
if (PdfName.Highlight.equals(pdfAnnotation.getSubtype())) {
// We should use 'getQuadPoints()' here, but 'getRectangle()' gives better result
// (because of variances in PDF standard implementations of QuadPoints).
Rectangle highlightedArea = toRectangle(textCoordinates);
ITextExtractionStrategy textFilter = new FilteredTextEventListener(
new LocationTextExtractionStrategy(), new TextRegionEventFilter(highlightedArea));
String highlightedText = PdfTextExtractor.getTextFromPage(pdfAnnotation.getPage(),
textFilter);
text = normalizeHighlightedText(highlightedText);
}
}
else {
// The text is included in the annotation content - use it directly.
if (pdfText.getEncoding() == null) {
text = pdfText.toUnicodeString();
}
else {
text = pdfText.getValue();
}
}

Annotation annotation = null;
if (text != null) {
annotation = new Annotation();
text = stripUnwantedChunks(text);
annotation.setText(text);
}
return annotation;
}

/**
* Convert comma separated string to list of keywords.
* @param sKeywords String to be converted.
* @return List of keywords.
* Detect the file format.
* @param fileName Document file name.
* @return Detected format, null for unknown formats.
*/
protected List<String> convertToKeywords(String sKeywords) {
List<String> keywords;
if ((sKeywords != null) && !sKeywords.isEmpty()) {
// The string can be surrounded with double quotes.
sKeywords = stripDoubleQuotes(sKeywords);
// Split on comma (and trim around it).
String[] words = sKeywords.split(" ?, ?");
keywords = Arrays.asList(words);
}
else {
keywords = new LinkedList<>();
}
return keywords;
protected FileFormat detectFileFormat(String fileName) {
// We use the file name extension to determine the format.
// Reading the first few bytes (signature) from the file would provide more reliable detection.
// But this one is good enough, since the associated importer will parse the file anyway and will
// detect if the file format is wrong (for example, PNG file, renamed with PDF extension).
String extension = getFileExtension(fileName);
FileFormat format = formats.get(extension);
return format;
}

/**
* Strip double quotes enclosing string. For example:
* <pre>
* "Flower" becomes Flower
* </pre>
* @param text Text to be stripped.
* @return Stripped text.
* Get file name extension of specified file. Example: for 'file1.ext' it will return '.ext'
* @param fileName The file name.
* @return File extension (in lowercase) or empty string if there is no extension.
*/
protected String stripDoubleQuotes(String text) {
String st = text;
char dQuota = '"';
int endPos = text.length() - 1;
if ((text.charAt(0) == dQuota)
&& (text.charAt(endPos) == dQuota)) {
// Remove the surrounding chars.
st = text.substring(1, endPos).trim();
protected String getFileExtension(String fileName) {
String ret = "";
if (fileName != null) {
int idx = fileName.lastIndexOf('.');
if (idx > 0 && (idx < fileName.length() - 1)) {
ret = fileName.substring(idx).toLowerCase();
}
}
return st;
}

/**
* Convert array of points, describing to rectangular shape.
* The points describe two diagonally opposite points: lower-left / upper-right.
* @param rectPoints Quad points.
* @return Corresponding rectangle.
*/
protected Rectangle toRectangle(PdfArray rectPoints) {
float x1 = ((PdfNumber)rectPoints.get(0)).floatValue();
float y1 = ((PdfNumber)rectPoints.get(1)).floatValue();
float x3 = ((PdfNumber)rectPoints.get(2)).floatValue();
float y3 = ((PdfNumber)rectPoints.get(3)).floatValue();
return new Rectangle(x1, y1, (x3 - x1), (y3 - y1));
return ret;
}

/**
* Normalize highlighted text - when retrieved from PDF renderer, it contains defects (like
* additional spaces, inappropriate characters).
* @param highlightedText Highlighted text.
* @return Normalized text.
* Create mapping between file extensions and the known file formats.
* @return The mapping.
*/
protected String normalizeHighlightedText(String highlightedText) {
return highlightedText.replaceAll("\\s+", " ").replaceAll("[“”]", "\"");
protected Map<String, FileFormat> getKnownFileFormats() {
TreeMap<String, FileFormat> mapping = new TreeMap<>();
mapping.put(FileFormat.PDF.getExtension(), FileFormat.PDF);
mapping.put(FileFormat.MARKDOWN.getExtension(), FileFormat.MARKDOWN);
return mapping;
}

/**
Expand All @@ -205,16 +81,5 @@ protected String normalizeHighlightedText(String highlightedText) {
protected void postProcess(AnnotatedDocument document) {
}

/**
* Strip unwanted character before or after the annotation (these chunks are PDF library issue).
* @param text The text to strip.
* @return Stripped text.
*/
protected String stripUnwantedChunks(String text) {
text = text.replaceFirst("^\\p{javaLowerCase}?[.?!]? ", "")
.replaceFirst(" \\p{IsAlphabetic}?$", "");
text = stripDoubleQuotes(text);
return text;
}
}

18 changes: 9 additions & 9 deletions sources/main/java/dsk/anotex/ConsoleRunner.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package dsk.anotex;

import dsk.anotex.core.AnnotatedDocument;
import dsk.anotex.export.AnnotationExporter;
import dsk.anotex.export.ExportFormat;
import dsk.anotex.export.ExporterFactory;
import dsk.anotex.core.FileFormat;
import dsk.anotex.exporter.AnnotationExporter;
import dsk.anotex.exporter.ExporterFactory;
import dsk.anotex.util.CommandLineParser;

import java.io.BufferedWriter;
Expand Down Expand Up @@ -38,11 +38,11 @@ public ConsoleRunner() {
public void extract(String inputFile, Map<String, Object> settings, String outputFile) {
// Read the annotations.
AnnotationExtractor extractor = new AnnotationExtractor();
AnnotatedDocument document = extractor.readAnnotations(inputFile);
AnnotatedDocument document = extractor.extractAnnotations(inputFile);

// Get appropriate exporter.
String sFormat = (String) settings.get(Constants.EXPORT_FORMAT);
ExportFormat exportFormat = ExportFormat.getByName(sFormat);
FileFormat exportFormat = FileFormat.getByName(sFormat);
AnnotationExporter exporter = ExporterFactory.createExporter(exportFormat);

// Write the output.
Expand All @@ -59,8 +59,8 @@ public void extract(String inputFile, Map<String, Object> settings, String outpu
* Get the default export format.
* @return Export format.
*/
protected ExportFormat getDefaultExportFormat() {
return ExportFormat.MARKDOWN;
protected FileFormat getDefaultExportFormat() {
return FileFormat.MARKDOWN;
}

/**
Expand Down Expand Up @@ -147,8 +147,8 @@ public static void main(String[] args) {
String inputFile = parser.getArgumentValue(ARG_INPUT);
if ((inputFile != null)) {
HashMap<String, Object> settings = new HashMap<>();
// Currently we support only one export format.
ExportFormat exportFormat = runner.getDefaultExportFormat();
// Currently we support only one export format, se we do not read this from the command line.
FileFormat exportFormat = runner.getDefaultExportFormat();
settings.put(Constants.EXPORT_FORMAT, exportFormat.getName());
// Retrieve the output file name.
String outputFile = parser.getArgumentValue(ARG_OUTPUT);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
package dsk.anotex.export;
package dsk.anotex.core;

/**
* Export formats enumeration.
* File format enumeration.
*/
public enum ExportFormat {
public enum FileFormat {
PDF("Pdf", ".pdf"),
MARKDOWN("Markdown", ".md");

String name;
Expand All @@ -17,14 +18,14 @@ public String getExtension() {
return extension;
}

ExportFormat(String name, String fileExtension) {
FileFormat(String name, String fileExtension) {
this.name = name;
this.extension = fileExtension;
}

public static ExportFormat getByName(String name) {
ExportFormat match = null;
for (ExportFormat v : values()) {
public static FileFormat getByName(String name) {
FileFormat match = null;
for (FileFormat v : values()) {
if (v.getName().equals(name)) {
match = v;
break;
Expand Down
4 changes: 0 additions & 4 deletions sources/main/java/dsk/anotex/export/package-info.java

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package dsk.anotex.export;
package dsk.anotex.exporter;

import dsk.anotex.core.AnnotatedDocument;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
package dsk.anotex.export;
package dsk.anotex.exporter;

import dsk.anotex.core.FileFormat;

/**
* Annotation exporter factory.
Expand All @@ -16,9 +18,9 @@ private ExporterFactory() {
* @param format Desired file format.
* @return Exporter instance for this format.
*/
public static AnnotationExporter createExporter(ExportFormat format) {
public static AnnotationExporter createExporter(FileFormat format) {
AnnotationExporter exporter;
if (ExportFormat.MARKDOWN == format) {
if (FileFormat.MARKDOWN == format) {
exporter = new MarkdownExporter();
}
else {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package dsk.anotex.export;
package dsk.anotex.exporter;

import dsk.anotex.core.AnnotatedDocument;
import dsk.anotex.core.Annotation;
Expand Down
5 changes: 5 additions & 0 deletions sources/main/java/dsk/anotex/exporter/package-info.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/**
* Document annotation exporting.
* Use the {@link dsk.anotex.exporter.ExporterFactory} to get appropriate exporter for given file format.
*/
package dsk.anotex.exporter;
Loading

0 comments on commit 1f5329d

Please sign in to comment.