Skip to content

Commit

Permalink
Make PDF Reader classes more customizable for assigning custom metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
markpollack committed Aug 23, 2024
1 parent 4fac212 commit a89b938
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ public class PagePdfDocumentReader implements DocumentReader {

public static final String METADATA_FILE_NAME = "file_name";

private final PDDocument document;
protected final PDDocument document;

private PdfDocumentReaderConfig config;

private String resourceFileName;
protected String resourceFileName;

public PagePdfDocumentReader(String resourceUrl) {
this(new DefaultResourceLoader().getResource(resourceUrl));
Expand All @@ -75,9 +75,7 @@ public PagePdfDocumentReader(String resourceUrl, PdfDocumentReaderConfig config)
}

public PagePdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig config) {

try {

PDFParser pdfParser = new PDFParser(
new org.apache.pdfbox.io.RandomAccessReadBuffer(pdfResource.getInputStream()));
this.document = pdfParser.parse();
Expand Down Expand Up @@ -109,7 +107,9 @@ public List<Document> get() {
// each iteration
int counter = 0;

PDPage lastPage = this.document.getDocumentCatalog().getPages().iterator().next();
for (PDPage page : this.document.getDocumentCatalog().getPages()) {
lastPage = page;
if (counter % logFrequency == 0 && counter / logFrequency < 10) {
logger.info("Processing PDF page: {}", (counter + 1));
}
Expand All @@ -123,7 +123,7 @@ public List<Document> get() {

var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining());
if (StringUtils.hasText(aggregatedPageTextGroup)) {
readDocuments.add(toDocument(aggregatedPageTextGroup, startPageNumber, pageNumber));
readDocuments.add(toDocument(page, aggregatedPageTextGroup, startPageNumber, pageNumber));
}
pageTextGroupList.clear();

Expand All @@ -150,8 +150,8 @@ public List<Document> get() {
pdfTextStripper.removeRegion(PDF_PAGE_REGION);
}
if (!CollectionUtils.isEmpty(pageTextGroupList)) {
readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()), startPageNumber,
pageNumber));
readDocuments.add(toDocument(lastPage, pageTextGroupList.stream().collect(Collectors.joining()),
startPageNumber, pageNumber));
}
logger.info("Processing {} pages", totalPages);
return readDocuments;
Expand All @@ -162,15 +162,13 @@ public List<Document> get() {
}
}

private Document toDocument(String docText, int startPageNumber, int endPageNumber) {

protected Document toDocument(PDPage page, String docText, int startPageNumber, int endPageNumber) {
Document doc = new Document(docText);
doc.getMetadata().put(METADATA_START_PAGE_NUMBER, startPageNumber);
if (startPageNumber != endPageNumber) {
doc.getMetadata().put(METADATA_END_PAGE_NUMBER, endPageNumber);
}
doc.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);

return doc;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ public class ParagraphPdfDocumentReader implements DocumentReader {

private final ParagraphManager paragraphTextExtractor;

private final PDDocument document;
protected final PDDocument document;

private PdfDocumentReaderConfig config;

private String resourceFileName;
protected String resourceFileName;

/**
* Constructs a ParagraphPdfDocumentReader using a resource URL.
Expand Down Expand Up @@ -155,7 +155,7 @@ public List<Document> get() {
return documents;
}

private Document toDocument(Paragraph from, Paragraph to) {
protected Document toDocument(Paragraph from, Paragraph to) {

String docText = this.getTextBetweenParagraphs(from, to);

Expand All @@ -164,13 +164,17 @@ private Document toDocument(Paragraph from, Paragraph to) {
}

Document document = new Document(docText);
addMetadata(from, to, document);

return document;
}

protected void addMetadata(Paragraph from, Paragraph to, Document document) {
document.getMetadata().put(METADATA_TITLE, from.title());
document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber());
document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber());
document.getMetadata().put(METADATA_LEVEL, from.level());
document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);

return document;
}

public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) {
Expand Down

0 comments on commit a89b938

Please sign in to comment.