Make PDF Reader classes more customizable for assigning custom metadata

yaswanthramanam · Aug 23, 2024 · a89b938 · a89b938
1 parent 4fac212
commit a89b938
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 15 deletions.
diff --git a/...ers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java b/...ers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java
@@ -56,11 +56,11 @@ public class PagePdfDocumentReader implements DocumentReader {
 
 	public static final String METADATA_FILE_NAME = "file_name";
 
-	private final PDDocument document;
+	protected final PDDocument document;
 
 	private PdfDocumentReaderConfig config;
 
-	private String resourceFileName;
+	protected String resourceFileName;
 
 	public PagePdfDocumentReader(String resourceUrl) {
 		this(new DefaultResourceLoader().getResource(resourceUrl));
@@ -75,9 +75,7 @@ public PagePdfDocumentReader(String resourceUrl, PdfDocumentReaderConfig config)
 	}
 
 	public PagePdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig config) {
-
 		try {
-
 			PDFParser pdfParser = new PDFParser(
 					new org.apache.pdfbox.io.RandomAccessReadBuffer(pdfResource.getInputStream()));
 			this.document = pdfParser.parse();
@@ -109,7 +107,9 @@ public List<Document> get() {
 																		// each iteration
 			int counter = 0;
 
+			PDPage lastPage = this.document.getDocumentCatalog().getPages().iterator().next();
 			for (PDPage page : this.document.getDocumentCatalog().getPages()) {
+				lastPage = page;
 				if (counter % logFrequency == 0 && counter / logFrequency < 10) {
 					logger.info("Processing PDF page: {}", (counter + 1));
 				}
@@ -123,7 +123,7 @@ public List<Document> get() {
 
 					var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining());
 					if (StringUtils.hasText(aggregatedPageTextGroup)) {
-						readDocuments.add(toDocument(aggregatedPageTextGroup, startPageNumber, pageNumber));
+						readDocuments.add(toDocument(page, aggregatedPageTextGroup, startPageNumber, pageNumber));
 					}
 					pageTextGroupList.clear();
 
@@ -150,8 +150,8 @@ public List<Document> get() {
 				pdfTextStripper.removeRegion(PDF_PAGE_REGION);
 			}
 			if (!CollectionUtils.isEmpty(pageTextGroupList)) {
-				readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()), startPageNumber,
-						pageNumber));
+				readDocuments.add(toDocument(lastPage, pageTextGroupList.stream().collect(Collectors.joining()),
+						startPageNumber, pageNumber));
 			}
 			logger.info("Processing {} pages", totalPages);
 			return readDocuments;
@@ -162,15 +162,13 @@ public List<Document> get() {
 		}
 	}
 
-	private Document toDocument(String docText, int startPageNumber, int endPageNumber) {
-
+	protected Document toDocument(PDPage page, String docText, int startPageNumber, int endPageNumber) {
 		Document doc = new Document(docText);
 		doc.getMetadata().put(METADATA_START_PAGE_NUMBER, startPageNumber);
 		if (startPageNumber != endPageNumber) {
 			doc.getMetadata().put(METADATA_END_PAGE_NUMBER, endPageNumber);
 		}
 		doc.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);
-
 		return doc;
 	}
 

diff --git a/...df-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java b/...df-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java
@@ -63,11 +63,11 @@ public class ParagraphPdfDocumentReader implements DocumentReader {
 
 	private final ParagraphManager paragraphTextExtractor;
 
-	private final PDDocument document;
+	protected final PDDocument document;
 
 	private PdfDocumentReaderConfig config;
 
-	private String resourceFileName;
+	protected String resourceFileName;
 
 	/**
 	 * Constructs a ParagraphPdfDocumentReader using a resource URL.
@@ -155,7 +155,7 @@ public List<Document> get() {
 		return documents;
 	}
 
-	private Document toDocument(Paragraph from, Paragraph to) {
+	protected Document toDocument(Paragraph from, Paragraph to) {
 
 		String docText = this.getTextBetweenParagraphs(from, to);
 
@@ -164,13 +164,17 @@ private Document toDocument(Paragraph from, Paragraph to) {
 		}
 
 		Document document = new Document(docText);
+		addMetadata(from, to, document);
+
+		return document;
+	}
+
+	protected void addMetadata(Paragraph from, Paragraph to, Document document) {
 		document.getMetadata().put(METADATA_TITLE, from.title());
 		document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber());
 		document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber());
 		document.getMetadata().put(METADATA_LEVEL, from.level());
 		document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);
-
-		return document;
 	}
 
 	public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) {