diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java index a16c27c87..9335c7735 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java @@ -79,6 +79,11 @@ public class CrawlConfig { * Should we fetch binary content such as images, audio, ...? */ private boolean includeBinaryContentInCrawling = false; + + /** + * Should we process binary content such as image, audio, ... using TIKA? + */ + private boolean processBinaryContentInCrawling = false; /** * Maximum Connections per host @@ -306,6 +311,17 @@ public boolean isIncludeBinaryContentInCrawling() { public void setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling) { this.includeBinaryContentInCrawling = includeBinaryContentInCrawling; } + + public boolean isProcessBinaryContentInCrawling() { + return processBinaryContentInCrawling; + } + + /** + * Should we process binary content such as images, audio, ... using TIKA? + */ + public void setProcessBinaryContentInCrawling(boolean processBinaryContentInCrawling) { + this.processBinaryContentInCrawling = processBinaryContentInCrawling; + } public int getMaxConnectionsPerHost() { return maxConnectionsPerHost; diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java index 396d7a5e7..aca2778bc 100644 --- a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java +++ b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java @@ -60,7 +60,11 @@ public void parse(Page page, String contextURL) throws NotAllowedContentExceptio if (Util.hasBinaryContent(page.getContentType())) { // BINARY BinaryParseData parseData = new BinaryParseData(); if (config.isIncludeBinaryContentInCrawling()) { - parseData.setBinaryContent(page.getContentData()); + if (config.isProcessBinaryContentInCrawling()) { + parseData.setBinaryContent(page.getContentData()); + } else { + parseData.setHtml(""); + } page.setParseData(parseData); if (parseData.getHtml() == null) { throw new ParseException();