Skip to content

Commit

Permalink
Merge pull request yasserg#54 from MadEgg/binary-content
Browse files Browse the repository at this point in the history
Added configuration parameter processBinaryContentInCrawling
  • Loading branch information
yasserg committed Jul 19, 2015
2 parents 373f09d + 705cac0 commit bdbdc3e
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
16 changes: 16 additions & 0 deletions src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ public class CrawlConfig {
* Should we fetch binary content such as images, audio, ...?
*/
private boolean includeBinaryContentInCrawling = false;

/**
* Should we process binary content such as image, audio, ... using TIKA?
*/
private boolean processBinaryContentInCrawling = false;

/**
* Maximum Connections per host
Expand Down Expand Up @@ -306,6 +311,17 @@ public boolean isIncludeBinaryContentInCrawling() {
public void setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling) {
this.includeBinaryContentInCrawling = includeBinaryContentInCrawling;
}

public boolean isProcessBinaryContentInCrawling() {
return processBinaryContentInCrawling;
}

/**
* Should we process binary content such as images, audio, ... using TIKA?
*/
public void setProcessBinaryContentInCrawling(boolean processBinaryContentInCrawling) {
this.processBinaryContentInCrawling = processBinaryContentInCrawling;
}

public int getMaxConnectionsPerHost() {
return maxConnectionsPerHost;
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ public void parse(Page page, String contextURL) throws NotAllowedContentExceptio
if (Util.hasBinaryContent(page.getContentType())) { // BINARY
BinaryParseData parseData = new BinaryParseData();
if (config.isIncludeBinaryContentInCrawling()) {
parseData.setBinaryContent(page.getContentData());
if (config.isProcessBinaryContentInCrawling()) {
parseData.setBinaryContent(page.getContentData());
} else {
parseData.setHtml("<html></html>");
}
page.setParseData(parseData);
if (parseData.getHtml() == null) {
throw new ParseException();
Expand Down

0 comments on commit bdbdc3e

Please sign in to comment.