Skip to content

Commit 1e3995c

Browse files
committed
Fixes yasserg#28 - Added binary content parsing
Now the crawler can really parse binary content. Based on Tero Jankilla's code
1 parent 9efaeef commit 1e3995c

File tree

4 files changed

+110
-66
lines changed

4 files changed

+110
-66
lines changed

pom.xml

+6-46
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,12 @@
100100
<scope>compile</scope>
101101
</dependency>
102102

103+
<dependency>
104+
<groupId>ch.qos.logback</groupId>
105+
<artifactId>logback-classic</artifactId>
106+
<version>1.1.2</version>
107+
</dependency>
108+
103109
<dependency>
104110
<groupId>org.apache.httpcomponents</groupId>
105111
<artifactId>httpclient</artifactId>
@@ -117,52 +123,6 @@
117123
<groupId>org.apache.tika</groupId>
118124
<artifactId>tika-parsers</artifactId>
119125
<version>1.5</version>
120-
<exclusions>
121-
<exclusion>
122-
<artifactId>poi-ooxml-schemas</artifactId>
123-
<groupId>org.apache.poi</groupId>
124-
</exclusion>
125-
<exclusion>
126-
<artifactId>poi-ooxml</artifactId>
127-
<groupId>org.apache.poi</groupId>
128-
</exclusion>
129-
<exclusion>
130-
<artifactId>poi-scratchpad</artifactId>
131-
<groupId>org.apache.poi</groupId>
132-
</exclusion>
133-
<exclusion>
134-
<artifactId>fontbox</artifactId>
135-
<groupId>org.apache.pdfbox</groupId>
136-
</exclusion>
137-
<exclusion>
138-
<artifactId>poi</artifactId>
139-
<groupId>org.apache.poi</groupId>
140-
</exclusion>
141-
<exclusion>
142-
<artifactId>pdfbox</artifactId>
143-
<groupId>org.apache.pdfbox</groupId>
144-
</exclusion>
145-
<exclusion>
146-
<artifactId>netcdf</artifactId>
147-
<groupId>edu.ucar</groupId>
148-
</exclusion>
149-
<exclusion>
150-
<artifactId>jdom</artifactId>
151-
<groupId>jdom</groupId>
152-
</exclusion>
153-
<exclusion>
154-
<artifactId>rome</artifactId>
155-
<groupId>rome</groupId>
156-
</exclusion>
157-
<exclusion>
158-
<artifactId>bcmail-jdk15</artifactId>
159-
<groupId>org.bouncycastle</groupId>
160-
</exclusion>
161-
<exclusion>
162-
<artifactId>bcprov-jdk15</artifactId>
163-
<groupId>org.bouncycastle</groupId>
164-
</exclusion>
165-
</exclusions>
166126
</dependency>
167127

168128
<!-- Test Dependencies -->

src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,9 @@ public void close() {
194194
sync();
195195
workQueues.close();
196196
counters.close();
197-
inProcessPages.close();
197+
if (inProcessPages != null) {
198+
inProcessPages.close();
199+
}
198200
}
199201

200202
public void finish() {

src/main/java/edu/uci/ics/crawler4j/parser/BinaryParseData.java

+94-11
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,99 @@
1717

1818
package edu.uci.ics.crawler4j.parser;
1919

20+
import java.io.ByteArrayInputStream;
21+
import java.io.ByteArrayOutputStream;
22+
import java.io.InputStream;
23+
import java.io.OutputStream;
24+
import java.io.PrintStream;
25+
import java.io.UnsupportedEncodingException;
26+
27+
import javax.xml.transform.OutputKeys;
28+
import javax.xml.transform.Transformer;
29+
import javax.xml.transform.TransformerConfigurationException;
30+
import javax.xml.transform.sax.SAXTransformerFactory;
31+
import javax.xml.transform.sax.TransformerHandler;
32+
import javax.xml.transform.stream.StreamResult;
33+
34+
import org.apache.tika.metadata.Metadata;
35+
import org.apache.tika.parser.AutoDetectParser;
36+
import org.apache.tika.parser.ParseContext;
37+
import org.apache.tika.parser.Parser;
38+
import org.slf4j.Logger;
39+
import org.slf4j.LoggerFactory;
40+
2041
public class BinaryParseData implements ParseData {
2142

22-
private static BinaryParseData instance = new BinaryParseData();
23-
24-
public static BinaryParseData getInstance() {
25-
return instance;
26-
}
27-
28-
@Override
29-
public String toString() {
30-
return "[Binary parse data can not be dumped as string]";
31-
}
32-
}
43+
private static final Logger logger = LoggerFactory.getLogger(BinaryParseData.class);
44+
private static final String DEFAULT_ENCODING = "UTF-8";
45+
private static final String DEFAULT_OUTPUT_FORMAT = "html";
46+
47+
private static final Metadata METADATA = new Metadata();
48+
private static final Parser AUTO_DETECT_PARSER = new AutoDetectParser();
49+
private static final SAXTransformerFactory SAX_TRANSFORMER_FACTORY = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
50+
51+
private final ParseContext context = new ParseContext();
52+
private String html = null;
53+
54+
public BinaryParseData() {
55+
context.set(Parser.class, AUTO_DETECT_PARSER);
56+
}
57+
58+
public void setBinaryContent(byte[] data) {
59+
InputStream inputStream = new ByteArrayInputStream(data);
60+
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
61+
62+
try {
63+
TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING);
64+
AUTO_DETECT_PARSER.parse(inputStream, handler, METADATA, context);
65+
66+
setHtml(new String(outputStream.toByteArray(), DEFAULT_ENCODING));
67+
} catch (TransformerConfigurationException e) {
68+
logger.error("Error configuring handler", e);
69+
} catch (UnsupportedEncodingException e) {
70+
logger.error("Encoding for content not supported", e);
71+
} catch (Exception e) {
72+
logger.error("Error parsing file", e);
73+
}
74+
}
75+
76+
/**
77+
* Returns a transformer handler that serializes incoming SAX events to
78+
* XHTML or HTML (depending the given method) using the given output encoding.
79+
*
80+
* @param encoding output encoding, or <code>null</code> for the platform default
81+
*/
82+
private static TransformerHandler getTransformerHandler(OutputStream out, String method, String encoding)
83+
throws TransformerConfigurationException {
84+
85+
TransformerHandler transformerHandler = SAX_TRANSFORMER_FACTORY.newTransformerHandler();
86+
Transformer transformer = transformerHandler.getTransformer();
87+
transformer.setOutputProperty(OutputKeys.METHOD, method);
88+
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
89+
90+
if (encoding != null) {
91+
transformer.setOutputProperty(OutputKeys.ENCODING, encoding);
92+
}
93+
94+
transformerHandler.setResult(new StreamResult(new PrintStream(out)));
95+
return transformerHandler;
96+
}
97+
98+
/** @return Parsed binary content or null */
99+
public String getHtml() {
100+
return html;
101+
}
102+
103+
public void setHtml(String html) {
104+
this.html = html;
105+
}
106+
107+
@Override
108+
public String toString() {
109+
if (html == null || html.isEmpty()) {
110+
return "No data parsed yet";
111+
} else {
112+
return getHtml();
113+
}
114+
}
115+
}

src/main/java/edu/uci/ics/crawler4j/parser/Parser.java

+7-8
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,13 @@ public Parser(CrawlConfig config) {
5757
public boolean parse(Page page, String contextURL) {
5858

5959
if (Util.hasBinaryContent(page.getContentType())) {
60-
if (!config.isIncludeBinaryContentInCrawling()) {
61-
return false;
60+
BinaryParseData parseData = new BinaryParseData();
61+
if (config.isIncludeBinaryContentInCrawling()) {
62+
parseData.setBinaryContent(page.getContentData());
63+
page.setParseData(parseData);
6264
}
6365

64-
page.setParseData(BinaryParseData.getInstance());
65-
return true;
66-
66+
return parseData.getHtml() != null;
6767
} else if (Util.hasPlainTextContent(page.getContentType())) {
6868
try {
6969
TextParseData parseData = new TextParseData();
@@ -77,6 +77,7 @@ public boolean parse(Page page, String contextURL) {
7777
} catch (Exception e) {
7878
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
7979
}
80+
8081
return false;
8182
}
8283

@@ -155,7 +156,5 @@ public boolean parse(Page page, String contextURL) {
155156

156157
page.setParseData(parseData);
157158
return true;
158-
159159
}
160-
161-
}
160+
}

0 commit comments

Comments
 (0)