Skip to content

Commit

Permalink
fix for yasserg#346
Browse files Browse the repository at this point in the history
  • Loading branch information
pgalbraith committed Oct 17, 2018
1 parent 4683cc7 commit 6a8d22b
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 5 deletions.
4 changes: 0 additions & 4 deletions crawler4j/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,6 @@
<artifactId>tika-parsers</artifactId>
<version>${apache.tika.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ public HtmlParseData parse(Page page, String contextURL) throws ParseException {
HtmlContentHandler contentHandler = new HtmlContentHandler();
Metadata metadata = new Metadata();

if (page.getContentType() != null) {
metadata.add(Metadata.CONTENT_TYPE, page.getContentType());
}

try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import org.apache.http.entity.*
class HtmlParserTest extends Specification {

def "can parse html page"() {
def parser = new TikaHtmlParser(new CrawlConfig())
def parser = new TikaHtmlParser(new CrawlConfig(), null)
def url = new WebURL(url: "http://wiki.c2.com/")
def file = new File("src/test/resources/html/wiki.c2.com.html")
def contentType = new ContentType("text/html", Charset.forName("UTF-8"))
Expand Down

0 comments on commit 6a8d22b

Please sign in to comment.