forked from yasserg/crawler4j
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Append spaces to textual content parsing of HTML
- Loading branch information
Craig Macdonald
authored and
Yasser Ganjisaffar
committed
Sep 28, 2016
1 parent
096819e
commit 58ccfe0
Showing
2 changed files
with
62 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
package edu.uci.ics.crawler4j.tests; | ||
|
||
import edu.uci.ics.crawler4j.parser.HtmlContentHandler; | ||
import org.apache.tika.metadata.Metadata; | ||
import org.apache.tika.parser.ParseContext; | ||
import org.apache.tika.parser.html.HtmlParser; | ||
import org.junit.Test; | ||
|
||
import java.io.ByteArrayInputStream; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
public class HtmlContentHandlerTest { | ||
|
||
private HtmlParser parser = new HtmlParser(); | ||
private ParseContext parseContext = new ParseContext(); | ||
|
||
private HtmlContentHandler parseHtml(String html) throws Exception { | ||
ByteArrayInputStream bais = new ByteArrayInputStream(html.getBytes()); | ||
Metadata metadata = new Metadata(); | ||
HtmlContentHandler contentHandler = new HtmlContentHandler(); | ||
parser.parse(bais, contentHandler, metadata, parseContext); | ||
return contentHandler; | ||
} | ||
|
||
@Test | ||
public void testEmpty() throws Exception | ||
{ | ||
HtmlContentHandler parse = parseHtml("<html></html>"); | ||
assertEquals("",parse.getBodyText()); | ||
} | ||
|
||
@Test | ||
public void testParaInBody() throws Exception | ||
{ | ||
HtmlContentHandler parse = parseHtml("<html><body><p>Hello there</p></html>"); | ||
assertEquals("Hello there",parse.getBodyText()); | ||
} | ||
|
||
@Test | ||
public void test2ParaInBody() throws Exception | ||
{ | ||
HtmlContentHandler parse = parseHtml("<html><body><p>Hello there</p><p>mr</p></html>"); | ||
assertEquals("Hello there mr",parse.getBodyText()); | ||
} | ||
|
||
@Test | ||
public void testTableInBody() throws Exception | ||
{ | ||
HtmlContentHandler parse = parseHtml("<html><body><table><tr><th>Hello</th><th>there</th></tr>" | ||
+"<tr><td>mr</td><td>bear</td></tr></html>"); | ||
assertEquals("Hello there mr bear",parse.getBodyText()); | ||
} | ||
|
||
} |