Skip to content

Commit

Permalink
Append spaces to textual content parsing of HTML
Browse files Browse the repository at this point in the history
  • Loading branch information
Craig Macdonald authored and Yasser Ganjisaffar committed Sep 28, 2016
1 parent 096819e commit 58ccfe0
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@

package edu.uci.ics.crawler4j.parser;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class HtmlContentHandler extends DefaultHandler {

private static final int MAX_ANCHOR_LENGTH = 100;
Expand Down Expand Up @@ -171,8 +171,10 @@ public void endElement(String uri, String localName, String qName) throws SAXExc
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (isWithinBodyElement) {
if (bodyText.length() > 0) {
bodyText.append(' ');
}
bodyText.append(ch, start, length);

if (anchorFlag) {
anchorText.append(new String(ch, start, length));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package edu.uci.ics.crawler4j.tests;

import edu.uci.ics.crawler4j.parser.HtmlContentHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import org.junit.Test;

import java.io.ByteArrayInputStream;

import static org.junit.Assert.assertEquals;

public class HtmlContentHandlerTest {

private HtmlParser parser = new HtmlParser();
private ParseContext parseContext = new ParseContext();

private HtmlContentHandler parseHtml(String html) throws Exception {
ByteArrayInputStream bais = new ByteArrayInputStream(html.getBytes());
Metadata metadata = new Metadata();
HtmlContentHandler contentHandler = new HtmlContentHandler();
parser.parse(bais, contentHandler, metadata, parseContext);
return contentHandler;
}

@Test
public void testEmpty() throws Exception
{
HtmlContentHandler parse = parseHtml("<html></html>");
assertEquals("",parse.getBodyText());
}

@Test
public void testParaInBody() throws Exception
{
HtmlContentHandler parse = parseHtml("<html><body><p>Hello there</p></html>");
assertEquals("Hello there",parse.getBodyText());
}

@Test
public void test2ParaInBody() throws Exception
{
HtmlContentHandler parse = parseHtml("<html><body><p>Hello there</p><p>mr</p></html>");
assertEquals("Hello there mr",parse.getBodyText());
}

@Test
public void testTableInBody() throws Exception
{
HtmlContentHandler parse = parseHtml("<html><body><table><tr><th>Hello</th><th>there</th></tr>"
+"<tr><td>mr</td><td>bear</td></tr></html>");
assertEquals("Hello there mr bear",parse.getBodyText());
}

}

0 comments on commit 58ccfe0

Please sign in to comment.