Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Allow localhost URL
  • Loading branch information
pgalbraith committed Sep 29, 2018
1 parent d20fc31 commit 000fb31
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 54 deletions.
6 changes: 6 additions & 0 deletions crawler4j/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
<apache.http.components.version>4.5.5</apache.http.components.version>
<je.version>5.0.84</je.version>
<apache.tika.version>1.17</apache.tika.version>
<url-detector.version>0.1.20</url-detector.version>
<!--test dependency versions -->
<junit.version>4.12</junit.version>
<wiremock.version>2.14.0</wiremock.version>
Expand Down Expand Up @@ -369,6 +370,11 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>io.github.pgalbraith</groupId>
<artifactId>url-detector</artifactId>
<version>${url-detector.version}</version>
</dependency>

<!-- Test Dependencies -->
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ public DnsResolver getDnsResolver() {

private DnsResolver dnsResolver = new SystemDefaultDnsResolver();

private boolean allowSingleLevelDomain = false;

/**
* Validates the configs specified by this instance.
*
Expand Down Expand Up @@ -642,6 +644,26 @@ public void setRespectNoIndex(boolean respectNoIndex) {
this.respectNoIndex = respectNoIndex;
}

/**
* Are single level domains (e.g. http://localhost) considered valid?
*
* @return
*/
public boolean isAllowSingleLevelDomain() {
return allowSingleLevelDomain;
}

/**
* Allow single level domains (e.g. http://localhost). This is very useful for
* testing especially when you may be using localhost.
*
* @param allowSingleLevelDomain
* {@code true} if single level domain should be considered valid
*/
public void setAllowSingleLevelDomain(boolean allowSingleLevelDomain) {
this.allowSingleLevelDomain = allowSingleLevelDomain;
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Expand All @@ -668,6 +690,7 @@ public String toString() {
sb.append("Cookie policy: " + getCookiePolicy() + "\n");
sb.append("Respect nofollow: " + isRespectNoFollow() + "\n");
sb.append("Respect noindex: " + isRespectNoIndex() + "\n");
sb.append("Allow single level domain:" + isAllowSingleLevelDomain() + "\n");
return sb.toString();
}
}
10 changes: 6 additions & 4 deletions crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,16 @@ public class Parser {

private final HtmlParser htmlContentParser;

private final Net net;

public Parser(CrawlConfig config) throws IllegalAccessException, InstantiationException {
this.config = config;
this.htmlContentParser = new TikaHtmlParser(config);
this(config, new TikaHtmlParser(config));
}

public Parser(CrawlConfig config, HtmlParser htmlParser) {
this.config = config;
this.htmlContentParser = htmlParser;
this.net = new Net(config);
}

public void parse(Page page, String contextURL)
Expand All @@ -61,7 +63,7 @@ public void parse(Page page, String contextURL)
if (parseData.getHtml() == null) {
throw new ParseException();
}
parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
parseData.setOutgoingUrls(net.extractUrls(parseData.getHtml()));
} else {
throw new NotAllowedContentException();
}
Expand All @@ -74,7 +76,7 @@ public void parse(Page page, String contextURL)
parseData.setTextContent(
new String(page.getContentData(), page.getContentCharset()));
}
parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
parseData.setOutgoingUrls(net.extractUrls(parseData.getTextContent()));
page.setParseData(parseData);
} catch (Exception e) {
logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
Expand Down
72 changes: 38 additions & 34 deletions crawler4j/src/main/java/edu/uci/ics/crawler4j/util/Net.java
Original file line number Diff line number Diff line change
@@ -1,50 +1,54 @@
package edu.uci.ics.crawler4j.util;

import java.util.HashSet;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.function.Function;
import java.util.stream.Collectors;

import com.linkedin.urls.Url;
import com.linkedin.urls.detection.UrlDetector;
import com.linkedin.urls.detection.UrlDetectorOptions;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.url.WebURL;

/**
* Created by Avi Hayun on 9/22/2014.
* Net related Utils
*
* @author Paul Galbraith <[email protected]>
*/
public class Net {
private static final Pattern pattern = initializePattern();

public static Set<WebURL> extractUrls(String input) {
Set<WebURL> extractedUrls = new HashSet<>();

if (input != null) {
Matcher matcher = pattern.matcher(input);
while (matcher.find()) {
WebURL webURL = new WebURL();
String urlStr = matcher.group();
if (!urlStr.startsWith("http")) {
urlStr = "http://" + urlStr;
}

webURL.setURL(urlStr);
extractedUrls.add(webURL);
}
}

return extractedUrls;
private static final Function<Url, WebURL> urlMapper = url -> {
WebURL webUrl = new WebURL();
webUrl.setURL(url.getFullUrl());
return webUrl;
};

private CrawlConfig config;

public Net(CrawlConfig config) {
this.config = config;
}

/** Singleton like one time call to initialize the Pattern */
private static Pattern initializePattern() {
return Pattern.compile("\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)" +
"(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov" +
"|mil|biz|info|mobi|name|aero|jobs|museum" +
"|travel|[a-z]{2}))(:[\\d]{1,5})?" +
"(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?" +
"((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)" +
"(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*" +
"(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b");
public Set<WebURL> extractUrls(String input) {
if (input == null) {
return Collections.emptySet();
} else {
UrlDetector detector = new UrlDetector(input, getOptions());
List<Url> urls = detector.detect();
return urls.stream().map(urlMapper).collect(Collectors.toSet());
}
}

private UrlDetectorOptions getOptions() {
if (config.isAllowSingleLevelDomain()) {
return UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN;
} else {
return UrlDetectorOptions.Default;
}
}

}
46 changes: 30 additions & 16 deletions crawler4j/src/test/groovy/edu/uci/ics/crawler4j/util/NetTest.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,43 @@

package edu.uci.ics.crawler4j.util

import spock.lang.Specification
import spock.lang.*
import edu.uci.ics.crawler4j.crawler.*

/**
* Test the Net utility class.
*
* @author Paul Galbraith <[email protected]>
*/
class NetTest extends Specification {
def "FEATURE: correctly identify URLs in a text document" () {
given: "a test document with embedded URLs"
def testDocument = '''
www.wikipedia.com
https://en.wikipedia.org/wiki/Main_Page
http://somesite.com:8080/page/1
http://localhost/page/1
http://localhost:8080/page/1
'''

when: "identify and extract URLs"
def urls = Net.extractUrls(testDocument)

then: "should have found 5 URLs"
urls.size() == 5

@Shared standard = new Net(new CrawlConfig())
@Shared allowSingleLevelDomain = new Net(new CrawlConfig(allowSingleLevelDomain: true))

def "no scheme specified" () {
when: def extracted = standard.extractUrls "www.wikipedia.com"
then: expectMatch extracted, "http://www.wikipedia.com/"
}

def "localhost" () {
when: def extracted = allowSingleLevelDomain.extractUrls "http://localhost/page/1"
then: expectMatch extracted, "http://localhost/page/1"
}

def "no url found" () {
when: def extracted = standard.extractUrls "http://localhost"
then: expectMatch extracted // no expected URL
}

def "multiple urls" () {
when: def extracted = standard.extractUrls " hey com check out host.com/toodles and http://例子.测试 real soon "
then: expectMatch extracted, "http://host.com/toodles", "http://例子.测试/"
}

void expectMatch(def extractedUrls, String... expectedUrls) {
def extracted = extractedUrls.collect { it.URL } as Set
def expected = expectedUrls as Set
assert extracted == expected
}

}

0 comments on commit 000fb31

Please sign in to comment.