Fix yasserg#348

Allow localhost URL
13401095975 · Sep 29, 2018 · 000fb31 · 000fb31
1 parent d20fc31
commit 000fb31
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 54 deletions.
diff --git a/crawler4j/pom.xml b/crawler4j/pom.xml
@@ -17,6 +17,7 @@
 		<apache.http.components.version>4.5.5</apache.http.components.version>
 		<je.version>5.0.84</je.version>
 		<apache.tika.version>1.17</apache.tika.version>
+        <url-detector.version>0.1.20</url-detector.version>
 		<!--test dependency versions -->
 		<junit.version>4.12</junit.version>
 		<wiremock.version>2.14.0</wiremock.version>
@@ -369,6 +370,11 @@
 			</exclusion>
 		</exclusions>
 	</dependency>
+    <dependency>
+        <groupId>io.github.pgalbraith</groupId>
+        <artifactId>url-detector</artifactId>
+        <version>${url-detector.version}</version>
+    </dependency>
 
     <!-- Test Dependencies -->
 		<dependency>

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
@@ -221,6 +221,8 @@ public DnsResolver getDnsResolver() {
 
     private DnsResolver dnsResolver = new SystemDefaultDnsResolver();
 
+    private boolean allowSingleLevelDomain = false;
+
     /**
      * Validates the configs specified by this instance.
      *
@@ -642,6 +644,26 @@ public void setRespectNoIndex(boolean respectNoIndex) {
         this.respectNoIndex = respectNoIndex;
     }
 
+    /**
+     * Are single level domains (e.g. http://localhost) considered valid?
+     *
+     * @return
+     */
+    public boolean isAllowSingleLevelDomain() {
+        return allowSingleLevelDomain;
+    }
+
+    /**
+     * Allow single level domains (e.g. http://localhost). This is very useful for
+     * testing especially when you may be using localhost.
+     *
+     * @param allowSingleLevelDomain
+     *            {@code true} if single level domain should be considered valid
+     */
+    public void setAllowSingleLevelDomain(boolean allowSingleLevelDomain) {
+        this.allowSingleLevelDomain = allowSingleLevelDomain;
+    }
+
     @Override
     public String toString() {
         StringBuilder sb = new StringBuilder();
@@ -668,6 +690,7 @@ public String toString() {
         sb.append("Cookie policy: " + getCookiePolicy() + "\n");
         sb.append("Respect nofollow: " + isRespectNoFollow() + "\n");
         sb.append("Respect noindex: " + isRespectNoIndex() + "\n");
+        sb.append("Allow single level domain:" + isAllowSingleLevelDomain() + "\n");
         return sb.toString();
     }
 }
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
@@ -37,14 +37,16 @@ public class Parser {
 
     private final HtmlParser htmlContentParser;
 
+    private final Net net;
+
     public Parser(CrawlConfig config) throws IllegalAccessException, InstantiationException {
-        this.config = config;
-        this.htmlContentParser = new TikaHtmlParser(config);
+        this(config, new TikaHtmlParser(config));
     }
 
     public Parser(CrawlConfig config, HtmlParser htmlParser) {
         this.config = config;
         this.htmlContentParser = htmlParser;
+        this.net = new Net(config);
     }
 
     public void parse(Page page, String contextURL)
@@ -61,7 +63,7 @@ public void parse(Page page, String contextURL)
                 if (parseData.getHtml() == null) {
                     throw new ParseException();
                 }
-                parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
+                parseData.setOutgoingUrls(net.extractUrls(parseData.getHtml()));
             } else {
                 throw new NotAllowedContentException();
             }
@@ -74,7 +76,7 @@ public void parse(Page page, String contextURL)
                     parseData.setTextContent(
                         new String(page.getContentData(), page.getContentCharset()));
                 }
-                parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
+                parseData.setOutgoingUrls(net.extractUrls(parseData.getTextContent()));
                 page.setParseData(parseData);
             } catch (Exception e) {
                 logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/util/Net.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/util/Net.java
@@ -1,50 +1,54 @@
 package edu.uci.ics.crawler4j.util;
 
-import java.util.HashSet;
+import java.util.Collections;
+import java.util.List;
 import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import java.util.function.Function;
+import java.util.stream.Collectors;
 
+import com.linkedin.urls.Url;
+import com.linkedin.urls.detection.UrlDetector;
+import com.linkedin.urls.detection.UrlDetectorOptions;
+
+import edu.uci.ics.crawler4j.crawler.CrawlConfig;
 import edu.uci.ics.crawler4j.url.WebURL;
 
 /**
  * Created by Avi Hayun on 9/22/2014.
  * Net related Utils
+ *
+ * @author Paul Galbraith <[email protected]>
  */
 public class Net {
-    private static final Pattern pattern = initializePattern();
-
-    public static Set<WebURL> extractUrls(String input) {
-        Set<WebURL> extractedUrls = new HashSet<>();
-
-        if (input != null) {
-            Matcher matcher = pattern.matcher(input);
-            while (matcher.find()) {
-                WebURL webURL = new WebURL();
-                String urlStr = matcher.group();
-                if (!urlStr.startsWith("http")) {
-                    urlStr = "http://" + urlStr;
-                }
-
-                webURL.setURL(urlStr);
-                extractedUrls.add(webURL);
-            }
-        }
 
-        return extractedUrls;
+    private static final Function<Url, WebURL> urlMapper = url -> {
+        WebURL webUrl = new WebURL();
+        webUrl.setURL(url.getFullUrl());
+        return webUrl;
+    };
+
+    private CrawlConfig config;
+
+    public Net(CrawlConfig config) {
+        this.config = config;
     }
 
-    /** Singleton like one time call to initialize the Pattern */
-    private static Pattern initializePattern() {
-        return Pattern.compile("\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)" +
-                               "(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov" +
-                               "|mil|biz|info|mobi|name|aero|jobs|museum" +
-                               "|travel|[a-z]{2}))(:[\\d]{1,5})?" +
-                               "(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?" +
-                               "((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
-                               "([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)" +
-                               "(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
-                               "([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*" +
-                               "(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b");
+    public Set<WebURL> extractUrls(String input) {
+        if (input == null) {
+            return Collections.emptySet();
+        } else {
+            UrlDetector detector = new UrlDetector(input, getOptions());
+            List<Url> urls = detector.detect();
+            return urls.stream().map(urlMapper).collect(Collectors.toSet());
+        }
     }
+
+    private UrlDetectorOptions getOptions() {
+        if (config.isAllowSingleLevelDomain()) {
+            return UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN;
+        } else {
+            return UrlDetectorOptions.Default;
+        }
+    }
+
 }
diff --git a/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/util/NetTest.groovy b/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/util/NetTest.groovy
@@ -16,29 +16,43 @@
 
 package edu.uci.ics.crawler4j.util
 
-import spock.lang.Specification
+import spock.lang.*
+import edu.uci.ics.crawler4j.crawler.*
 
 /**
  * Test the Net utility class.
  *   
  * @author Paul Galbraith <[email protected]>
  */
 class NetTest extends Specification {
-    def "FEATURE: correctly identify URLs in a text document" () {
-        given: "a test document with embedded URLs"
-            def testDocument = '''
-                www.wikipedia.com
-                https://en.wikipedia.org/wiki/Main_Page
-                http://somesite.com:8080/page/1
-                http://localhost/page/1
-                http://localhost:8080/page/1
-            '''
-
-            when: "identify and extract URLs"
-                def urls = Net.extractUrls(testDocument)
-
-            then: "should have found 5 URLs"
-                urls.size() == 5
+
+    @Shared standard = new Net(new CrawlConfig())
+    @Shared allowSingleLevelDomain = new Net(new CrawlConfig(allowSingleLevelDomain: true))
+
+    def "no scheme specified" () {
+        when: def extracted = standard.extractUrls "www.wikipedia.com"
+        then: expectMatch extracted, "http://www.wikipedia.com/"
+    }
+
+    def "localhost" () {
+        when: def extracted = allowSingleLevelDomain.extractUrls "http://localhost/page/1"
+        then: expectMatch extracted, "http://localhost/page/1"
+    }
+
+    def "no url found" () {
+        when: def extracted = standard.extractUrls "http://localhost"
+        then: expectMatch extracted     // no expected URL
+    }
+
+    def "multiple urls" () {
+        when: def extracted = standard.extractUrls " hey com check out host.com/toodles and http://例子.测试 real soon "
+        then: expectMatch extracted, "http://host.com/toodles", "http://例子.测试/"
+    }
+
+    void expectMatch(def extractedUrls, String... expectedUrls) {
+        def extracted = extractedUrls.collect { it.URL } as Set
+        def expected = expectedUrls as Set
+        assert extracted == expected
     }
 
 }