forked from yasserg/crawler4j
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
5 changed files
with
103 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
72 changes: 38 additions & 34 deletions
72
crawler4j/src/main/java/edu/uci/ics/crawler4j/util/Net.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,50 +1,54 @@ | ||
package edu.uci.ics.crawler4j.util; | ||
|
||
import java.util.HashSet; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Set; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
import java.util.function.Function; | ||
import java.util.stream.Collectors; | ||
|
||
import com.linkedin.urls.Url; | ||
import com.linkedin.urls.detection.UrlDetector; | ||
import com.linkedin.urls.detection.UrlDetectorOptions; | ||
|
||
import edu.uci.ics.crawler4j.crawler.CrawlConfig; | ||
import edu.uci.ics.crawler4j.url.WebURL; | ||
|
||
/** | ||
* Created by Avi Hayun on 9/22/2014. | ||
* Net related Utils | ||
* | ||
* @author Paul Galbraith <[email protected]> | ||
*/ | ||
public class Net { | ||
private static final Pattern pattern = initializePattern(); | ||
|
||
public static Set<WebURL> extractUrls(String input) { | ||
Set<WebURL> extractedUrls = new HashSet<>(); | ||
|
||
if (input != null) { | ||
Matcher matcher = pattern.matcher(input); | ||
while (matcher.find()) { | ||
WebURL webURL = new WebURL(); | ||
String urlStr = matcher.group(); | ||
if (!urlStr.startsWith("http")) { | ||
urlStr = "http://" + urlStr; | ||
} | ||
|
||
webURL.setURL(urlStr); | ||
extractedUrls.add(webURL); | ||
} | ||
} | ||
|
||
return extractedUrls; | ||
private static final Function<Url, WebURL> urlMapper = url -> { | ||
WebURL webUrl = new WebURL(); | ||
webUrl.setURL(url.getFullUrl()); | ||
return webUrl; | ||
}; | ||
|
||
private CrawlConfig config; | ||
|
||
public Net(CrawlConfig config) { | ||
this.config = config; | ||
} | ||
|
||
/** Singleton like one time call to initialize the Pattern */ | ||
private static Pattern initializePattern() { | ||
return Pattern.compile("\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)" + | ||
"(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov" + | ||
"|mil|biz|info|mobi|name|aero|jobs|museum" + | ||
"|travel|[a-z]{2}))(:[\\d]{1,5})?" + | ||
"(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?" + | ||
"((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" + | ||
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)" + | ||
"(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" + | ||
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*" + | ||
"(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b"); | ||
public Set<WebURL> extractUrls(String input) { | ||
if (input == null) { | ||
return Collections.emptySet(); | ||
} else { | ||
UrlDetector detector = new UrlDetector(input, getOptions()); | ||
List<Url> urls = detector.detect(); | ||
return urls.stream().map(urlMapper).collect(Collectors.toSet()); | ||
} | ||
} | ||
|
||
private UrlDetectorOptions getOptions() { | ||
if (config.isAllowSingleLevelDomain()) { | ||
return UrlDetectorOptions.ALLOW_SINGLE_LEVEL_DOMAIN; | ||
} else { | ||
return UrlDetectorOptions.Default; | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,29 +16,43 @@ | |
|
||
package edu.uci.ics.crawler4j.util | ||
|
||
import spock.lang.Specification | ||
import spock.lang.* | ||
import edu.uci.ics.crawler4j.crawler.* | ||
|
||
/** | ||
* Test the Net utility class. | ||
* | ||
* @author Paul Galbraith <[email protected]> | ||
*/ | ||
class NetTest extends Specification { | ||
def "FEATURE: correctly identify URLs in a text document" () { | ||
given: "a test document with embedded URLs" | ||
def testDocument = ''' | ||
www.wikipedia.com | ||
https://en.wikipedia.org/wiki/Main_Page | ||
http://somesite.com:8080/page/1 | ||
http://localhost/page/1 | ||
http://localhost:8080/page/1 | ||
''' | ||
|
||
when: "identify and extract URLs" | ||
def urls = Net.extractUrls(testDocument) | ||
|
||
then: "should have found 5 URLs" | ||
urls.size() == 5 | ||
|
||
@Shared standard = new Net(new CrawlConfig()) | ||
@Shared allowSingleLevelDomain = new Net(new CrawlConfig(allowSingleLevelDomain: true)) | ||
|
||
def "no scheme specified" () { | ||
when: def extracted = standard.extractUrls "www.wikipedia.com" | ||
then: expectMatch extracted, "http://www.wikipedia.com/" | ||
} | ||
|
||
def "localhost" () { | ||
when: def extracted = allowSingleLevelDomain.extractUrls "http://localhost/page/1" | ||
then: expectMatch extracted, "http://localhost/page/1" | ||
} | ||
|
||
def "no url found" () { | ||
when: def extracted = standard.extractUrls "http://localhost" | ||
then: expectMatch extracted // no expected URL | ||
} | ||
|
||
def "multiple urls" () { | ||
when: def extracted = standard.extractUrls " hey com check out host.com/toodles and http://例子.测试 real soon " | ||
then: expectMatch extracted, "http://host.com/toodles", "http://例子.测试/" | ||
} | ||
|
||
void expectMatch(def extractedUrls, String... expectedUrls) { | ||
def extracted = extractedUrls.collect { it.URL } as Set | ||
def expected = expectedUrls as Set | ||
assert extracted == expected | ||
} | ||
|
||
} |