Skip to content

Commit

Permalink
Parse css for links matching url(*)in tags @import and @font-face
Browse files Browse the repository at this point in the history
  • Loading branch information
Federico Tolomei committed Dec 15, 2018
1 parent d7366fa commit 8043a98
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,60 +17,62 @@

package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.url.WebURL;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;

public class CssParseData extends TextParseData {

public Set<WebURL> parseOutgoingUrls(WebURL referingPage) {
public Set<WebURL> parseOutgoingUrls(WebURL referringPage) {

Set<String> extractedUrls = extractUrlInCssText(this.getTextContent());

final String pagePath = referingPage.getPath();
final String pageUrl = referingPage.getURL();
final String pagePath = referringPage.getPath();
final String pageUrl = referringPage.getURL();

Set<WebURL> outgoingUrls = new HashSet<>();
for (String url : extractedUrls) {

String relative = getLinkRelativeTo(pagePath, url);
String absolute = getAbsoluteUrlFrom(pageUrl, relative);
String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative);

WebURL webURL = new WebURL();
webURL.setURL(absolute);
outgoingUrls.add(webURL);

// logger.info("CSS Link: {} -> {}", url, absolute);
}
return outgoingUrls;
}

public void setOutgoingUrls(WebURL referingPage) {
public void setOutgoingUrls(WebURL referringPage) {

Set<WebURL> outgoingUrls = parseOutgoingUrls(referingPage);
Set<WebURL> outgoingUrls = parseOutgoingUrls(referringPage);
this.setOutgoingUrls(outgoingUrls);
}

private static Set<String> extractUrlInCssText(String input) {

Set<String> extractedUrls = new HashSet<>();
if (input == null)
if (input == null || input.isEmpty()) {
return extractedUrls;
}

Matcher matcher = pattern.matcher(input);
while (matcher.find()) {
String url = matcher.group(1);
if (url == null)
if (url == null) {
url = matcher.group(2);
if (url == null)
}
if (url == null) {
url = matcher.group(3);
if (url == null)
continue;
if (url.startsWith("data:"))
}
if (url == null || url.startsWith("data:")) {
continue;
}
extractedUrls.add(url);
}
return extractedUrls;
Expand All @@ -88,22 +90,25 @@ private static Pattern initializePattern() {
private static String getAbsoluteUrlFrom(String pageUrl, String linkPath) {

String domainUrl = getFullDomainFromUrl(pageUrl);
if (linkPath.startsWith("/"))
if (linkPath.startsWith("/")) {
return domainUrl + linkPath;
}
return domainUrl + "/" + linkPath;
}

private static String getLinkRelativeTo(String pagePath, String linkUrl) {

if (linkUrl.startsWith("/"))
if (linkUrl.startsWith("/") && !linkUrl.startsWith("//")) {
return linkUrl;
}

if (linkUrl.startsWith("//"))
if (linkUrl.startsWith("//")) {
linkUrl = "http" + linkUrl;
}

if (linkUrl.startsWith("http")) {
String domainUrl = getPathFromUrl(linkUrl);
return "/" + domainUrl;
return domainUrl;
}

if (linkUrl.startsWith("../")) {
Expand All @@ -117,8 +122,9 @@ private static String getLinkRelativeTo(String pagePath, String linkUrl) {
String absolute = "";
for (int i = 0; i < diff; i++) {
String dir = parts[i];
if (!dir.isEmpty())
if (!dir.isEmpty()) {
absolute = absolute + "/" + dir;
}
}
return absolute + "/" + linkUrl.substring(pos);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package edu.uci.ics.crawler4j.parser

import edu.uci.ics.crawler4j.url.WebURL
import spock.lang.Specification

/**
* Test the CssParseData class.
*
* @author Federico Tolomei <[email protected]>
*/
class CssParseDataTest extends Specification {

def "CSS urls parsing quotes"() {
given: "css parser"
CssParseData parseData = new CssParseData()
parseData.setTextContent(this.getClass().getResource( '/css/quotes.css' ).text)

and: "configure css parser"
WebURL webUrl = new WebURL()
webUrl.setURL("http://example.com/css.css")

when: "parse css"
parseData.setOutgoingUrls(webUrl)
Set<WebURL> urls = parseData.outgoingUrls

then: "urls from css"
assert urls.size() == 3
}

def "CSS absolute urls paths"() {
given: "css parser"
CssParseData parseData = new CssParseData()
parseData.setTextContent(this.getClass().getResource( '/css/absolute.css' ).text)

and: "configure css parser"
WebURL webUrl = new WebURL()
webUrl.setURL("http://example.com/css.css")

when: "parse css"
parseData.setOutgoingUrls(webUrl)
Set<WebURL> urls = parseData.outgoingUrls

then: "urls from css"
assert urls.size() == 3

and:
List<String> mapped = urls.collect { x -> x.getURL() }
assert mapped.contains("http://example.com/css/absolute_no_proto.png")
assert mapped.contains("http://example.com/css/absolute_path.png")
assert mapped.contains("http://example.com/css/absolute_with_domain.png")
}

def "CSS relative urls paths"() {
given: "css parser"
CssParseData parseData = new CssParseData()
parseData.setTextContent(this.getClass().getResource( '/css/relative.css' ).text)

and: "configure css parser"
WebURL webUrl = new WebURL()
webUrl.setURL("http://example.com/asset/css/css.css")

when: "parse css"
parseData.setOutgoingUrls(webUrl)
Set<WebURL> urls = parseData.outgoingUrls

then: "urls from css"
assert urls.size() == 2

and:
List<String> mapped = urls.collect { x -> x.getURL() }
assert mapped.contains("http://example.com/asset/images/backgound_one.jpg")
assert mapped.contains("http://example.com/backgound_two.jpg")
}
}
17 changes: 17 additions & 0 deletions crawler4j/src/test/resources/css/absolute.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
.absolute_with_domain {
background-image: url("http://example.com/css/absolute_with_domain.png");
}

.absolute_no_proto {
background-image: url("//example.com/css/absolute_no_proto.png");
}


.absolute_path {
background-image: url("/css/absolute_path.png");
}


.data {
background: url(data:image/gif;base64,IGNORETHISURL) no-repeat left center;
}
11 changes: 11 additions & 0 deletions crawler4j/src/test/resources/css/quotes.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
.double {
background-image: url("//css/quotes/double.jpg");
}

.single {
background-image: url('//css/quotes/single.jpg');
}

.noquote {
background-image: url(//pix/quotes/none.jpg);
}
8 changes: 8 additions & 0 deletions crawler4j/src/test/resources/css/relative.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.relative_one {
background-image: url("../images/backgound_one.jpg");
}

.relative_two {
background-image: url("../../backgound_two.jpg");
}

0 comments on commit 8043a98

Please sign in to comment.