Skip to content

Commit 457d922

Browse files
committed
Add processing of meta robots flags and rel="nofollow"
1 parent 469a0aa commit 457d922

File tree

10 files changed

+323
-7
lines changed

10 files changed

+323
-7
lines changed

src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java

+28
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,16 @@ public class CrawlConfig {
185185
*/
186186
private String cookiePolicy = CookieSpecs.STANDARD;
187187

188+
/**
189+
* Whether to honor "nofollow" flag
190+
*/
191+
private boolean respectNoFollow = true;
192+
193+
/**
194+
* Whether to honor "noindex" flag
195+
*/
196+
private boolean respectNoIndex = true;
197+
188198
/**
189199
* Validates the configs specified by this instance.
190200
*
@@ -555,6 +565,22 @@ public void setCookiePolicy(String cookiePolicy) {
555565
this.cookiePolicy = cookiePolicy;
556566
}
557567

568+
public boolean isRespectNoFollow() {
569+
return respectNoFollow;
570+
}
571+
572+
public void setRespectNoFollow(boolean respectNoFollow) {
573+
this.respectNoFollow = respectNoFollow;
574+
}
575+
576+
public boolean isRespectNoIndex() {
577+
return respectNoIndex;
578+
}
579+
580+
public void setRespectNoIndex(boolean respectNoIndex) {
581+
this.respectNoIndex = respectNoIndex;
582+
}
583+
558584
@Override
559585
public String toString() {
560586
StringBuilder sb = new StringBuilder();
@@ -580,6 +606,8 @@ public String toString() {
580606
sb.append("Thread shutdown delay: " + getThreadShutdownDelaySeconds() + "\n");
581607
sb.append("Cleanup delay: " + getCleanupDelaySeconds() + "\n");
582608
sb.append("Cookie policy: " + getCookiePolicy() + "\n");
609+
sb.append("Respect nofollow: " + isRespectNoFollow() + "\n");
610+
sb.append("Respect noindex: " + isRespectNoIndex() + "\n");
583611
return sb.toString();
584612
}
585613
}

src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java

+24-3
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
3434
import edu.uci.ics.crawler4j.frontier.DocIDServer;
3535
import edu.uci.ics.crawler4j.frontier.Frontier;
36+
import edu.uci.ics.crawler4j.parser.HtmlParseData;
3637
import edu.uci.ics.crawler4j.parser.NotAllowedContentException;
3738
import edu.uci.ics.crawler4j.parser.ParseData;
3839
import edu.uci.ics.crawler4j.parser.Parser;
@@ -300,7 +301,8 @@ public void run() {
300301
/**
301302
* Classes that extends WebCrawler should overwrite this function to tell the
302303
* crawler whether the given url should be crawled or not. The following
303-
* default implementation indicates that all urls should be included in the crawl.
304+
* default implementation indicates that all urls should be included in the crawl
305+
* except those with a nofollow flag.
304306
*
305307
* @param url
306308
* the url which we are interested to know whether it should be
@@ -311,7 +313,16 @@ public void run() {
311313
* otherwise false is returned.
312314
*/
313315
public boolean shouldVisit(Page referringPage, WebURL url) {
314-
// By default allow all urls to be crawled.
316+
if (myController.getConfig().isRespectNoFollow()) {
317+
return !((referringPage != null &&
318+
referringPage.getContentType() != null &&
319+
referringPage.getContentType().contains("html") &&
320+
((HtmlParseData)referringPage.getParseData())
321+
.getMetaTagValue("robots")
322+
.contains("nofollow")) ||
323+
url.getAttribute("rel").contains("nofollow"));
324+
}
325+
315326
return true;
316327
}
317328

@@ -487,7 +498,17 @@ private void processPage(WebURL curURL) {
487498
+ "as per your \"shouldFollowLinksInPage\" policy",
488499
page.getWebURL().getURL());
489500
}
490-
visit(page);
501+
502+
boolean noIndex = myController.getConfig().isRespectNoIndex() &&
503+
page.getContentType() != null &&
504+
page.getContentType().contains("html") &&
505+
((HtmlParseData)page.getParseData())
506+
.getMetaTagValue("robots").
507+
contains("noindex");
508+
509+
if (!noIndex) {
510+
visit(page);
511+
}
491512
}
492513
} catch (PageBiggerThanMaxSizeException e) {
493514
onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());

src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java

+20
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
package edu.uci.ics.crawler4j.parser;
22

3+
import java.util.HashMap;
4+
import java.util.Map;
5+
36
public class ExtractedUrlAnchorPair {
47

58
private String href;
69
private String anchor;
710
private String tag;
11+
private Map<String, String> attributes = new HashMap<String, String>();
812

913
public String getHref() {
1014
return href;
@@ -29,4 +33,20 @@ public String getTag() {
2933
public void setTag(String tag) {
3034
this.tag = tag;
3135
}
36+
37+
public Map<String, String> getAttributes() {
38+
return attributes;
39+
}
40+
41+
public void setAttributes(Map<String, String> attributes) {
42+
this.attributes = attributes;
43+
}
44+
45+
public String getAttribute(String name) {
46+
return attributes.get(name);
47+
}
48+
49+
public void setAttribute(String name, String val) {
50+
attributes.put(name, val);
51+
}
3252
}

src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java

+13-4
Original file line numberDiff line numberDiff line change
@@ -88,21 +88,18 @@ public void startElement(String uri, String localName, String qName, Attributes
8888
String href = attributes.getValue("href");
8989
if (href != null) {
9090
anchorFlag = true;
91-
addToOutgoingUrls(href, localName);
92-
91+
addToOutgoingUrls(href, localName, attributes);
9392
}
9493
} else if (element == Element.IMG) {
9594
String imgSrc = attributes.getValue("src");
9695
if (imgSrc != null) {
9796
addToOutgoingUrls(imgSrc, localName);
98-
9997
}
10098
} else if ((element == Element.IFRAME) || (element == Element.FRAME) ||
10199
(element == Element.EMBED) || (element == Element.SCRIPT)) {
102100
String src = attributes.getValue("src");
103101
if (src != null) {
104102
addToOutgoingUrls(src, localName);
105-
106103
}
107104
} else if (element == Element.BASE) {
108105
if (base != null) { // We only consider the first occurrence of the Base element.
@@ -149,6 +146,18 @@ private void addToOutgoingUrls(String href, String tag) {
149146
outgoingUrls.add(curUrl);
150147
}
151148

149+
private void addToOutgoingUrls(String href, String tag, Attributes attributes) {
150+
curUrl = new ExtractedUrlAnchorPair();
151+
curUrl.setHref(href);
152+
curUrl.setTag(tag);
153+
for (int x = 0; x < attributes.getLength(); x++) {
154+
String attrName = attributes.getLocalName(x);
155+
String attrVal = attributes.getValue(attrName);
156+
curUrl.setAttribute(attrName, attrVal);
157+
}
158+
outgoingUrls.add(curUrl);
159+
}
160+
152161
@Override
153162
public void endElement(String uri, String localName, String qName) throws SAXException {
154163
Element element = HtmlFactory.getElement(localName);

src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java

+4
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ public void setMetaTags(Map<String, String> metaTags) {
6363
this.metaTags = metaTags;
6464
}
6565

66+
public String getMetaTagValue(String metaTag) {
67+
return metaTags.getOrDefault(metaTag, "");
68+
}
69+
6670
@Override
6771
public Set<WebURL> getOutgoingUrls() {
6872
return outgoingUrls;

src/main/java/edu/uci/ics/crawler4j/parser/Parser.java

+1
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ public void parse(Page page, String contextURL)
138138
webURL.setURL(url);
139139
webURL.setTag(urlAnchorPair.getTag());
140140
webURL.setAnchor(urlAnchorPair.getAnchor());
141+
webURL.setAttributes(urlAnchorPair.getAttributes());
141142
outgoingUrls.add(webURL);
142143
urlCount++;
143144
if (urlCount > config.getMaxOutgoingLinksToFollow()) {

src/main/java/edu/uci/ics/crawler4j/url/WebURL.java

+18
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
import java.io.Serializable;
2121

22+
import java.util.Map;
23+
2224
import com.sleepycat.persist.model.Entity;
2325
import com.sleepycat.persist.model.PrimaryKey;
2426

@@ -44,6 +46,7 @@ public class WebURL implements Serializable {
4446
private String anchor;
4547
private byte priority;
4648
private String tag;
49+
private Map<String, String> attributes;
4750

4851
/**
4952
* @return unique document id assigned to this Url.
@@ -192,6 +195,21 @@ public void setTag(String tag) {
192195
this.tag = tag;
193196
}
194197

198+
public Map<String, String> getAttributes() {
199+
return attributes;
200+
}
201+
202+
public void setAttributes(Map<String, String> attributes) {
203+
this.attributes = attributes;
204+
}
205+
206+
public String getAttribute(String name) {
207+
if (attributes == null) {
208+
return "";
209+
}
210+
return attributes.getOrDefault(name, "");
211+
}
212+
195213
@Override
196214
public int hashCode() {
197215
return url.hashCode();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package edu.uci.ics.crawler4j.crawler
2+
3+
import com.github.tomakehurst.wiremock.junit.WireMockRule
4+
import edu.uci.ics.crawler4j.fetcher.PageFetcher
5+
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig
6+
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer
7+
import edu.uci.ics.crawler4j.url.WebURL
8+
import org.junit.Rule
9+
import org.junit.rules.TemporaryFolder
10+
import spock.lang.Specification
11+
12+
import static com.github.tomakehurst.wiremock.client.WireMock.*
13+
14+
class NoFollowTest extends Specification {
15+
16+
@Rule
17+
public TemporaryFolder temp = new TemporaryFolder()
18+
19+
@Rule
20+
public WireMockRule wireMockRule = new WireMockRule()
21+
22+
def "ignore nofollow links"() {
23+
given: "an index page with two links"
24+
stubFor(get(urlEqualTo("/some/index.html"))
25+
.willReturn(aResponse()
26+
.withStatus(200)
27+
.withHeader("Content-Type", "text/html")
28+
.withBody(
29+
$/<html>
30+
<body>
31+
<a href="/some/page1.html" rel="nofollow">should not visit this</a>
32+
<a href="/some/page2.html">link to a nofollow page</a>
33+
</body>
34+
</html>/$
35+
)))
36+
stubFor(get(urlPathMatching("/some/page(1|3).html"))
37+
.willReturn(aResponse()
38+
.withStatus(200)
39+
.withHeader("Content-Type", "text/html")
40+
.withBody(
41+
$/<html>
42+
<body>
43+
<h1>title</h1>
44+
</body>
45+
</html>/$)))
46+
stubFor(get(urlPathMatching("/some/page2.html"))
47+
.willReturn(aResponse()
48+
.withStatus(200)
49+
.withHeader("Content-Type", "text/html")
50+
.withBody(
51+
$/<html>
52+
<head>
53+
<meta name="robots" content="nofollow">
54+
</head>
55+
<body>
56+
<a href="/some/page3.html">should not visit this</a>
57+
</body>
58+
</html>/$)))
59+
60+
and: "an allow everything robots.txt"
61+
stubFor(get(urlPathMatching("/robots.txt"))
62+
.willReturn(aResponse()
63+
.withStatus(200)
64+
.withHeader("Content-Type", "text/plain")
65+
.withBody(
66+
$/User-agent: *
67+
Allow: /
68+
/$)))
69+
70+
when:
71+
CrawlConfig config = new CrawlConfig(
72+
crawlStorageFolder: temp.getRoot().getAbsolutePath()
73+
, politenessDelay: 100
74+
, maxConnectionsPerHost: 1
75+
, threadShutdownDelaySeconds: 1
76+
, threadMonitoringDelaySeconds: 1
77+
, cleanupDelaySeconds: 1
78+
)
79+
80+
PageFetcher pageFetcher = new PageFetcher(config)
81+
RobotstxtServer robotstxtServer = new RobotstxtServer(new RobotstxtConfig(), pageFetcher)
82+
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer)
83+
controller.addSeed "http://localhost:8080/some/index.html"
84+
85+
controller.start(WebCrawler.class, 1)
86+
87+
then: "nofollow links should not be visited"
88+
verify(exactly(1), getRequestedFor(urlEqualTo("/robots.txt")))
89+
verify(exactly(0), getRequestedFor(urlEqualTo("/some/page1.html")))
90+
verify(exactly(1), getRequestedFor(urlEqualTo("/some/page2.html")))
91+
verify(exactly(0), getRequestedFor(urlEqualTo("/some/page3.html")))
92+
}
93+
}

0 commit comments

Comments
 (0)