Add processing of meta robots flags and rel="nofollow"

JCotton1123 · JCotton1123 · commit 457d92215a12 · 2017-04-16T21:14:05.000-04:00
diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
@@ -185,6 +185,16 @@ public class CrawlConfig {
      */
     private String cookiePolicy = CookieSpecs.STANDARD;
 
+    /**
+     * Whether to honor "nofollow" flag
+     */
+    private boolean respectNoFollow = true;
+
+    /**
+     * Whether to honor "noindex" flag
+     */
+    private boolean respectNoIndex = true;
+
     /**
      * Validates the configs specified by this instance.
      *
@@ -555,6 +565,22 @@ public void setCookiePolicy(String cookiePolicy) {
         this.cookiePolicy = cookiePolicy;
     }
 
+    public boolean isRespectNoFollow() {
+        return respectNoFollow;
+    }
+
+    public void setRespectNoFollow(boolean respectNoFollow) {
+        this.respectNoFollow = respectNoFollow;
+    }
+
+    public boolean isRespectNoIndex() {
+        return respectNoIndex;
+    }
+
+    public void setRespectNoIndex(boolean respectNoIndex) {
+        this.respectNoIndex = respectNoIndex;
+    }
+
     @Override
     public String toString() {
         StringBuilder sb = new StringBuilder();
@@ -580,6 +606,8 @@ public String toString() {
         sb.append("Thread shutdown delay: " + getThreadShutdownDelaySeconds() + "\n");
         sb.append("Cleanup delay: " + getCleanupDelaySeconds() + "\n");
         sb.append("Cookie policy: " + getCookiePolicy() + "\n");
+        sb.append("Respect nofollow: " + isRespectNoFollow() + "\n");
+        sb.append("Respect noindex: " + isRespectNoIndex() + "\n");
         return sb.toString();
     }
 }
diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -33,6 +33,7 @@
 import edu.uci.ics.crawler4j.fetcher.PageFetcher;
 import edu.uci.ics.crawler4j.frontier.DocIDServer;
 import edu.uci.ics.crawler4j.frontier.Frontier;
+import edu.uci.ics.crawler4j.parser.HtmlParseData;
 import edu.uci.ics.crawler4j.parser.NotAllowedContentException;
 import edu.uci.ics.crawler4j.parser.ParseData;
 import edu.uci.ics.crawler4j.parser.Parser;
@@ -300,7 +301,8 @@ public void run() {
     /**
      * Classes that extends WebCrawler should overwrite this function to tell the
      * crawler whether the given url should be crawled or not. The following
-     * default implementation indicates that all urls should be included in the crawl.
+     * default implementation indicates that all urls should be included in the crawl
+     * except those with a nofollow flag.
      *
      * @param url
      *            the url which we are interested to know whether it should be
@@ -311,7 +313,16 @@ public void run() {
      *         otherwise false is returned.
      */
     public boolean shouldVisit(Page referringPage, WebURL url) {
-        // By default allow all urls to be crawled.
+        if (myController.getConfig().isRespectNoFollow()) {
+            return !((referringPage != null &&
+                    referringPage.getContentType() != null &&
+                    referringPage.getContentType().contains("html") &&
+                    ((HtmlParseData)referringPage.getParseData())
+                        .getMetaTagValue("robots")
+                        .contains("nofollow")) ||
+                    url.getAttribute("rel").contains("nofollow"));
+        }
+
         return true;
     }
 
@@ -487,7 +498,17 @@ private void processPage(WebURL curURL) {
                                  + "as per your \"shouldFollowLinksInPage\" policy",
                                  page.getWebURL().getURL());
                 }
-                visit(page);
+
+                boolean noIndex = myController.getConfig().isRespectNoIndex() &&
+                    page.getContentType() != null &&
+                    page.getContentType().contains("html") &&
+                    ((HtmlParseData)page.getParseData())
+                        .getMetaTagValue("robots").
+                        contains("noindex");
+
+                if (!noIndex) {
+                    visit(page);
+                }
             }
         } catch (PageBiggerThanMaxSizeException e) {
             onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java b/src/main/java/edu/uci/ics/crawler4j/parser/ExtractedUrlAnchorPair.java
@@ -1,10 +1,14 @@
 package edu.uci.ics.crawler4j.parser;
 
+import java.util.HashMap;
+import java.util.Map;
+
 public class ExtractedUrlAnchorPair {
 
     private String href;
     private String anchor;
     private String tag;
+    private Map<String, String> attributes = new HashMap<String, String>();
 
     public String getHref() {
         return href;
@@ -29,4 +33,20 @@ public String getTag() {
     public void setTag(String tag) {
         this.tag = tag;
     }
+
+    public Map<String, String> getAttributes() {
+        return attributes;
+    }
+
+    public void setAttributes(Map<String, String> attributes) {
+        this.attributes = attributes;
+    }
+
+    public String getAttribute(String name) {
+        return attributes.get(name);
+    }
+
+    public void setAttribute(String name, String val) {
+        attributes.put(name, val);
+    }
 }
diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
@@ -88,21 +88,18 @@ public void startElement(String uri, String localName, String qName, Attributes
             String href = attributes.getValue("href");
             if (href != null) {
                 anchorFlag = true;
-                addToOutgoingUrls(href, localName);
-
+                addToOutgoingUrls(href, localName, attributes);
             }
         } else if (element == Element.IMG) {
             String imgSrc = attributes.getValue("src");
             if (imgSrc != null) {
                 addToOutgoingUrls(imgSrc, localName);
-
             }
         } else if ((element == Element.IFRAME) || (element == Element.FRAME) ||
                    (element == Element.EMBED) || (element == Element.SCRIPT)) {
             String src = attributes.getValue("src");
             if (src != null) {
                 addToOutgoingUrls(src, localName);
-
             }
         } else if (element == Element.BASE) {
             if (base != null) { // We only consider the first occurrence of the Base element.
@@ -149,6 +146,18 @@ private void addToOutgoingUrls(String href, String tag) {
         outgoingUrls.add(curUrl);
     }
 
+    private void addToOutgoingUrls(String href, String tag, Attributes attributes) {
+        curUrl = new ExtractedUrlAnchorPair();
+        curUrl.setHref(href);
+        curUrl.setTag(tag);
+        for (int x = 0; x < attributes.getLength(); x++) {
+            String attrName = attributes.getLocalName(x);
+            String attrVal = attributes.getValue(attrName);
+            curUrl.setAttribute(attrName, attrVal);
+        }
+        outgoingUrls.add(curUrl);
+    }
+
     @Override
     public void endElement(String uri, String localName, String qName) throws SAXException {
         Element element = HtmlFactory.getElement(localName);
diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java
@@ -63,6 +63,10 @@ public void setMetaTags(Map<String, String> metaTags) {
         this.metaTags = metaTags;
     }
 
+    public String getMetaTagValue(String metaTag) {
+        return metaTags.getOrDefault(metaTag, "");
+    }
+
     @Override
     public Set<WebURL> getOutgoingUrls() {
         return outgoingUrls;
diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
@@ -138,6 +138,7 @@ public void parse(Page page, String contextURL)
                         webURL.setURL(url);
                         webURL.setTag(urlAnchorPair.getTag());
                         webURL.setAnchor(urlAnchorPair.getAnchor());
+                        webURL.setAttributes(urlAnchorPair.getAttributes());
                         outgoingUrls.add(webURL);
                         urlCount++;
                         if (urlCount > config.getMaxOutgoingLinksToFollow()) {
diff --git a/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
@@ -19,6 +19,8 @@
 
 import java.io.Serializable;
 
+import java.util.Map;
+
 import com.sleepycat.persist.model.Entity;
 import com.sleepycat.persist.model.PrimaryKey;
 
@@ -44,6 +46,7 @@ public class WebURL implements Serializable {
     private String anchor;
     private byte priority;
     private String tag;
+    private Map<String, String> attributes;
 
     /**
      * @return unique document id assigned to this Url.
@@ -192,6 +195,21 @@ public void setTag(String tag) {
         this.tag = tag;
     }
 
+    public Map<String, String> getAttributes() {
+        return attributes;
+    }
+
+    public void setAttributes(Map<String, String> attributes) {
+        this.attributes = attributes;
+    }
+
+    public String getAttribute(String name) {
+        if (attributes == null) {
+            return "";
+        }
+        return attributes.getOrDefault(name, "");
+    }
+
     @Override
     public int hashCode() {
         return url.hashCode();
diff --git a/src/test/groovy/edu/uci/ics/crawler4j/crawler/NoFollowTest.groovy b/src/test/groovy/edu/uci/ics/crawler4j/crawler/NoFollowTest.groovy
@@ -0,0 +1,93 @@
+package edu.uci.ics.crawler4j.crawler
+
+import com.github.tomakehurst.wiremock.junit.WireMockRule
+import edu.uci.ics.crawler4j.fetcher.PageFetcher
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer
+import edu.uci.ics.crawler4j.url.WebURL
+import org.junit.Rule
+import org.junit.rules.TemporaryFolder
+import spock.lang.Specification
+
+import static com.github.tomakehurst.wiremock.client.WireMock.*
+
+class NoFollowTest extends Specification {
+
+    @Rule
+    public TemporaryFolder temp = new TemporaryFolder()
+
+    @Rule
+    public WireMockRule wireMockRule = new WireMockRule()
+
+    def "ignore nofollow links"() {
+        given: "an index page with two links"
+        stubFor(get(urlEqualTo("/some/index.html"))
+                .willReturn(aResponse()
+                .withStatus(200)
+                .withHeader("Content-Type", "text/html")
+                .withBody(
+                $/<html>
+                    <body> 
+                        <a href="/some/page1.html" rel="nofollow">should not visit this</a>
+                        <a href="/some/page2.html">link to a nofollow page</a>
+                    </body>
+                   </html>/$
+        )))
+        stubFor(get(urlPathMatching("/some/page(1|3).html"))
+                .willReturn(aResponse()
+                .withStatus(200)
+                .withHeader("Content-Type", "text/html")
+                .withBody(
+                $/<html>
+                    <body>
+                        <h1>title</h1>
+                    </body>
+                  </html>/$)))
+        stubFor(get(urlPathMatching("/some/page2.html"))
+                .willReturn(aResponse()
+                .withStatus(200)
+                .withHeader("Content-Type", "text/html")
+                .withBody(
+                $/<html>
+                    <head>
+                      <meta name="robots" content="nofollow">
+                    </head>
+                    <body>
+                        <a href="/some/page3.html">should not visit this</a>
+                    </body>
+                  </html>/$)))
+
+        and: "an allow everything robots.txt"
+        stubFor(get(urlPathMatching("/robots.txt"))
+                .willReturn(aResponse()
+                .withStatus(200)
+                .withHeader("Content-Type", "text/plain")
+                .withBody(
+                $/User-agent: * 
+                  Allow: /
+                /$)))
+
+        when:
+        CrawlConfig config = new CrawlConfig(
+                crawlStorageFolder: temp.getRoot().getAbsolutePath()
+                , politenessDelay: 100
+                , maxConnectionsPerHost: 1
+                , threadShutdownDelaySeconds: 1
+                , threadMonitoringDelaySeconds: 1
+                , cleanupDelaySeconds: 1
+        )
+
+        PageFetcher pageFetcher = new PageFetcher(config)
+        RobotstxtServer robotstxtServer = new RobotstxtServer(new RobotstxtConfig(), pageFetcher)
+        CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer)
+        controller.addSeed "http://localhost:8080/some/index.html"
+
+        controller.start(WebCrawler.class, 1)
+
+        then: "nofollow links should not be visited"
+        verify(exactly(1), getRequestedFor(urlEqualTo("/robots.txt")))
+        verify(exactly(0), getRequestedFor(urlEqualTo("/some/page1.html")))
+        verify(exactly(1), getRequestedFor(urlEqualTo("/some/page2.html")))
+        verify(exactly(0), getRequestedFor(urlEqualTo("/some/page3.html")))
+    }
+}
diff --git a/src/test/groovy/edu/uci/ics/crawler4j/crawler/NoIndexTest.groovy b/src/test/groovy/edu/uci/ics/crawler4j/crawler/NoIndexTest.groovy
diff --git a/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java b/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java