More comments for changes in the shouldVisit method

shiguang1120 · Feb 4, 2015 · dc207db · dc207db
1 parent 3319b95
commit dc207db
Show file tree

Hide file tree

Showing 8 changed files with 30 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ To use the latest release of crawler4j, please use the following snippet in your
     <dependency>
         <groupId>edu.uci.ics</groupId>
         <artifactId>crawler4j</artifactId>
-        <version>4.0</version>
+        <version>4.1</version>
     </dependency>
 ```
 
@@ -32,30 +32,32 @@ implementation:
 ```java
 public class MyCrawler extends WebCrawler {
 
-    private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpe?g" 
+    private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpe?g"
                                                            + "|png|mp3|mp3|zip|gz))$");
 
     /**
-     * You should implement this function to specify whether
-     * the given url should be crawled or not (based on your
-     * crawling logic). In this example, we are instructing
-     * the crawler to ignore urls that have css, js, git, ...
-     * extensions and to only accept urls that start with
-     * "http://www.ics.uci.edu/".
+     * This method receives two parameters. The first parameter is the page
+     * in which we have discovered this new url and the second parameter is
+     * the new url. You should implement this function to specify whether
+     * the given url should be crawled or not (based on your crawling logic).
+     * In this example, we are instructing the crawler to ignore urls that
+     * have css, js, git, ... extensions and to only accept urls that start
+     * with "http://www.ics.uci.edu/". In this case, we didn't need the
+     * referringPage parameter to make the decision.
      */
      @Override
-     public boolean shouldVisit(WebURL url) {
+     public boolean shouldVisit(Page referringPage, WebURL url) {
          String href = url.getURL().toLowerCase();
-         return !FILTERS.matcher(href).matches() 
+         return !FILTERS.matcher(href).matches()
                 && href.startsWith("http://www.ics.uci.edu/");
      }
 
      /**
-      * This function is called when a page is fetched and ready 
+      * This function is called when a page is fetched and ready
       * to be processed by your program.
       */
      @Override
-     public void visit(Page page) {          
+     public void visit(Page page) {
          String url = page.getWebURL().getURL();
          System.out.println("URL: " + url);
 
@@ -74,13 +76,13 @@ public class MyCrawler extends WebCrawler {
 ```
 As can be seen in the above code, there are two main functions that should be overridden:
 
-- shouldVisit: This function decides whether the given URL should be crawled or not. In 
+- shouldVisit: This function decides whether the given URL should be crawled or not. In
 the above example, this example is not allowing .css, .js and media files and only allows
  pages within 'www.ics.uci.edu' domain.
 - visit: This function is called after the content of a URL is downloaded successfully.
  You can easily get the url, text, links, html, and unique id of the downloaded page.
 
-You should also implement a controller class which specifies the seeds of the crawl, 
+You should also implement a controller class which specifies the seeds of the crawl,
 the folder in which intermediate crawl data should be stored and the number of concurrent
  threads:
 
@@ -114,7 +116,7 @@ public class Controller {
          * Start the crawl. This is a blocking operation, meaning that your code
          * will reach the line after this only when crawling is finished.
          */
-        controller.start(MyCrawler.class, numberOfCrawlers);    
+        controller.start(MyCrawler.class, numberOfCrawlers);
     }
 }
 ```
@@ -169,7 +171,7 @@ crawlConfig.setProxyPort(8080);
 ```
 If your proxy also needs authentication:
 ```java
-crawlConfig.setProxyUsername(username); 
+crawlConfig.setProxyUsername(username);
 crawlConfig.getProxyPassword(password);
 ```
 

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -273,17 +273,19 @@ public void run() {
   /**
    * Classes that extends WebCrawler should overwrite this function to tell the
    * crawler whether the given url should be crawled or not. The following
-   * implementation indicates that all urls should be included in the crawl.
+   * default implementation indicates that all urls should be included in the crawl.
    *
    * @param url
    *            the url which we are interested to know whether it should be
    *            included in the crawl or not.
-   * @param page
-   *           Page context from which this URL was scraped
+   * @param referringPage
+   *           The Page in which this url was found or null if the url has been
+   *           a seed url.
    * @return if the url should be included in the crawl it returns true,
    *         otherwise false is returned.
    */
-  public boolean shouldVisit(Page page, WebURL url) {
+  public boolean shouldVisit(Page referringPage, WebURL url) {
+    // By default allow all urls to be crawled.
     return true;
   }
 

diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java
@@ -44,7 +44,7 @@ public class BasicCrawler extends WebCrawler {
    * should be crawled or not (based on your crawling logic).
    */
   @Override
-  public boolean shouldVisit(Page page, WebURL url) {
+  public boolean shouldVisit(Page referringPage, WebURL url) {
     String href = url.getURL().toLowerCase();
 
     return !BINARY_FILES_EXTENSIONS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");

diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java
@@ -59,7 +59,7 @@ public static void configure(String[] domain, String storageFolderName) {
   }
 
   @Override
-  public boolean shouldVisit(Page page, WebURL url) {
+  public boolean shouldVisit(Page referringPage, WebURL url) {
     String href = url.getURL().toLowerCase();
     if (filters.matcher(href).matches()) {
       return false;

diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorCrawler.java
@@ -43,7 +43,7 @@ public LocalDataCollectorCrawler() {
   }
 
   @Override
-  public boolean shouldVisit(Page page, WebURL url) {
+  public boolean shouldVisit(Page referringPage, WebURL url) {
     String href = url.getURL().toLowerCase();
     return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
   }

diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java
@@ -46,7 +46,7 @@ public void onStart() {
   }
 
   @Override
-  public boolean shouldVisit(Page page, WebURL url) {
+  public boolean shouldVisit(Page referringPage, WebURL url) {
     String href = url.getURL().toLowerCase();
     if (FILTERS.matcher(href).matches()) {
       return false;

diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/BasicCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/BasicCrawler.java
@@ -42,7 +42,7 @@ public class BasicCrawler extends WebCrawler {
   private static final String DOMAIN = "http://www.ics.uci.edu/";
 
   @Override
-  public boolean shouldVisit(Page page, WebURL url) {
+  public boolean shouldVisit(Page referringPage, WebURL url) {
     String href = url.getURL().toLowerCase();
     return !FILTERS.matcher(href).matches() && href.startsWith(DOMAIN);
   }

diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java
@@ -43,7 +43,7 @@ public class StatusHandlerCrawler extends WebCrawler {
    * crawling logic).
    */
   @Override
-  public boolean shouldVisit(Page page, WebURL url) {
+  public boolean shouldVisit(Page referringPage, WebURL url) {
     String href = url.getURL().toLowerCase();
     return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
   }