diff --git a/README.md b/README.md index 0512c2b1a..7b00b997a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ To use the latest release of crawler4j, please use the following snippet in your edu.uci.ics crawler4j - 4.0 + 4.1 ``` @@ -32,30 +32,32 @@ implementation: ```java public class MyCrawler extends WebCrawler { - private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpe?g" + private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpe?g" + "|png|mp3|mp3|zip|gz))$"); /** - * You should implement this function to specify whether - * the given url should be crawled or not (based on your - * crawling logic). In this example, we are instructing - * the crawler to ignore urls that have css, js, git, ... - * extensions and to only accept urls that start with - * "http://www.ics.uci.edu/". + * This method receives two parameters. The first parameter is the page + * in which we have discovered this new url and the second parameter is + * the new url. You should implement this function to specify whether + * the given url should be crawled or not (based on your crawling logic). + * In this example, we are instructing the crawler to ignore urls that + * have css, js, git, ... extensions and to only accept urls that start + * with "http://www.ics.uci.edu/". In this case, we didn't need the + * referringPage parameter to make the decision. */ @Override - public boolean shouldVisit(WebURL url) { + public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); - return !FILTERS.matcher(href).matches() + return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/"); } /** - * This function is called when a page is fetched and ready + * This function is called when a page is fetched and ready * to be processed by your program. */ @Override - public void visit(Page page) { + public void visit(Page page) { String url = page.getWebURL().getURL(); System.out.println("URL: " + url); @@ -74,13 +76,13 @@ public class MyCrawler extends WebCrawler { ``` As can be seen in the above code, there are two main functions that should be overridden: -- shouldVisit: This function decides whether the given URL should be crawled or not. In +- shouldVisit: This function decides whether the given URL should be crawled or not. In the above example, this example is not allowing .css, .js and media files and only allows pages within 'www.ics.uci.edu' domain. - visit: This function is called after the content of a URL is downloaded successfully. You can easily get the url, text, links, html, and unique id of the downloaded page. -You should also implement a controller class which specifies the seeds of the crawl, +You should also implement a controller class which specifies the seeds of the crawl, the folder in which intermediate crawl data should be stored and the number of concurrent threads: @@ -114,7 +116,7 @@ public class Controller { * Start the crawl. This is a blocking operation, meaning that your code * will reach the line after this only when crawling is finished. */ - controller.start(MyCrawler.class, numberOfCrawlers); + controller.start(MyCrawler.class, numberOfCrawlers); } } ``` @@ -169,7 +171,7 @@ crawlConfig.setProxyPort(8080); ``` If your proxy also needs authentication: ```java -crawlConfig.setProxyUsername(username); +crawlConfig.setProxyUsername(username); crawlConfig.getProxyPassword(password); ``` diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index 5ac15a9d3..8d55d6e36 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -273,17 +273,19 @@ public void run() { /** * Classes that extends WebCrawler should overwrite this function to tell the * crawler whether the given url should be crawled or not. The following - * implementation indicates that all urls should be included in the crawl. + * default implementation indicates that all urls should be included in the crawl. * * @param url * the url which we are interested to know whether it should be * included in the crawl or not. - * @param page - * Page context from which this URL was scraped + * @param referringPage + * The Page in which this url was found or null if the url has been + * a seed url. * @return if the url should be included in the crawl it returns true, * otherwise false is returned. */ - public boolean shouldVisit(Page page, WebURL url) { + public boolean shouldVisit(Page referringPage, WebURL url) { + // By default allow all urls to be crawled. return true; } diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java index 0eb36436e..ef69280b5 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java @@ -44,7 +44,7 @@ public class BasicCrawler extends WebCrawler { * should be crawled or not (based on your crawling logic). */ @Override - public boolean shouldVisit(Page page, WebURL url) { + public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); return !BINARY_FILES_EXTENSIONS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/"); diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java index d36899a5a..48f02fd05 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java @@ -59,7 +59,7 @@ public static void configure(String[] domain, String storageFolderName) { } @Override - public boolean shouldVisit(Page page, WebURL url) { + public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); if (filters.matcher(href).matches()) { return false; diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorCrawler.java index 3164fe058..c92dffcce 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorCrawler.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/LocalDataCollectorCrawler.java @@ -43,7 +43,7 @@ public LocalDataCollectorCrawler() { } @Override - public boolean shouldVisit(Page page, WebURL url) { + public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/"); } diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java index 607715ce8..22afc281f 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/multiple/BasicCrawler.java @@ -46,7 +46,7 @@ public void onStart() { } @Override - public boolean shouldVisit(Page page, WebURL url) { + public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); if (FILTERS.matcher(href).matches()) { return false; diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/BasicCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/BasicCrawler.java index 2150b7822..850acd98c 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/BasicCrawler.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/BasicCrawler.java @@ -42,7 +42,7 @@ public class BasicCrawler extends WebCrawler { private static final String DOMAIN = "http://www.ics.uci.edu/"; @Override - public boolean shouldVisit(Page page, WebURL url) { + public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); return !FILTERS.matcher(href).matches() && href.startsWith(DOMAIN); } diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java index a742a0b1c..3ab6f991c 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/statushandler/StatusHandlerCrawler.java @@ -43,7 +43,7 @@ public class StatusHandlerCrawler extends WebCrawler { * crawling logic). */ @Override - public boolean shouldVisit(Page page, WebURL url) { + public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/"); }