Skip to content

Commit

Permalink
Cleaned basic example.
Browse files Browse the repository at this point in the history
  • Loading branch information
yasserg committed Jan 1, 2012
1 parent 52f65cc commit 497d71e
Showing 1 changed file with 10 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,21 @@ public class BasicCrawler extends WebCrawler {
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g" + "|png|tiff?|mid|mp2|mp3|mp4"
+ "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$");

private final static String DOMAIN = "http://www.ics.uci.edu/";

/**
* You should implement this function to specify whether
* the given url should be crawled or not (based on your
* crawling logic).
*/
@Override
public boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith(DOMAIN);
return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/");
}

/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
Expand Down

0 comments on commit 497d71e

Please sign in to comment.