Skip to content

Commit

Permalink
Add shouldFollowLinksInPage(Page) method, to allow easy configuration…
Browse files Browse the repository at this point in the history
… for certain crawling policies.

Fixes yasserg#129
  • Loading branch information
bgoldowsky committed Oct 27, 2016
1 parent 16c0b74 commit d08bf87
Showing 1 changed file with 11 additions and 6 deletions.
17 changes: 11 additions & 6 deletions src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -302,15 +302,20 @@ public boolean shouldVisit(Page referringPage, WebURL url) {
}

/**
* Determine whether links in the given page should be added to the queue for crawling.
* Determine whether links found at the given URL should be added to the queue for crawling.
* By default this method returns true always, but classes that extend WebCrawler can
* override it in order to implement particular policies about which pages should be
* mined for outgoing links and which should not.
*
* @param page the page currently being visited
* If links from the URL are not being followed, then we are not operating as
* a web crawler and need not check robots.txt before fetching the single URL.
* (see definition at http://www.robotstxt.org/faq/what.html). Thus URLs that
* return false from this method will not be subject to robots.txt filtering.
*
* @param url the URL of the page under consideration
* @return true if outgoing links from this page should be added to the queue.
*/
protected boolean shouldFollowLinksInPage(Page page) {
protected boolean shouldFollowLinksIn(WebURL url) {
return true;
}

Expand Down Expand Up @@ -368,7 +373,7 @@ private void processPage(WebURL curURL) {
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
} else {
Expand Down Expand Up @@ -401,7 +406,7 @@ private void processPage(WebURL curURL) {

parser.parse(page, curURL.getURL());

if (shouldFollowLinksInPage(page)) {
if (shouldFollowLinksIn(page.getWebURL())) {
ParseData parseData = page.getParseData();
List<WebURL> toSchedule = new ArrayList<>();
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
Expand All @@ -418,7 +423,7 @@ private void processPage(WebURL curURL) {
webURL.setDepth((short) (curURL.getDepth() + 1));
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
} else {
Expand Down

0 comments on commit d08bf87

Please sign in to comment.