Skip to content

Commit

Permalink
Added support for handling http status codes for fetched pages
Browse files Browse the repository at this point in the history
  • Loading branch information
yasserg committed Jan 22, 2012
1 parent a74d2c0 commit 0161a71
Show file tree
Hide file tree
Showing 12 changed files with 412 additions and 157 deletions.
16 changes: 0 additions & 16 deletions src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,6 @@ public class CrawlConfig {
*/
private boolean followRedirects = true;

/**
* Should we log the 404 (Not Found) pages?
*/
private boolean show404PagesInLogs = false;

/**
* If crawler should run behind a proxy, this parameter can be used for
* specifying the proxy host.
Expand Down Expand Up @@ -330,17 +325,6 @@ public void setFollowRedirects(boolean followRedirects) {
this.followRedirects = followRedirects;
}

public boolean isShow404PagesInLogs() {
return show404PagesInLogs;
}

/**
* Should we log the 404 (Not Found) pages?
*/
public void setShow404PagesInLogs(boolean show404PagesInLogs) {
this.show404PagesInLogs = show404PagesInLogs;
}

public String getProxyHost() {
return proxyHost;
}
Expand Down
125 changes: 63 additions & 62 deletions src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
import java.util.List;

/**
* The controller that manages a crawling session. This class creates
* the crawler threads and monitors their progress.
*
* The controller that manages a crawling session. This class creates the
* crawler threads and monitors their progress.
*
* @author Yasser Ganjisaffar <lastname at gmail dot com>
*/
public class CrawlController extends Configurable {
Expand All @@ -48,26 +48,24 @@ public class CrawlController extends Configurable {
*/
protected Object customData;

/**
* Once the crawling session finishes the controller
* collects the local data of the crawler threads and stores
* them in this List.
*/
protected List<Object> crawlersLocalData = new ArrayList<Object>();

/**
* Is the crawling of this session finished?
*/
protected boolean finished;

/**
* Is the crawling session set to 'shutdown'.
* Crawler threads monitor this flag and when it is set
* they will no longer process new pages.
*/
protected boolean shuttingDown;

protected PageFetcher pageFetcher;
/**
* Once the crawling session finishes the controller collects the local data
* of the crawler threads and stores them in this List.
*/
protected List<Object> crawlersLocalData = new ArrayList<Object>();

/**
* Is the crawling of this session finished?
*/
protected boolean finished;

/**
* Is the crawling session set to 'shutdown'. Crawler threads monitor this
* flag and when it is set they will no longer process new pages.
*/
protected boolean shuttingDown;

protected PageFetcher pageFetcher;
protected RobotstxtServer robotstxtServer;
protected Frontier frontier;
protected DocIDServer docIdServer;
Expand All @@ -82,8 +80,8 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtSer
File folder = new File(config.getCrawlStorageFolder());
if (!folder.exists()) {
if (!folder.mkdirs()) {
throw new Exception("Couldn't create this folder: " + folder.getAbsolutePath());
}
throw new Exception("Couldn't create this folder: " + folder.getAbsolutePath());
}
}

boolean resumable = config.isResumableCrawling();
Expand All @@ -96,8 +94,8 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtSer
File envHome = new File(config.getCrawlStorageFolder() + "/frontier");
if (!envHome.exists()) {
if (!envHome.mkdir()) {
throw new Exception("Couldn't create this folder: " + envHome.getAbsolutePath());
}
throw new Exception("Couldn't create this folder: " + envHome.getAbsolutePath());
}
}
if (!resumable) {
IO.deleteFolderContents(envHome);
Expand All @@ -114,24 +112,28 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtSer
shuttingDown = false;
}

/**
* Start the crawling session and wait for it to finish.
*
* @param _c the class that implements the logic for crawler threads
* @param numberOfCrawlers the number of concurrent threads that will be
* contributing in this crawling session.
*/
/**
* Start the crawling session and wait for it to finish.
*
* @param _c
* the class that implements the logic for crawler threads
* @param numberOfCrawlers
* the number of concurrent threads that will be contributing in
* this crawling session.
*/
public <T extends WebCrawler> void start(final Class<T> _c, final int numberOfCrawlers) {
this.start(_c, numberOfCrawlers, true);
}

/**
* Start the crawling session and return immediately.
*
* @param _c the class that implements the logic for crawler threads
* @param numberOfCrawlers the number of concurrent threads that will be
* contributing in this crawling session.
*/
/**
* Start the crawling session and return immediately.
*
* @param _c
* the class that implements the logic for crawler threads
* @param numberOfCrawlers
* the number of concurrent threads that will be contributing in
* this crawling session.
*/
public <T extends WebCrawler> void startNonBlocking(final Class<T> _c, final int numberOfCrawlers) {
this.start(_c, numberOfCrawlers, false);
}
Expand Down Expand Up @@ -256,9 +258,9 @@ public void run() {
}
}

/**
* Wait until this crawling session finishes.
*/
/**
* Wait until this crawling session finishes.
*/
public void waitUntilFinish() {
while (!finished) {
synchronized (waitingLock) {
Expand All @@ -274,12 +276,11 @@ public void waitUntilFinish() {
}
}

/**
* Once the crawling session finishes the controller
* collects the local data of the crawler threads and stores
* them in a List. This function returns the reference to this
* list.
*/
/**
* Once the crawling session finishes the controller collects the local data
* of the crawler threads and stores them in a List. This function returns
* the reference to this list.
*/
public List<Object> getCrawlersLocalData() {
return crawlersLocalData;
}
Expand All @@ -291,13 +292,13 @@ protected void sleep(int seconds) {
}
}

/**
* Adds a new seed URL. A seed URL is a URL that
* is fetched by the crawler to extract new URLs
* in it and follow them for crawling.
*
* @param pageUrl the URL of the seed
*/
/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling.
*
* @param pageUrl
* the URL of the seed
*/
public void addSeed(String pageUrl) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
if (canonicalUrl == null) {
Expand Down Expand Up @@ -370,11 +371,11 @@ public boolean isShuttingDown() {
return shuttingDown;
}

/**
* Set the current crawling session set to 'shutdown'.
* Crawler threads monitor the shutdown flag and when it is set
* to true, they will no longer process new pages.
*/
/**
* Set the current crawling session set to 'shutdown'. Crawler threads
* monitor the shutdown flag and when it is set to true, they will no longer
* process new pages.
*/
public void Shutdown() {
logger.info("Shutting down...");
this.shuttingDown = true;
Expand Down
38 changes: 25 additions & 13 deletions src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
package edu.uci.ics.crawler4j.crawler;

import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
import edu.uci.ics.crawler4j.fetcher.PageFetchStatus;
import edu.uci.ics.crawler4j.fetcher.CustomFetchStatus;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
Expand All @@ -27,6 +27,8 @@
import edu.uci.ics.crawler4j.parser.Parser;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;

import org.apache.http.HttpStatus;
import org.apache.log4j.Logger;

import java.util.ArrayList;
Expand Down Expand Up @@ -144,6 +146,14 @@ public void onStart() {
*/
public void onBeforeExit() {
}

/**
* This function is called once the header of a page is fetched.
* It can be overwritten by sub-classes to perform custom logic
* for different status codes. For example, 404 pages can be logged, etc.
*/
protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) {
}

/**
* The CrawlController instance that has created this crawler instance will
Expand Down Expand Up @@ -213,23 +223,26 @@ public boolean shouldVisit(WebURL url) {
public void visit(Page page) {
}

private int processPage(WebURL curURL) {
private void processPage(WebURL curURL) {
if (curURL == null) {
return -1;
return;
}
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchHeader(curURL);
if (fetchResult.getStatusCode() != PageFetchStatus.OK) {
if (fetchResult.getStatusCode() == PageFetchStatus.Moved) {
int statusCode = fetchResult.getStatusCode();
handlePageStatusCode(curURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode));
if (statusCode != HttpStatus.SC_OK) {
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
if (myController.getConfig().isFollowRedirects()) {
String movedToUrl = curURL.getURL();
String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
return PageFetchStatus.MovedToUnknownLocation;
return;
}
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
return PageFetchStatus.RedirectedPageIsSeen;
// Redirect page is already seen
return;
} else {
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
Expand All @@ -242,16 +255,16 @@ private int processPage(WebURL curURL) {
}
}
}
return PageFetchStatus.Moved;
} else if (fetchResult.getStatusCode() == PageFetchStatus.PageTooBig) {
} else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
}
return fetchResult.getStatusCode();
return;
}

if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
return PageFetchStatus.RedirectedPageIsSeen;
// Redirect page is already seen
return;
}
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
Expand Down Expand Up @@ -298,7 +311,6 @@ private int processPage(WebURL curURL) {
fetchResult.discardContentIfNotConsumed();
}
}
return 0;
}

public Thread getThread() {
Expand Down
Loading

0 comments on commit 0161a71

Please sign in to comment.