Added support for handling http status codes for fetched pages

ThiagoKrug · Jan 22, 2012 · 0161a71 · 0161a71
1 parent a74d2c0
commit 0161a71
Show file tree

Hide file tree

Showing 12 changed files with 412 additions and 157 deletions.
diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
@@ -101,11 +101,6 @@ public class CrawlConfig {
 	 */
 	private boolean followRedirects = true;
 
-	/**
-	 * Should we log the 404 (Not Found) pages?
-	 */
-	private boolean show404PagesInLogs = false;
-
 	/**
 	 * If crawler should run behind a proxy, this parameter can be used for
 	 * specifying the proxy host.
@@ -330,17 +325,6 @@ public void setFollowRedirects(boolean followRedirects) {
 		this.followRedirects = followRedirects;
 	}
 
-	public boolean isShow404PagesInLogs() {
-		return show404PagesInLogs;
-	}
-
-    /**
-     * Should we log the 404 (Not Found) pages?
-     */
-	public void setShow404PagesInLogs(boolean show404PagesInLogs) {
-		this.show404PagesInLogs = show404PagesInLogs;
-	}
-
 	public String getProxyHost() {
 		return proxyHost;
 	}

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java
@@ -33,9 +33,9 @@
 import java.util.List;
 
 /**
- * The controller that manages a crawling session. This class creates
- * the crawler threads and monitors their progress.
- *
+ * The controller that manages a crawling session. This class creates the
+ * crawler threads and monitors their progress.
+ * 
  * @author Yasser Ganjisaffar <lastname at gmail dot com>
  */
 public class CrawlController extends Configurable {
@@ -48,26 +48,24 @@ public class CrawlController extends Configurable {
 	 */
 	protected Object customData;
 
-    /**
-     * Once the crawling session finishes the controller
-     * collects the local data of the crawler threads and stores
-     * them in this List.
-     */
-    protected List<Object> crawlersLocalData = new ArrayList<Object>();
-
-    /**
-     * Is the crawling of this session finished?
-     */
-    protected boolean finished;
-
-    /**
-     * Is the crawling session set to 'shutdown'.
-     * Crawler threads monitor this flag and when it is set
-     * they will no longer process new pages.
-     */
-    protected boolean shuttingDown;
-
-    protected PageFetcher pageFetcher;
+	/**
+	 * Once the crawling session finishes the controller collects the local data
+	 * of the crawler threads and stores them in this List.
+	 */
+	protected List<Object> crawlersLocalData = new ArrayList<Object>();
+
+	/**
+	 * Is the crawling of this session finished?
+	 */
+	protected boolean finished;
+
+	/**
+	 * Is the crawling session set to 'shutdown'. Crawler threads monitor this
+	 * flag and when it is set they will no longer process new pages.
+	 */
+	protected boolean shuttingDown;
+
+	protected PageFetcher pageFetcher;
 	protected RobotstxtServer robotstxtServer;
 	protected Frontier frontier;
 	protected DocIDServer docIdServer;
@@ -82,8 +80,8 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtSer
 		File folder = new File(config.getCrawlStorageFolder());
 		if (!folder.exists()) {
 			if (!folder.mkdirs()) {
-                throw new Exception("Couldn't create this folder: " + folder.getAbsolutePath());
-            }
+				throw new Exception("Couldn't create this folder: " + folder.getAbsolutePath());
+			}
 		}
 
 		boolean resumable = config.isResumableCrawling();
@@ -96,8 +94,8 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtSer
 		File envHome = new File(config.getCrawlStorageFolder() + "/frontier");
 		if (!envHome.exists()) {
 			if (!envHome.mkdir()) {
-                throw new Exception("Couldn't create this folder: " + envHome.getAbsolutePath());
-            }
+				throw new Exception("Couldn't create this folder: " + envHome.getAbsolutePath());
+			}
 		}
 		if (!resumable) {
 			IO.deleteFolderContents(envHome);
@@ -114,24 +112,28 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtSer
 		shuttingDown = false;
 	}
 
-    /**
-     * Start the crawling session and wait for it to finish.
-     *
-     * @param _c  the class that implements the logic for crawler threads
-     * @param numberOfCrawlers the number of concurrent threads that will be
-     *                         contributing in this crawling session.
-     */
+	/**
+	 * Start the crawling session and wait for it to finish.
+	 * 
+	 * @param _c
+	 *            the class that implements the logic for crawler threads
+	 * @param numberOfCrawlers
+	 *            the number of concurrent threads that will be contributing in
+	 *            this crawling session.
+	 */
 	public <T extends WebCrawler> void start(final Class<T> _c, final int numberOfCrawlers) {
 		this.start(_c, numberOfCrawlers, true);
 	}
 
-    /**
-     * Start the crawling session and return immediately.
-     *
-     * @param _c  the class that implements the logic for crawler threads
-     * @param numberOfCrawlers the number of concurrent threads that will be
-     *                         contributing in this crawling session.
-     */
+	/**
+	 * Start the crawling session and return immediately.
+	 * 
+	 * @param _c
+	 *            the class that implements the logic for crawler threads
+	 * @param numberOfCrawlers
+	 *            the number of concurrent threads that will be contributing in
+	 *            this crawling session.
+	 */
 	public <T extends WebCrawler> void startNonBlocking(final Class<T> _c, final int numberOfCrawlers) {
 		this.start(_c, numberOfCrawlers, false);
 	}
@@ -256,9 +258,9 @@ public void run() {
 		}
 	}
 
-    /**
-     * Wait until this crawling session finishes.
-     */
+	/**
+	 * Wait until this crawling session finishes.
+	 */
 	public void waitUntilFinish() {
 		while (!finished) {
 			synchronized (waitingLock) {
@@ -274,12 +276,11 @@ public void waitUntilFinish() {
 		}
 	}
 
-    /**
-     * Once the crawling session finishes the controller
-     * collects the local data of the crawler threads and stores
-     * them in a List. This function returns the reference to this
-     * list.
-     */
+	/**
+	 * Once the crawling session finishes the controller collects the local data
+	 * of the crawler threads and stores them in a List. This function returns
+	 * the reference to this list.
+	 */
 	public List<Object> getCrawlersLocalData() {
 		return crawlersLocalData;
 	}
@@ -291,13 +292,13 @@ protected void sleep(int seconds) {
 		}
 	}
 
-    /**
-     * Adds a new seed URL. A seed URL is a URL that
-     * is fetched by the crawler to extract new URLs
-     * in it and follow them for crawling.
-     *
-     * @param pageUrl the URL of the seed
-     */
+	/**
+	 * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
+	 * to extract new URLs in it and follow them for crawling.
+	 * 
+	 * @param pageUrl
+	 *            the URL of the seed
+	 */
 	public void addSeed(String pageUrl) {
 		String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
 		if (canonicalUrl == null) {
@@ -370,11 +371,11 @@ public boolean isShuttingDown() {
 		return shuttingDown;
 	}
 
-    /**
-     * Set the current crawling session set to 'shutdown'.
-     * Crawler threads monitor the shutdown flag and when it is set
-     * to true, they will no longer process new pages.
-     */
+	/**
+	 * Set the current crawling session set to 'shutdown'. Crawler threads
+	 * monitor the shutdown flag and when it is set to true, they will no longer
+	 * process new pages.
+	 */
 	public void Shutdown() {
 		logger.info("Shutting down...");
 		this.shuttingDown = true;

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -18,7 +18,7 @@
 package edu.uci.ics.crawler4j.crawler;
 
 import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
-import edu.uci.ics.crawler4j.fetcher.PageFetchStatus;
+import edu.uci.ics.crawler4j.fetcher.CustomFetchStatus;
 import edu.uci.ics.crawler4j.fetcher.PageFetcher;
 import edu.uci.ics.crawler4j.frontier.DocIDServer;
 import edu.uci.ics.crawler4j.frontier.Frontier;
@@ -27,6 +27,8 @@
 import edu.uci.ics.crawler4j.parser.Parser;
 import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
 import edu.uci.ics.crawler4j.url.WebURL;
+
+import org.apache.http.HttpStatus;
 import org.apache.log4j.Logger;
 
 import java.util.ArrayList;
@@ -144,6 +146,14 @@ public void onStart() {
 	 */
 	public void onBeforeExit() {
 	}
+
+	/**
+	 * This function is called once the header of a page is fetched.
+	 * It can be overwritten by sub-classes to perform custom logic
+	 * for different status codes. For example, 404 pages can be logged, etc.
+	 */
+	protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) {
+	}
 
 	/**
 	 * The CrawlController instance that has created this crawler instance will
@@ -213,23 +223,26 @@ public boolean shouldVisit(WebURL url) {
 	public void visit(Page page) {
 	}
 
-	private int processPage(WebURL curURL) {
+	private void processPage(WebURL curURL) {
 		if (curURL == null) {
-			return -1;
+			return;
 		}
 		PageFetchResult fetchResult = null;
 		try {
 			fetchResult = pageFetcher.fetchHeader(curURL);
-			if (fetchResult.getStatusCode() != PageFetchStatus.OK) {
-				if (fetchResult.getStatusCode() == PageFetchStatus.Moved) {
+			int statusCode = fetchResult.getStatusCode();
+			handlePageStatusCode(curURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode));
+			if (statusCode != HttpStatus.SC_OK) {
+				if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
 					if (myController.getConfig().isFollowRedirects()) {
-						String movedToUrl = curURL.getURL();
+						String movedToUrl = fetchResult.getMovedToUrl();
 						if (movedToUrl == null) {
-							return PageFetchStatus.MovedToUnknownLocation;
+							return;
 						}
 						int newDocId = docIdServer.getDocId(movedToUrl);
 						if (newDocId > 0) {
-							return PageFetchStatus.RedirectedPageIsSeen;
+							// Redirect page is already seen
+							return;
 						} else {
 							WebURL webURL = new WebURL();
 							webURL.setURL(movedToUrl);
@@ -242,16 +255,16 @@ private int processPage(WebURL curURL) {
 							}
 						}
 					}
-					return PageFetchStatus.Moved;
-				} else if (fetchResult.getStatusCode() == PageFetchStatus.PageTooBig) {
+				} else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) {
 					logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL());
 				}
-				return fetchResult.getStatusCode();
+				return;
 			}
 
 			if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
 				if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
-					return PageFetchStatus.RedirectedPageIsSeen;
+					// Redirect page is already seen
+					return;
 				}
 				curURL.setURL(fetchResult.getFetchedUrl());
 				curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
@@ -298,7 +311,6 @@ private int processPage(WebURL curURL) {
 				fetchResult.discardContentIfNotConsumed();
 			}
 		}
-		return 0;
 	}
 
 	public Thread getThread() {