Skip to content

Commit

Permalink
Create handler for 3xx pages
Browse files Browse the repository at this point in the history
  • Loading branch information
JCotton1123 committed Dec 27, 2016
1 parent 0940b99 commit 4e6cdaa
Showing 1 changed file with 19 additions and 8 deletions.
27 changes: 19 additions & 8 deletions src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,15 @@ protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) {
pageSize);
}

/**
* This function is called if the crawler encounters a page with a 3xx status code
*
* @param page Partial page object
*/
protected void onRedirectedStatusCode(Page page) {
//Subclasses can override this to add their custom functionality
}

/**
* This function is called if the crawler encountered an unexpected http status code ( a
* status code other than 3xx)
Expand Down Expand Up @@ -361,15 +370,17 @@ private void processPage(WebURL curURL) {
// follow https://issues.apache.org/jira/browse/HTTPCORE-389

page.setRedirect(true);
if (myController.getConfig().isFollowRedirects()) {
String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
curURL);
return;
}
page.setRedirectedToUrl(movedToUrl);

String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
curURL);
return;
}
page.setRedirectedToUrl(movedToUrl);
onRedirectedStatusCode(page);

if (myController.getConfig().isFollowRedirects()) {
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
logger.debug("Redirect page: {} is already seen", curURL);
Expand Down

0 comments on commit 4e6cdaa

Please sign in to comment.