Skip to content

Commit 4e6cdaa

Browse files
committed
Create handler for 3xx pages
1 parent 0940b99 commit 4e6cdaa

File tree

1 file changed

+19
-8
lines changed

1 file changed

+19
-8
lines changed

src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java

+19-8
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,15 @@ protected void onPageBiggerThanMaxSize(String urlStr, long pageSize) {
187187
pageSize);
188188
}
189189

190+
/**
191+
* This function is called if the crawler encounters a page with a 3xx status code
192+
*
193+
* @param page Partial page object
194+
*/
195+
protected void onRedirectedStatusCode(Page page) {
196+
//Subclasses can override this to add their custom functionality
197+
}
198+
190199
/**
191200
* This function is called if the crawler encountered an unexpected http status code ( a
192201
* status code other than 3xx)
@@ -361,15 +370,17 @@ private void processPage(WebURL curURL) {
361370
// follow https://issues.apache.org/jira/browse/HTTPCORE-389
362371

363372
page.setRedirect(true);
364-
if (myController.getConfig().isFollowRedirects()) {
365-
String movedToUrl = fetchResult.getMovedToUrl();
366-
if (movedToUrl == null) {
367-
logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
368-
curURL);
369-
return;
370-
}
371-
page.setRedirectedToUrl(movedToUrl);
372373

374+
String movedToUrl = fetchResult.getMovedToUrl();
375+
if (movedToUrl == null) {
376+
logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
377+
curURL);
378+
return;
379+
}
380+
page.setRedirectedToUrl(movedToUrl);
381+
onRedirectedStatusCode(page);
382+
383+
if (myController.getConfig().isFollowRedirects()) {
373384
int newDocId = docIdServer.getDocId(movedToUrl);
374385
if (newDocId > 0) {
375386
logger.debug("Redirect page: {} is already seen", curURL);

0 commit comments

Comments
 (0)