diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java index ceea2ce4e..c2788d385 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java @@ -155,6 +155,21 @@ public interface WebCrawlerFactory { T newInstance() throws Exception; } + private static class SingleInstanceFactory + implements WebCrawlerFactory { + + final T instance; + + SingleInstanceFactory(T instance) { + this.instance = instance; + } + + @Override + public T newInstance() throws Exception { + return this.instance; + } + } + private static class DefaultWebCrawlerFactory implements WebCrawlerFactory { final Class clazz; @@ -188,6 +203,18 @@ public void start(Class clazz, int numberOfCrawlers) { this.start(new DefaultWebCrawlerFactory<>(clazz), numberOfCrawlers, true); } + /** + * Start the crawling session and wait for it to finish. + * This method depends on a single instance of a crawler. Only that instance will be used for crawling. + * + * @param instance + * the instance of a class that implements the logic for crawler threads + * @param Your class extending WebCrawler + */ + public void start(T instance) { + this.start(new SingleInstanceFactory<>(instance), 1, true); + } + /** * Start the crawling session and wait for it to finish. * diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index 786afe752..9f43aae57 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -204,6 +204,15 @@ protected void onRedirectedStatusCode(Page page) { //Subclasses can override this to add their custom functionality } + /** + * Emitted when the crawler is redirected to an invalid Location. + * @param page + */ + protected void onRedirectedToInvalidUrl(Page page) { + logger.warn("Unexpected error, URL: {} is redirected to NOTHING", + page.url.getURL()); + } + /** * This function is called if the crawler encountered an unexpected http status code ( a * status code other than 3xx) @@ -405,8 +414,7 @@ private void processPage(WebURL curURL) { String movedToUrl = fetchResult.getMovedToUrl(); if (movedToUrl == null) { - logger.warn("Unexpected error, URL: {} is redirected to NOTHING", - curURL); + onRedirectedToInvalidUrl(page); return; } page.setRedirectedToUrl(movedToUrl);