Merge pull request yasserg#378 from s17t/master

Fix yasserg#343
ndnhuy · Dec 15, 2018 · b873207 · b873207
2 parents 1a0df4d + d321979
commit b873207
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 2 deletions.
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java
@@ -155,6 +155,21 @@ public interface WebCrawlerFactory<T extends WebCrawler> {
         T newInstance() throws Exception;
     }
 
+    private static class SingleInstanceFactory<T extends WebCrawler>
+        implements WebCrawlerFactory<T> {
+
+        final T instance;
+
+        SingleInstanceFactory(T instance) {
+            this.instance = instance;
+        }
+
+        @Override
+        public T newInstance() throws Exception {
+            return this.instance;
+        }
+    }
+
     private static class DefaultWebCrawlerFactory<T extends WebCrawler>
         implements WebCrawlerFactory<T> {
         final Class<T> clazz;
@@ -188,6 +203,18 @@ public <T extends WebCrawler> void start(Class<T> clazz, int numberOfCrawlers) {
         this.start(new DefaultWebCrawlerFactory<>(clazz), numberOfCrawlers, true);
     }
 
+    /**
+     * Start the crawling session and wait for it to finish.
+     * This method depends on a single instance of a crawler. Only that instance will be used for crawling.
+     *
+     * @param instance
+     *            the instance of a class that implements the logic for crawler threads
+     * @param <T> Your class extending WebCrawler
+     */
+    public <T extends WebCrawler> void start(T instance) {
+        this.start(new SingleInstanceFactory<>(instance), 1, true);
+    }
+
     /**
      * Start the crawling session and wait for it to finish.
      *

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -204,6 +204,15 @@ protected void onRedirectedStatusCode(Page page) {
         //Subclasses can override this to add their custom functionality
     }
 
+    /**
+     * Emitted when the crawler is redirected to an invalid Location.
+     * @param page
+     */
+    protected void onRedirectedToInvalidUrl(Page page) {
+        logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
+            page.url.getURL());
+    }
+
     /**
      * This function is called if the crawler encountered an unexpected http status code ( a
      * status code other than 3xx)
@@ -405,8 +414,7 @@ private void processPage(WebURL curURL) {
 
                     String movedToUrl = fetchResult.getMovedToUrl();
                     if (movedToUrl == null) {
-                        logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
-                                    curURL);
+                        onRedirectedToInvalidUrl(page);
                         return;
                     }
                     page.setRedirectedToUrl(movedToUrl);

diff --git a/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/OnRedirectedToInvalidTest.groovy b/crawler4j/src/test/groovy/edu/uci/ics/crawler4j/crawler/OnRedirectedToInvalidTest.groovy
@@ -0,0 +1,104 @@
+package edu.uci.ics.crawler4j.crawler
+
+import com.github.tomakehurst.wiremock.core.WireMockConfiguration
+import com.github.tomakehurst.wiremock.junit.WireMockRule
+import edu.uci.ics.crawler4j.fetcher.PageFetcher
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer
+import org.junit.Rule
+import org.junit.rules.TemporaryFolder
+import spock.lang.Specification
+
+import static com.github.tomakehurst.wiremock.client.WireMock.*
+
+class OnRedirectedToInvalidTest extends Specification {
+
+    @Rule
+    public TemporaryFolder temp = new TemporaryFolder()
+
+    @Rule
+    public WireMockRule wireMockRule = new WireMockRule(new WireMockConfiguration().dynamicPort())
+
+
+    def "intercept redirect to invalid url"() {
+        given: "an index page with links to a redirect"
+
+        String redirectToNothing = "asd://-invalid-/varybadlocation"
+
+        stubFor(get(urlEqualTo("/some/index.html"))
+            .willReturn(aResponse()
+            .withStatus(200)
+            .withHeader("Content-Type", "text/html")
+            .withBody(
+            $/<html>
+                    <head>
+                        <meta charset="UTF-8">
+                    </head>
+                    <body> 
+                        <a href="/some/redirect.html">link to a redirected page to nothing</a>
+                    </body>
+                   </html>/$
+        )))
+
+        when: "the redirect point to an invalid url"
+        stubFor(get(urlPathMatching("/some/redirect.html"))
+            .willReturn(aResponse()
+            .withStatus(redirectHttpCode)
+            .withHeader("Content-Type", "text/html")
+            .withHeader("Location", redirectToNothing)
+            .withBody(
+            $/<html>
+                <head>
+                    <title>Moved</title>
+                </head>
+                <body>
+                    <h1>Moved</h1>
+                    <p>This page has moved to <a href="${redirectToNothing}">Some invalid location</a>.</p>
+                </body>
+                </html>/$)))
+
+        and:
+        CrawlConfig config = new CrawlConfig(
+            crawlStorageFolder: temp.getRoot().getAbsolutePath()
+            , politenessDelay: 100
+            , maxConnectionsPerHost: 1
+            , threadShutdownDelaySeconds: 1
+            , threadMonitoringDelaySeconds: 1
+            , cleanupDelaySeconds: 1
+        )
+
+        PageFetcher pageFetcher = new PageFetcher(config)
+        RobotstxtConfig robotstxtConfig = new RobotstxtConfig()
+        robotstxtConfig.setEnabled false
+        RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher)
+        CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer)
+        controller.addSeed "http://localhost:" + wireMockRule.port() + "/some/index.html"
+
+        HandleInvalidRedirectWebCrawler crawler = new HandleInvalidRedirectWebCrawler()
+        controller.start(crawler)
+
+        then: "the right event must triggered"
+        crawler.invalidLocation.equals("/some/redirect.html")
+
+        where:
+        redirectHttpCode | _
+        300 | _
+        301 | _
+        302 | _
+        303 | _
+        307 | _
+        308 | _
+    }
+}
+
+class HandleInvalidRedirectWebCrawler extends WebCrawler {
+
+    String invalidLocation
+
+    @Override
+    void onRedirectedToInvalidUrl(Page page) {
+        super.onRedirectedToInvalidUrl(page)
+        invalidLocation = page.getWebURL().getPath()
+    }
+
+}