Skip to content

Commit

Permalink
Merge pull request yasserg#378 from s17t/master
Browse files Browse the repository at this point in the history
  • Loading branch information
s17t authored Dec 15, 2018
2 parents 1a0df4d + d321979 commit b873207
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,21 @@ public interface WebCrawlerFactory<T extends WebCrawler> {
T newInstance() throws Exception;
}

private static class SingleInstanceFactory<T extends WebCrawler>
implements WebCrawlerFactory<T> {

final T instance;

SingleInstanceFactory(T instance) {
this.instance = instance;
}

@Override
public T newInstance() throws Exception {
return this.instance;
}
}

private static class DefaultWebCrawlerFactory<T extends WebCrawler>
implements WebCrawlerFactory<T> {
final Class<T> clazz;
Expand Down Expand Up @@ -188,6 +203,18 @@ public <T extends WebCrawler> void start(Class<T> clazz, int numberOfCrawlers) {
this.start(new DefaultWebCrawlerFactory<>(clazz), numberOfCrawlers, true);
}

/**
* Start the crawling session and wait for it to finish.
* This method depends on a single instance of a crawler. Only that instance will be used for crawling.
*
* @param instance
* the instance of a class that implements the logic for crawler threads
* @param <T> Your class extending WebCrawler
*/
public <T extends WebCrawler> void start(T instance) {
this.start(new SingleInstanceFactory<>(instance), 1, true);
}

/**
* Start the crawling session and wait for it to finish.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,15 @@ protected void onRedirectedStatusCode(Page page) {
//Subclasses can override this to add their custom functionality
}

/**
* Emitted when the crawler is redirected to an invalid Location.
* @param page
*/
protected void onRedirectedToInvalidUrl(Page page) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
page.url.getURL());
}

/**
* This function is called if the crawler encountered an unexpected http status code ( a
* status code other than 3xx)
Expand Down Expand Up @@ -405,8 +414,7 @@ private void processPage(WebURL curURL) {

String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING",
curURL);
onRedirectedToInvalidUrl(page);
return;
}
page.setRedirectedToUrl(movedToUrl);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package edu.uci.ics.crawler4j.crawler

import com.github.tomakehurst.wiremock.core.WireMockConfiguration
import com.github.tomakehurst.wiremock.junit.WireMockRule
import edu.uci.ics.crawler4j.fetcher.PageFetcher
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer
import org.junit.Rule
import org.junit.rules.TemporaryFolder
import spock.lang.Specification

import static com.github.tomakehurst.wiremock.client.WireMock.*

class OnRedirectedToInvalidTest extends Specification {

@Rule
public TemporaryFolder temp = new TemporaryFolder()

@Rule
public WireMockRule wireMockRule = new WireMockRule(new WireMockConfiguration().dynamicPort())


def "intercept redirect to invalid url"() {
given: "an index page with links to a redirect"

String redirectToNothing = "asd://-invalid-/varybadlocation"

stubFor(get(urlEqualTo("/some/index.html"))
.willReturn(aResponse()
.withStatus(200)
.withHeader("Content-Type", "text/html")
.withBody(
$/<html>
<head>
<meta charset="UTF-8">
</head>
<body>
<a href="/some/redirect.html">link to a redirected page to nothing</a>
</body>
</html>/$
)))

when: "the redirect point to an invalid url"
stubFor(get(urlPathMatching("/some/redirect.html"))
.willReturn(aResponse()
.withStatus(redirectHttpCode)
.withHeader("Content-Type", "text/html")
.withHeader("Location", redirectToNothing)
.withBody(
$/<html>
<head>
<title>Moved</title>
</head>
<body>
<h1>Moved</h1>
<p>This page has moved to <a href="${redirectToNothing}">Some invalid location</a>.</p>
</body>
</html>/$)))

and:
CrawlConfig config = new CrawlConfig(
crawlStorageFolder: temp.getRoot().getAbsolutePath()
, politenessDelay: 100
, maxConnectionsPerHost: 1
, threadShutdownDelaySeconds: 1
, threadMonitoringDelaySeconds: 1
, cleanupDelaySeconds: 1
)

PageFetcher pageFetcher = new PageFetcher(config)
RobotstxtConfig robotstxtConfig = new RobotstxtConfig()
robotstxtConfig.setEnabled false
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher)
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer)
controller.addSeed "http://localhost:" + wireMockRule.port() + "/some/index.html"

HandleInvalidRedirectWebCrawler crawler = new HandleInvalidRedirectWebCrawler()
controller.start(crawler)

then: "the right event must triggered"
crawler.invalidLocation.equals("/some/redirect.html")

where:
redirectHttpCode | _
300 | _
301 | _
302 | _
303 | _
307 | _
308 | _
}
}

class HandleInvalidRedirectWebCrawler extends WebCrawler {

String invalidLocation

@Override
void onRedirectedToInvalidUrl(Page page) {
super.onRedirectedToInvalidUrl(page)
invalidLocation = page.getWebURL().getPath()
}

}

0 comments on commit b873207

Please sign in to comment.