forked from yasserg/crawler4j
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
28 changed files
with
492 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<parent> | ||
<artifactId>crawler4j-examples</artifactId> | ||
<groupId>edu.uci.ics</groupId> | ||
<version>4.4.0-SNAPSHOT</version> | ||
<relativePath>../pom.xml</relativePath> | ||
</parent> | ||
<artifactId>crawler4j-examples-base</artifactId> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>edu.uci.ics</groupId> | ||
<artifactId>crawler4j</artifactId> | ||
<version>${project.parent.version}</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
|
||
</project> |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
A sample shows how to save crawled page into a JDBC repository. | ||
|
||
Shamelessy grabbed with rzo1's permission, from [the original repo](https://github.com/rzo1/crawler4j-postgres-sample). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<parent> | ||
<artifactId>crawler4j-examples</artifactId> | ||
<groupId>edu.uci.ics</groupId> | ||
<version>4.4.0-SNAPSHOT</version> | ||
<relativePath>../pom.xml</relativePath> | ||
</parent> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>crawler4j-examples</groupId> | ||
<artifactId>crawler4j-examples-postgres</artifactId> | ||
|
||
<packaging>jar</packaging> | ||
|
||
<properties> | ||
<!-- 3rd party libs --> | ||
<postgresql.version>42.2.1</postgresql.version> | ||
<crawler4j.version>4.4.0-SNAPSHOT</crawler4j.version> | ||
<c3p0.version>0.9.5.2</c3p0.version> | ||
</properties> | ||
|
||
<repositories> | ||
<repository> | ||
<snapshots> | ||
<enabled>false</enabled> | ||
</snapshots> | ||
<id>bintray-palantir-releases</id> | ||
<name>bintray</name> | ||
<url>https://palantir.bintray.com/releases</url> | ||
</repository> | ||
</repositories> | ||
<pluginRepositories> | ||
<pluginRepository> | ||
<snapshots> | ||
<enabled>false</enabled> | ||
</snapshots> | ||
<id>bintray-palantir-releases</id> | ||
<name>bintray-plugins</name> | ||
<url>https://palantir.bintray.com/releases</url> | ||
</pluginRepository> | ||
</pluginRepositories> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.postgresql</groupId> | ||
<artifactId>postgresql</artifactId> | ||
<version>${postgresql.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.mchange</groupId> | ||
<artifactId>c3p0</artifactId> | ||
<version>${c3p0.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>edu.uci.ics</groupId> | ||
<artifactId>crawler4j</artifactId> | ||
<version>${crawler4j.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.flywaydb</groupId> | ||
<artifactId>flyway-core</artifactId> | ||
<version>5.0.7</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>junit</groupId> | ||
<artifactId>junit</artifactId> | ||
<scope>test</scope> | ||
<version>4.12</version> | ||
</dependency> | ||
<!-- https://mvnrepository.com/artifact/com.palantir.docker.compose/docker-compose-rule-junit4 --> | ||
<dependency> | ||
<groupId>com.palantir.docker.compose</groupId> | ||
<artifactId>docker-compose-rule-junit4</artifactId> | ||
<scope>test</scope> | ||
<version>0.33.0</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<configuration> | ||
<source>${java.source}</source> | ||
<target>${java.target}</target> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
</project> |
71 changes: 71 additions & 0 deletions
71
...wler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/SampleLauncher.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
package edu.uci.ics.crawler4j.examples; | ||
|
||
import com.google.common.io.Files; | ||
import com.mchange.v2.c3p0.ComboPooledDataSource; | ||
import edu.uci.ics.crawler4j.crawler.CrawlConfig; | ||
import edu.uci.ics.crawler4j.crawler.CrawlController; | ||
import edu.uci.ics.crawler4j.examples.crawler.PostgresCrawlerFactory; | ||
import edu.uci.ics.crawler4j.fetcher.PageFetcher; | ||
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; | ||
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; | ||
import org.flywaydb.core.Flyway; | ||
|
||
public class SampleLauncher { | ||
|
||
public static void main(String[] args) throws Exception { | ||
|
||
String crawlStorageFolder = Files.createTempDir().getAbsolutePath(); | ||
int numberOfCrawlers = Integer.valueOf(args[2]); | ||
|
||
CrawlConfig config = new CrawlConfig(); | ||
|
||
config.setPolitenessDelay(100); | ||
|
||
config.setCrawlStorageFolder(crawlStorageFolder); | ||
|
||
config.setMaxPagesToFetch(Integer.valueOf(args[0])); | ||
|
||
/* | ||
* Instantiate the controller for this crawl. | ||
*/ | ||
PageFetcher pageFetcher = new PageFetcher(config); | ||
RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); | ||
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); | ||
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); | ||
|
||
/* | ||
* For each crawl, you need to add some seed urls. These are the first | ||
* URLs that are fetched and then the crawler starts following links | ||
* which are found in these pages | ||
*/ | ||
controller.addSeed("https://de.wikipedia.org/wiki/Java_Database_Connectivity"); | ||
controller.addSeed("https://de.wikipedia.org/wiki/Relationale_Datenbank"); | ||
controller.addSeed("https://pt.wikipedia.org/wiki/JDBC"); | ||
controller.addSeed("https://pt.wikipedia.org/wiki/Protocolo"); | ||
controller.addSeed("https://de.wikipedia.org/wiki/Datenbank"); | ||
|
||
|
||
Flyway flyway = new Flyway(); | ||
flyway.setDataSource(args[1], "crawler4j", "crawler4j"); | ||
flyway.migrate(); | ||
|
||
|
||
ComboPooledDataSource pool = new ComboPooledDataSource(); | ||
pool.setDriverClass("org.postgresql.Driver"); | ||
pool.setJdbcUrl(args[1]); | ||
pool.setUser("crawler4j"); | ||
pool.setPassword("crawler4j"); | ||
pool.setMaxPoolSize(numberOfCrawlers); | ||
pool.setMinPoolSize(numberOfCrawlers); | ||
pool.setInitialPoolSize(numberOfCrawlers); | ||
|
||
/* | ||
* Start the crawl. This is a blocking operation, meaning that your code | ||
* will reach the line after this only when crawling is finished. | ||
*/ | ||
controller.start(new PostgresCrawlerFactory(pool), numberOfCrawlers); | ||
|
||
pool.close(); | ||
} | ||
|
||
} |
23 changes: 23 additions & 0 deletions
23
...postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresCrawlerFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package edu.uci.ics.crawler4j.examples.crawler; | ||
|
||
import com.mchange.v2.c3p0.ComboPooledDataSource; | ||
import edu.uci.ics.crawler4j.crawler.CrawlController; | ||
import edu.uci.ics.crawler4j.examples.db.impl.PostgresDBServiceImpl; | ||
|
||
import java.beans.PropertyVetoException; | ||
|
||
/** | ||
* Created by rz on 03.06.2016. | ||
*/ | ||
public class PostgresCrawlerFactory implements CrawlController.WebCrawlerFactory<PostgresWebCrawler> { | ||
|
||
private ComboPooledDataSource comboPooledDataSource; | ||
|
||
public PostgresCrawlerFactory(ComboPooledDataSource comboPooledDataSource) { | ||
this.comboPooledDataSource = comboPooledDataSource; | ||
} | ||
|
||
public PostgresWebCrawler newInstance() throws Exception { | ||
return new PostgresWebCrawler(new PostgresDBServiceImpl(comboPooledDataSource)); | ||
} | ||
} |
69 changes: 69 additions & 0 deletions
69
...les-postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresWebCrawler.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
package edu.uci.ics.crawler4j.examples.crawler; | ||
|
||
import edu.uci.ics.crawler4j.crawler.Page; | ||
import edu.uci.ics.crawler4j.crawler.WebCrawler; | ||
import edu.uci.ics.crawler4j.examples.db.PostgresDBService; | ||
import edu.uci.ics.crawler4j.parser.HtmlParseData; | ||
import edu.uci.ics.crawler4j.url.WebURL; | ||
import org.slf4j.Logger; | ||
|
||
import java.util.Set; | ||
import java.util.regex.Pattern; | ||
|
||
public class PostgresWebCrawler extends WebCrawler { | ||
|
||
private static final Logger logger = org.slf4j.LoggerFactory.getLogger(PostgresWebCrawler.class); | ||
|
||
private static Pattern FILE_ENDING_EXCLUSION_PATTERN = Pattern.compile(".*(\\.(" + | ||
"css|js" + | ||
"|bmp|gif|jpe?g|JPE?G|png|tiff?|ico|nef|raw" + | ||
"|mid|mp2|mp3|mp4|wav|wma|flv|mpe?g" + | ||
"|avi|mov|mpeg|ram|m4v|wmv|rm|smil" + | ||
"|pdf|doc|docx|pub|xls|xlsx|vsd|ppt|pptx" + | ||
"|swf" + | ||
"|zip|rar|gz|bz2|7z|bin" + | ||
"|xml|txt|java|c|cpp|exe" + | ||
"))$"); | ||
|
||
|
||
private final PostgresDBService postgresDBService; | ||
|
||
public PostgresWebCrawler(PostgresDBService postgresDBService) { | ||
this.postgresDBService = postgresDBService; | ||
} | ||
|
||
@Override | ||
public boolean shouldVisit(Page referringPage, WebURL url) { | ||
String href = url.getURL().toLowerCase(); | ||
return !FILE_ENDING_EXCLUSION_PATTERN.matcher(href).matches(); | ||
} | ||
|
||
@Override | ||
public void visit(Page page) { | ||
String url = page.getWebURL().getURL(); | ||
logger.info("URL: " + url); | ||
|
||
if (page.getParseData() instanceof HtmlParseData) { | ||
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); | ||
String text = htmlParseData.getText(); | ||
String html = htmlParseData.getHtml(); | ||
Set<WebURL> links = htmlParseData.getOutgoingUrls(); | ||
|
||
logger.info("Text length: " + text.length()); | ||
logger.info("Html length: " + html.length()); | ||
logger.info("Number of outgoing links: " + links.size()); | ||
|
||
try { | ||
postgresDBService.store(page); | ||
} catch (RuntimeException e) { | ||
logger.error("Storing failed", e); | ||
} | ||
} | ||
} | ||
|
||
public void onBeforeExit() { | ||
if (postgresDBService != null) { | ||
postgresDBService.close(); | ||
} | ||
} | ||
} |
10 changes: 10 additions & 0 deletions
10
...-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/PostgresDBService.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package edu.uci.ics.crawler4j.examples.db; | ||
|
||
import edu.uci.ics.crawler4j.crawler.Page; | ||
|
||
public interface PostgresDBService { | ||
|
||
void store(Page webPage); | ||
|
||
void close(); | ||
} |
52 changes: 52 additions & 0 deletions
52
...-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/impl/PostgresDBServiceImpl.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
package edu.uci.ics.crawler4j.examples.db.impl; | ||
|
||
import com.mchange.v2.c3p0.ComboPooledDataSource; | ||
import edu.uci.ics.crawler4j.crawler.Page; | ||
import edu.uci.ics.crawler4j.examples.db.PostgresDBService; | ||
import edu.uci.ics.crawler4j.parser.HtmlParseData; | ||
import org.slf4j.Logger; | ||
import java.sql.PreparedStatement; | ||
import java.sql.SQLException; | ||
import java.sql.Timestamp; | ||
|
||
public class PostgresDBServiceImpl implements PostgresDBService { | ||
|
||
private static final Logger logger = org.slf4j.LoggerFactory.getLogger(PostgresDBServiceImpl.class); | ||
|
||
private ComboPooledDataSource comboPooledDataSource; | ||
|
||
private PreparedStatement insertKeyStatement; | ||
|
||
public PostgresDBServiceImpl(ComboPooledDataSource comboPooledDataSource) throws SQLException { | ||
this.comboPooledDataSource = comboPooledDataSource; | ||
insertKeyStatement = comboPooledDataSource.getConnection().prepareStatement("insert into webpage values " + | ||
"(nextval('id_master_seq'),?,?,?,?)"); | ||
} | ||
|
||
@Override | ||
public void store(Page page) { | ||
|
||
if (page.getParseData() instanceof HtmlParseData) { | ||
try { | ||
|
||
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); | ||
|
||
insertKeyStatement.setString(1, htmlParseData.getHtml()); | ||
insertKeyStatement.setString(2, htmlParseData.getText()); | ||
insertKeyStatement.setString(3, page.getWebURL().getURL()); | ||
insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime())); | ||
insertKeyStatement.executeUpdate(); | ||
} catch (SQLException e) { | ||
logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e); | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public void close() { | ||
if (comboPooledDataSource != null) { | ||
comboPooledDataSource.close(); | ||
} | ||
} | ||
} |
Oops, something went wrong.