Integrate rzo1's postgres example.

jinfengfeng30 · Mar 1, 2018 · 349a42d · 349a42d
1 parent c7811d7
commit 349a42d
Show file tree

Hide file tree

Showing 28 changed files with 492 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -197,12 +197,12 @@ To use a factory just call the right method in the `CrawlController` (probably y
             controller.startNonBlocking(factory, numberOfCrawlers);
 ```
 ## More Examples
-- [Basic crawler](crawler4j-examples/src/test/java/edu/uci/ics/crawler4j/examples/basic/): the full source code of the above example with more details.
-- [Image crawler](crawler4j-examples/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/): a simple image crawler that downloads image content from the crawling domain and stores them in a folder. This example demonstrates how binary content can be fetched using crawler4j.
-- [Collecting data from threads](crawler4j-examples/src/test/java/edu/uci/ics/crawler4j/examples/localdata/): this example demonstrates how the controller can collect data/statistics from crawling threads.
-- [Multiple crawlers](crawler4j-examples/src/test/java/edu/uci/ics/crawler4j/examples/multiple/): this is a sample that shows how two distinct crawlers can run concurrently. For example, you might want to split your crawling into different domains and then take different crawling policies for each group. Each crawling controller can have its own configurations.
-- [Shutdown crawling](crawler4j-examples/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/): this example shows have crawling can be terminated gracefully by sending the 'shutdown' command to the controller.
-- [Postgres/JDBC integration](https://github.com/rzo1/crawler4j-postgres-sample): this shows how to save the crawled content into a Postgres database (or any other JDBC repository).
+- [Basic crawler](crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/basic/): the full source code of the above example with more details.
+- [Image crawler](crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/): a simple image crawler that downloads image content from the crawling domain and stores them in a folder. This example demonstrates how binary content can be fetched using crawler4j.
+- [Collecting data from threads](crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/localdata/): this example demonstrates how the controller can collect data/statistics from crawling threads.
+- [Multiple crawlers](crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/multiple/): this is a sample that shows how two distinct crawlers can run concurrently. For example, you might want to split your crawling into different domains and then take different crawling policies for each group. Each crawling controller can have its own configurations.
+- [Shutdown crawling](crawler4j-examples/crawler4j-examples-base/src/test/java/edu/uci/ics/crawler4j/examples/shutdown/): this example shows have crawling can be terminated gracefully by sending the 'shutdown' command to the controller.
+- [Postgres/JDBC integration](crawler4j-examples/crawler4j-examples-postgres/): this shows how to save the crawled content into a Postgres database (or any other JDBC repository), thanks [rzo1](https://github.com/rzo1/).
 
 ## Configuration Details
 The controller class has a mandatory parameter of type [CrawlConfig](crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java).

diff --git a/crawler4j-examples/crawler4j-examples-base/pom.xml b/crawler4j-examples/crawler4j-examples-base/pom.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>crawler4j-examples</artifactId>
+        <groupId>edu.uci.ics</groupId>
+        <version>4.4.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>crawler4j-examples-base</artifactId>
+    <modelVersion>4.0.0</modelVersion>
+
+    <dependencies>
+        <dependency>
+            <groupId>edu.uci.ics</groupId>
+            <artifactId>crawler4j</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+
+
+</project>
diff --git a/.../examples/basic/BasicCrawlController.java → .../examples/basic/BasicCrawlController.java b/.../examples/basic/BasicCrawlController.java → .../examples/basic/BasicCrawlController.java
diff --git a/...rawler4j/examples/basic/BasicCrawler.java → ...rawler4j/examples/basic/BasicCrawler.java b/...rawler4j/examples/basic/BasicCrawler.java → ...rawler4j/examples/basic/BasicCrawler.java
diff --git a/...es/imagecrawler/ImageCrawlController.java → ...es/imagecrawler/ImageCrawlController.java b/...es/imagecrawler/ImageCrawlController.java → ...es/imagecrawler/ImageCrawlController.java
diff --git a/...j/examples/imagecrawler/ImageCrawler.java → ...j/examples/imagecrawler/ImageCrawler.java b/...j/examples/imagecrawler/ImageCrawler.java → ...j/examples/imagecrawler/ImageCrawler.java
diff --git a/...awler4j/examples/localdata/CrawlStat.java → ...awler4j/examples/localdata/CrawlStat.java b/...awler4j/examples/localdata/CrawlStat.java → ...awler4j/examples/localdata/CrawlStat.java
diff --git a/...wler4j/examples/localdata/Downloader.java → ...wler4j/examples/localdata/Downloader.java b/...wler4j/examples/localdata/Downloader.java → ...wler4j/examples/localdata/Downloader.java
diff --git a/...caldata/LocalDataCollectorController.java → ...caldata/LocalDataCollectorController.java b/...caldata/LocalDataCollectorController.java → ...caldata/LocalDataCollectorController.java
diff --git a/.../localdata/LocalDataCollectorCrawler.java → .../localdata/LocalDataCollectorCrawler.java b/.../localdata/LocalDataCollectorCrawler.java → .../localdata/LocalDataCollectorCrawler.java
diff --git a/...ler4j/examples/multiple/BasicCrawler.java → ...ler4j/examples/multiple/BasicCrawler.java b/...ler4j/examples/multiple/BasicCrawler.java → ...ler4j/examples/multiple/BasicCrawler.java
diff --git a/...s/multiple/MultipleCrawlerController.java → ...s/multiple/MultipleCrawlerController.java b/...s/multiple/MultipleCrawlerController.java → ...s/multiple/MultipleCrawlerController.java
diff --git a/...ler4j/examples/shutdown/BasicCrawler.java → ...ler4j/examples/shutdown/BasicCrawler.java b/...ler4j/examples/shutdown/BasicCrawler.java → ...ler4j/examples/shutdown/BasicCrawler.java
diff --git a/...ples/shutdown/ControllerWithShutdown.java → ...ples/shutdown/ControllerWithShutdown.java b/...ples/shutdown/ControllerWithShutdown.java → ...ples/shutdown/ControllerWithShutdown.java
diff --git a/...handler/StatusHandlerCrawlController.java → ...handler/StatusHandlerCrawlController.java b/...handler/StatusHandlerCrawlController.java → ...handler/StatusHandlerCrawlController.java
diff --git a/...s/statushandler/StatusHandlerCrawler.java → ...s/statushandler/StatusHandlerCrawler.java b/...s/statushandler/StatusHandlerCrawler.java → ...s/statushandler/StatusHandlerCrawler.java
diff --git a/crawler4j-examples/crawler4j-examples-postgres/README.md b/crawler4j-examples/crawler4j-examples-postgres/README.md
@@ -0,0 +1,3 @@
+A sample shows how to save crawled page into a JDBC repository.
+
+Shamelessy grabbed with rzo1's permission, from [the original repo](https://github.com/rzo1/crawler4j-postgres-sample).
diff --git a/crawler4j-examples/crawler4j-examples-postgres/pom.xml b/crawler4j-examples/crawler4j-examples-postgres/pom.xml
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>crawler4j-examples</artifactId>
+        <groupId>edu.uci.ics</groupId>
+        <version>4.4.0-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>crawler4j-examples</groupId>
+    <artifactId>crawler4j-examples-postgres</artifactId>
+
+    <packaging>jar</packaging>
+
+    <properties>
+        <!-- 3rd party libs -->
+        <postgresql.version>42.2.1</postgresql.version>
+        <crawler4j.version>4.4.0-SNAPSHOT</crawler4j.version>
+        <c3p0.version>0.9.5.2</c3p0.version>
+    </properties>
+
+    <repositories>
+        <repository>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+            <id>bintray-palantir-releases</id>
+            <name>bintray</name>
+            <url>https://palantir.bintray.com/releases</url>
+        </repository>
+    </repositories>
+    <pluginRepositories>
+        <pluginRepository>
+            <snapshots>
+                <enabled>false</enabled>
+            </snapshots>
+            <id>bintray-palantir-releases</id>
+            <name>bintray-plugins</name>
+            <url>https://palantir.bintray.com/releases</url>
+        </pluginRepository>
+    </pluginRepositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.postgresql</groupId>
+            <artifactId>postgresql</artifactId>
+            <version>${postgresql.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.mchange</groupId>
+            <artifactId>c3p0</artifactId>
+            <version>${c3p0.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>edu.uci.ics</groupId>
+            <artifactId>crawler4j</artifactId>
+            <version>${crawler4j.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.flywaydb</groupId>
+            <artifactId>flyway-core</artifactId>
+            <version>5.0.7</version>
+        </dependency>
+
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <scope>test</scope>
+            <version>4.12</version>
+        </dependency>
+        <!-- https://mvnrepository.com/artifact/com.palantir.docker.compose/docker-compose-rule-junit4 -->
+        <dependency>
+            <groupId>com.palantir.docker.compose</groupId>
+            <artifactId>docker-compose-rule-junit4</artifactId>
+            <scope>test</scope>
+            <version>0.33.0</version>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>${java.source}</source>
+                    <target>${java.target}</target>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+</project>
diff --git a/...wler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/SampleLauncher.java b/...wler4j-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/SampleLauncher.java
@@ -0,0 +1,71 @@
+package edu.uci.ics.crawler4j.examples;
+
+import com.google.common.io.Files;
+import com.mchange.v2.c3p0.ComboPooledDataSource;
+import edu.uci.ics.crawler4j.crawler.CrawlConfig;
+import edu.uci.ics.crawler4j.crawler.CrawlController;
+import edu.uci.ics.crawler4j.examples.crawler.PostgresCrawlerFactory;
+import edu.uci.ics.crawler4j.fetcher.PageFetcher;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
+import org.flywaydb.core.Flyway;
+
+public class SampleLauncher {
+
+    public static void main(String[] args) throws Exception {
+
+        String crawlStorageFolder = Files.createTempDir().getAbsolutePath();
+        int numberOfCrawlers = Integer.valueOf(args[2]);
+
+        CrawlConfig config = new CrawlConfig();
+
+        config.setPolitenessDelay(100);
+
+        config.setCrawlStorageFolder(crawlStorageFolder);
+
+        config.setMaxPagesToFetch(Integer.valueOf(args[0]));
+
+        /*
+         * Instantiate the controller for this crawl.
+         */
+        PageFetcher pageFetcher = new PageFetcher(config);
+        RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
+        RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
+        CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
+
+        /*
+         * For each crawl, you need to add some seed urls. These are the first
+         * URLs that are fetched and then the crawler starts following links
+         * which are found in these pages
+         */
+        controller.addSeed("https://de.wikipedia.org/wiki/Java_Database_Connectivity");
+        controller.addSeed("https://de.wikipedia.org/wiki/Relationale_Datenbank");
+        controller.addSeed("https://pt.wikipedia.org/wiki/JDBC");
+        controller.addSeed("https://pt.wikipedia.org/wiki/Protocolo");
+        controller.addSeed("https://de.wikipedia.org/wiki/Datenbank");
+
+
+        Flyway flyway = new Flyway();
+        flyway.setDataSource(args[1], "crawler4j", "crawler4j");
+        flyway.migrate();
+
+
+        ComboPooledDataSource pool = new ComboPooledDataSource();
+        pool.setDriverClass("org.postgresql.Driver");
+        pool.setJdbcUrl(args[1]);
+        pool.setUser("crawler4j");
+        pool.setPassword("crawler4j");
+        pool.setMaxPoolSize(numberOfCrawlers);
+        pool.setMinPoolSize(numberOfCrawlers);
+        pool.setInitialPoolSize(numberOfCrawlers);
+
+        /*
+         * Start the crawl. This is a blocking operation, meaning that your code
+         * will reach the line after this only when crawling is finished.
+         */
+        controller.start(new PostgresCrawlerFactory(pool), numberOfCrawlers);
+
+        pool.close();
+    }
+
+}
diff --git a/...postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresCrawlerFactory.java b/...postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresCrawlerFactory.java
@@ -0,0 +1,23 @@
+package edu.uci.ics.crawler4j.examples.crawler;
+
+import com.mchange.v2.c3p0.ComboPooledDataSource;
+import edu.uci.ics.crawler4j.crawler.CrawlController;
+import edu.uci.ics.crawler4j.examples.db.impl.PostgresDBServiceImpl;
+
+import java.beans.PropertyVetoException;
+
+/**
+ * Created by rz on 03.06.2016.
+ */
+public class PostgresCrawlerFactory implements CrawlController.WebCrawlerFactory<PostgresWebCrawler> {
+
+    private ComboPooledDataSource comboPooledDataSource;
+
+    public PostgresCrawlerFactory(ComboPooledDataSource comboPooledDataSource) {
+        this.comboPooledDataSource = comboPooledDataSource;
+    }
+
+    public PostgresWebCrawler newInstance() throws Exception {
+        return new PostgresWebCrawler(new PostgresDBServiceImpl(comboPooledDataSource));
+    }
+}
diff --git a/...les-postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresWebCrawler.java b/...les-postgres/src/main/java/edu/uci/ics/crawler4j/examples/crawler/PostgresWebCrawler.java
@@ -0,0 +1,69 @@
+package edu.uci.ics.crawler4j.examples.crawler;
+
+import edu.uci.ics.crawler4j.crawler.Page;
+import edu.uci.ics.crawler4j.crawler.WebCrawler;
+import edu.uci.ics.crawler4j.examples.db.PostgresDBService;
+import edu.uci.ics.crawler4j.parser.HtmlParseData;
+import edu.uci.ics.crawler4j.url.WebURL;
+import org.slf4j.Logger;
+
+import java.util.Set;
+import java.util.regex.Pattern;
+
+public class PostgresWebCrawler extends WebCrawler {
+
+    private static final Logger logger = org.slf4j.LoggerFactory.getLogger(PostgresWebCrawler.class);
+
+    private static Pattern FILE_ENDING_EXCLUSION_PATTERN = Pattern.compile(".*(\\.(" +
+            "css|js" +
+            "|bmp|gif|jpe?g|JPE?G|png|tiff?|ico|nef|raw" +
+            "|mid|mp2|mp3|mp4|wav|wma|flv|mpe?g" +
+            "|avi|mov|mpeg|ram|m4v|wmv|rm|smil" +
+            "|pdf|doc|docx|pub|xls|xlsx|vsd|ppt|pptx" +
+            "|swf" +
+            "|zip|rar|gz|bz2|7z|bin" +
+            "|xml|txt|java|c|cpp|exe" +
+            "))$");
+
+
+    private final PostgresDBService postgresDBService;
+
+    public PostgresWebCrawler(PostgresDBService postgresDBService) {
+        this.postgresDBService = postgresDBService;
+    }
+
+    @Override
+    public boolean shouldVisit(Page referringPage, WebURL url) {
+        String href = url.getURL().toLowerCase();
+        return !FILE_ENDING_EXCLUSION_PATTERN.matcher(href).matches();
+    }
+
+    @Override
+    public void visit(Page page) {
+        String url = page.getWebURL().getURL();
+        logger.info("URL: " + url);
+
+        if (page.getParseData() instanceof HtmlParseData) {
+            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
+            String text = htmlParseData.getText();
+            String html = htmlParseData.getHtml();
+            Set<WebURL> links = htmlParseData.getOutgoingUrls();
+
+            logger.info("Text length: " + text.length());
+            logger.info("Html length: " + html.length());
+            logger.info("Number of outgoing links: " + links.size());
+
+            try {
+                postgresDBService.store(page);
+            } catch (RuntimeException e) {
+                logger.error("Storing failed", e);
+            }
+        }
+    }
+
+    public void onBeforeExit() {
+        if (postgresDBService != null) {
+            postgresDBService.close();
+        }
+    }
+}
diff --git a/...-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/PostgresDBService.java b/...-examples-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/PostgresDBService.java
@@ -0,0 +1,10 @@
+package edu.uci.ics.crawler4j.examples.db;
+
+import edu.uci.ics.crawler4j.crawler.Page;
+
+public interface PostgresDBService {
+
+    void store(Page webPage);
+
+    void close();
+}
diff --git a/...-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/impl/PostgresDBServiceImpl.java b/...-postgres/src/main/java/edu/uci/ics/crawler4j/examples/db/impl/PostgresDBServiceImpl.java
@@ -0,0 +1,52 @@
+package edu.uci.ics.crawler4j.examples.db.impl;
+
+import com.mchange.v2.c3p0.ComboPooledDataSource;
+import edu.uci.ics.crawler4j.crawler.Page;
+import edu.uci.ics.crawler4j.examples.db.PostgresDBService;
+import edu.uci.ics.crawler4j.parser.HtmlParseData;
+import org.slf4j.Logger;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Timestamp;
+
+public class PostgresDBServiceImpl implements PostgresDBService {
+
+    private static final Logger logger = org.slf4j.LoggerFactory.getLogger(PostgresDBServiceImpl.class);
+
+    private ComboPooledDataSource comboPooledDataSource;
+
+    private PreparedStatement insertKeyStatement;
+
+    public PostgresDBServiceImpl(ComboPooledDataSource comboPooledDataSource) throws SQLException {
+        this.comboPooledDataSource = comboPooledDataSource;
+        insertKeyStatement = comboPooledDataSource.getConnection().prepareStatement("insert into webpage values " +
+                "(nextval('id_master_seq'),?,?,?,?)");
+    }
+
+    @Override
+    public void store(Page page) {
+
+        if (page.getParseData() instanceof HtmlParseData) {
+            try {
+
+                HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
+
+                insertKeyStatement.setString(1, htmlParseData.getHtml());
+                insertKeyStatement.setString(2, htmlParseData.getText());
+                insertKeyStatement.setString(3, page.getWebURL().getURL());
+                insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime()));
+                insertKeyStatement.executeUpdate();
+            } catch (SQLException e) {
+                logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e);
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    @Override
+    public void close() {
+        if (comboPooledDataSource != null) {
+            comboPooledDataSource.close();
+        }
+    }
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		A sample shows how to save crawled page into a JDBC repository.

		Shamelessy grabbed with rzo1's permission, from [the original repo](https://github.com/rzo1/crawler4j-postgres-sample).