Cleanup image crawler example

joskid · Feb 24, 2019 · 949af32 · 949af32
1 parent 8c0fce6
commit 949af32
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 76 deletions.
diff --git a/...-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawlController.java b/...-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawlController.java
@@ -1,61 +1,37 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 package edu.uci.ics.crawler4j.examples.imagecrawler;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
 
 import edu.uci.ics.crawler4j.crawler.CrawlConfig;
 import edu.uci.ics.crawler4j.crawler.CrawlController;
 import edu.uci.ics.crawler4j.fetcher.PageFetcher;
 import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
 import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
 
-/**
- * @author Yasser Ganjisaffar
- */
 public class ImageCrawlController {
-    private static final Logger logger = LoggerFactory.getLogger(ImageCrawlController.class);
 
     public static void main(String[] args) throws Exception {
-        if (args.length < 3) {
-            logger.info("Needed parameters: ");
-            logger.info("\t rootFolder (it will contain intermediate crawl data)");
-            logger.info("\t numberOfCrawlers (number of concurrent threads)");
-            logger.info("\t storageFolder (a folder for storing downloaded images)");
-            return;
-        }
+        CrawlConfig config = new CrawlConfig();
 
-        String rootFolder = args[0];
-        int numberOfCrawlers = Integer.parseInt(args[1]);
-        String storageFolder = args[2];
+        // Set the folder where intermediate crawl data is stored (e.g. list of urls that are extracted from previously
+        // fetched pages and need to be crawled later).
+        config.setCrawlStorageFolder("/tmp/crawler4j/");
 
-        CrawlConfig config = new CrawlConfig();
+        // Number of threads to use during crawling. Increasing this typically makes crawling faster. But crawling
+        // speed depends on many other factors as well. You can experiment with this to figure out what number of
+        // threads works best for you.
+        int numberOfCrawlers = 8;
 
-        config.setCrawlStorageFolder(rootFolder);
+        // Where should the downloaded images be stored?
+        File storageFolder = new File("/tmp/crawled-images/");
 
-    /*
-     * Since images are binary content, we need to set this parameter to
-     * true to make sure they are included in the crawl.
-     */
+        // Since images are binary content, we need to set this parameter to
+        // true to make sure they are included in the crawl.
         config.setIncludeBinaryContentInCrawling(true);
 
-        String[] crawlDomains = {"https://uci.edu/"};
+        List<String> crawlDomains = Arrays.asList("https://uci.edu/");
 
         PageFetcher pageFetcher = new PageFetcher(config);
         RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
@@ -65,8 +41,12 @@ public static void main(String[] args) throws Exception {
             controller.addSeed(domain);
         }
 
-        ImageCrawler.configure(crawlDomains, storageFolder);
+        if (!storageFolder.exists()) {
+            storageFolder.mkdirs();
+        }
 
-        controller.start(ImageCrawler.class, numberOfCrawlers);
+        CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(storageFolder, crawlDomains);
+        controller.start(factory, numberOfCrawlers);
     }
+
 }
diff --git a/...examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java b/...examples-base/src/test/java/edu/uci/ics/crawler4j/examples/imagecrawler/ImageCrawler.java
@@ -1,27 +1,12 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
 package edu.uci.ics.crawler4j.examples.imagecrawler;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.List;
 import java.util.UUID;
 import java.util.regex.Pattern;
 
+import com.google.common.collect.ImmutableList;
 import com.google.common.io.Files;
 
 import edu.uci.ics.crawler4j.crawler.Page;
@@ -30,10 +15,6 @@
 import edu.uci.ics.crawler4j.url.WebURL;
 
 /**
- * @author Yasser Ganjisaffar
- */
-
-/*
  * This class shows how you can crawl images on the web and store them in a
  * folder. This is just for demonstration purposes and doesn't scale for large
  * number of images. For crawling millions of images you would need to store
@@ -47,16 +28,12 @@ public class ImageCrawler extends WebCrawler {
 
     private static final Pattern imgPatterns = Pattern.compile(".*(\\.(bmp|gif|jpe?g|png|tiff?))$");
 
-    private static File storageFolder;
-    private static String[] crawlDomains;
-
-    public static void configure(String[] domain, String storageFolderName) {
-        crawlDomains = domain;
+    private final File storageFolder;
+    private final List<String> crawlDomains;
 
-        storageFolder = new File(storageFolderName);
-        if (!storageFolder.exists()) {
-            storageFolder.mkdirs();
-        }
+    public ImageCrawler(File storageFolder, List<String> crawlDomains) {
+        this.storageFolder = storageFolder;
+        this.crawlDomains = ImmutableList.copyOf(crawlDomains);
     }
 
     @Override
@@ -89,17 +66,18 @@ public void visit(Page page) {
             return;
         }
 
-        // get a unique name for storing this image
+        // Get a unique name for storing this image
         String extension = url.substring(url.lastIndexOf('.'));
         String hashedName = UUID.randomUUID() + extension;
 
-        // store image
-        String filename = storageFolder.getAbsolutePath() + "/" + hashedName;
+        // Store image
+        String filename = storageFolder.getAbsolutePath() + '/' + hashedName;
         try {
             Files.write(page.getContentData(), new File(filename));
             WebCrawler.logger.info("Stored: {}", url);
         } catch (IOException iox) {
-            WebCrawler.logger.error("Failed to write file: " + filename, iox);
+            WebCrawler.logger.error("Failed to write file: {}", filename, iox);
         }
     }
+
 }