Skip to content

Commit

Permalink
Cleanup image crawler example
Browse files Browse the repository at this point in the history
  • Loading branch information
yasserg committed Feb 24, 2019
1 parent 8c0fce6 commit 949af32
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 76 deletions.
Original file line number Diff line number Diff line change
@@ -1,61 +1,37 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package edu.uci.ics.crawler4j.examples.imagecrawler;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.util.Arrays;
import java.util.List;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;

/**
* @author Yasser Ganjisaffar
*/
public class ImageCrawlController {
private static final Logger logger = LoggerFactory.getLogger(ImageCrawlController.class);

public static void main(String[] args) throws Exception {
if (args.length < 3) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCrawlers (number of concurrent threads)");
logger.info("\t storageFolder (a folder for storing downloaded images)");
return;
}
CrawlConfig config = new CrawlConfig();

String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
String storageFolder = args[2];
// Set the folder where intermediate crawl data is stored (e.g. list of urls that are extracted from previously
// fetched pages and need to be crawled later).
config.setCrawlStorageFolder("/tmp/crawler4j/");

CrawlConfig config = new CrawlConfig();
// Number of threads to use during crawling. Increasing this typically makes crawling faster. But crawling
// speed depends on many other factors as well. You can experiment with this to figure out what number of
// threads works best for you.
int numberOfCrawlers = 8;

config.setCrawlStorageFolder(rootFolder);
// Where should the downloaded images be stored?
File storageFolder = new File("/tmp/crawled-images/");

/*
* Since images are binary content, we need to set this parameter to
* true to make sure they are included in the crawl.
*/
// Since images are binary content, we need to set this parameter to
// true to make sure they are included in the crawl.
config.setIncludeBinaryContentInCrawling(true);

String[] crawlDomains = {"https://uci.edu/"};
List<String> crawlDomains = Arrays.asList("https://uci.edu/");

PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
Expand All @@ -65,8 +41,12 @@ public static void main(String[] args) throws Exception {
controller.addSeed(domain);
}

ImageCrawler.configure(crawlDomains, storageFolder);
if (!storageFolder.exists()) {
storageFolder.mkdirs();
}

controller.start(ImageCrawler.class, numberOfCrawlers);
CrawlController.WebCrawlerFactory<ImageCrawler> factory = () -> new ImageCrawler(storageFolder, crawlDomains);
controller.start(factory, numberOfCrawlers);
}

}
Original file line number Diff line number Diff line change
@@ -1,27 +1,12 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package edu.uci.ics.crawler4j.examples.imagecrawler;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.UUID;
import java.util.regex.Pattern;

import com.google.common.collect.ImmutableList;
import com.google.common.io.Files;

import edu.uci.ics.crawler4j.crawler.Page;
Expand All @@ -30,10 +15,6 @@
import edu.uci.ics.crawler4j.url.WebURL;

/**
* @author Yasser Ganjisaffar
*/

/*
* This class shows how you can crawl images on the web and store them in a
* folder. This is just for demonstration purposes and doesn't scale for large
* number of images. For crawling millions of images you would need to store
Expand All @@ -47,16 +28,12 @@ public class ImageCrawler extends WebCrawler {

private static final Pattern imgPatterns = Pattern.compile(".*(\\.(bmp|gif|jpe?g|png|tiff?))$");

private static File storageFolder;
private static String[] crawlDomains;

public static void configure(String[] domain, String storageFolderName) {
crawlDomains = domain;
private final File storageFolder;
private final List<String> crawlDomains;

storageFolder = new File(storageFolderName);
if (!storageFolder.exists()) {
storageFolder.mkdirs();
}
public ImageCrawler(File storageFolder, List<String> crawlDomains) {
this.storageFolder = storageFolder;
this.crawlDomains = ImmutableList.copyOf(crawlDomains);
}

@Override
Expand Down Expand Up @@ -89,17 +66,18 @@ public void visit(Page page) {
return;
}

// get a unique name for storing this image
// Get a unique name for storing this image
String extension = url.substring(url.lastIndexOf('.'));
String hashedName = UUID.randomUUID() + extension;

// store image
String filename = storageFolder.getAbsolutePath() + "/" + hashedName;
// Store image
String filename = storageFolder.getAbsolutePath() + '/' + hashedName;
try {
Files.write(page.getContentData(), new File(filename));
WebCrawler.logger.info("Stored: {}", url);
} catch (IOException iox) {
WebCrawler.logger.error("Failed to write file: " + filename, iox);
WebCrawler.logger.error("Failed to write file: {}", filename, iox);
}
}

}

0 comments on commit 949af32

Please sign in to comment.