From 3367ea7f2602189d7620aac4ec34be3c098627c1 Mon Sep 17 00:00:00 2001 From: Douglass Cutting Date: Wed, 12 Jul 2006 08:16:37 +0000 Subject: [PATCH] Patch a bug introduced by Hadoop 0.4.0, which requires specified input directories to exist. git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/trunk@421185 13f79535-47bb-0310-9956-ffa450edef68 --- src/java/org/apache/nutch/crawl/CrawlDb.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java index 7042274259..20482f1a62 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDb.java +++ b/src/java/org/apache/nutch/crawl/CrawlDb.java @@ -65,7 +65,8 @@ public void update(Path crawlDb, Path segment) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: done"); } } - public static JobConf createJob(Configuration config, Path crawlDb) { + public static JobConf createJob(Configuration config, Path crawlDb) + throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); @@ -73,7 +74,11 @@ public static JobConf createJob(Configuration config, Path crawlDb) { JobConf job = new NutchJob(config); job.setJobName("crawldb " + crawlDb); - job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME)); + + Path current = new Path(crawlDb, CrawlDatum.DB_DIR_NAME); + if (FileSystem.get(job).exists(current)) { + job.addInputPath(current); + } job.setInputFormat(SequenceFileInputFormat.class); job.setInputKeyClass(UTF8.class); job.setInputValueClass(CrawlDatum.class);