Added support for adding seeds with specific docids.

ThiagoKrug · Jan 23, 2012 · 7a7efd2 · 7a7efd2
1 parent 0161a71
commit 7a7efd2
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 6 deletions.
diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java
@@ -300,21 +300,50 @@ protected void sleep(int seconds) {
 	 *            the URL of the seed
 	 */
 	public void addSeed(String pageUrl) {
+		addSeed(pageUrl, -1);
+	}
+
+	/**
+	 * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
+	 * to extract new URLs in it and follow them for crawling. You can also 
+	 * specify a specific document id to be assigned to this seed URL. This
+	 * document id needs to be unique. Also, note that if you add three seeds
+	 * with document ids 1,2, and 7. Then the next URL that is found during the
+	 * crawl will get a doc id of 8. Also you need to ensure to add seeds in 
+	 * increasing order of document ids.
+	 * 
+	 * Specifying doc ids is mainly useful when you have had a previous crawl
+	 * and have stored the results and want to start a new crawl with seeds
+	 * which get the same document ids as the previous crawl.
+	 * 
+	 * @param pageUrl the URL of the seed
+	 * @param docId the document id that you want to be assigned to this seed URL.
+	 * 
+	 */
+	public void addSeed(String pageUrl, int docId) {
 		String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
 		if (canonicalUrl == null) {
 			logger.error("Invalid seed URL: " + pageUrl);
 			return;
 		}
-		int docid = docIdServer.getDocId(canonicalUrl);
-		if (docid > 0) {
-			// This URL is already seen.
-			return;
+		if (docId < 0) {
+			docId = docIdServer.getDocId(canonicalUrl);
+			if (docId > 0) {
+				// This URL is already seen.
+				return;
+			}
+			docId = docIdServer.getNewDocID(canonicalUrl);
+		} else {
+			try {
+				docIdServer.addUrlAndDocId(canonicalUrl, docId);
+			} catch (Exception e) {
+				logger.error("Could not add seed: " + e.getMessage());
+			}
 		}
 
 		WebURL webUrl = new WebURL();
 		webUrl.setURL(canonicalUrl);
-		docid = docIdServer.getNewDocID(canonicalUrl);
-		webUrl.setDocid(docid);
+		webUrl.setDocid(docId);
 		webUrl.setDepth((short) 0);
 		if (!robotstxtServer.allows(webUrl)) {
 			logger.info("Robots.txt does not allow this seed: " + pageUrl);

diff --git a/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java b/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java
@@ -103,6 +103,26 @@ public int getNewDocID(String url) {
 		}
 	}
 
+	public void addUrlAndDocId(String url, int docId) throws Exception {
+		synchronized (mutex) {
+			if (docId <= lastDocID) {
+				throw new Exception("Requested doc id: " + docId + " is not larger than: " + lastDocID);
+			}
+
+			// Make sure that we have not already assigned a docid for this URL
+			int prevDocid = getDocId(url);
+			if (prevDocid > 0) {
+				if (prevDocid == docId) {
+					return;
+				}
+				throw new Exception("Doc id: " + prevDocid + " is already assigned to URL: " + url);
+			}
+
+			docIDsDB.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(Util.int2ByteArray(docId)));
+			lastDocID = docId;
+		}
+	}
+
 	public boolean isSeenBefore(String url) {
 		return getDocId(url) != -1;
 	}