Skip to content

Commit

Permalink
Added support for adding seeds with specific docids.
Browse files Browse the repository at this point in the history
  • Loading branch information
yasserg committed Jan 23, 2012
1 parent 0161a71 commit 7a7efd2
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 6 deletions.
41 changes: 35 additions & 6 deletions src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java
Original file line number Diff line number Diff line change
Expand Up @@ -300,21 +300,50 @@ protected void sleep(int seconds) {
* the URL of the seed
*/
public void addSeed(String pageUrl) {
addSeed(pageUrl, -1);
}

/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling. You can also
* specify a specific document id to be assigned to this seed URL. This
* document id needs to be unique. Also, note that if you add three seeds
* with document ids 1,2, and 7. Then the next URL that is found during the
* crawl will get a doc id of 8. Also you need to ensure to add seeds in
* increasing order of document ids.
*
* Specifying doc ids is mainly useful when you have had a previous crawl
* and have stored the results and want to start a new crawl with seeds
* which get the same document ids as the previous crawl.
*
* @param pageUrl the URL of the seed
* @param docId the document id that you want to be assigned to this seed URL.
*
*/
public void addSeed(String pageUrl, int docId) {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
if (canonicalUrl == null) {
logger.error("Invalid seed URL: " + pageUrl);
return;
}
int docid = docIdServer.getDocId(canonicalUrl);
if (docid > 0) {
// This URL is already seen.
return;
if (docId < 0) {
docId = docIdServer.getDocId(canonicalUrl);
if (docId > 0) {
// This URL is already seen.
return;
}
docId = docIdServer.getNewDocID(canonicalUrl);
} else {
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
} catch (Exception e) {
logger.error("Could not add seed: " + e.getMessage());
}
}

WebURL webUrl = new WebURL();
webUrl.setURL(canonicalUrl);
docid = docIdServer.getNewDocID(canonicalUrl);
webUrl.setDocid(docid);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (!robotstxtServer.allows(webUrl)) {
logger.info("Robots.txt does not allow this seed: " + pageUrl);
Expand Down
20 changes: 20 additions & 0 deletions src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,26 @@ public int getNewDocID(String url) {
}
}

public void addUrlAndDocId(String url, int docId) throws Exception {
synchronized (mutex) {
if (docId <= lastDocID) {
throw new Exception("Requested doc id: " + docId + " is not larger than: " + lastDocID);
}

// Make sure that we have not already assigned a docid for this URL
int prevDocid = getDocId(url);
if (prevDocid > 0) {
if (prevDocid == docId) {
return;
}
throw new Exception("Doc id: " + prevDocid + " is already assigned to URL: " + url);
}

docIDsDB.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(Util.int2ByteArray(docId)));
lastDocID = docId;
}
}

public boolean isSeenBefore(String url) {
return getDocId(url) != -1;
}
Expand Down

0 comments on commit 7a7efd2

Please sign in to comment.