Skip to content

Commit

Permalink
Minor refactoring and improving code styles
Browse files Browse the repository at this point in the history
  • Loading branch information
yasserg committed Feb 27, 2015
1 parent 54db52a commit 6bd194a
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 156 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ implementation:
```java
public class MyCrawler extends WebCrawler {

private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpe?g"
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"
+ "|png|mp3|mp3|zip|gz))$");

/**
Expand Down
10 changes: 9 additions & 1 deletion src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@
* @author Yasser Ganjisaffar
*/
public class PageFetcher extends Configurable {

protected static final Logger logger = LoggerFactory.getLogger(PageFetcher.class);

protected PoolingHttpClientConnectionManager connectionManager;
Expand Down Expand Up @@ -240,6 +239,15 @@ public PageFetchResult fetchPage(WebURL webUrl)
// Checking maximum size
if (fetchResult.getEntity() != null) {
long size = fetchResult.getEntity().getContentLength();
if (size == -1) {
Header length = response.getLastHeader("Content-Length");
if (length == null) {
length = response.getLastHeader("Content-length");
}
if (length != null) {
size = Integer.parseInt(length.getValue());
}
}
if (size > config.getMaxDownloadSize()) {
throw new PageBiggerThanMaxSizeException(size);
}
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/edu/uci/ics/crawler4j/frontier/Counters.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@ public Counters(Environment env, CrawlConfig config) {

public long getValue(String name) {
synchronized (mutex) {
return (counterValues.get(name) == null) ? 0 : counterValues.get(name);
Long value = counterValues.get(name);
if (value == null) {
return 0;
}
return value;
}
}

Expand Down
63 changes: 27 additions & 36 deletions src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,31 +36,29 @@
*/

public class DocIDServer extends Configurable {
private static final Logger logger = LoggerFactory.getLogger(DocIDServer.class);

private final Database docIDsDB;
private static final String DATABASE_NAME = "DocIDs";
protected static final Logger logger = LoggerFactory.getLogger(DocIDServer.class);

protected Database docIDsDB = null;
private final Object mutex = new Object();

protected final Object mutex = new Object();

protected int lastDocID;
private int lastDocID;

public DocIDServer(Environment env, CrawlConfig config) {
super(config);
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setAllowCreate(true);
dbConfig.setTransactional(config.isResumableCrawling());
dbConfig.setDeferredWrite(!config.isResumableCrawling());
lastDocID = 0;
docIDsDB = env.openDatabase(null, DATABASE_NAME, dbConfig);
if (config.isResumableCrawling()) {
int docCount = getDocCount();
if (docCount > 0) {
logger.info("Loaded {} URLs that had been detected in previous crawl.", docCount);
lastDocID = docCount;
}
} else {
lastDocID = 0;
}
}

Expand All @@ -72,46 +70,41 @@ public DocIDServer(Environment env, CrawlConfig config) {
*/
public int getDocId(String url) {
synchronized (mutex) {
int docID = -1;

if (docIDsDB != null) {
OperationStatus result = null;
DatabaseEntry value = new DatabaseEntry();
try {
DatabaseEntry key = new DatabaseEntry(url.getBytes());
result = docIDsDB.get(null, key, value, null);
OperationStatus result = null;
DatabaseEntry value = new DatabaseEntry();
try {
DatabaseEntry key = new DatabaseEntry(url.getBytes());
result = docIDsDB.get(null, key, value, null);

} catch (Exception e) {
logger.error("Exception thrown while getting DocID", e);
}
} catch (Exception e) {
logger.error("Exception thrown while getting DocID", e);
return -1;
}

if ((result != null) && (result == OperationStatus.SUCCESS) && (value.getData().length > 0)) {
docID = Util.byteArray2Int(value.getData());
}
if ((result == OperationStatus.SUCCESS) && (value.getData().length > 0)) {
return Util.byteArray2Int(value.getData());
}

return docID;
return -1;
}
}

public int getNewDocID(String url) {

synchronized (mutex) {
int docID = -1;
try {
// Make sure that we have not already assigned a docid for this URL
docID = getDocId(url);

if (docID <= 0) {
lastDocID++;
docIDsDB.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(Util.int2ByteArray(lastDocID)));
docID = lastDocID;
int docID = getDocId(url);
if (docID > 0) {
return docID;
}

++lastDocID;
docIDsDB.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(Util.int2ByteArray(lastDocID)));
return lastDocID;
} catch (Exception e) {
logger.error("Exception thrown while getting new DocID", e);
return -1;
}

return docID;
}
}

Expand Down Expand Up @@ -140,14 +133,12 @@ public boolean isSeenBefore(String url) {
}

public final int getDocCount() {
int count = -1;

try {
count = (int) docIDsDB.count();
return (int) docIDsDB.count();
} catch (DatabaseException e) {
logger.error("Exception thrown while getting DOC Count", e);
return -1;
}
return count;
}

public void close() {
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
*/

public class Frontier extends Configurable {

protected static final Logger logger = LoggerFactory.getLogger(Frontier.class);

private static final String DATABASE_NAME = "PendingURLsDB";
protected WorkQueues workQueues;

Expand Down
42 changes: 13 additions & 29 deletions src/main/java/edu/uci/ics/crawler4j/frontier/InProcessPagesDB.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

import com.sleepycat.je.Cursor;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.Transaction;
Expand All @@ -38,9 +37,10 @@
* @author Yasser Ganjisaffar
*/
public class InProcessPagesDB extends WorkQueues {

private static final Logger logger = LoggerFactory.getLogger(InProcessPagesDB.class);

private static final String DATABASE_NAME = "InProcessPagesDB";

public InProcessPagesDB(Environment env) {
super(env, DATABASE_NAME, true);
long docCount = getLength();
Expand All @@ -51,37 +51,21 @@ public InProcessPagesDB(Environment env) {

public boolean removeURL(WebURL webUrl) {
synchronized (mutex) {
try {
DatabaseEntry key = getDatabaseEntryKey(webUrl);
Cursor cursor = null;
DatabaseEntry value = new DatabaseEntry();
Transaction txn = env.beginTransaction(null, null);
try {
cursor = urlsDB.openCursor(txn, null);
OperationStatus result = cursor.getSearchKey(key, value, null);
DatabaseEntry key = getDatabaseEntryKey(webUrl);
DatabaseEntry value = new DatabaseEntry();
Transaction txn = beginTransaction();
try (Cursor cursor = openCursor(txn)) {
OperationStatus result = cursor.getSearchKey(key, value, null);

if (result == OperationStatus.SUCCESS) {
result = cursor.delete();
if (result == OperationStatus.SUCCESS) {
result = cursor.delete();
if (result == OperationStatus.SUCCESS) {
return true;
}
}
} catch (DatabaseException e) {
if (txn != null) {
txn.abort();
txn = null;
}
throw e;
} finally {
if (cursor != null) {
cursor.close();
}
if (txn != null) {
txn.commit();
return true;
}
}
} catch (Exception e) {
logger.error("Error while manipulating the DB of links from previous crawls", e);
}
if (txn != null) {
txn.commit();
}
}
return false;
Expand Down
Loading

0 comments on commit 6bd194a

Please sign in to comment.