Skip to content

Commit 9199a2c

Browse files
author
mashuai
committed
Merge remote-tracking branch 'crawler4j/master'
2 parents f65b4a2 + 6bd194a commit 9199a2c

File tree

8 files changed

+91
-156
lines changed

8 files changed

+91
-156
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ implementation:
3232
```java
3333
public class MyCrawler extends WebCrawler {
3434

35-
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpe?g"
35+
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"
3636
+ "|png|mp3|mp3|zip|gz))$");
3737

3838
/**

src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java

+9-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@
6969
* @author Yasser Ganjisaffar
7070
*/
7171
public class PageFetcher extends Configurable {
72-
7372
protected static final Logger logger = LoggerFactory.getLogger(PageFetcher.class);
7473

7574
protected PoolingHttpClientConnectionManager connectionManager;
@@ -240,6 +239,15 @@ public PageFetchResult fetchPage(WebURL webUrl)
240239
// Checking maximum size
241240
if (fetchResult.getEntity() != null) {
242241
long size = fetchResult.getEntity().getContentLength();
242+
if (size == -1) {
243+
Header length = response.getLastHeader("Content-Length");
244+
if (length == null) {
245+
length = response.getLastHeader("Content-length");
246+
}
247+
if (length != null) {
248+
size = Integer.parseInt(length.getValue());
249+
}
250+
}
243251
if (size > config.getMaxDownloadSize()) {
244252
throw new PageBiggerThanMaxSizeException(size);
245253
}

src/main/java/edu/uci/ics/crawler4j/frontier/Counters.java

+5-1
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,11 @@ public Counters(Environment env, CrawlConfig config) {
9595

9696
public long getValue(String name) {
9797
synchronized (mutex) {
98-
return (counterValues.get(name) == null) ? 0 : counterValues.get(name);
98+
Long value = counterValues.get(name);
99+
if (value == null) {
100+
return 0;
101+
}
102+
return value;
99103
}
100104
}
101105

src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java

+27-36
Original file line numberDiff line numberDiff line change
@@ -36,31 +36,29 @@
3636
*/
3737

3838
public class DocIDServer extends Configurable {
39+
private static final Logger logger = LoggerFactory.getLogger(DocIDServer.class);
3940

41+
private final Database docIDsDB;
4042
private static final String DATABASE_NAME = "DocIDs";
41-
protected static final Logger logger = LoggerFactory.getLogger(DocIDServer.class);
4243

43-
protected Database docIDsDB = null;
44+
private final Object mutex = new Object();
4445

45-
protected final Object mutex = new Object();
46-
47-
protected int lastDocID;
46+
private int lastDocID;
4847

4948
public DocIDServer(Environment env, CrawlConfig config) {
5049
super(config);
5150
DatabaseConfig dbConfig = new DatabaseConfig();
5251
dbConfig.setAllowCreate(true);
5352
dbConfig.setTransactional(config.isResumableCrawling());
5453
dbConfig.setDeferredWrite(!config.isResumableCrawling());
54+
lastDocID = 0;
5555
docIDsDB = env.openDatabase(null, DATABASE_NAME, dbConfig);
5656
if (config.isResumableCrawling()) {
5757
int docCount = getDocCount();
5858
if (docCount > 0) {
5959
logger.info("Loaded {} URLs that had been detected in previous crawl.", docCount);
6060
lastDocID = docCount;
6161
}
62-
} else {
63-
lastDocID = 0;
6462
}
6563
}
6664

@@ -72,46 +70,41 @@ public DocIDServer(Environment env, CrawlConfig config) {
7270
*/
7371
public int getDocId(String url) {
7472
synchronized (mutex) {
75-
int docID = -1;
76-
77-
if (docIDsDB != null) {
78-
OperationStatus result = null;
79-
DatabaseEntry value = new DatabaseEntry();
80-
try {
81-
DatabaseEntry key = new DatabaseEntry(url.getBytes());
82-
result = docIDsDB.get(null, key, value, null);
73+
OperationStatus result = null;
74+
DatabaseEntry value = new DatabaseEntry();
75+
try {
76+
DatabaseEntry key = new DatabaseEntry(url.getBytes());
77+
result = docIDsDB.get(null, key, value, null);
8378

84-
} catch (Exception e) {
85-
logger.error("Exception thrown while getting DocID", e);
86-
}
79+
} catch (Exception e) {
80+
logger.error("Exception thrown while getting DocID", e);
81+
return -1;
82+
}
8783

88-
if ((result != null) && (result == OperationStatus.SUCCESS) && (value.getData().length > 0)) {
89-
docID = Util.byteArray2Int(value.getData());
90-
}
84+
if ((result == OperationStatus.SUCCESS) && (value.getData().length > 0)) {
85+
return Util.byteArray2Int(value.getData());
9186
}
9287

93-
return docID;
88+
return -1;
9489
}
9590
}
9691

9792
public int getNewDocID(String url) {
98-
9993
synchronized (mutex) {
100-
int docID = -1;
10194
try {
10295
// Make sure that we have not already assigned a docid for this URL
103-
docID = getDocId(url);
104-
105-
if (docID <= 0) {
106-
lastDocID++;
107-
docIDsDB.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(Util.int2ByteArray(lastDocID)));
108-
docID = lastDocID;
96+
int docID = getDocId(url);
97+
if (docID > 0) {
98+
return docID;
10999
}
100+
101+
++lastDocID;
102+
docIDsDB.put(null, new DatabaseEntry(url.getBytes()), new DatabaseEntry(Util.int2ByteArray(lastDocID)));
103+
return lastDocID;
110104
} catch (Exception e) {
111105
logger.error("Exception thrown while getting new DocID", e);
106+
return -1;
112107
}
113-
114-
return docID;
115108
}
116109
}
117110

@@ -140,14 +133,12 @@ public boolean isSeenBefore(String url) {
140133
}
141134

142135
public final int getDocCount() {
143-
int count = -1;
144-
145136
try {
146-
count = (int) docIDsDB.count();
137+
return (int) docIDsDB.count();
147138
} catch (DatabaseException e) {
148139
logger.error("Exception thrown while getting DOC Count", e);
140+
return -1;
149141
}
150-
return count;
151142
}
152143

153144
public void close() {

src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@
3434
*/
3535

3636
public class Frontier extends Configurable {
37-
3837
protected static final Logger logger = LoggerFactory.getLogger(Frontier.class);
38+
3939
private static final String DATABASE_NAME = "PendingURLsDB";
4040
protected WorkQueues workQueues;
4141

src/main/java/edu/uci/ics/crawler4j/frontier/InProcessPagesDB.java

+13-29
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323

2424
import com.sleepycat.je.Cursor;
2525
import com.sleepycat.je.DatabaseEntry;
26-
import com.sleepycat.je.DatabaseException;
2726
import com.sleepycat.je.Environment;
2827
import com.sleepycat.je.OperationStatus;
2928
import com.sleepycat.je.Transaction;
@@ -38,9 +37,10 @@
3837
* @author Yasser Ganjisaffar
3938
*/
4039
public class InProcessPagesDB extends WorkQueues {
41-
4240
private static final Logger logger = LoggerFactory.getLogger(InProcessPagesDB.class);
41+
4342
private static final String DATABASE_NAME = "InProcessPagesDB";
43+
4444
public InProcessPagesDB(Environment env) {
4545
super(env, DATABASE_NAME, true);
4646
long docCount = getLength();
@@ -51,37 +51,21 @@ public InProcessPagesDB(Environment env) {
5151

5252
public boolean removeURL(WebURL webUrl) {
5353
synchronized (mutex) {
54-
try {
55-
DatabaseEntry key = getDatabaseEntryKey(webUrl);
56-
Cursor cursor = null;
57-
DatabaseEntry value = new DatabaseEntry();
58-
Transaction txn = env.beginTransaction(null, null);
59-
try {
60-
cursor = urlsDB.openCursor(txn, null);
61-
OperationStatus result = cursor.getSearchKey(key, value, null);
54+
DatabaseEntry key = getDatabaseEntryKey(webUrl);
55+
DatabaseEntry value = new DatabaseEntry();
56+
Transaction txn = beginTransaction();
57+
try (Cursor cursor = openCursor(txn)) {
58+
OperationStatus result = cursor.getSearchKey(key, value, null);
6259

60+
if (result == OperationStatus.SUCCESS) {
61+
result = cursor.delete();
6362
if (result == OperationStatus.SUCCESS) {
64-
result = cursor.delete();
65-
if (result == OperationStatus.SUCCESS) {
66-
return true;
67-
}
68-
}
69-
} catch (DatabaseException e) {
70-
if (txn != null) {
71-
txn.abort();
72-
txn = null;
73-
}
74-
throw e;
75-
} finally {
76-
if (cursor != null) {
77-
cursor.close();
78-
}
79-
if (txn != null) {
80-
txn.commit();
63+
return true;
8164
}
8265
}
83-
} catch (Exception e) {
84-
logger.error("Error while manipulating the DB of links from previous crawls", e);
66+
}
67+
if (txn != null) {
68+
txn.commit();
8569
}
8670
}
8771
return false;

0 commit comments

Comments
 (0)