Skip to content

Commit

Permalink
支持文件下载完成
Browse files Browse the repository at this point in the history
  • Loading branch information
liye committed Mar 30, 2019
1 parent 7735f3b commit e8399a9
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ private List<Info> getLinksFromUrlBasedQuery(UrlBasedQuery query, LinkCollector
private List<Info> getLinksFromApiBasedQuery(ApiBasedQuery query, LinkCollector collector) {
WebClient client = threadClient.get();
HtmlPage page = retryGetPage(client, query.getUrl());
if (page == null) {
return Collections.emptyList();
}
//if can't find the input, directly exist
if (StringUtils.isBlank(Constant.apiBaseConf.getInputXpath())) {
return Collections.emptyList();
Expand All @@ -222,11 +225,13 @@ private List<Info> getLinksFromApiBasedQuery(ApiBasedQuery query, LinkCollector

logger.info("get the keyword input");
input = (HtmlTextInput) inputList.get(0);
page.setFocusedElement(input);
input.setText(query.getKeyword());


//below has locate the main page or iframe, here is no need to locate it again
if (StringUtils.isBlank(Constant.apiBaseConf.getSubmitXpath())) {//if submitXpath is a empty string, use the keyboard enter
//the keyboard enter operation is invalid, temporarily don't use it
logger.info("undefined submit button xpath, use the keyboard return");
input.fireEvent(Event.TYPE_KEY_UP);
page = (HtmlPage) client.getCurrentWindow().getEnclosedPage();
Expand All @@ -247,11 +252,6 @@ private List<Info> getLinksFromApiBasedQuery(ApiBasedQuery query, LinkCollector
}
}

try {
Thread.sleep(3_000);
} catch (InterruptedException ex) {
//ignored
}

//try 5 times to wait .3 second each for filling the page.
List<Info> links = null;
Expand All @@ -264,24 +264,24 @@ private List<Info> getLinksFromApiBasedQuery(ApiBasedQuery query, LinkCollector
}
}
if (!(links = collector.collect(page.asXml(), page.getUrl(), Constant.apiBaseConf.getInfoLinkXpath(), Constant.apiBaseConf.getPayloadXpath())).isEmpty()) {
logger.info("have collect infos, continue to execute");
break;
}
logger.info("can't collect infos from page, retry");
}
return links;
}


private HtmlPage retryGetPage(WebClient client, String url) {
return new Try<HtmlPage>(3).run(new RetryOperation<HtmlPage>() {
@Override
public HtmlPage execute() throws Exception {
return new Try<HtmlPage>(3).run(() -> {
HtmlPage ans = null;
ans = client.getPage(url);
if(ans.getBody() == null) {
ans = null;
}
return ans;
}

});
}
public static void main(String[] args) {
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/com/cufe/deepweb/common/index/IndexClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,10 @@ public void addDocument(Map<String, String> fieldContentPairs) {
doc.add(new TextField(entry.getKey(), entry.getValue(), Field.Store.YES));
}
try{
if (!indexWriter.isOpen()) {
indexWriter = null;
updateIndexWriter();
}
indexWriter.addDocument(doc);
}catch (IOException ex){
logger.error("error happen when add document",ex);
Expand Down Expand Up @@ -456,6 +460,16 @@ public Map<String, Set<Integer>> getDocSetMap(String field,double low,double up)
docSetMap.put(term, new HashSet<>());
}
}

if (docSetMap.size() == 0) {
//backup operation, sometimes here docSetMap's size is zero, should make sure it is not zero
terms = MultiFields.getTerms(indexReader, field);
termsEnum = terms.iterator();
while (termsEnum.next() != null) {
docSetMap.put(termsEnum.term().utf8ToString(), new HashSet<>());
}
}

}catch (IOException ex){
logger.error("IOException in read lucene index", ex);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
package com.cufe.deepweb.common.retry;

public abstract class RetryOperation<T> {
public abstract T execute() throws Exception;
public interface RetryOperation<T> {
T execute() throws Exception;
}
1 change: 1 addition & 0 deletions src/main/java/com/cufe/deepweb/common/retry/Try.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ public T run(RetryOperation<T> op) {
T ans = null;
Exception exception = null;
for (int i = 0; i < num; i++) {
exception = null;
try {
ans = op.execute();
} catch (Exception ex) {
Expand Down
1 change: 1 addition & 0 deletions src/main/java/com/cufe/deepweb/crawler/Launcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ private static void init(final String[] args) {
//config website info
Sql2o sql2o = Orm.getSql2o();
try (Connection conn=sql2o.open()) {
conn.setRollbackOnException(true);
//there should have one row corresponding to the webID in database
String sql = "select * from website where webId=:webID";
Constant.webSite = conn.createQuery(sql).addParameter("webID", webID).executeAndFetchFirst(WebSite.class);
Expand Down
7 changes: 6 additions & 1 deletion src/main/java/com/cufe/deepweb/crawler/branch/Scheduler.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.slf4j.LoggerFactory;
import org.sql2o.Connection;
import org.sql2o.Sql2o;
import org.sql2o.Sql2oException;

import java.util.*;
import java.util.concurrent.*;
Expand Down Expand Up @@ -162,7 +163,7 @@ private int round() {
threadPool.shutdown();

//loop here until all the thread in thread pool exit
int stopCount = 3;//a flag to indicate whether to force stop the thread pool
int stopCount = 10;//a flag to indicate whether to force stop the thread pool
while (true) {
try {
//most of the situation, the thread pool would close after the following block, and jump out the while loop
Expand Down Expand Up @@ -325,6 +326,7 @@ public void fixStatus(int pre, int cur) {
public synchronized int dynamicUpdate() {
int sLinkNum = 0;
try (Connection conn = sql2o.open()) {
conn.setRollbackOnException(true);
String sql = null;

//update the last round's fLinkNum and sLinkNum in database's status table
Expand All @@ -333,6 +335,7 @@ public synchronized int dynamicUpdate() {
//update the last round's fLinkNum and sLinkNum in database's status table
int fLinkNum = queryLinkService.getFailedLinkNum();
sLinkNum = queryLinkService.getTotalLinkNum() - fLinkNum;
logger.trace("fLinkNum:{},sLinkNum:{},totalLinkNum:{}", fLinkNum, sLinkNum, queryLinkService.getTotalLinkNum());
conn.createQuery(sql)
.addParameter("fLinkNum", fLinkNum)
.addParameter("sLinkNum", sLinkNum)
Expand Down Expand Up @@ -361,6 +364,8 @@ public synchronized int dynamicUpdate() {
.executeUpdate();
lastSInfoLink = sLinkNum;
lastFInfoLink = fLinkNum;
} catch (Sql2oException ex) {

}
return sLinkNum;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

public class ApiBaseQueryLinkService extends QueryLinkService {
private Logger logger = LoggerFactory.getLogger(ApiBaseQueryLinkService.class);
Expand All @@ -24,6 +25,7 @@ private ApiBasedQuery buildQuery(String keyword) {
return Query.asApiBased(Constant.apiBaseConf.getPrefix(), Constant.apiBaseConf.getInputXpath(), Constant.apiBaseConf.getSubmitXpath(), keyword);
}
public List<Info> getInfoLinks(String keyword) {
this.totalLinkNum++;
ApiBasedQuery query = buildQuery(keyword);
List<Info> links = browser.getAllLinks(query, collector);
if (links.size() == 0) {
Expand All @@ -38,6 +40,9 @@ class InfoLinkCollector extends LinkCollector {
//TODO: should implement in detail
@Override
public List<Info> privateOp(List<Info> links) {
links = links.stream().filter(link -> {//remove the repeated links
return dedu.add(link.getUrl());
}).collect(Collectors.toList());
return links;
}
}
Expand Down

0 comments on commit e8399a9

Please sign in to comment.