Skip to content

Commit

Permalink
支持302跳转,默认的下载器改成HttpClientDownloader
Browse files Browse the repository at this point in the history
  • Loading branch information
xtuhcy committed Jan 28, 2016
1 parent 5e6d7fd commit c6a105b
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 27 deletions.
8 changes: 7 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,16 @@
<url>https://github.com/xtuhcy/gecco</url>

<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.6</version>
</dependency>
<dependency>
<groupId>com.mashape.unirest</groupId>
<artifactId>unirest-java</artifactId>
<version>1.4.7</version>
<version>1.4.8-SNAPSHOT</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/com/geccocrawler/gecco/GeccoEngine.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import com.alibaba.fastjson.JSON;
import com.geccocrawler.gecco.downloader.Downloader;
import com.geccocrawler.gecco.downloader.UnirestDownloader;
import com.geccocrawler.gecco.downloader.HttpClientDownloader;
import com.geccocrawler.gecco.pipeline.PipelineFactory;
import com.geccocrawler.gecco.request.HttpGetRequest;
import com.geccocrawler.gecco.request.HttpRequest;
Expand Down Expand Up @@ -129,7 +129,7 @@ public void run() {
scheduler = new StartScheduler();
}
if(downloader == null) {
downloader = new UnirestDownloader();
downloader = new HttpClientDownloader();
downloader.userAgent(userAgent);
downloader.timeout(timeout);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,8 @@ public DownloaderException(Throwable cause) {
super(cause);
}

public DownloaderException(String message) {
super(message);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
package com.geccocrawler.gecco.downloader;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpEntityEnclosingRequestBase;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import com.geccocrawler.gecco.request.HttpGetRequest;
import com.geccocrawler.gecco.request.HttpPostRequest;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.response.HttpResponse;

/**
* userAgent更换
* proxy更换
*
* @author huchengyi
*
*/
public class HttpClientDownloader implements Downloader {

private static Log log = LogFactory.getLog(HttpClientDownloader.class);

private CloseableHttpClient httpClient;

private long timeout;

private String userAgent;

private HttpHost proxy;

public HttpClientDownloader() {
RequestConfig clientConfig = RequestConfig.custom().setRedirectsEnabled(false).build();
PoolingHttpClientConnectionManager syncConnectionManager = new PoolingHttpClientConnectionManager();
syncConnectionManager.setMaxTotal(1000);
syncConnectionManager.setDefaultMaxPerRoute(50);
httpClient = HttpClientBuilder.create().setDefaultRequestConfig(clientConfig).setConnectionManager(syncConnectionManager).build();
}

@Override
public HttpResponse download(HttpRequest request) throws DownloaderException {
if(log.isDebugEnabled()) {
log.debug("downloading..." + request.getUrl());
}
HttpRequestBase reqObj = null;
if(request instanceof HttpPostRequest) {
HttpPostRequest post = (HttpPostRequest)request;
List<NameValuePair> fields = new ArrayList<NameValuePair>();
for(Map.Entry<String, Object> entry : post.getFields().entrySet()) {
NameValuePair nvp = new BasicNameValuePair(entry.getKey(), entry.getValue().toString());
fields.add(nvp);
}
reqObj = new HttpPost(post.getUrl());
try {
HttpEntity entity = new UrlEncodedFormEntity(fields, "UTF-8");
((HttpEntityEnclosingRequestBase) reqObj).setEntity(entity);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
} else {
reqObj = new HttpGet(request.getUrl());
}
reqObj.addHeader("User-Agent", userAgent);
for(Map.Entry<String, String> entry : request.getHeaders().entrySet()) {
reqObj.addHeader(entry.getKey(), entry.getValue());
}
reqObj.setConfig(RequestConfig.custom()
.setConnectionRequestTimeout(((Long)timeout).intValue())
.setSocketTimeout(((Long)timeout).intValue())
.setConnectionRequestTimeout(((Long)timeout).intValue())
.setRedirectsEnabled(false)
//.setProxy(proxy)
.build());
try {
org.apache.http.HttpResponse response = httpClient.execute(reqObj);
int status = response.getStatusLine().getStatusCode();
HttpResponse resp = new HttpResponse();
resp.setStatus(status);
if(status == 302 || status == 301) {
resp.setContent(response.getFirstHeader("Location").getValue());
} else if(status == 200) {
HttpEntity responseEntity = response.getEntity();
resp.setRaw(responseEntity.getContent());
String contentType = responseEntity.getContentType().getValue();
resp.setContentType(contentType);
String charset = getCharset(request, contentType);
resp.setCharset(charset);
String content = EntityUtils.toString(responseEntity, charset);
/*Header ceHeader = responseEntity.getContentEncoding();
if(ceHeader != null && ceHeader.getValue().equalsIgnoreCase("gzip")) {
content = EntityUtils.toString(new GzipDecompressingEntity(responseEntity), charset);
} else {
content = EntityUtils.toString(responseEntity, charset);
}*/
resp.setContent(content);
} else {
throw new DownloaderException("ERROR : " + status);
}
return resp;
} catch (Exception e) {
throw new DownloaderException(e);
} finally {
reqObj.releaseConnection();
}
}

private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");

/**
* Parse out a charset from a content type header.
*
* @param contentType e.g. "text/html; charset=EUC-JP"
* @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
*/
public String getCharsetFromContentType(String contentType) {
if (contentType == null)
return null;

Matcher m = charsetPattern.matcher(contentType);
if (m.find()) {
return m.group(1).trim().toUpperCase();
}
return null;
}

private String getCharset(HttpRequest request, String contentType) {
String charset = getCharsetFromContentType(contentType);
if(charset == null) {
charset = request.getCharset();
}
if(charset == null) {
charset = "UTF-8";
}
return charset;
}

@Override
public void timeout(long timeout) {
this.timeout = timeout;
}

@Override
public void userAgent(String userAgent) {
this.userAgent = userAgent;
}

@Override
public void proxy(String host, int port) {
// TODO Auto-generated method stub

}

@Override
public void shutdown() {
try {
httpClient.close();
} catch (IOException e) {
httpClient = null;
}
}

public static void main(String[] args) throws Exception {
HttpClientDownloader hd = new HttpClientDownloader();
hd.timeout(3000);
//http://temai.tuniu.com/tours/212032167
//http://san-yun.iteye.com/blog/2065732
HttpResponse resp = hd.download(new HttpGetRequest("http://temai.tuniu.com/tours/212032167"));
System.out.println(resp.getContent());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpHost;

import com.geccocrawler.gecco.request.HttpGetRequest;
import com.geccocrawler.gecco.request.HttpPostRequest;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.response.HttpResponse;
Expand Down Expand Up @@ -37,9 +38,9 @@ public HttpResponse download(HttpRequest request) throws DownloaderException {
HttpRequestWithBody httpRequestWithBody = Unirest.post(post.getUrl());
httpRequestWithBody.headers(post.getHeaders());
httpRequestWithBody.fields(post.getFields());
response = httpRequestWithBody.asString();
response = httpRequestWithBody.redirectsEnabled(false).asString();
} else {
response = Unirest.get(request.getUrl()).headers(request.getHeaders()).asString();
response = Unirest.get(request.getUrl()).headers(request.getHeaders()).redirectsEnabled(false).asString();
}
String contentType = response.getHeaders().getFirst("Content-Type");
HttpResponse resp = new HttpResponse();
Expand Down Expand Up @@ -92,4 +93,10 @@ public void userAgent(String userAgent) {
public void proxy(String host, int port) {
Unirest.setProxy(new HttpHost(host, port));
}

public static void main(String[] args) throws Exception {
UnirestDownloader ud = new UnirestDownloader();
HttpResponse resp = ud.download(new HttpGetRequest("http://temai.tuniu.com/tours/212032167"));
System.out.println(resp.getContent());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;

import net.sf.cglib.beans.BeanCopier;

public abstract class AbstractHttpRequest implements HttpRequest, Comparable<HttpRequest>, Serializable {
Expand All @@ -30,7 +32,7 @@ public AbstractHttpRequest() {

public AbstractHttpRequest(String url) {
this();
this.url = url;
this.setUrl(url);
}

public void addCookie(String name, String value) {
Expand Down Expand Up @@ -132,7 +134,7 @@ public Map<String, String> getCookies() {

@Override
public void setUrl(String url) {
this.url = url;
this.url = StringUtils.substringBefore(url, "#");
}

/**
Expand Down
46 changes: 26 additions & 20 deletions src/main/java/com/geccocrawler/gecco/spider/Spider.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,27 +53,31 @@ public void run() {
}
currSpiderBeanClass = engine.getSpiderBeanFactory().matchSpider(request);
if(currSpiderBeanClass == null) {
log.info("cant't match url : " + request.getUrl());
log.error("cant't match url : " + request.getUrl());
continue;
}
//bean config:beforeDownloader,afterDownloader,render,pipelines
SpiderBeanContext context = engine.getSpiderBeanFactory().getContext(currSpiderBeanClass);
//download
HttpResponse response = download(context, request);
HttpResponse response = download(context.getBeforeDownload(), context.getAfterDownload(), request);
if(response != null) {
//render
Render render = context.getRender();
SpiderBean spiderBean = render.inject(currSpiderBeanClass, request, response);
//pipelines
List<Pipeline> pipelines = context.getPipelines();
if(pipelines != null) {
for(Pipeline pipeline : pipelines) {
try {
pipeline.process(spiderBean);
} catch(Exception ex) {
ex.printStackTrace();
if(response.getStatus() == 200) {
//render
Render render = context.getRender();
SpiderBean spiderBean = render.inject(currSpiderBeanClass, request, response);
//pipelines
List<Pipeline> pipelines = context.getPipelines();
if(pipelines != null) {
for(Pipeline pipeline : pipelines) {
try {
pipeline.process(spiderBean);
} catch(Exception ex) {
ex.printStackTrace();
}
}
}
} else if(response.getStatus() == 302 || response.getStatus() == 301){
spiderScheduler.into(request.subRequest(response.getContent()));
}
} else {
//如果没有抓取到任何信息,重新加入请求队列??重试次数
Expand All @@ -92,22 +96,24 @@ public void run() {
}
}

private HttpResponse download(SpiderBeanContext config, HttpRequest startRequest) {
private HttpResponse download(BeforeDownload before, AfterDownload after, HttpRequest request) {
try {
BeforeDownload before = config.getBeforeDownload();
if(before != null) {
before.process(startRequest);
before.process(request);
}
HttpResponse response = engine.getDownloader().download(request);
int status = response.getStatus();
if(status != 200 && status != 301 && status != 302) {
log.error("download error " + request.getUrl() + " : " + response.getStatus());
return null;
}
HttpResponse response = engine.getDownloader().download(startRequest);
AfterDownload after = config.getAfterDownload();
if(after != null) {
after.process(response);
}
return response;
} catch(Exception ex) {
//ex.printStackTrace();
//下载失败,加入jmx监控
log.error("download error " + startRequest.getUrl() + " : " + ex.getMessage());
log.error("download error " + request.getUrl() + " : " + ex.getMessage());
return null;
}
}
Expand Down

0 comments on commit c6a105b

Please sign in to comment.