Skip to content

Commit

Permalink
升级项目到 python3
Browse files Browse the repository at this point in the history
  • Loading branch information
awofly9 committed Jun 23, 2017
1 parent 974fc73 commit 1744917
Show file tree
Hide file tree
Showing 40 changed files with 234 additions and 232 deletions.
38 changes: 15 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,10 @@
感谢 [youngjeff](https://github.com/youngjeff) 和我一起维护该项目

## 运行环境
python 2.7.12

### 运行依赖包
* scrapy
* BeautifulSoup
* requests
* pymysql
* web.py
* scrapydo
* lxml
* pymongo
* 安装 mysql 并启动

安装命令:
安装 python3 and mysql 数据库

```
$ pip install Scrapy BeautifulSoup requests pymysql web.py scrapydo lxml
$ pip install -r requirements.txt
```


Expand Down Expand Up @@ -124,10 +111,10 @@ $ python run_server.py
| sort | str | asc 升序,desc 降序 ||
| count | int | 获取代理数量,默认 100 ||




#### 删除


#### 删除
<http://127.0.0.1:8000/delete?name=httpbin&ip=27.197.144.181>

参数
Expand Down Expand Up @@ -182,20 +169,25 @@ $ python run_server.py


## 项目更新
-----------------------------2017-5-17----------------------------<br>
-----------------------------2017-6-23----------------------------<br>
1.python2 -> python3<br>
2.web.py -> flask<br>
<br>
-----------------------------2017-5-17----------------------------
1.本系统在原来的基础上加入了docker。操作见下方,关于docker的相关知识可以上官网看看http://www.docker.com.<br>
-----------------------------2017-3-30----------------------------<br>
-----------------------------2017-3-30----------------------------<br>
1.修改完善 readme<br>
2.数据插入支持事务<br>
-----------------------------2017-3-14----------------------------<br>
<br>
-----------------------------2017-3-14----------------------------<br>
1.更改服务器接口,添加排序方式<br>
2.添加多进程方式验证代理 ip 的有效性<br>
<br>
-----------------------------2017-2-20----------------------------<br>
-----------------------------2017-2-20----------------------------<br>
1.添加服务器获取接口更多筛选条件<br>
<br>

-----------------------------2017-2-16----------------------------<br>
-----------------------------2017-2-16----------------------------<br>
1.验证代理 IP 的匿名度<br>
2.验证代理 IP HTTPS 支持<br>
3.添加 httpbin 验证并发数设置,默认为 4
Expand Down
2 changes: 1 addition & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@
free_ipproxy_table = 'free_ipproxy'
httpbin_table = 'httpbin'

data_port = '8000'
data_port = 8000
4 changes: 2 additions & 2 deletions ipproxytool.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import run_validator

if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')

# 进入当前项目目录
os.chdir(sys.path[0])
Expand All @@ -26,3 +24,5 @@
subprocess.Popen(['python', 'run_server.py'])

run_validator.validator()


15 changes: 6 additions & 9 deletions ipproxytool/spiders/proxy/basespider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
from scrapy.http import Request
from sql import SqlManager

reload(sys)
sys.setdefaultencoding('utf8')


class BaseSpider(Spider):
name = 'basespider'
Expand All @@ -38,12 +35,12 @@ def init(self):
def start_requests(self):
for i, url in enumerate(self.urls):
yield Request(
url = url,
headers = self.headers,
meta = self.meta,
dont_filter = True,
callback = self.parse_page,
errback = self.error_parse,
url=url,
headers=self.headers,
meta=self.meta,
dont_filter=True,
callback=self.parse_page,
errback=self.error_parse,
)

def parse_page(self, response):
Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/data5u.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#-*- coding: utf-8 -*-

from scrapy import Selector
from basespider import BaseSpider
from .basespider import BaseSpider
from proxy import Proxy

class data5uSpider(BaseSpider):
Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/freeproxylists.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import re

from proxy import Proxy
from basespider import BaseSpider
from .basespider import BaseSpider
from bs4 import BeautifulSoup


Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/gatherproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import requests

from proxy import Proxy
from basespider import BaseSpider
from .basespider import BaseSpider


class GatherproxySpider(BaseSpider):
Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/hidemy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import utils

from scrapy import Selector
from basespider import BaseSpider
from .basespider import BaseSpider
from proxy import Proxy


Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/ip181.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#-*- coding: utf-8 -*-

from scrapy import Selector
from basespider import BaseSpider
from .basespider import BaseSpider
from proxy import Proxy


Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/kuaidaili.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re

from proxy import Proxy
from basespider import BaseSpider
from .basespider import BaseSpider


class KuaiDaiLiSpider(BaseSpider):
Expand Down
4 changes: 2 additions & 2 deletions ipproxytool/spiders/proxy/peuland.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from scrapy.http import Request
from proxy import Proxy
from utils import log
from basespider import BaseSpider
from .basespider import BaseSpider


# 目标站点失效
Expand Down Expand Up @@ -96,7 +96,7 @@ def parse_page(self, response):
log('PeulandSpider parse_page req.text:%s' % req.url)
ret = True
break
except Exception, e:
except Exception as e:
log('PeulandSpider parse_page exception:%s' % str(e), logging.WARNING)
continue

Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/proxydb.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# coding=utf-8

from proxy import Proxy
from basespider import BaseSpider
from .basespider import BaseSpider
from scrapy.selector import Selector


Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/proxylistplus.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#-*- coding: utf-8 -*-

from scrapy import Selector
from basespider import BaseSpider
from .basespider import BaseSpider
from proxy import Proxy


Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/proxyrox.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# coding=utf-8

from proxy import Proxy
from basespider import BaseSpider
from .basespider import BaseSpider
from scrapy.selector import Selector


Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/sixsixip.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re

from proxy import Proxy
from basespider import BaseSpider
from .basespider import BaseSpider


class SixSixIpSpider(BaseSpider):
Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/usproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re

from proxy import Proxy
from basespider import BaseSpider
from .basespider import BaseSpider


class UsProxySpider(BaseSpider):
Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/proxy/xicidaili.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#-*- coding: utf-8 -*-

from proxy import Proxy
from basespider import BaseSpider
from .basespider import BaseSpider
from scrapy.selector import Selector


Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/validator/assetstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import config

from scrapy.http import Request
from validator import Validator
from .validator import Validator


class AssetStoreSpider(Validator):
Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/validator/baidu.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#-*- coding: utf-8 -*-

from validator import Validator
from .validator import Validator


class BaiduSpider(Validator):
Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/validator/boss.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#-*- coding: utf-8 -*-

from validator import Validator
from .validator import Validator


class BossSpider(Validator):
Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/validator/douban.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#-*- coding: utf-8 -*-

from validator import Validator
from .validator import Validator


class DoubanSpider(Validator):
Expand Down
2 changes: 1 addition & 1 deletion ipproxytool/spiders/validator/gather.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#-*- coding: utf-8 -*-

from validator import Validator
from .validator import Validator


class GatherSpider(Validator):
Expand Down
50 changes: 25 additions & 25 deletions ipproxytool/spiders/validator/httpbin.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
#-*- coding: utf-8 -*-
# -*- coding: utf-8 -*-

import json
import time
import requests
import config

from scrapy import Request
from validator import Validator
from .validator import Validator


class HttpBinSpider(Validator):
name = 'httpbin'
concurrent_requests = 16

def __init__(self, name = None, **kwargs):
def __init__(self, name=None, **kwargs):
super(HttpBinSpider, self).__init__(name, **kwargs)
self.timeout = 20
self.urls = [
Expand All @@ -36,7 +36,7 @@ def __init__(self, name = None, **kwargs):
def init(self):
super(HttpBinSpider, self).init()

r = requests.get(url = self.urls[0], timeout = 20)
r = requests.get(url=self.urls[0], timeout=20)
data = json.loads(r.text)
self.origin_ip = data.get('origin', '')
self.log('origin ip:%s' % self.origin_ip)
Expand All @@ -60,21 +60,21 @@ def start_requests(self):
https = 'yes' if 'https' in url else 'no'

yield Request(
url = url,
headers = self.headers,
dont_filter = True,
priority = 0 if https == 'yes' else 10,
meta = {
'cur_time': time.time(),
'download_timeout': self.timeout,
'proxy_info': proxy,
'table': table,
'https': https,
'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
'vali_count': proxy.vali_count,
},
callback = self.success_parse,
errback = self.error_parse,
url=url,
headers=self.headers,
dont_filter=True,
priority=0 if https == 'yes' else 10,
meta={
'cur_time': time.time(),
'download_timeout': self.timeout,
'proxy_info': proxy,
'table': table,
'https': https,
'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
'vali_count': proxy.vali_count,
},
callback=self.success_parse,
errback=self.error_parse,
)

def success_parse(self, response):
Expand All @@ -84,7 +84,7 @@ def success_parse(self, response):

self.save_page(proxy.ip, response.body)

if response.body.find(self.success_mark) or self.success_mark is '':
if self.success_mark in response.text or self.success_mark is '':
proxy.speed = time.time() - response.meta.get('cur_time')
proxy.vali_count += 1
self.log('proxy_info:%s' % (str(proxy)))
Expand All @@ -106,14 +106,14 @@ def success_parse(self, response):

if table == self.name:
if proxy.speed > self.timeout:
self.sql.del_proxy_with_id(table_name = table, id = proxy.id)
self.sql.del_proxy_with_id(table_name=table, id=proxy.id)
else:
self.sql.update_proxy(table_name = table, proxy = proxy)
self.sql.update_proxy(table_name=table, proxy=proxy)
else:
if proxy.speed < self.timeout:
self.sql.insert_proxy(table_name = self.name, proxy = proxy)
self.sql.insert_proxy(table_name=self.name, proxy=proxy)
else:
self.sql.update_proxy(table_name = table, proxy = proxy)
self.sql.update_proxy(table_name=table, proxy=proxy)

self.sql.commit()

Expand All @@ -126,7 +126,7 @@ def error_parse(self, failure):
proxy = request.meta.get('proxy_info')

if table == self.name:
self.sql.del_proxy_with_id(table_name = table, id = proxy.id)
self.sql.del_proxy_with_id(table_name=table, id=proxy.id)
else:
# TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
pass
Loading

0 comments on commit 1744917

Please sign in to comment.