Skip to content

Commit

Permalink
add proxy getter kuaidaili daili66 youdaili
Browse files Browse the repository at this point in the history
  • Loading branch information
jhao104 committed Nov 25, 2016
1 parent c4f7034 commit 5304447
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 30 deletions.
63 changes: 33 additions & 30 deletions ProxyGetter/getFreeProxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,17 @@
-------------------------------------------------
"""
import re
import sys
import requests
from lxml import etree

reload(sys)
sys.setdefaultencoding('utf-8')

def robust(func):
def decorate(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
print u"sorry, 抓取出错。错误原因:"
print e

return decorate


def verifyProxy(proxy):
"""
检查代理格式
:param proxy:
:return:
"""
verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,4}"
return True if re.findall(verify_regex, proxy) else False
from Util.utilFunction import robustCrawl, getHtmlTree


# 快代理
# noinspection PyPep8Naming
@robust
@robustCrawl
def freeProxyFirst(page=10):
"""
抓取快代理IP http://www.kuaidaili.com/
Expand All @@ -48,25 +31,45 @@ def freeProxyFirst(page=10):
url_list = ('http://www.kuaidaili.com/proxylist/{page}/'.format(page=page) for page in range(1, page + 1))
# 页数不用太多, 后面的全是历史IP, 可用性不高
for url in url_list:
html = requests.get(url).content
tree = etree.HTML(html)
tree = getHtmlTree(url)
proxy_list = tree.xpath('.//div[@id="index_free_list"]//tbody/tr')
for proxy in proxy_list:
yield ':'.join(proxy.xpath('./td/text()')[0:2])


# 代理66
@robust
def freeProxySecond(proxy_number):
@robustCrawl
def freeProxySecond(proxy_number=100):
"""
抓取代理66 http://www.66ip.cn/
:param proxy_number: 代理数量
:return:
"""
pass
url = "http://m.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format(
proxy_number)
html = requests.get(url).content
for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,4}', html):
yield proxy

# 快代理
def freeProxyThird(days=1):
"""
抓取快代理 http://www.youdaili.net/Daili/http/
:param days:
:return:
"""
url = "http://www.youdaili.net/Daili/http/"
tree = getHtmlTree(url)
page_url_list = tree.xpath('.//div[@class="chunlist"]/ul//a/@href')[0:days]
for page_url in page_url_list:
html = requests.get(page_url).content
proxy_list = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,4}', html)
for proxy in proxy_list:
yield proxy

if __name__ == '__main__':
# for e in freeProxyFirst():
# print e
pass
for e in freeProxyThird():
print e



12 changes: 12 additions & 0 deletions Util/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: __init__.py.py
Description :
Author : JHao
date: 2016/11/25
-------------------------------------------------
Change Activity:
2016/11/25:
-------------------------------------------------
"""
Binary file added Util/__init__.pyc
Binary file not shown.
48 changes: 48 additions & 0 deletions Util/utilFunction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: utilFunction.py
Description : 工具函数
Author : JHao
date: 2016/11/25
-------------------------------------------------
Change Activity:
2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree
-------------------------------------------------
"""


# noinspection PyPep8Naming
def robustCrawl(func):
def decorate(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
print u"sorry, 抓取出错。错误原因:"
print e

return decorate


def verifyProxy(proxy):
"""
检查代理格式
:param proxy:
:return:
"""
import re
verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,4}"
return True if re.findall(verify_regex, proxy) else False

def getHtmlTree(url, **kwargs):
"""
获取html树
:param url:
:param kwargs:
:return:
"""
import requests
from lxml import etree
html = requests.get(url=url).content
return etree.HTML(html)

Binary file added Util/utilFunction.pyc
Binary file not shown.

0 comments on commit 5304447

Please sign in to comment.