forked from jhao104/proxy_pool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilFunction.py
83 lines (73 loc) · 2.22 KB
/
utilFunction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""
-------------------------------------------------
File Name: utilFunction.py
Description : tool function
Author : JHao
date: 2016/11/25
-------------------------------------------------
Change Activity:
2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree
-------------------------------------------------
"""
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
# noinspection PyPep8Naming
def robustCrawl(func):
def decorate(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
print(u"sorry, 抓取出错。错误原因:")
print(e)
return decorate
def verifyProxy(proxy):
"""
检查代理格式
:param proxy:
:return:
"""
import re
verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
return True if re.findall(verify_regex, proxy) else False
def getHtmlTree(url, **kwargs):
"""
获取html树
:param url:
:param kwargs:
:return:
"""
import requests
from lxml import etree
header = {'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
try:
response = requests.get(url=url, headers=header, timeout=30)
response.encoding = "UTF-8"
html = response.text
return etree.HTML(html)
except Exception as e:
print("getHtmlTree fail %s, error: %s" % (url, str(e)))
return None
def validUsefulProxy(proxy):
"""
检验代理可以性
:param proxy:
:return:
"""
proxies = {"https": "https://{proxy}".format(proxy=proxy)}
try:
# 超过30秒的代理就不要了
r = requests.get('https://www.baidu.com/', proxies=proxies, timeout=20, verify=False)
if r.status_code == 200:
return True
except Exception as e:
return False