Skip to content

Commit

Permalink
add new feature, validate if ip is accessible to specific websites
Browse files Browse the repository at this point in the history
  • Loading branch information
liya2001 committed Jan 3, 2017
1 parent 5c50a1a commit c2af34c
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 55 deletions.
11 changes: 9 additions & 2 deletions IPProxyPool_py2/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
不需要检测ip是否已经存在,因为会定时清理
'''
UPDATE_TIME=60*60#每一个小时检测一次是否有代理ip失效
MINNUM = 40 #当有效的ip值小于一个时 需要启动爬虫进行爬取
MINNUM = 80 #当有效的ip值小于一个时 需要启动爬虫进行爬取
MAXTIME = 3*24*60 #当爬取存储开始一直使用的最大时间,如果超过这个时间,都删除

TIMEOUT = 5#socket延时
Expand Down Expand Up @@ -187,4 +187,11 @@
TEST_URL='http://ip.chinaz.com/getip.aspx'
# #添加的检测关键字,修复测试的代理是否能真正的访问到目的网址
# TEST_KEY = '站长工具'
TEST_PROXY='http://www.lagado.com/proxy-test'

# 检查代理类型,注意:国外网站,在不同网络环境下可能会失效
TEST_PROXY='http://www.lagado.com/proxy-test'

# 能访问特定网站的代理IP数量很少,目的是检查对特定网站能否访问
# 若能访问,将CHECK_SITES词典里的键值标记到一个可访问一个数组里
# 此为订制服务,目前的想法是简化爬虫,验证放在这里
CHECK_SITES={'JD':'https://list.jd.com/list.html?cat=9987,653,655'}
2 changes: 1 addition & 1 deletion IPProxyPool_py2/db/MongoHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def drop_db(self):
def insert(self,value=None):
if value:
proxy = dict(ip=value['ip'],port=value['port'],types=value['types'],protocol=value['protocol'],country = value['country'],
area=value['area'],speed=value['speed'],score=0)
area=value['area'],speed=value['speed'],sites=value['sites'],score=0)
self.proxys.insert(proxy)


Expand Down
9 changes: 6 additions & 3 deletions IPProxyPool_py2/spider/HtmlPraser.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def XpathPraser(self,response,parser):
addr = self.ips.getIpAddr(self.ips.str2ip(ip))
country = ''
area = ''
sites = []
if addr.find(u'省')!=-1 or self.AuthCountry(addr):
country = u'中国'
area = addr
Expand All @@ -89,7 +90,7 @@ def XpathPraser(self,response,parser):
# ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)

# proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
proxy ={'ip':ip,'port':int(port),'types':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100}
proxy ={'ip':ip,'port':int(port),'types':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100,'sites':sites}
proxylist.append(proxy)
return proxylist

Expand Down Expand Up @@ -120,13 +121,14 @@ def RegularPraser(self,response,parser):
addr = self.ips.getIpAddr(self.ips.str2ip(ip))
country = ''
area = ''
sites = []
if addr.find(u'省')!=-1 or self.AuthCountry(addr):
country = u'中国'
area = addr
else:
country = addr
area = ''
proxy ={'ip':ip,'port':port,'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}
proxy ={'ip':ip,'port':port,'types':type,'protocol':protocol,'country':country,'area':area,'speed':100,'sites':sites}

proxylist.append(proxy)
return proxylist
Expand Down Expand Up @@ -161,13 +163,14 @@ def proxy_listPraser(self,response,parser):
addr = self.ips.getIpAddr(self.ips.str2ip(ip))
country = ''
area = ''
sites = []
if addr.find(u'省')!=-1 or self.AuthCountry(addr):
country = u'中国'
area = addr
else:
country = addr
area = ''
proxy ={'ip':ip,'port':int(port),'types':type,'protocol':protocol,'country':country,'area':area,'speed':100}
proxy ={'ip':ip,'port':int(port),'types':type,'protocol':protocol,'country':country,'area':area,'speed':100,'sites':sites}

proxylist.append(proxy)
return proxylist
Expand Down
29 changes: 20 additions & 9 deletions IPProxyPool_py2/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
import requests
import json
from lxml import etree
import config

r = requests.get('http://127.0.0.1:8000/?')
ip_ports = json.loads(r.text)
print ip_ports
i = 0
https_ips = []
https_anony_ips = []
for ip_port in ip_ports:
Expand All @@ -25,17 +25,28 @@
'''
proxies={
'http':'http://%s:%s'%(ip,port),
'https':'https://%s:%s'%(ip,port)
'https':'http://%s:%s'%(ip,port)
}
try:
requests.get('https://list.jd.com/list.html?cat=9987,653,655',proxies=proxies, timeout=6)
https_ips.append(ip_port)

jdr = requests.get('https://list.jd.com/list.html?cat=9987,653,655',headers=config.HEADER, proxies=proxies, timeout=6)
if jdr.ok:
https_ips.append(ip_port)
'''
r = requests.get(url='http://ipaddress.com/')
root = etree.HTML(r.text)
proxy = root.xpath('/html/body/div[1]/div[3]/div[2]/table/tr[6]/td/text()')[0]
if proxy == 'No Proxy Detected':
https_anony_ips.append(ip_port)
if r.ok:
root = etree.HTML(r.text)
proxy = root.xpath('/html/body/div[1]/div[3]/div[2]/table/tr[6]/td/text()')[0]
if proxy == 'No Proxy Detected':
https_anony_ips.append(ip_port)
'''
r = requests.get(url='http://www.lagado.com/proxy-test',headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)

if r.ok:
root = etree.HTML(r.text)
proxy = root.xpath('//*[@id="summary"]/p[1]/text()')[0]
print proxy
if proxy==test_str:
https_anony_ips.append(ip_port)
except requests.exceptions.RequestException:
print 'cant access to JD!'

Expand Down
64 changes: 32 additions & 32 deletions IPProxyPool_py2/test/testIPType.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,43 @@
import config


def checkProxyType(selfip,proxies):
'''
用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
:param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理
:return:
'''

'''
用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
:param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理
:return:
'''
r = requests.get('http://127.0.0.1:8000/?')
ip_ports = json.loads(r.text)
print ip_ports
https_ips = []
test_str = '\nThis request appears NOT to have come via a proxy.\n'

for ip_port in ip_ports:
print ip_port
ip = ip_port[0]
port = ip_port[1]

proxies={
'http':'http://%s:%s'%(ip,port),
'https':'http://%s:%s'%(ip,port)
}

try:
r = requests.get(url='http://www.lagado.com/proxy-test/',headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
print r.text
'''
r = requests.get(url='http://www.lagado.com/proxy-test',headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)

if r.ok:
root = etree.HTML(r.text)
ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text
# print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for)
if ip==selfip:
return 3
if http_x_forwared_for is None and http_via is None:
return 0
if http_via != None and http_x_forwared_for.find(selfip)== -1:
return 1
if http_via != None and http_x_forwared_for.find(selfip)!= -1:
return 2
return 3
'''


proxy = root.xpath('//*[@id="summary"]/p[1]/text()')[0]
print proxy
if proxy==test_str:
https_ips.append(ip_port)

except Exception,e:
print str(e)
return 3

print len(https_ips)
print https_ips



if __name__=='__main__':
ip = '120.52.73.97'
port = '84'
proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
checkProxyType(None,proxies)
33 changes: 25 additions & 8 deletions IPProxyPool_py2/validator/Validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from lxml import etree
import requests
import time
from config import TEST_URL
from config import TEST_URL, CHECK_SITES
import config
from db.DataStore import sqlhelper
from util.exception import Test_URL_Fail
Expand All @@ -20,15 +20,15 @@

def detect_from_db(myip,proxy,proxies_set):
proxy_dict = {'ip':proxy[0],'port':proxy[1]}
result = detect_list(myip,proxy_dict)
if result:
sites = detect_list(myip,proxy_dict)
if sites:
if proxy[2]<60000:
score = proxy[2] + 1
else:
score = 60000
proxy_str ='%s:%s'%(proxy[0],proxy[1])
proxies_set.add(proxy_str)
sqlhelper.update({'ip':proxy[0],'port':proxy[1]},{'score':score})
sqlhelper.update({'ip':proxy[0],'port':proxy[1]},{'score':score,'sites':sites})
else:
sqlhelper.delete({'ip':proxy[0],'port':proxy[1]})

Expand Down Expand Up @@ -69,9 +69,10 @@ def detect_list(selfip,proxy,queue2=None):
'''
ip = proxy['ip']
port = proxy['port']
proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
proxies={"http": "http://%s:%s"%(ip,port),"http": "http://%s:%s"%(ip,port)}

start = time.time()
sites = None
try:
r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)

Expand All @@ -82,6 +83,8 @@ def detect_list(selfip,proxy,queue2=None):
proxy['speed']=speed
proxyType = checkProxyType(selfip,proxies)
proxy['type'] = proxyType
sites = checkSites(proxies, CHECK_SITES)
proxy['sites'] = sites
'''
if proxyType==3:
logger.info('failed %s:%s'%(ip,port))
Expand All @@ -98,7 +101,7 @@ def detect_list(selfip,proxy,queue2=None):

if queue2:
queue2.put(proxy)
return proxy
return sites

def checkProxyType(selfip,proxies):
'''
Expand All @@ -116,18 +119,32 @@ def checkProxyType(selfip,proxies):
if r.ok:
root = etree.HTML(r.text)
proxy = root.xpath('//*[@id="summary"]/p[1]/text()')[0]
print proxy
#print proxy
if proxy==test_str:
return 0
else:
return 1
return 3

except Exception,e:
print 'The proxy test website becomes invalid! or not'
#print 'The proxy test website becomes invalid! or not'
return 3


def checkSites(proxies, CHECK_SITES):
'''
检查对特定网站能否访问,若能访问,将CHECK_SITES词典里的键值标记到一个可访问一个数组里
'''
sites = []
for key, value in CHECK_SITES.iteritems():
try:
r = requests.get(url=value,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
if r.ok:
sites.append(key)
except Exception,e:
pass
return sites

def getMyIP():
try:
r = requests.get(url=config.TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT)
Expand Down

0 comments on commit c2af34c

Please sign in to comment.