From af71fe1c67cec2322d5c54b8da0da4697c329133 Mon Sep 17 00:00:00 2001 From: awolfly9 Date: Mon, 20 Feb 2017 15:41:11 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BB=A3=E7=90=86=E7=BD=91?= =?UTF-8?q?=E7=AB=99=20https://hidemy.name/en/?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ipproxytool/spiders/proxy/hidemy.py | 57 +++++++++++++++++++++++++++++ proxy.py | 6 +-- runspider.py | 2 + 3 files changed, 62 insertions(+), 3 deletions(-) create mode 100644 ipproxytool/spiders/proxy/hidemy.py diff --git a/ipproxytool/spiders/proxy/hidemy.py b/ipproxytool/spiders/proxy/hidemy.py new file mode 100644 index 0000000..fbb144b --- /dev/null +++ b/ipproxytool/spiders/proxy/hidemy.py @@ -0,0 +1,57 @@ +#-*- coding: utf-8 -*- + +import utils + +from scrapy import Selector +from basespider import BaseSpider +from proxy import Proxy + + +class HidemySpider(BaseSpider): + name = 'hidemy' + + def __init__(self, *a, **kw): + super(HidemySpider, self).__init__(*a, **kw) + + self.urls = ['https://hidemy.name/en/proxy-list/?start=%s' % n for n in range(0, 5 * 64, 64)] + self.headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en-US,en;q=0.5', + 'Connection': 'keep-alive', + 'Host': 'hidemy.name', + 'Referer': 'https://hidemy.name/en/proxy-list/?start=0', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', + } + + self.init() + + def parse_page(self, response): + utils.log(dir(response)) + utils.log('body type:%s' % type(response.body)) + utils.log('body_as_unicode type:%s' % type(response.body_as_unicode)) + self.write(response.body) + + sel = Selector(response) + infos = sel.xpath('//tbody/tr').extract() + for i, info in enumerate(infos): + if i == 0: + continue + + val = Selector(text = info) + ip = val.xpath('//td[1]/text()').extract_first() + port = val.xpath('//td[2]/text()').extract_first() + country = val.xpath('//td[3]/div/text()').extract_first() + anonymity = val.xpath('//td[6]/text()').extract_first() + + proxy = Proxy() + proxy.set_value( + ip = ip, + port = port, + country = country, + anonymity = anonymity, + source = self.name, + ) + + self.add_proxy(proxy = proxy) diff --git a/proxy.py b/proxy.py index 74e1035..af15239 100644 --- a/proxy.py +++ b/proxy.py @@ -44,11 +44,11 @@ def get_anonymity_type(self, anonymity): ''' if anonymity == u'高匿代理' or anonymity == u'高匿名' or anonymity == 'elite proxy' or \ - anonymity == u'超级匿名': + anonymity == u'超级匿名' or anonymity == u'High': return '1' - elif anonymity == u'匿名' or anonymity == 'anonymous' or anonymity == u'普通匿名': + elif anonymity == u'匿名' or anonymity == 'anonymous' or anonymity == u'普通匿名' or anonymity == u'Medium': return '2' - elif anonymity == u'透明' or anonymity == 'transparent': + elif anonymity == u'透明' or anonymity == 'transparent' or anonymity == u'No': return '3' else: return '3' diff --git a/runspider.py b/runspider.py index 6dcea48..71bee87 100644 --- a/runspider.py +++ b/runspider.py @@ -14,6 +14,7 @@ from ipproxytool.spiders.proxy.ip181 import IpOneEightOneSpider from ipproxytool.spiders.proxy.kuaidaili import KuaiDaiLiSpider from ipproxytool.spiders.proxy.gatherproxy import GatherproxySpider +from ipproxytool.spiders.proxy.hidemy import HidemySpider scrapydo.setup() @@ -45,6 +46,7 @@ items = scrapydo.run_spider(IpOneEightOneSpider) items = scrapydo.run_spider(KuaiDaiLiSpider) items = scrapydo.run_spider(GatherproxySpider) + items = scrapydo.run_spider(HidemySpider) utils.log('*******************run spider waiting...*******************') time.sleep(300)