From 65c4798ba2891513c09db4d567037820aeb097a3 Mon Sep 17 00:00:00 2001 From: cwjokaka <1250217678@qq.com> Date: Fri, 6 Sep 2019 18:10:00 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=B4=E6=97=B6=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setting.py | 2 +- src/spider/abs_spider.py | 11 ++++++++--- src/spider/spider_66_ip.py | 8 ++++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/setting.py b/setting.py index f91420a..8f26b16 100644 --- a/setting.py +++ b/setting.py @@ -1,4 +1,4 @@ -from src.spider import Spider66Ip +from src.spider.spider_66_ip import Spider66Ip DB_TYPE = 'memory' # memory/redis diff --git a/src/spider/abs_spider.py b/src/spider/abs_spider.py index cfc5a10..a3eb553 100644 --- a/src/spider/abs_spider.py +++ b/src/spider/abs_spider.py @@ -1,13 +1,18 @@ +from typing import List, Iterable + +from src.entity.proxy_entity import ProxyEntity + + class AbsSpider(object): def __init__(self, name='unknown') -> None: self._name = name def crawl(self): - print('开始爬取...') + print(f'{self._name}开始爬取...') res = self.do_crawl() - print(f'爬取完毕!共:{len(res)}个代理') + print(f'{self._name}爬取完毕!共:{len(res)}个代理') return res - def do_crawl(self): + def do_crawl(self) -> List[ProxyEntity]: raise RuntimeError('do_crawl方法没有实现!') diff --git a/src/spider/spider_66_ip.py b/src/spider/spider_66_ip.py index 711f0f2..72fc32d 100644 --- a/src/spider/spider_66_ip.py +++ b/src/spider/spider_66_ip.py @@ -1,3 +1,5 @@ +from typing import List, Iterable + import requests from src.entity.proxy_entity import ProxyEntity @@ -14,10 +16,10 @@ def __init__(self) -> None: super().__init__('66IP代理爬虫') self._base_url = 'http://www.66ip.cn' - def do_crawl(self): + def do_crawl(self) -> List[ProxyEntity]: result = [] for page in range(1, 5): - print(f'第{page}页...') + # print(f'第{page}页...') resp = requests.get(f'{self._base_url}/{page}.html') resp.encoding = 'gb2312' soup = BeautifulSoup(resp.text, 'lxml') @@ -34,3 +36,5 @@ def do_crawl(self): print(f'{ip}:{port}/{region}/{proxy_type}/{check_time}') result.append(ProxyEntity(ip, port, self._name, type=proxy_type, region=region)) return result + +