Skip to content

Commit

Permalink
临时提交
Browse files Browse the repository at this point in the history
  • Loading branch information
cwjokaka committed Sep 6, 2019
1 parent 6b51f91 commit 65c4798
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 6 deletions.
2 changes: 1 addition & 1 deletion setting.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from src.spider import Spider66Ip
from src.spider.spider_66_ip import Spider66Ip

DB_TYPE = 'memory' # memory/redis

Expand Down
11 changes: 8 additions & 3 deletions src/spider/abs_spider.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from typing import List, Iterable

from src.entity.proxy_entity import ProxyEntity


class AbsSpider(object):

def __init__(self, name='unknown') -> None:
self._name = name

def crawl(self):
print('开始爬取...')
print(f'{self._name}开始爬取...')
res = self.do_crawl()
print(f'爬取完毕!共:{len(res)}个代理')
print(f'{self._name}爬取完毕!共:{len(res)}个代理')
return res

def do_crawl(self):
def do_crawl(self) -> List[ProxyEntity]:
raise RuntimeError('do_crawl方法没有实现!')
8 changes: 6 additions & 2 deletions src/spider/spider_66_ip.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List, Iterable

import requests

from src.entity.proxy_entity import ProxyEntity
Expand All @@ -14,10 +16,10 @@ def __init__(self) -> None:
super().__init__('66IP代理爬虫')
self._base_url = 'http://www.66ip.cn'

def do_crawl(self):
def do_crawl(self) -> List[ProxyEntity]:
result = []
for page in range(1, 5):
print(f'第{page}页...')
# print(f'第{page}页...')
resp = requests.get(f'{self._base_url}/{page}.html')
resp.encoding = 'gb2312'
soup = BeautifulSoup(resp.text, 'lxml')
Expand All @@ -34,3 +36,5 @@ def do_crawl(self):
print(f'{ip}:{port}/{region}/{proxy_type}/{check_time}')
result.append(ProxyEntity(ip, port, self._name, type=proxy_type, region=region))
return result


0 comments on commit 65c4798

Please sign in to comment.