Skip to content

Commit

Permalink
update struct
Browse files Browse the repository at this point in the history
  • Loading branch information
Germey committed Feb 23, 2017
1 parent 05277b9 commit 72e024a
Show file tree
Hide file tree
Showing 9 changed files with 156 additions and 176 deletions.
201 changes: 123 additions & 78 deletions .idea/workspace.xml

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion proxypool/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
__author__ = 'WiseDoge'
13 changes: 0 additions & 13 deletions proxypool/api.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,3 @@
"""
-------------------------------------------------
File Name: api.py
Description: API模块,运行后打开浏览器,访问
http://127.0.0.1:5000/进入主页。
访问 http://127.0.0.1:5000/get
从代理池中获取一个代理。
访问 http://127.0.0.1:5000/count
获取代理池中可用代理的总数。
Author: Liu
Date: 2016/12/9
-------------------------------------------------
"""
from flask import Flask, g

from .db import RedisClient
Expand Down
35 changes: 15 additions & 20 deletions proxypool/db.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
"""
-------------------------------------------------
File Name: db.py
Description: 数据库操作模块,负责对象与底层数据库
的交互。
Author: Liu
Date: 2016/12/9
-------------------------------------------------
"""
import redis

from .error import PoolEmptyError
from .setting import HOST, PORT
from proxypool.error import PoolEmptyError
from proxypool.setting import HOST, PORT


class RedisClient(object):
Expand All @@ -19,21 +9,21 @@ class RedisClient(object):
"""

def __init__(self, host=HOST, port=PORT):
self.__db = redis.Redis(host, port)
self._db = redis.Redis(host, port)

def get(self, count=1):
"""从Pool中获取一定量数据。"""
proxies = self.__db.lrange("proxies", 0, count - 1)
self.__db.ltrim("proxies", count, -1)
proxies = self._db.lrange("proxies", 0, count - 1)
self._db.ltrim("proxies", count, -1)
return proxies

def put(self, proxy):
"""将代理压入Pool中。
用Redis的set容器来负责去重,如果proxy能被压入proxy_set,
就将其放入proxy pool中,否则不压入。
"""
if self.__db.sadd("proxy_set", proxy):
self.__db.rpush("proxies", proxy)
if self._db.sadd("set", proxy):
self._db.rpush("proxies", proxy)
else:
pass

Expand All @@ -47,17 +37,22 @@ def pop(self):
"""弹出一个可用代理。
"""
try:
return self.__db.blpop("proxies", 30)[1].decode('utf-8')
return self._db.blpop("proxies", 30)[1].decode('utf-8')
except:
raise PoolEmptyError

@property
def queue_len(self):
"""获取proxy pool的大小。
"""
return self.__db.llen("proxies")
return self._db.llen("proxies")

def flush(self):
"""刷新Redis中的全部内容,测试用。
"""
self.__db.flushall()
self._db.flushall()


if __name__ == '__main__':
conn = RedisClient()
print(conn.get(20))
10 changes: 0 additions & 10 deletions proxypool/error.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,3 @@
"""
-------------------------------------------------
File Name: error.py
Description: 异常模块
Author: Liu
Date: 2016/12/9
-------------------------------------------------
"""


class ResourceDepletionError(Exception):
"""
资源枯竭异常,如果从所有抓取网站都抓不到可用的代理资源,
Expand Down
17 changes: 1 addition & 16 deletions proxypool/getter.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,3 @@
"""
-------------------------------------------------
File Name: proxyGetter.py
Description: 代理抓取模块,负责与网络的交互。
注意,代理网站的HTML结构可能会时常的更新,
会导致本文件下的抓取函数失效,所以,在运行
代理池之前,需要更新一下FreeProxyGetter类
中以crawl_开头的方法。
Author: Liu
Date: 2016/12/9
-------------------------------------------------
"""

import time

from .utils import get_page
from pyquery import PyQuery as pq

Expand Down Expand Up @@ -43,7 +28,7 @@ class FreeProxyGetter(object, metaclass=ProxyMetaclass):
添加器会自动识别并调用此类函数。
"""

def get_raw_proxies(self, callback, count=40):
def get_raw_proxies(self, callback):
proxies = []
print('Callback', callback)
for proxy in eval("self.{}()".format(callback)):
Expand Down
36 changes: 13 additions & 23 deletions proxypool/schedule.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,49 @@
"""
-------------------------------------------------
File Name: schedule.py
Description: 调度器模块,
包含ValidityTester,PoolAdder,
Schedule三个类,负责维护代理池。
Author: Liu
Date: 2016/12/9
-------------------------------------------------
"""
import time
from multiprocessing import Process
import asyncio
import aiohttp
from .db import RedisClient
from .error import ResourceDepletionError
from .getter import FreeProxyGetter
from .setting import *
from proxypool.db import RedisClient
from proxypool.error import ResourceDepletionError
from proxypool.getter import FreeProxyGetter
from proxypool.setting import *


class ValidityTester(object):
"""
检验器,负责对未知的代理进行异步检测。
"""
# 用百度的首页来检验
test_api = 'http://www.baidu.com'
test_api = TEST_API

def __init__(self):
self._raw_proxies = None
self._usable_proxies = []

def set_raw_proxies(self, proxies):
"""设置待检测的代理。
"""
设置待检测的代理。
"""
self._raw_proxies = proxies
self._usable_proxies = []

async def test_single_proxy(self, proxy):
"""检测单个代理,如果可用,则将其加入_usable_proxies
"""
检测单个代理,如果可用,则将其加入_usable_proxies
"""
async with aiohttp.ClientSession() as session:
try:
real_proxy = 'http://' + proxy
print('Testing', real_proxy)
async with session.get(self.test_api, proxy=real_proxy, timeout=15) as response:
await response
print('Response from', proxy)
self._usable_proxies.append(proxy)
print('Valid proxy', proxy)
except Exception:
pass
print('Invalid proxy', proxy)

def test(self):
"""异步检测_raw_proxies中的全部代理。
"""
异步检测_raw_proxies中的全部代理。
"""
print('ValidityTester is working')
loop = asyncio.get_event_loop()
Expand Down Expand Up @@ -92,9 +85,6 @@ def add_to_queue(self):
for callback_label in range(self._crawler.__CrawlFuncCount__):
callback = self._crawler.__CrawlFunc__[callback_label]
raw_proxies = self._crawler.get_raw_proxies(callback)
self._tester.set_raw_proxies(raw_proxies)
self._tester.test()
self._conn.put_many(self._tester.get_usable_proxies())
proxy_count += len(raw_proxies)
if proxy_count == 0:
raise ResourceDepletionError
Expand Down
11 changes: 2 additions & 9 deletions proxypool/setting.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,3 @@
"""
-------------------------------------------------
File Name: setting.py
Description: 设置模块,包含了一些常量。
Author: Liu
Date: 2016/12/9
-------------------------------------------------
"""

# Redis Host
HOST = 'localhost'
# Redis PORT
Expand All @@ -18,3 +9,5 @@

VALID_CHECK_CYCLE = 600
POOL_LEN_CHECK_CYCLE = 20

TEST_API='http://www.baidu.com'
8 changes: 2 additions & 6 deletions proxypool/utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
import requests
import lxml
import asyncio
import time
import aiohttp
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError

base_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8'
}


def get_page(url, options={}):
headers = dict(base_headers, **options)
print('Getting', url, headers)
print('Getting', url)
try:
r = requests.get(url, headers=headers)
print('Getting result', url, r.status_code)
Expand Down

0 comments on commit 72e024a

Please sign in to comment.