forked from jhao104/proxy_pool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProxyRefreshSchedule.py
111 lines (87 loc) · 3.19 KB
/
ProxyRefreshSchedule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""
-------------------------------------------------
File Name: ProxyRefreshSchedule.py
Description : 代理定时刷新
Author : JHao
date: 2016/12/4
-------------------------------------------------
Change Activity:
2016/12/4: 代理定时刷新
2017/03/06: 使用LogHandler添加日志
2017/04/26: raw_proxy_queue验证通过但useful_proxy_queue中已经存在的代理不在放入
-------------------------------------------------
"""
import sys
import time
import logging
from threading import Thread
from apscheduler.schedulers.background import BackgroundScheduler
sys.path.append('../')
from Util.utilFunction import validUsefulProxy
from Manager.ProxyManager import ProxyManager
from Util.LogHandler import LogHandler
__author__ = 'JHao'
logging.basicConfig()
class ProxyRefreshSchedule(ProxyManager):
"""
代理定时刷新
"""
def __init__(self):
ProxyManager.__init__(self)
self.log = LogHandler('refresh_schedule')
def validProxy(self):
"""
验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue
:return:
"""
self.db.changeTable(self.raw_proxy_queue)
raw_proxy_item = self.db.pop()
self.log.info('ProxyRefreshSchedule: %s start validProxy' % time.ctime())
# 计算剩余代理,用来减少重复计算
remaining_proxies = self.getAll()
while raw_proxy_item:
raw_proxy = raw_proxy_item.get('proxy')
if isinstance(raw_proxy, bytes):
# 兼容Py3
raw_proxy = raw_proxy.decode('utf8')
if (raw_proxy not in remaining_proxies) and validUsefulProxy(raw_proxy):
self.db.changeTable(self.useful_proxy_queue)
self.db.put(raw_proxy)
self.log.info('ProxyRefreshSchedule: %s validation pass' % raw_proxy)
else:
self.log.info('ProxyRefreshSchedule: %s validation fail' % raw_proxy)
self.db.changeTable(self.raw_proxy_queue)
raw_proxy_item = self.db.pop()
remaining_proxies = self.getAll()
self.log.info('ProxyRefreshSchedule: %s validProxy complete' % time.ctime())
def refreshPool():
pp = ProxyRefreshSchedule()
pp.validProxy()
def batchRefresh(process_num=30):
# 检验新代理
pl = []
for num in range(process_num):
proc = Thread(target=refreshPool, args=())
pl.append(proc)
for num in range(process_num):
pl[num].daemon = True
pl[num].start()
for num in range(process_num):
pl[num].join()
def fetchAll():
p = ProxyRefreshSchedule()
# 获取新代理
p.refresh()
def run():
scheduler = BackgroundScheduler()
# 不用太快, 网站更新速度比较慢, 太快会加大验证压力, 导致raw_proxy积压
scheduler.add_job(fetchAll, 'interval', minutes=10, id="fetch_proxy")
scheduler.add_job(batchRefresh, "interval", minutes=1) # 每分钟检查一次
scheduler.start()
fetchAll()
while True:
time.sleep(3)
if __name__ == '__main__':
run()