Skip to content

Commit

Permalink
[Record] 2016/12/6 22:32
Browse files Browse the repository at this point in the history
  • Loading branch information
liuslnlp committed Dec 6, 2016
1 parent 721c899 commit 71f7948
Show file tree
Hide file tree
Showing 8 changed files with 261 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.vscode
*.pyc
*.db
28 changes: 28 additions & 0 deletions proxypool/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from flask import Flask, jsonify
import os
from db import RedisClient

app = Flask(__name__)

conn = RedisClient()

# app.config.update(dict(
# HOST='localhost',
# PORT=6379
# ))


@app.route('/')
def index():
return '<h1>Hello World</h1>'

@app.route('/get')
def get_proxy():
return conn.pop()

@app.route('/counts')
def get_counts():
return conn.queue_len

if __name__ == '__main__':
app.run()
51 changes: 51 additions & 0 deletions proxypool/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import redis

HOST='localhost'
PORT=6379

class RedisClient(object):
def __init__(self):
self.__db = redis.Redis(host=HOST, port=PORT)

def get(self, count=1):
proxies = self.__db.lrange("proxies", 0, count-1)
self.__db.ltrim("proxies", count, -1)
return proxies

def put(self, proxy):
if self.__db.sadd("proxy_set", proxy):
self.__db.rpush("proxies", proxy)
else:
pass

def put_many(self, proxies):
for proxy in proxies:
self.put(proxy)

def pop(self):
# if self.queue_len == 0:
# pass
try:
return self.__db.blpop("proxies", 7)[1].decode('utf-8')
except:
return "111"

@property
def queue_len(self):
return self.__db.llen("proxies")

def flush(self):
self.__db.flushall()




if __name__ == '__main__':
conn = RedisClient()
conn.put("aaa")
conn.put("bbb")

print(conn.pop())
print(conn.pop())
print(conn.pop())
conn.flush()
28 changes: 28 additions & 0 deletions proxypool/poolAdder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from db import RedisClient
from vaildityTester import VaildityTester
from proxyGetter import CrawlFreeProxy


class PoolAdder(object):

def __init__(self, threshold):
self._threshold = threshold
self._conn = RedisClient()
self._tester = VaildityTester()
self._crawler = CrawlFreeProxy()

def is_over_threshold(self):
if self._conn.queue_len >= self._threshold:
return True
else:
return False

def add(self):
# 增加一些抓取逻辑,防止对同一页面的持续抓取
raw_proxies = self._conn.get()
self._tester.set_raw_proxies(raw_proxies)
self._tester.test()
self._conn.put_many(self._tester.get_usable_proxies())
if self.is_over_threshold():
return
self.add()
51 changes: 51 additions & 0 deletions proxypool/proxyGetter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from utils import get_page, Downloader
from bs4 import BeautifulSoup

class CrawlFreeProxy(object):

def __init__(self):
pass

def get_raw_proxies(self, engine='1', count=50):
pass

def crawl_kuaidaili(self, page_count=8):
start_url = 'http://www.kuaidaili.com/proxylist/{}/'
urls = [start_url.format(str(page)) for page in range(1, page_count + 1)]
# d = Downloader(urls)
# htmls = d.htmls

# for html in htmls:
# soup = BeautifulSoup(html, 'lxml')

# proxy_list = soup.find('div', {'id': 'index_free_list'}).find('tbody')
# for proxy in proxy_list.find_all('tr'):
# ip = proxy.find_all('td')[0].get_text()
# port = proxy.find_all('td')[1].get_text()
# yield ':'.join([ip, port])


for url in urls:
soup = get_page(url)

proxy_list = soup.find('div', {'id': 'index_free_list'}).find('tbody')
for proxy in proxy_list.find_all('tr'):
ip = proxy.find_all('td')[0].get_text()
port = proxy.find_all('td')[1].get_text()
yield ':'.join([ip, port])


def crawl_daili66(self):
pass

def crawl_xici(self):
pass

if __name__ == '__main__':
import time
a = CrawlFreeProxy()
start = time.clock()
for i in a.crawl_kuaidaili():
print(i)

print(time.clock()-start)
42 changes: 42 additions & 0 deletions proxypool/schedule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from threading import Thread
from proxyGetter import CrawlFreeProxy
from vaildityTester import VaildityTester
from db import RedisClient
import time


class Schedule(object):

def __init__(self):
self.__conn = RedisClient()
self.__crawler = CrawlFreeProxy()

def vaild_proxy(self, cycle=10):
while True:
count = int(0.25 * self.__conn.queue_len)
raw_proxies = self.__conn.get(count)
# usable_proxies =

def check_pool(self, threshold=10, cycle=10):
while True:
if self.__conn.queue_len < threshold:
self.add_proxy()
time.sleep(cycle)

# 从这往下,全部重写

def add_proxy(self, threshold=30):
raw_proxies = self.__crawler.get_raw_proxies()
tester = VaildityTester(raw_proxies)
tester.test()
for proxy in tester.get_usable_proxies():
self.__conn.put(proxy)
if self.__conn.queue_len < threshold:
self.add_proxy()


def start_schedule(self):
vaild_thread = Thread(target=self.vaild_proxy)
check_thread = Thread(target=self.check_pool)
vaild_thread.start()
check_thread.start()
38 changes: 38 additions & 0 deletions proxypool/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import requests
import lxml
import asyncio
import aiohttp
from bs4 import BeautifulSoup

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}

def get_page(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content.decode("utf-8"), 'lxml')
return soup




class Downloader(object):

def __init__(self, urls):
self.urls = urls
self.__htmls = []

async def download_single_page(self, url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
self.__htmls.append(await resp.text())

def download(self):
loop = asyncio.get_event_loop()
tasks = [self.download_single_page(url) for url in self.urls]
loop.run_until_complete(asyncio.wait(tasks))

@property
def htmls(self):
self.download()
return self.__htmls
20 changes: 20 additions & 0 deletions proxypool/vaildityTester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import asyncio
import aiohttp

class VaildityTester(object):
def __init__(self, raw_proxies=None):
if raw_proxies == None:
self.raw_proxies = []
else:
self.raw_proxies = raw_proxies
self.usable_proxies = []

def set_raw_proxies(self, proxies):
self.raw_proxies.extend(proxies)


def test(self):
pass

def get_usable_proxies(self):
return self.usable_proxies

0 comments on commit 71f7948

Please sign in to comment.