Skip to content

Commit

Permalink
知乎漂亮小姐姐爬虫 队列+多线程
Browse files Browse the repository at this point in the history
Signed-off-by: daacheng <[email protected]>
  • Loading branch information
daacheng committed Dec 24, 2018
1 parent 20fe76b commit 0eaef44
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 2 deletions.
4 changes: 2 additions & 2 deletions agent_pool/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def crawl_66(self):
'Accept-Language': 'zh-CN,zh;q=0.9',
'Host': 'www.66ip.cn',
'Referer': 'http://www.66ip.cn/',
'Cookie': 'yd_cookie=b04f4f2e-bece-46a1ac6ee50ce2ffd7b607de356fce654d8e; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1543800567,1543892144,1544411555,1544411564; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1544411564; _ydclearance=cdc56c50fac95ebdcc5cb44a-f476-4ed6-9412-40debf56f8a6-1544418766',
'Cookie': 'yd_cookie=ff2bb1a7-e805-4cddc1e057fab8bce8141159750163bcaf81; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1543754108,1544017881,1545407427; _ydclearance=2fd37a257ebe5b81bd666d7c-1713-4506-8d7d-ff5b5647189c-1545414626; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1545407429',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}

Expand Down Expand Up @@ -104,7 +104,7 @@ def crawl_kuaidaili(self):
'Accept-Language': 'zh-CN,zh;q=0.9',
'Host': 'www.kuaidaili.com',
'Referer': 'http://www.66ip.cn/',
'Cookie': 'channelid=0; sid=1543828498576461; _ga=GA1.2.2019923346.1543828501; _gid=GA1.2.613511733.1544411613; _gat=1; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1543828501,1543892257,1544068211,1544411614; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1544411614',
'Cookie': 'channelid=0; sid=1544017933009606; _ga=GA1.2.493718697.1544017936; _gid=GA1.2.1906934372.1545407524; _gat=1; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1544017936,1545407524; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1545407525',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}

Expand Down
130 changes: 130 additions & 0 deletions spider/zhihu_spider/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import csv
from queue import Queue
import requests
import os
import time
import threading

url_queue = Queue()
with open('image_urls.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
url_queue.put((row))

question_id_dict = {'292901966': '有着一双大长腿是什么感觉',
'26297181': '大胸女生如何穿衣搭配',
'274143680': '男生会主动搭讪一个长得很高并且长得好看的女生吗',
'266695575': '当你有一双好看的腿之后会不会觉得差一张好看的脸',
'297715922': '有一副令人羡慕的好身材是怎样的体验',
'26037846': '身材好是一种怎样的体验',
'28997505': '有个漂亮女朋友是什么体验',
'29815334': '女生腿长是什么感觉',
'35255031': '你的身材不配你的脸是一种怎样的体验',
'274638737': '大胸妹子夏季如何穿搭',
'264568089': '你坚持健身的理由是什么现在身材怎么样敢不敢发一张照片来看看',
'49075464': '在知乎上爆照是一种什么样的体验',
'22918070': '女生如何健身练出好身材',
'56378769': '女生身高170cm以上是什么样的体验',
'22132862': '女生如何选购适合自己的泳装',
'46936305': '为什么包臀裙大部分人穿都不好看',
'266354731': '被人关注胸部是种怎样的体验',
'51863354': '你觉得自己身体哪个部位最漂亮',
'66313867': '身为真正的素颜美女是种怎样的体验',
'34243513': '你见过最漂亮的女生长什么样',
'21052148': '有哪些评价女性身材好的标准'
}

headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}

base_dir = r'D:\zhihu_picture'


def get_proxy():
"""
根据之前代理池系统,对外提供的web访问接口,从代理池获取代理
"""
try:
get_proxy_utl = 'http://127.0.0.1:5000/random'
res = requests.get(get_proxy_utl)
if res.status_code == 200:
print('从代理池中获取代理IP: %s' % res.text)
proxies = {'http': 'http://' + res.text}
return proxies
else:
return None
except Exception as e:
print('从代理池中获取代理IP出错了!! %s' % e)
return None


def download_pictures():

while True:
try:
item = url_queue.get(block=True, timeout=180)
image_url, question_id = item
except:
break

data = None
request_time = 0

while True:
try:
headers['referer'] = 'https://www.zhihu.com/question/' + question_id
proxies = get_proxy()
# print('代理IP: %s' % proxies)
res = requests.get(image_url, proxies=proxies, headers=headers, timeout=3)
print('返回值: %s, url: %s' % (res.status_code, image_url))
if res.status_code == 200:
data = res.content
break
else:
request_time += 1
if request_time > 5:
break
except Exception as e:
print('请求次数: %s (%s)' % (request_time, e))
request_time += 1
if request_time > 5:
res = requests.get(image_url, headers=headers)
if res.status_code == 200:
data = res.content
break
continue

try:

# 获取图片所属的问题名称
question_name = question_id_dict.get(question_id)
# 创建图片存储的文件夹
pic_dir = os.path.join(base_dir, question_name)
if not os.path.exists(pic_dir):
try:
os.makedirs(pic_dir)
except:
pass
# 图片名称
pic_name = image_url.split('/')[-1]
# 图片路径
pic_path = os.path.join(pic_dir, pic_name)
if data:
with open(pic_path, 'wb') as f:
f.write(data)
print('下载成功: ', pic_name)
# time.sleep(0.3)
except Exception as e:
print('存入图片数据出错, (%s)' % e)
continue


def main():
for i in range(10):
td = threading.Thread(target=download_pictures)
td.start()


if __name__ == '__main__':
main()

0 comments on commit 0eaef44

Please sign in to comment.