forked from Glenn1Q84/zhihu-TopicSpider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_QuestionInfo_essence.py
143 lines (112 loc) · 7.81 KB
/
1_QuestionInfo_essence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os.path
import requests
import json
import time
import re,os
from Package.encrypt import encrypt
from utils import json_write,txt_save,ensure_dir
def timestamp2time(timestamp):
timeArray = time.localtime(timestamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
class GetOneQuestion():
def __init__(self, topic_id, offset, proxies):
self.topic_id = topic_id
self.proxies = proxies
self.offset = offset
def __get_url(self):
url = "https://www.zhihu.com/api/v4/topics/{}/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.paid_info%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&offset={}&limit=10&after_id=0".format(
self.topic_id, self.offset)
print(url)
return url
def __get_html(self):
try:
# requests.DEFAULT_RETRIES = 5
URL = self.__get_url()
headers = get_headers(URL)
r = requests.get(url=URL, headers=headers, proxies=self.proxies)
r.encoding = 'utf-8'
jsonQuestion = json.loads(r.text)
return jsonQuestion
except Exception as e:
print("获取html失败:", e)
def parse_html(self):
jsonAnswer_data = self.__get_html()["data"]
question_lst = []
for i, item in enumerate(jsonAnswer_data):
print("当前时间:{},正在获取第{}问题".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), i + self.offset))
target = item.get("target").get("question")
if target:
question_id = target.get("id")
question_title = target.get("title")
created_time = target.get("created_time")
type = target.get("type")
dict_question = {
"question_id":question_id,
"question_title":question_title,
"created_time": timestamp2time(created_time),
"type":type,
}
question_lst.append(dict_question)
return question_lst
# https://www.zhihu.com/api/v4/topics/19550937/feeds/top_question?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Canswer_type%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.paid_info%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&limit=10&after_id=0
def GetQuestion(topic_id, question_count,start_offset, proxies,MAX_ATTEMPTS,save_dir):
ensure_dir(save_dir)
offset = start_offset
error_offset = []
while offset <= question_count:
attempts = 0
success = False
while attempts < MAX_ATTEMPTS and not success:
try:
print("当前时间:{},正在获取offset={}条问题".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), offset))
s1_time = time.perf_counter()
Question = GetOneQuestion(topic_id=topic_id, offset=offset,proxies=proxies)
qestion_lst = Question.parse_html()
filename = os.path.join(save_dir,"offset_{}.json".format(str(offset)))
print("成功保存offset={}至:".format(offset),filename)
json_write(data=qestion_lst, filename=filename)
e1_time = time.perf_counter()
print("获取本次回答共用时:", e1_time - s1_time)
success = True
offset = offset + 5
time.sleep(5)
except:
attempts=attempts+1
print("获取第offset = {}条问题失败,休息20秒.......".format(offset))
time.sleep(20)
print("继续获取offset={},attempts={}".format(offset,attempts))
if attempts == MAX_ATTEMPTS:
print("获取offset={}失败,跳过本次offset".format(offset))
error_offset.append(offset)
offset = offset + 5
break
finally:
if len(error_offset) != 0:
error_filename = os.path.join(save_dir,"error_offsets.txt")
txt_save(input_list=error_offset,filename=error_filename)
def get_sublist_all_elements(input_lst):
out_lst = []
for item in input_lst:
out_lst.extend(item)
return out_lst
def get_headers(url):
X_ZSE_93 = '101_3_2.0'
useragent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'
sign, cookies = encrypt(X_ZSE_93,''.join(re.sub(r'.*zhihu\.com', '', url)))
headers = {
'cookie': f'd_c0={cookies}',
'user-agent': useragent,
'x-zse-93': X_ZSE_93,
'x-zse-96': sign
}
return headers
if __name__ == '__main__':
# COOKIE = None
# HEADERS = get_headers()
proxy_account = "fee2dyq3og:[email protected]:23128"
PROXIES = {"HTTP": "HTTP://" + proxy_account}
for topic_id in ["19651260","19629961","19684571"]:
dir = "essence/questions_info"
save_dir = os.path.join("output",topic_id,dir)
GetQuestion(topic_id=topic_id, question_count=1100,start_offset=0, proxies=PROXIES,MAX_ATTEMPTS=5,save_dir=save_dir)