forked from DropsDevopsOrg/ECommerceCrawlers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_parks_companies_threads.py
203 lines (183 loc) · 8.69 KB
/
get_parks_companies_threads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
import requests
from config import *
from lxml import etree
import csv
# from fake_useragent import UserAgent
import pandas as pd
import threading
import time
import random
def log(txt):
print(txt)
with open('log.txt', 'a') as f:
f.write(txt+'\n')
class QiChaCha:
def __init__(self, cookie, proxies, companies_name):
self.cookie = cookie
self.proxies = proxies
self.companies_name = companies_name
# ua = UserAgent(verify_ssl=False)
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': self.cookie,
'DNT': '1',
'Host': 'www.qichacha.com',
'Referer': 'https://www.qichacha.com/more_zonecompany.html?id=000c85b2a120712454f4c5b74e4fdfae&p=2',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
# 'User-Agent': ua.random
}
self.path = './csv/'
self.file_name = self.path+self.companies_name+'.csv'
self.ListTask = []
self.csv_data = pd.read_csv('./csv/全国工业园区信息.csv')
self.length = len(self.csv_data)
self.work()
def get_companies(self, id, page_no):
url = 'https://www.qichacha.com/more_zonecompany.html?id={}&p={}'.format(
id, page_no)
while True:
try:
# with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
with requests.get(url, headers=self.headers) as response:
# response = requests.get(url, headers=self.headers)
html = response.text
parseHtml = etree.HTML(html)
return parseHtml
except Exception as e:
# log('代理请求故障,重复任务!')
log('连接故障,重复任务!')
pass
def get_companies_all(self, name_thread, id, province, city, county, park, area, numcop):
num_page = numcop // 10 + 1
for i in range(1, num_page+1):
num_writer = 0 # 计算是否有信息写入(反扒机制)
# for i in range(1, 2):
parseHtml = self.get_companies(id, i)
# '/firm_2468290f38f4601299b29acdf6eccce9.html'
rUrls = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/a/@href')
# '临海市互通汽车销售有限公司'
rTitle = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/a/text()')
# '黄剑勇'
rPerson = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[1]/a/text()')
# '注册资本:1000万元人民币'
rCapital = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[1]/span[1]/text()')
# '成立日期:2017-09-08'
rSetTime = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[1]/span[2]/text()')
# '\n 邮箱:[email protected]\n '
rEmail = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[2]/text()')
# '电话:0576-85323665'
rPhone = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[2]/span/text()')
# '\n 地址:浙江省台州市临海市江南街道恒大家居建材城(靖江南路112号)\n '
rAddress = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[3]/text()')
# '存续'
rState = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[3]/span/text()')
num_current = len(rUrls)
for num in range(num_current):
try:
url = 'https://www.qichacha.com'+rUrls[num]
company = rTitle[num]
person = rPerson[num]
capital = rCapital[num].replace('注册资本:', '')
settime = rSetTime[num].replace('成立日期:', '')
email = rEmail[num].replace(
'\n', '').replace('邮箱:', '').strip()
phone = rPhone[num].replace('电话:', '')
address = rAddress[num].replace(
'\n', '').replace('地址:', '').strip()
state = rState[num]
L = [province, city, county, park, area, numcop, company,
person, capital, settime, email, phone, address, state, url]
# print(L)
with open(self.file_name, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(L)
num_writer += 1
except Exception as e:
self.err_log(id, i)
log(
'{} 报错 ID: {} , 页码: {} / {}'.format(name_thread, id, i, num_page))
if num_writer == 0:
log('{} 无信息写入 ID: {} , 页码: {} / {} 请检查反扒机制'.format(name_thread, id, i, num_page))
self.err_log(id, i)
else:
log('{} 完成爬取 ID: {} , 页码: {} / {}'.format(name_thread, id, i, num_page))
def err_log(self, id, page):
err_file = self.path + 'error.csv'
if not os.path.exists(err_file):
header = ['id', 'page']
with open(err_file, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(header)
with open(err_file, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow([id, page])
def thread_task(self):
name_thread = threading.current_thread().name
n = 3 # # 三次请求 self.ListTash 无返回, 则认为所有数据爬取完毕
while True:
if n == 0:
break
try:
i = self.ListTask.pop(0)
province = self.csv_data.loc[i, 'province']
city = self.csv_data.loc[i, 'city']
county = self.csv_data.loc[i, 'county']
park = self.csv_data.loc[i, 'park']
area = self.csv_data.loc[i, 'area']
numcop = self.csv_data.loc[i, 'numcop']
id = self.csv_data.loc[i, 'url'].split('_')[-1]
self.get_companies_all(name_thread, id, province,
city, county, park, area, numcop)
log('\n\n{} 完成爬取 ID: {}, 整体进度: {} / {}\n\n=============================\n'.format(
name_thread, id, i+1, self.length))
n = 3
except Exception as e:
n -= 1
time.sleep(random.randint(3,10))
def work(self):
# 判断\新建文件夹
if not os.path.exists(self.path):
os.makedirs(self.path)
log(self.path+' 文件夹创建成功')
# 判断\新建文件
if not os.path.exists(self.file_name):
header = ['province', 'city', 'county', 'park', 'area', 'numcop', 'company',
'person', 'capital', 'settime', 'email', 'phone', 'address', 'state', 'url']
with open(self.file_name, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(header)
for i in range(self.length):
self.ListTask.append(i)
threads = []
for i in range(200):
thread = threading.Thread(target=self.thread_task, args=())
threads.append(thread)
# 启动多线程
for t in threads:
t.start()
log('开启线程: '+t.name)
for t in threads:
t.join()
log('关闭线程: '+t.name)
log('主线程结束! '+threading.current_thread().name)
if __name__ == "__main__":
QiChaCha(cookie, proxies, companies_name)