Skip to content

Commit

Permalink
🍻 [feat] 新增 高德POI查询
Browse files Browse the repository at this point in the history
  • Loading branch information
卜俊杰 committed Mar 3, 2020
1 parent 8121a4a commit b47dcda
Show file tree
Hide file tree
Showing 13 changed files with 510 additions and 440 deletions.
26 changes: 23 additions & 3 deletions QiChaCha/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@

## cookie策略

1. taobao 买入 企查查 一天会员, 并登陆查询, 获得cookie 中
2. 经验证, QCCSESSID=i1bie6vpgudru9g56pkf8a5141 即可
- 企查查

1. taobao 几块钱买入 企查查 一天\一个月 会员, 并登陆查询, 获得cookie 中 `QCCSESSID` 和 `acw_tc`

- 高德

1. 高德对 IP地址 无反爬机制
2. 高德POI信息查询,若数据量小于3万,可通过认证个人开发者,获取免费api接口,实现每天 3万次 / 50-200 并发量
3. 若数据量远大于3万,可通过页面爬虫,若 CPU\内存\网速 够用,可开 50-100 条线程并发(python内核是伪线程,可以自己测试并发效率, 并非数越多越好)

- 注意

Expand All @@ -16,7 +23,20 @@

1. 先获取园区信息(省份\城市-区\占地面积\企业数\详情链接),存为 csv
2. 逐个访问详情链接, 获得所有企业数
3. 将所有数据合并在一张表里
3. 将所有数据合并在一张表里(因爬虫过程中被反爬等,中断继续,导致部分数据重复,进行 csv 去重\排序)
4. 高德地图POI信息爬取(园区、企业)
5. Tableau & echarts、django、mysql 等在线可视化

## 文件说明
1. `config.py` 配置文件(企查查需要代理,高德不需要)
2. `get_parks.py` 获取园区信息
3. `get_parks_companies.py` 获取企业信息(单线程,有bug,未修复)
4. `get_parks_companies_threads.py` 获取企业信息(多线程,修复bug)
5. `deal_error.py` 处理企查查爬虫中的错误(该部分重新爬取\其他策略等)
6. `deal_result.py` 处理企业信息csv文件: 去重\排序
7. `get_addr_longitude_latitude.py` 高德POI获取(地址\经度\维度),作为中间函数被其他文件导入
8. `get_parks_addr_long_lati.py` 园区高德POI获取(地址\经度\维度)
9. `get_companies_addr_long_lati.py` 企业高德POI获取(地址[已有]\经度\维度)

## 关于作者

Expand Down
2 changes: 1 addition & 1 deletion QiChaCha/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


# cookie 调出控制台,查询
_QCCSESSID = 'ep8acmi9t19bbntrvvq520q3m3' # 可填入VIP账户 对应值
_QCCSESSID = 'viajssfpmd8aohi8msai5a11p0' # 可填入VIP账户 对应值
# 可用Postman 无cookie模拟请求,将返回的此值填入,有效期长达数月;用vip账户中的值也可,但易过期
_acw_tc = 'b683069715829049003957779e854980863847b379248edf39287f83c4'
cookie = 'acw_tc={}; QCCSESSID={}'.format(_acw_tc, _QCCSESSID)
Expand Down
31 changes: 0 additions & 31 deletions QiChaCha/csv示例/error.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,3 @@ id,page
000c85b2a120712454f4c5b74e4fdfae,6
0011bb96f23642524fa9021aed17a260,9
0011bb96f23642524fa9021aed17a260,9
0011bb96f23642524fa9021aed17a260,21
0011bb96f23642524fa9021aed17a260,29
0011bb96f23642524fa9021aed17a260,36
0011bb96f23642524fa9021aed17a260,37
0011bb96f23642524fa9021aed17a260,37
0011bb96f23642524fa9021aed17a260,37
0011bb96f23642524fa9021aed17a260,37
0011bb96f23642524fa9021aed17a260,38
0011bb96f23642524fa9021aed17a260,38
0011bb96f23642524fa9021aed17a260,39
0011bb96f23642524fa9021aed17a260,39
0011bb96f23642524fa9021aed17a260,39
0011bb96f23642524fa9021aed17a260,40
0011bb96f23642524fa9021aed17a260,40
0011bb96f23642524fa9021aed17a260,40
0011bb96f23642524fa9021aed17a260,41
0011bb96f23642524fa9021aed17a260,41
001aaabefb0ddaa8b98a42d87440a82f,1
001aaabefb0ddaa8b98a42d87440a82f,159
001aaabefb0ddaa8b98a42d87440a82f,163
003cdf3fa19a0970e007201c1a0f5da0,1
003cdf3fa19a0970e007201c1a0f5da0,1
003cdf3fa19a0970e007201c1a0f5da0,2
003cdf3fa19a0970e007201c1a0f5da0,5
003cdf3fa19a0970e007201c1a0f5da0,6
003cdf3fa19a0970e007201c1a0f5da0,17
003cdf3fa19a0970e007201c1a0f5da0,27
003cdf3fa19a0970e007201c1a0f5da0,36
003cdf3fa19a0970e007201c1a0f5da0,37
003cdf3fa19a0970e007201c1a0f5da0,37
003cdf3fa19a0970e007201c1a0f5da0,38
261 changes: 0 additions & 261 deletions QiChaCha/csv示例/全国工业区企业简要信息.csv

Large diffs are not rendered by default.

138 changes: 0 additions & 138 deletions QiChaCha/csv示例/全国工业园区信息.csv

Large diffs are not rendered by default.

225 changes: 225 additions & 0 deletions QiChaCha/deal_error.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import os
import requests
from config import *
from lxml import etree
import csv
# from fake_useragent import UserAgent
import pandas as pd
import threading
import time
import random

# 去重处理
_csv_data = pd.read_csv('./csv/error.csv')
# print(csv_data.duplicated(), len(csv_data))
_csv_data_single = _csv_data.drop_duplicates()
# print(csv_data_single.duplicated(), len(csv_data_single))

_csv_data_single['index'] = range(_csv_data_single.shape[0])
csv_data_res = _csv_data_single.set_index('index')
# print(csv_data_res)

class DealError:
def __init__(self, cookie, proxies, companies_name, csv_data_res):
self.cookie = cookie
self.proxies = proxies
self.companies_name = companies_name
# ua = UserAgent(verify_ssl=False)
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': self.cookie,
'DNT': '1',
'Host': 'www.qichacha.com',
'Referer': 'https://www.qichacha.com/more_zonecompany.html?id=000c85b2a120712454f4c5b74e4fdfae&p=2',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
# 'User-Agent': ua.random
}
self.path = './csv/'
self.file_name = self.path+self.companies_name+'.csv'
self.ListTask = []
self.csv_data = pd.read_csv('./csv/全国工业园区信息.csv')
self.csv_data_res = csv_data_res
self.length = len(self.csv_data_res)

# print(self.csv_data)
self.work()

def get_companies(self, id, page_no):
url = 'https://www.qichacha.com/more_zonecompany.html?id={}&p={}'.format(
id, page_no)
# print(url)
while True:
try:
# with requests.get(url, headers=self.headers, proxies=self.proxies) as response:
with requests.get(url, headers=self.headers) as response:
# response = requests.get(url, headers=self.headers)
html = response.text
parseHtml = etree.HTML(html)
# print(1)

return parseHtml
except Exception as e:
# log('代理请求故障,重复任务!')
print('连接故障,重复任务!')
pass

def get_companies_all(self, name_thread, id, i, province, city, county, park, area, numcop):
# i = page_no
num_writer = 0 # 计算是否有信息写入(反扒机制)
# for i in range(1, 2):
parseHtml = self.get_companies(id, i)
# print(parseHtml)
# '/firm_2468290f38f4601299b29acdf6eccce9.html'
rUrls = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/a/@href')
# '临海市互通汽车销售有限公司'
rTitle = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/a/text()')
# '黄剑勇'
rPerson = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[1]/a/text()')
# '注册资本:1000万元人民币'
rCapital = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[1]/span[1]/text()')
# '成立日期:2017-09-08'
rSetTime = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[1]/span[2]/text()')
# '\n 邮箱:[email protected]\n '
rEmail = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[2]/text()')
# '电话:0576-85323665'
rPhone = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[2]/span/text()')
# '\n 地址:浙江省台州市临海市江南街道恒大家居建材城(靖江南路112号)\n '
rAddress = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[2]/p[3]/text()')
# '存续'
rState = parseHtml.xpath(
'//div[@class="e_zone-company"]/section/table/tbody/tr/td[3]/span/text()')

# print(rUrls)
num_current = len(rUrls)
for num in range(num_current):
try:
url = 'https://www.qichacha.com'+rUrls[num]
company = rTitle[num]
person = rPerson[num]
capital = rCapital[num].replace('注册资本:', '')
settime = rSetTime[num].replace('成立日期:', '')
email = rEmail[num].replace(
'\n', '').replace('邮箱:', '').strip()
phone = rPhone[num].replace('电话:', '')
address = rAddress[num].replace(
'\n', '').replace('地址:', '').strip()
state = rState[num]
L = [province, city, county, park, area, numcop, company,
person, capital, settime, email, phone, address, state, url]
# print(L)
with open(self.file_name, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(L)
print(L)
num_writer += 1
except Exception as e:
self.err_log(id, i)
print('{} 报错 ID: {} , 页码: {}'.format(name_thread, id, i))
if num_writer == 0:
print('{} 无信息写入 ID: {} , 页码: {} 请检查反扒机制'.format(name_thread, id, i))
self.err_log(id, i)
else:
print('{} 完成爬取 ID: {} , 页码: {}'.format(name_thread, id, i))

def err_log(self, id, page):
err_file = self.path + 'error1.csv'
if not os.path.exists(err_file):
header = ['id', 'page']
with open(err_file, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(header)
with open(err_file, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow([id, page])

def thread_task(self):
name_thread = threading.current_thread().name
n = 3 # 三次请求 self.ListTash 无返回, 则认为所有数据爬取完毕
while True:
if n == 0:
break
try:
i = self.ListTask.pop(0)
# print(i)
id = self.csv_data_res.loc[i, 'id']
# print(id)
page_no = self.csv_data_res.loc[i, 'page']

index = self.get_index(id) # 原园区所在的索引
# print(index)
province = self.csv_data.loc[index, 'province']
city = self.csv_data.loc[index, 'city']
county = self.csv_data.loc[index, 'county']
park = self.csv_data.loc[index, 'park']
area = self.csv_data.loc[index, 'area']
numcop = self.csv_data.loc[index, 'numcop']

self.get_companies_all(name_thread, id, page_no, province,
city, county, park, area, numcop)
print('\n\n{} 完成爬取 ID: {}, 整体进度: {} / {}\n\n=============================\n'.format(
name_thread, id, i+1, self.length))
n = 3
except Exception as e:
n -= 1
time.sleep(random.randint(3, 10))

def get_index(self, id):
url = 'https://www.qichacha.com/zonecompany_'+id
l_csv_data = len(self.csv_data)
for index in range(l_csv_data):
if self.csv_data.loc[index, 'url'] == url:
return index

def work(self):
# 判断\新建文件夹
if not os.path.exists(self.path):
os.makedirs(self.path)
print(self.path+' 文件夹创建成功')

# 判断\新建文件
if not os.path.exists(self.file_name):
header = ['province', 'city', 'county', 'park', 'area', 'numcop', 'company',
'person', 'capital', 'settime', 'email', 'phone', 'address', 'state', 'url']
with open(self.file_name, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(header)

for i in range(18200,self.length):
self.ListTask.append(i)

threads = []
for i in range(200):
thread = threading.Thread(target=self.thread_task, args=())
threads.append(thread)

# 启动多线程
for t in threads:
t.start()
print('开启线程: '+t.name)

for t in threads:
t.join()
print('关闭线程: '+t.name)

print('主线程结束! '+threading.current_thread().name)


if __name__ == "__main__":
DealError(cookie, proxies, companies_name, csv_data_res)
18 changes: 18 additions & 0 deletions QiChaCha/deal_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import csv
import pandas as pd

# 去重处理
csv_data = pd.read_csv('./csv/全国工业园区企业简要信息.csv')
print(csv_data.duplicated(), len(csv_data))
csv_data = csv_data.drop_duplicates()
print(csv_data.duplicated(), len(csv_data))

csv_data['index'] = range(csv_data.shape[0])
csv_data = csv_data.set_index('index')

# province,city,county,park,area,numcop,company,person,capital,settime,email,phone,address,state,url
# 排序
csv_data = csv_data.sort_values(['province', 'city', 'county', 'park', 'area',
'numcop', 'capital', 'settime'], ascending=[1, 1, 1, 1, 0, 0, 0, 1])
print(csv_data, len(csv_data))
csv_data.to_csv('./csv/去重_全国工业园区企业简要信息.csv', index=None)
37 changes: 37 additions & 0 deletions QiChaCha/get_addr_longitude_latitude.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import requests
from urllib.parse import quote
import json


def get_addr_longitude_latitude(keyword):
url = "https://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=12&keywords={}".format(
quote(keyword))

payload = {}
headers = {}
try:
with requests.request("GET", url, headers=headers, data=payload) as response:
js = json.loads(response.text)
target = js['data']['poi_list'][0]
try:
if target['address']:
address = target['address']
else:
address = keyword
except Exception as e:
address = keyword
L = [
address,
target['longitude'],
target['latitude']
]

except Exception as e:
L = ['', '', '']

return L


if __name__ == "__main__":
L = get_addr_longitude_latitude('上海市嘉定区真新街道金沙江路3131号4幢中区123室')
print(L)
Loading

0 comments on commit b47dcda

Please sign in to comment.