forked from luyishisi/Nyspider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompany.py
84 lines (79 loc) · 3.31 KB
/
company.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
from bs4 import BeautifulSoup
import json
import re
import copy
headers = {
'Host':"210.12.219.18",
'X-Requested-With':"XMLHttpRequest",
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/44.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Referer':"http://210.12.219.18/jianguanfabuweb/companies.html",
'Connection': 'keep-alive'}
def getCompanyUrl():
f=open('urls.txt','a')
page=1
while page<926:
try:
html=requests.get('http://210.12.219.18/jianguanfabuweb/handler/GetCompanyData.ashx?method=GetCorpData&corpname=&certid=&endtime=&cert=5&name=-1&PageIndex=%s&PageSize='%(page),headers=headers).text
except:
continue
data=json.loads(html)['tb']
table=BeautifulSoup(html,'lxml').find_all('tr')
for tr in table:
f.write(tr.find('a').get('title')+'||'+tr.find('a').get('href')+'\n')
print(page)
page+=1
def companyInfor(url,name):
keys=['法人代表','所属省市','联系地址','工程监理资质','招标代理','造价咨询','注册人员']
company={}
company['name']=name
for key in keys:
company[key]=[]
html=requests.get('http://210.12.219.18/jianguanfabuweb/'+url,headers=headers).text
soup=BeautifulSoup(html,'lxml').find('div',{'class':'content'})
corpid=re.findall('encodeURI\((\d+)\)',html)[0]
basic=soup.find('table',{'class':'company_basic_infor_table'}).get_text().replace('\r','').replace('\n','').replace(' ','')
basic_re='法人代表:(.*?)企业.*所属省市:(.*?)联系地址:(.*?)备注'
infor=re.findall(basic_re,basic)[0]
company['法人代表']=infor[0]
company['所属省市']=infor[1]
company['联系地址']=infor[2]
zizhi=soup.find_all('div',{'class':'zizhi'})
for item in zizhi:
header=item.find('div',{'class':'zizhi_header'}).get_text()
if '监理' in header:
table=item.find('table').find_all('td')[-1].get_text().split(',')
company['工程监理资质']+=table
if '招标代理' in header:
table=item.find('table').find_all('td')[-1].get_text().split(',')
company['招标代理']+=table
if '造价' in header:
table=item.find('table').find_all('td')[-1].get_text().split(',')
company['造价咨询']+=table
html=requests.get('http://210.12.219.18/jianguanfabuweb/handler/Company_Details_CertifiedEngineers.ashx?method=getStaff&corpid='+corpid,headers=headers).text
table=BeautifulSoup(html,'lxml').find_all('a')
result=[]
for item in table:
person=copy.deepcopy(company)
person['url']=item.get('href').replace('\\"','')
result.append(person)
return result
def getPerson():
f=open('person.txt','a')
for line in open('urls.txt','r').readlines():
lists=line.replace('\n','').split('||')
try:
result=companyInfor(lists[1],lists[0])
except:
failed=open('failed_company.txt','a')
failed.write(line)
failed.close()
print(line,'failed')
continue
for item in result:
f.write(str(item)+'\n')
f.close()
getPerson()