forked from queensun/Nyspider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path18ladys.py
77 lines (71 loc) · 2.4 KB
/
18ladys.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
from bs4 import BeautifulSoup
import time
import re
import openpyxl
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate'}
def get_names():
page=1
while page<21:
html=requests.get('http://www.18ladys.com/cyzy/index.asp?page='+str(page),headers=headers).text.encode('iso-8859-1').decode('gbk')
table=BeautifulSoup(html,'lxml').find('div',{'class':'tb1'}).find_all('a')
f=open('names.txt','a')
for item in table:
try:
name=item.get_text()
url='http://www.18ladys.com/cyzy/'+item.get('href')
f.write(name+'|'+url+'\n')
except:
continue
f.close()
print(page)
page+=1
def get_infor(name,url):
html=requests.get(url,headers=headers).text.encode('iso-8859-1').decode('gbk','ignore')
text=BeautifulSoup(html,'lxml').find('dd',{'class':'f14 jl4'}).find('p').get_text().replace('【','||【').replace('\r','').replace('\n','')
text=text.split('||')
result={'name':name}
for item in text:
try:
name_value=item.split('】')
name=name_value[0].replace('【','')
value=name_value[1]
result[name]=value
except:
continue
return result
def crawler():
for line in open('names.txt','r'):
line=line.replace('\n','')
name=line.split('|')[0]
url=line.split('|')[1]
try:
item=get_infor(name,url)
except:
failed=open('failed','a')
failed.write(line+'\n')
failed.close()
f=open('result.txt','a')
f.write(str(item)+'\n')
f.close()
print(line,'ok')
def write_to_excel():
excel=openpyxl.Workbook(write_only=True)
sheet=excel.create_sheet()
keys=['name','异名','别名','来源','植物形态','功用主治','用法与用量','炮制']
sheet.append(keys)
for line in open('result.txt','r'):
item=eval(line)
infor=[]
for key in keys:
try:
infor.append(item[key])
except:
infor.append('')
sheet.append(infor)
excel.save('result.xlsx')
crawler()