-
Notifications
You must be signed in to change notification settings - Fork 12
/
toutiaoPage3.py
84 lines (73 loc) · 2.22 KB
/
toutiaoPage3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding:utf-8 -*-
from MySQLdb import *
import re
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
'''
使用模拟点击(selenium+PhantomJS())的方式,抓取今日头条文章内容
'''
def loadLink(url):
driver = webdriver.PhantomJS()
driver.get(url)
time.sleep(0.1)
driver.refresh()
driver.implicitly_wait(0.1)
response = BeautifulSoup(driver.page_source, 'lxml')
try:
content = response.find_all('script')
except:
return '[]'
if len(content) >= 6:
time.sleep(0.02)
content = content[6]
content = str(content)
content = content[28:-12]
content = content.strip()
content = content.split('},')
if len(content) > 2:
content = content[2]
content = content.strip()
content = content.split('content:')
if len(content) >= 2:
content = content[1]
content = content.split('groupId:')
content = content[0].strip()
content = content[:-1]
text = content.replace('div><', '').replace('</div>', '')
#text = HTMLParser.unescape(text)
return text
else:
return str(content)
else:
return str(content)
else:
return '[]'
if __name__ == "__main__":
db = connect(host="secret", port=3306, db="Spider", user="root", passwd="secret", charset="utf8")
conn = db.cursor()
try:
sql = 'SELECT source_url FROM Article'
MainUrl = conn.execute(sql)
data = conn.fetchall()
db.commit()
except:
db.rollback()
for i in range(len(data)):
url = data[i][0]
print(url)
print(i)
time.sleep(0.1)
page = loadLink(url)
n = i + 1
n = str(n)
params = [page,n]
try:
sql = """update Article set article_content=%s WHERE id=%s"""
conn.execute(sql,params)
db.commit()
except:
db.rollback()
time.sleep(0.1)
db.close()