-
Notifications
You must be signed in to change notification settings - Fork 4
/
main.py
135 lines (131 loc) · 4.24 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from cmspider.config import config
conf = {
# 数据库配置
"db": {
"type": "mongodb",
"host": "q.213.name",
"port": 27017,
"db_name": "neeq",
"user": "",
"password": "",
# 各表名
"table_column": "column",
"table_list": "url",
"table_article": "article",
"table_file": "file",
},
# 爬虫基础配置
"basic": {
"timeout": 5, # 连接超时时间
"sleep": 0, # 爬取间隔时间
"max_thread": 1, # 最大线程,建议单线程
"header": {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) "
"Chrome/59.0.3071.86 Safari/537.36",
"cookie": "",
},
"entry_url": "http://www.neeq.com.cn",
"start_page": 0, # 起始页
"max_page": 999, # 最多爬取的页数
"total_page": 2500, # 总页数,建议动态修改
},
# 板块(API)抓取规则
"column": {
"css": "",
"my_filter": "MyFilter.rule_html_column"
},
# URL列表抓取规则
"list": {
"max_replicate": 100, # 连续重复多少次则停止抓取url,0为不限 (重新抓取全部时则设置为0)
# 通过api获取
"api": {
# "filetype": "file", # html | file
"method": "post",
# "url": "http://www.neeq.com.cn/disclosureInfoController/infoResult.do",
"url": "http://www.neeq.com.cn/info/list.do",
# "url": "http://open.tool.hexun.com/MongodbNewsService/newsListPageByJson.jsp?id=128367438&s=10"
# "&cp=##page##&priority=1",
# POST数据
"post_data": {
# list.do 文章
"page": "##page##",
"pageSize": "10",
"keywords": "",
"publishDate": "",
"nodeId": "93",
# infoResult.do 文件
# "disclosureType": "5",
# "page": "##page##",
# "isNewThree": "1",
# "startTime": "2016-08-07",
# "endTime": "2017-08-07",
},
"my_filter": "rule_api_list",
"store": "store_article_list"
# "store": "store_file_list"
},
# 通过html爬取
"html": {
"url": "http://forex.hexun.com/market/index-##page##.html",
"filter": {
},
"css": ".mainboxcontent ul li",
"regular": "",
"my_filter": "MyFilter.rule_html_list",
"store": "store_article_list",
"recursive": {
# 递归时配置
"next_page": {
"css": "",
"my_filter": "rule_html_list_page"
},
}
}
},
# 文章抓取规则
"article": {
"max_replicate": 4, # 连续重复多少次则停止抓取url,0为不限 (重新抓取全部时则设置为0)
"api": {
# 暂无
},
"html": {
# source http://www.neeq.com.cn/notice/20000482.html
"source": "http://www.neeq.com.cn/notice/20000482.html", # url来源:为空从数据库读取,不为空采用递归
"filter": {
},
"css": ".newstext", # 文章主体dom
"regular": "",
"my_filter": "rule_html_article",
"store": "store_article",
"recursive": {
# 递归时的翻页配置
"next_page": {
"css": ".next a",
"my_filter": "rule_html_article_page",
},
"max_num": 1,
}
}
},
# 文件抓取规则
"file": {
"basic_path": "./file/",
"hash_path": "hash_path",
"hash_filename": "hash_filename",
"recursive": {
# 递归时配置
"next_page": {
"css": "",
"my_filter": "rule_file_page"
},
"max_num": 10,
}
}
}
config.update(conf)
print(config)
from cmspider.spider import Spider
s = Spider()
s.recover_status()
s.fetch_url()
s.fetch_file()
s.fetch_article()