-
Notifications
You must be signed in to change notification settings - Fork 28
/
settings.py
48 lines (38 loc) · 1.41 KB
/
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# -*- coding: utf-8 -*-
# Scrapy settings for crawler project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
COMMANDS_MODULE = 'crawler.commands'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'crawler (+http://www.yourdomain.com)'
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None,
'scrapy.downloadermiddlewares.retry.RetryMiddleware' : 300,
'crawler.middleware.random_user_agent.RandomUserAgentMiddleware' : 500,
# 'crawler.middleware.proxy.StaticProxyMiddleware' : 100 ,
}
ITEM_PIPELINES = {
'crawler.pipelines.ProxyScanPipeline': 500,
# 'crawler.pipelines.PrintPipeline': 800,
}
LOG_LEVEL = 'INFO'
DOWNLOAD_DELAY = 1
import urllib2
LOCAL_IP = urllib2.urlopen('http://ifconfig.io/ip').read()[:-1]
# PROXY = "http://61.53.143.179:80"
# used by StaticProxyMiddleware, if you want to crawl through a proxy server
PROXY = "http://localhost:8088"
# used by RandomProxyMiddleware, if you want to crawl through proxy servers
PROXY_LIST = [
"http://120.83.5.164:18000",
"http://111.161.126.100:80",
"http://61.53.143.179:80"
]
RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 405, 408]