diff --git a/README.md b/README.md index d903cbd..c79eae8 100644 --- a/README.md +++ b/README.md @@ -55,9 +55,10 @@ 1、使用vmware或virtualbox导入虚拟机 2、登录用户名hancool,密码qwe123 - 3、分别进入wooyun_public目录下的wooyun和wooyun_drops,运行爬虫爬取数据(爬取全部数据并且本地离线缓存):scrapy crawl wooyun -a page_max=0 -a local_store=true -a update=true - 4、进入wooyun_publich目录下的flask,运行./app.py,启动web服务 - 5、打开浏览器,输入http://ip:5000,ip为虚拟机的网卡地址(如果是vmware,则使用ifconfig eth0查看,如果是virtualbox为192.168.56.130) + 3、进入wooyun_public目录,先用git更新一下到最新的代码git pull + 4、分别进入wooyun_public目录下的wooyun和wooyun_drops,运行爬虫爬取数据(爬取全部数据并且本地离线缓存):scrapy crawl wooyun -a page_max=0 -a local_store=true -a update=true + 5、进入wooyun_publich目录下的flask,运行./app.py,启动web服务 + 6、打开浏览器,输入http://ip:5000,ip为虚拟机的网卡地址(如果是vmware,则使用ifconfig eth0查看,如果是virtualbox为192.168.56.130) ### 6.其它 diff --git a/scrapy/wooyun/wooyun/spiders/WooyunSpider.py b/scrapy/wooyun/wooyun/spiders/WooyunSpider.py index c759728..455a4e0 100644 --- a/scrapy/wooyun/wooyun/spiders/WooyunSpider.py +++ b/scrapy/wooyun/wooyun/spiders/WooyunSpider.py @@ -79,6 +79,9 @@ def parse_detail(self,response): for u in image_urls: if u.startswith('https://'): continue + #skip www.quip.com, can'nt be downloaded + if 'www.quip.com' in u: + continue if u.startswith('/'): u = 'http://www.wooyun.org' + u item['image_urls'].append(u) diff --git a/scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc b/scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc index d45c438..2d938ae 100644 Binary files a/scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc and b/scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc differ diff --git a/scrapy/wooyun_drops/wooyun_drops/settings.py b/scrapy/wooyun_drops/wooyun_drops/settings.py index 49077cc..2b60971 100644 --- a/scrapy/wooyun_drops/wooyun_drops/settings.py +++ b/scrapy/wooyun_drops/wooyun_drops/settings.py @@ -22,7 +22,7 @@ } #the crawl default setting PAGE_MAX_DEFAULT = 1 -LOCAL_STORE_DEFAULT = 'false' +LOCAL_STORE_DEFAULT = 'true' UPDATE_DEFAULT = 'false' #save to local LOCAL_STORE='../../flask/static/drops/' diff --git a/scrapy/wooyun_drops/wooyun_drops/settings.pyc b/scrapy/wooyun_drops/wooyun_drops/settings.pyc index 652b129..4ea1b8d 100644 Binary files a/scrapy/wooyun_drops/wooyun_drops/settings.pyc and b/scrapy/wooyun_drops/wooyun_drops/settings.pyc differ diff --git a/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.py b/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.py index 4a74366..878071d 100644 --- a/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.py +++ b/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.py @@ -59,9 +59,10 @@ def parse_detail(self,response): item['image_urls'] = [] if self.local_store: image_urls = response.xpath("//p/img/@src").extract() - #scrapy can'nt get https page,so so skip the https image download + #skip the https image download + #skip www.quip.com,can'n be downloaded for u in image_urls: - if 'https://' not in u: + if 'https://' not in u and 'www.quip.com' not in u: item['image_urls'].append(u) item['html'] = response.body.decode('utf-8','ignore') diff --git a/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.pyc b/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.pyc index 309e5cf..48b248c 100644 Binary files a/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.pyc and b/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.pyc differ