diff --git a/scrapy/wooyun/wooyun/__init__.pyc b/scrapy/wooyun/wooyun/__init__.pyc deleted file mode 100644 index c4e556c..0000000 Binary files a/scrapy/wooyun/wooyun/__init__.pyc and /dev/null differ diff --git a/scrapy/wooyun/wooyun/items.pyc b/scrapy/wooyun/wooyun/items.pyc deleted file mode 100644 index 3f2ed18..0000000 Binary files a/scrapy/wooyun/wooyun/items.pyc and /dev/null differ diff --git a/scrapy/wooyun/wooyun/pipelines.pyc b/scrapy/wooyun/wooyun/pipelines.pyc deleted file mode 100644 index 5dcbc8e..0000000 Binary files a/scrapy/wooyun/wooyun/pipelines.pyc and /dev/null differ diff --git a/scrapy/wooyun/wooyun/settings.py b/scrapy/wooyun/wooyun/settings.py index 7616419..63f66f1 100644 --- a/scrapy/wooyun/wooyun/settings.py +++ b/scrapy/wooyun/wooyun/settings.py @@ -33,6 +33,10 @@ MONGODB_PORT = 27017 MONGODB_DB = 'wooyun' MONGODB_COLLECTION = 'wooyun_list' +#image ignored downoading +IMAGE_DOWLOAD_IGNORED = { + 'www.quip.com' +} # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'wooyun (+http://www.yourdomain.com)' diff --git a/scrapy/wooyun/wooyun/settings.pyc b/scrapy/wooyun/wooyun/settings.pyc deleted file mode 100644 index a131298..0000000 Binary files a/scrapy/wooyun/wooyun/settings.pyc and /dev/null differ diff --git a/scrapy/wooyun/wooyun/spiders/WooyunSpider.py b/scrapy/wooyun/wooyun/spiders/WooyunSpider.py index 455a4e0..1f74540 100644 --- a/scrapy/wooyun/wooyun/spiders/WooyunSpider.py +++ b/scrapy/wooyun/wooyun/spiders/WooyunSpider.py @@ -77,16 +77,20 @@ def parse_detail(self,response): #同时,在piplines.py存放时,作相应的反向处理 image_urls = response.xpath("//img[contains(@src, '/upload/')]/@src").extract() for u in image_urls: - if u.startswith('https://'): - continue - #skip www.quip.com, can'nt be downloaded - if 'www.quip.com' in u: + if self.__check_ingnored_image(u): continue if u.startswith('/'): u = 'http://www.wooyun.org' + u item['image_urls'].append(u) return item + def __check_ingnored_image(self,image_url): + for ignored_url in settings['IMAGE_DOWLOAD_IGNORED']: + if ignored_url in image_url: + return True + + return False + def __search_mongodb(self,wooyun_id): # wooyun_id_exsist = True if self.collection.find({'wooyun_id':wooyun_id}).count()>0 else False diff --git a/scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc b/scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc deleted file mode 100644 index 2d938ae..0000000 Binary files a/scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc and /dev/null differ diff --git a/scrapy/wooyun/wooyun/spiders/__init__.pyc b/scrapy/wooyun/wooyun/spiders/__init__.pyc deleted file mode 100644 index 4a58b4a..0000000 Binary files a/scrapy/wooyun/wooyun/spiders/__init__.pyc and /dev/null differ diff --git a/scrapy/wooyun_drops/wooyun_drops/__init__.pyc b/scrapy/wooyun_drops/wooyun_drops/__init__.pyc deleted file mode 100644 index e94e697..0000000 Binary files a/scrapy/wooyun_drops/wooyun_drops/__init__.pyc and /dev/null differ diff --git a/scrapy/wooyun_drops/wooyun_drops/items.pyc b/scrapy/wooyun_drops/wooyun_drops/items.pyc deleted file mode 100644 index ee5f357..0000000 Binary files a/scrapy/wooyun_drops/wooyun_drops/items.pyc and /dev/null differ diff --git a/scrapy/wooyun_drops/wooyun_drops/pipelines.pyc b/scrapy/wooyun_drops/wooyun_drops/pipelines.pyc deleted file mode 100644 index 2737e81..0000000 Binary files a/scrapy/wooyun_drops/wooyun_drops/pipelines.pyc and /dev/null differ diff --git a/scrapy/wooyun_drops/wooyun_drops/settings.pyc b/scrapy/wooyun_drops/wooyun_drops/settings.pyc deleted file mode 100644 index 4ea1b8d..0000000 Binary files a/scrapy/wooyun_drops/wooyun_drops/settings.pyc and /dev/null differ diff --git a/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.pyc b/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.pyc deleted file mode 100644 index 48b248c..0000000 Binary files a/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.pyc and /dev/null differ diff --git a/scrapy/wooyun_drops/wooyun_drops/spiders/__init__.pyc b/scrapy/wooyun_drops/wooyun_drops/spiders/__init__.pyc deleted file mode 100644 index 8e64400..0000000 Binary files a/scrapy/wooyun_drops/wooyun_drops/spiders/__init__.pyc and /dev/null differ