Skip to content

Commit

Permalink
update image download ignored
Browse files Browse the repository at this point in the history
  • Loading branch information
hanc00l committed Jan 7, 2016
1 parent 9789145 commit c93b3f7
Show file tree
Hide file tree
Showing 14 changed files with 12 additions and 4 deletions.
Binary file removed scrapy/wooyun/wooyun/__init__.pyc
Binary file not shown.
Binary file removed scrapy/wooyun/wooyun/items.pyc
Binary file not shown.
Binary file removed scrapy/wooyun/wooyun/pipelines.pyc
Binary file not shown.
4 changes: 4 additions & 0 deletions scrapy/wooyun/wooyun/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@
MONGODB_PORT = 27017
MONGODB_DB = 'wooyun'
MONGODB_COLLECTION = 'wooyun_list'
#image ignored downoading
IMAGE_DOWLOAD_IGNORED = {
'www.quip.com'
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wooyun (+http://www.yourdomain.com)'

Expand Down
Binary file removed scrapy/wooyun/wooyun/settings.pyc
Binary file not shown.
12 changes: 8 additions & 4 deletions scrapy/wooyun/wooyun/spiders/WooyunSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,20 @@ def parse_detail(self,response):
#同时,在piplines.py存放时,作相应的反向处理
image_urls = response.xpath("//img[contains(@src, '/upload/')]/@src").extract()
for u in image_urls:
if u.startswith('https://'):
continue
#skip www.quip.com, can'nt be downloaded
if 'www.quip.com' in u:
if self.__check_ingnored_image(u):
continue
if u.startswith('/'):
u = 'http://www.wooyun.org' + u
item['image_urls'].append(u)
return item

def __check_ingnored_image(self,image_url):
for ignored_url in settings['IMAGE_DOWLOAD_IGNORED']:
if ignored_url in image_url:
return True

return False

def __search_mongodb(self,wooyun_id):
#
wooyun_id_exsist = True if self.collection.find({'wooyun_id':wooyun_id}).count()>0 else False
Expand Down
Binary file removed scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc
Binary file not shown.
Binary file removed scrapy/wooyun/wooyun/spiders/__init__.pyc
Binary file not shown.
Binary file removed scrapy/wooyun_drops/wooyun_drops/__init__.pyc
Binary file not shown.
Binary file removed scrapy/wooyun_drops/wooyun_drops/items.pyc
Binary file not shown.
Binary file removed scrapy/wooyun_drops/wooyun_drops/pipelines.pyc
Binary file not shown.
Binary file removed scrapy/wooyun_drops/wooyun_drops/settings.pyc
Binary file not shown.
Binary file not shown.
Binary file removed scrapy/wooyun_drops/wooyun_drops/spiders/__init__.pyc
Binary file not shown.

0 comments on commit c93b3f7

Please sign in to comment.