Skip to content

Commit

Permalink
update skip crawl https
Browse files Browse the repository at this point in the history
  • Loading branch information
hanc00l committed Oct 17, 2015
1 parent 3fc1037 commit 358ce3a
Showing 1 changed file with 13 additions and 17 deletions.
30 changes: 13 additions & 17 deletions scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ class WooyunSpider(scrapy.Spider):
name = "wooyun"
allowed_domains = ["wooyun.org"]
start_urls = [
'http://drops.wooyun.org/'
#'http://drops.wooyun.org/papers/8989'
'http://drops.wooyun.org/'
]

def __init__(self,page_max=settings['PAGE_MAX_DEFAULT'],local_store=settings['LOCAL_STORE_DEFAULT'],\
Expand All @@ -28,53 +27,50 @@ def __init__(self,page_max=settings['PAGE_MAX_DEFAULT'],local_store=settings['LO

def closed(self,reason):
self.client.close()

def parse(self, response):
# <span class="pages">第 1 页,共 80 页</span>
total_pages = response.xpath("//div[@class='wp-pagenavi']/span[@class = 'pages']/text()").re(u"共 (\d+) 页")[0]
if self.page_max == 0:
end_page = int(total_pages)
else:
end_page = self.page_max
end_page = self.page_max

for page in range(1,end_page + 1):
page_url = "http://drops.wooyun.org/page/%d"%page
yield scrapy.Request(page_url, self.parse_post_urls)

def parse_post_urls(self, response):
post_urls = response.xpath("//div[@class = 'post']/h2[@class = 'entry-title']/a/@href").extract()
for url in post_urls:
url = response.urljoin(url)
if self.update or not self.__search_mongodb(url):
yield scrapy.Request(url, self.parse_detail)
#def parse(self,response):
def parse_detail(self,response):

def parse_detail(self,response):
item = WooyunItem()
item['url'] = unquote(response.url)
item['category'] = unquote(response.url).split('//')[1].split('/')[1]
item['title'] = response.xpath("//title/text()").extract()[0].split(u"|")[0].strip()
item['author'] = response.xpath("//div[@class = 'entry-meta']/a/@href").extract()[0].split("/")[2]
dt = response.xpath("//div[@class = 'entry-meta']/time/text()").extract()[0].split(' ')[0].split('/')
dt_time = response.xpath("//div[@class = 'entry-meta']/time/text()").extract()[0].split(' ')[1].split(':')
dt_time = response.xpath("//div[@class = 'entry-meta']/time/text()").extract()[0].split(' ')[1].split(':')
item['datetime'] = datetime(int(dt[0]),int(dt[1]),int(dt[2]),int(dt_time[0]),int(dt_time[1]))
if self.local_store:
item['image_urls'] = []
image_urls = response.xpath("//p/img/@src").extract()
'''
#download the image of 'https://quip.com/blob/' fail,so skip the image download
#scrapy can'nt get https page,so so skip the https image download
for u in image_urls:
if 'https://quip.com/blog' in u:
continue
item['image_urls'].append(u)
'''
item['image_urls'] = image_urls
if 'https://' not in u:
item['image_urls'].append(u)
else:
item['image_urls']=[]

item['html'] = response.body.decode('utf-8','ignore')

return item

def __search_mongodb(self,url):
def __search_mongodb(self,url):
#
wooyun_drops_exsist = True if self.collection.find({'url':url}).count()>0 else False
#
Expand Down

0 comments on commit 358ce3a

Please sign in to comment.