update skip crawl https

ppnoHZ · Oct 17, 2015 · 358ce3a · 358ce3a
1 parent 3fc1037
commit 358ce3a
Showing 1 changed file with 13 additions and 17 deletions.
diff --git a/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.py b/scrapy/wooyun_drops/wooyun_drops/spiders/WooyunSpider.py
@@ -11,8 +11,7 @@ class WooyunSpider(scrapy.Spider):
     name = "wooyun"
     allowed_domains = ["wooyun.org"]
     start_urls = [
-        'http://drops.wooyun.org/'        
-        #'http://drops.wooyun.org/papers/8989'
+        'http://drops.wooyun.org/'
     ]
 
     def __init__(self,page_max=settings['PAGE_MAX_DEFAULT'],local_store=settings['LOCAL_STORE_DEFAULT'],\
@@ -28,53 +27,50 @@ def __init__(self,page_max=settings['PAGE_MAX_DEFAULT'],local_store=settings['LO
 
     def closed(self,reason):
         self.client.close()
-    
+
     def parse(self, response):
         # <span class="pages">第 1 页，共 80 页</span>
         total_pages = response.xpath("//div[@class='wp-pagenavi']/span[@class = 'pages']/text()").re(u"共 (\d+) 页")[0]
         if self.page_max == 0:
             end_page = int(total_pages)
         else:
-            end_page = self.page_max   
+            end_page = self.page_max
 
         for page in range(1,end_page + 1):
             page_url = "http://drops.wooyun.org/page/%d"%page
             yield scrapy.Request(page_url, self.parse_post_urls)
-    
+
     def parse_post_urls(self, response):
         post_urls = response.xpath("//div[@class = 'post']/h2[@class = 'entry-title']/a/@href").extract()
         for url in post_urls:
             url = response.urljoin(url)
             if self.update or not self.__search_mongodb(url):
                 yield scrapy.Request(url, self.parse_detail)
-    #def parse(self,response):
-    def parse_detail(self,response):  
+
+    def parse_detail(self,response):
         item = WooyunItem()
         item['url'] = unquote(response.url)
         item['category'] = unquote(response.url).split('//')[1].split('/')[1]
         item['title'] = response.xpath("//title/text()").extract()[0].split(u"|")[0].strip()
         item['author'] = response.xpath("//div[@class = 'entry-meta']/a/@href").extract()[0].split("/")[2]
         dt = response.xpath("//div[@class = 'entry-meta']/time/text()").extract()[0].split(' ')[0].split('/')
-        dt_time = response.xpath("//div[@class = 'entry-meta']/time/text()").extract()[0].split(' ')[1].split(':')              
+        dt_time = response.xpath("//div[@class = 'entry-meta']/time/text()").extract()[0].split(' ')[1].split(':')
         item['datetime'] = datetime(int(dt[0]),int(dt[1]),int(dt[2]),int(dt_time[0]),int(dt_time[1]))
         if self.local_store:
             item['image_urls'] = []
             image_urls = response.xpath("//p/img/@src").extract()
-            '''
-            #download the image of 'https://quip.com/blob/' fail,so skip the image download
+            #scrapy can'nt get https page,so so skip the https image download
             for u in image_urls:
-                if 'https://quip.com/blog' in u:
-                    continue
-                item['image_urls'].append(u)
-            '''
-            item['image_urls'] = image_urls
+                if 'https://' not in u:
+                    item['image_urls'].append(u)
         else:
             item['image_urls']=[]
+
         item['html'] = response.body.decode('utf-8','ignore')
-        
+
         return item
 
-    def __search_mongodb(self,url):        
+    def __search_mongodb(self,url):
         #
         wooyun_drops_exsist = True if self.collection.find({'url':url}).count()>0 else False
         #