修正部份图片爬取时的BUG

LanXSHJ · Nov 2, 2015 · be143fa · be143fa
1 parent 1fd2140
commit be143fa
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -7,10 +7,10 @@
 ![search](search.png)
 
 ### 1.安装相关组件
-+ Python 2.7.X和pip
++ python 2.7和pip
 + mongodb
 + scrapy (pip install scrapy)
-+ Flask (pip install Flask)
++ flask (pip install Flask)
 + pymongo (pip install pymongo) 
 
 ### 2.爬虫

diff --git a/flask/app.py b/flask/app.py
@@ -4,58 +4,67 @@
 import re
 import pymongo
 from flask import Flask, request, session, g, redirect, url_for, abort, render_template, flash
-#setting:
+# setting:
 MONGODB_SERVER = 'localhost'
 MONGODB_PORT = 27017
 MONGODB_DB = 'wooyun'
 MONGODB_COLLECTION_BUGS = 'wooyun_list'
 MONGODB_COLLECTION_DROPS = 'wooyun_drops'
 ROWS_PER_PAGE = 20
-#flask app:
+# flask app:
 app = Flask(__name__)
 app.config.from_object(__name__)
-#monogodb connection string
-connection_string = "mongodb://%s:%d" % (app.config['MONGODB_SERVER'],app.config['MONGODB_PORT'])
-content ={'by_bugs':\
-                    {'mongodb_collection':app.config['MONGODB_COLLECTION_BUGS'],'template_html':'search_bugs.html'},\
-          'by_drops':\
-                    {'mongodb_collection':app.config['MONGODB_COLLECTION_DROPS'],'template_html':'search_drops.html'},\
-}
-def get_search_regex(keywords,search_by_html):
+# monogodb connection string
+connection_string = "mongodb://%s:%d" % (
+    app.config['MONGODB_SERVER'], app.config['MONGODB_PORT'])
+content = {'by_bugs':
+           {'mongodb_collection': app.config[
+               'MONGODB_COLLECTION_BUGS'], 'template_html': 'search_bugs.html'},
+           'by_drops':
+           {'mongodb_collection': app.config[
+               'MONGODB_COLLECTION_DROPS'], 'template_html': 'search_drops.html'},
+           }
+
+
+def get_search_regex(keywords, search_by_html):
     keywords_regex = {}
-    kws = [ks for ks in keywords.strip().split(' ') if ks!='']
+    kws = [ks for ks in keywords.strip().split(' ') if ks != '']
     field_name = 'html' if search_by_html else 'title'
-    if len(kws)>0:
-        reg_pattern = re.compile('|'.join(kws),re.IGNORECASE)
-        #keywords_regex[field_name]={'$regex':'|'.join(kws)}
-        keywords_regex[field_name]=reg_pattern
-    
+    if len(kws) > 0:
+        reg_pattern = re.compile('|'.join(kws), re.IGNORECASE)
+        # keywords_regex[field_name]={'$regex':'|'.join(kws)}
+        keywords_regex[field_name] = reg_pattern
+
     return keywords_regex
-
-def search_mongodb(keywords,page,content_search_by,search_by_html):        
+
+
+def search_mongodb(keywords, page, content_search_by, search_by_html):
     client = pymongo.MongoClient(connection_string)
     db = client[app.config['MONGODB_DB']]
-    keywords_regex = get_search_regex(keywords,search_by_html)
+    keywords_regex = get_search_regex(keywords, search_by_html)
     collection = db[content[content_search_by]['mongodb_collection']]
-    #get the total count and page:
+    # get the total count and page:
     total_rows = collection.find(keywords_regex).count()
-    total_page = int(math.ceil(total_rows / (app.config['ROWS_PER_PAGE']*1.0)))
-    page_info={'current':page,'total':total_page,'total_rows':total_rows,'rows':[]}
-    #get the page rows
-    if total_page >0 and page <= total_page:
-        row_start = (page-1)*app.config['ROWS_PER_PAGE']
+    total_page = int(
+        math.ceil(total_rows / (app.config['ROWS_PER_PAGE'] * 1.0)))
+    page_info = {'current': page, 'total': total_page,
+                 'total_rows': total_rows, 'rows': []}
+    # get the page rows
+    if total_page > 0 and page <= total_page:
+        row_start = (page - 1) * app.config['ROWS_PER_PAGE']
         cursors = collection.find(keywords_regex)\
-            .sort('datetime',pymongo.DESCENDING).skip(row_start).limit(app.config['ROWS_PER_PAGE'])
+            .sort('datetime', pymongo.DESCENDING).skip(row_start).limit(app.config['ROWS_PER_PAGE'])
         for c in cursors:
-            c['datetime']=c['datetime'].strftime('%Y-%m-%d')
+            c['datetime'] = c['datetime'].strftime('%Y-%m-%d')
             if 'url' in c:
                 urlsep = c['url'].split('//')[1].split('/')
-                c['url_local'] = '%s-%s.html'%(urlsep[1],urlsep[2])
+                c['url_local'] = '%s-%s.html' % (urlsep[1], urlsep[2])
             page_info['rows'].append(c)
     client.close()
     #
     return page_info
 
+
 def get_wooyun_total_count():
     client = pymongo.MongoClient(connection_string)
     db = client[app.config['MONGODB_DB']]
@@ -65,28 +74,33 @@ def get_wooyun_total_count():
     total_count_drops = collection_drops.find().count()
     client.close()
 
-    return (total_count_bugs,total_count_drops)
+    return (total_count_bugs, total_count_drops)
 
 
 @app.route('/')
 def index():
-    total_count_bugs,total_count_drops = get_wooyun_total_count()
-    return render_template('index.html',total_count_bugs=total_count_bugs,total_count_drops=total_count_drops,title=u'乌云公开漏洞、知识库搜索')
+    total_count_bugs, total_count_drops = get_wooyun_total_count()
+    return render_template('index.html', total_count_bugs=total_count_bugs, total_count_drops=total_count_drops, title=u'乌云公开漏洞、知识库搜索')
+
 
 @app.route('/search', methods=['get'])
 def search():
     keywords = request.args.get('keywords')
-    page = int(request.args.get('page',1))
-    search_by_html = True if 'true' == request.args.get('search_by_html','false').lower() else False
-    content_search_by = request.args.get('content_search_by','by_bugs')
-    if page<1: page = 1
+    page = int(request.args.get('page', 1))
+    search_by_html = True if 'true' == request.args.get(
+        'search_by_html', 'false').lower() else False
+    content_search_by = request.args.get('content_search_by', 'by_bugs')
+    if page < 1:
+        page = 1
     #
-    page_info = search_mongodb(keywords,page,content_search_by,search_by_html)
+    page_info = search_mongodb(
+        keywords, page, content_search_by, search_by_html)
     #
-    return render_template(content[content_search_by]['template_html'],keywords=keywords,page_info=page_info,search_by_html=search_by_html,title=u'搜索结果-乌云公开漏洞、知识库搜索')
+    return render_template(content[content_search_by]['template_html'], keywords=keywords, page_info=page_info, search_by_html=search_by_html, title=u'搜索结果-乌云公开漏洞、知识库搜索')
+
 
 def main():
-    app.run(host='0.0.0.0',debug=True)
+    app.run(host='0.0.0.0', debug=True)
 
 if __name__ == '__main__':
-	main()
+    main()
diff --git a/scrapy/wooyun/wooyun/pipelines.py b/scrapy/wooyun/wooyun/pipelines.py
@@ -16,7 +16,7 @@
 class MongoDBPipeline(object):
     def __init__(self):
         self.connection_string = "mongodb://%s:%d" % (settings['MONGODB_SERVER'],settings['MONGODB_PORT'])
-   
+
     def open_spider(self, spider):
         self.client = pymongo.MongoClient(self.connection_string)
         self.db = self.client[settings['MONGODB_DB']]
@@ -63,7 +63,7 @@ def process_item(self,item,spider):
         #save file as utf-8 format
         with codecs.open(path_name,mode='w',encoding='utf-8',errors='ignore') as f:
             f.write(post_data['html'])
-        
+
         return item
 
     def __process_html(self,item):
@@ -72,12 +72,13 @@ def __process_html(self,item):
             return False
         #deal the img
         for img in item['images']:
+            #处理部份图片存放于http://www.wooyun.org时，使用/upload/..形式的路径
+            if img['url'].startswith('http://www.wooyun.org'):
+                img['url'] = img['url'].replace('http://www.wooyun.org','')
             item['html'] = re.sub('<img src=[\'\"]%s[\'\"]'%img['url'],'<img src=\'%s\''%img['path'],item['html'])
         #deal css
         item['html'] = re.sub(r'<link href=\"/css/style\.css','<link href=\"css/style.css',item['html'])
         #deal script
         item['html'] = re.sub(r'<script src=\"https://static\.wooyun\.org/static/js/jquery\-1\.4\.2\.min\.js','<script src=\"js/jquery-1.4.2.min.js',item['html'])
 
         return True
-
-
diff --git a/scrapy/wooyun/wooyun/pipelines.pyc b/scrapy/wooyun/wooyun/pipelines.pyc
diff --git a/scrapy/wooyun/wooyun/spiders/WooyunSpider.py b/scrapy/wooyun/wooyun/spiders/WooyunSpider.py
@@ -46,7 +46,8 @@ def parse_list(self,response):
                 url = response.urljoin(url)
                 yield scrapy.Request(url, self.parse_detail)
 
-    def parse_detail(self,response):
+    def parse(self, response):
+    #def parse_detail(self,response):
         item = WooyunItem()
         item['wooyun_id'] = response.xpath('//*[@id="bugDetail"]/div[5]/h3[1]/a/@href').extract()[0].split('/')[2]
         item['title'] = response.xpath('//title/text()').extract()[0].split("|")[0]
@@ -72,10 +73,16 @@ def parse_detail(self,response):
         #images url for download
         item['image_urls']=[]
         if self.local_store:
-            image_urls = response.xpath("//img[contains(@src, 'http://static.wooyun.org/wooyun/upload/')]/@src").extract()
+            #乌云图片目前发两种格式，一种是http://static.wooyun.org/wooyun/upload/,另一格式是/upload/...
+            #因此，对后一种在爬取时，增加http://www.wooyun.org，以形成完整的url地址
+            #同时，在piplines.py存放时，作相应的反向处理
+            image_urls = response.xpath("//img[contains(@src, '/upload/')]/@src").extract()
             for u in image_urls:
-                if 'https://' not in u:
-                    item['image_urls'].append(u)
+                if u.startswith('https://'):
+                    continue
+                if u.startswith('/'):
+                    u = 'http://www.wooyun.org' + u
+                item['image_urls'].append(u)
         return item
 
     def __search_mongodb(self,wooyun_id):

diff --git a/scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc b/scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc