Skip to content

Commit

Permalink
修正部份图片爬取时的BUG
Browse files Browse the repository at this point in the history
  • Loading branch information
hanc00l committed Nov 2, 2015
1 parent 1fd2140 commit be143fa
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 49 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
![search](search.png)

### 1.安装相关组件
+ Python 2.7.X和pip
+ python 2.7和pip
+ mongodb
+ scrapy (pip install scrapy)
+ Flask (pip install Flask)
+ flask (pip install Flask)
+ pymongo (pip install pymongo)

### 2.爬虫
Expand Down
92 changes: 53 additions & 39 deletions flask/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,58 +4,67 @@
import re
import pymongo
from flask import Flask, request, session, g, redirect, url_for, abort, render_template, flash
#setting:
# setting:
MONGODB_SERVER = 'localhost'
MONGODB_PORT = 27017
MONGODB_DB = 'wooyun'
MONGODB_COLLECTION_BUGS = 'wooyun_list'
MONGODB_COLLECTION_DROPS = 'wooyun_drops'
ROWS_PER_PAGE = 20
#flask app:
# flask app:
app = Flask(__name__)
app.config.from_object(__name__)
#monogodb connection string
connection_string = "mongodb://%s:%d" % (app.config['MONGODB_SERVER'],app.config['MONGODB_PORT'])
content ={'by_bugs':\
{'mongodb_collection':app.config['MONGODB_COLLECTION_BUGS'],'template_html':'search_bugs.html'},\
'by_drops':\
{'mongodb_collection':app.config['MONGODB_COLLECTION_DROPS'],'template_html':'search_drops.html'},\
}
def get_search_regex(keywords,search_by_html):
# monogodb connection string
connection_string = "mongodb://%s:%d" % (
app.config['MONGODB_SERVER'], app.config['MONGODB_PORT'])
content = {'by_bugs':
{'mongodb_collection': app.config[
'MONGODB_COLLECTION_BUGS'], 'template_html': 'search_bugs.html'},
'by_drops':
{'mongodb_collection': app.config[
'MONGODB_COLLECTION_DROPS'], 'template_html': 'search_drops.html'},
}


def get_search_regex(keywords, search_by_html):
keywords_regex = {}
kws = [ks for ks in keywords.strip().split(' ') if ks!='']
kws = [ks for ks in keywords.strip().split(' ') if ks != '']
field_name = 'html' if search_by_html else 'title'
if len(kws)>0:
reg_pattern = re.compile('|'.join(kws),re.IGNORECASE)
#keywords_regex[field_name]={'$regex':'|'.join(kws)}
keywords_regex[field_name]=reg_pattern
if len(kws) > 0:
reg_pattern = re.compile('|'.join(kws), re.IGNORECASE)
# keywords_regex[field_name]={'$regex':'|'.join(kws)}
keywords_regex[field_name] = reg_pattern

return keywords_regex

def search_mongodb(keywords,page,content_search_by,search_by_html):


def search_mongodb(keywords, page, content_search_by, search_by_html):
client = pymongo.MongoClient(connection_string)
db = client[app.config['MONGODB_DB']]
keywords_regex = get_search_regex(keywords,search_by_html)
keywords_regex = get_search_regex(keywords, search_by_html)
collection = db[content[content_search_by]['mongodb_collection']]
#get the total count and page:
# get the total count and page:
total_rows = collection.find(keywords_regex).count()
total_page = int(math.ceil(total_rows / (app.config['ROWS_PER_PAGE']*1.0)))
page_info={'current':page,'total':total_page,'total_rows':total_rows,'rows':[]}
#get the page rows
if total_page >0 and page <= total_page:
row_start = (page-1)*app.config['ROWS_PER_PAGE']
total_page = int(
math.ceil(total_rows / (app.config['ROWS_PER_PAGE'] * 1.0)))
page_info = {'current': page, 'total': total_page,
'total_rows': total_rows, 'rows': []}
# get the page rows
if total_page > 0 and page <= total_page:
row_start = (page - 1) * app.config['ROWS_PER_PAGE']
cursors = collection.find(keywords_regex)\
.sort('datetime',pymongo.DESCENDING).skip(row_start).limit(app.config['ROWS_PER_PAGE'])
.sort('datetime', pymongo.DESCENDING).skip(row_start).limit(app.config['ROWS_PER_PAGE'])
for c in cursors:
c['datetime']=c['datetime'].strftime('%Y-%m-%d')
c['datetime'] = c['datetime'].strftime('%Y-%m-%d')
if 'url' in c:
urlsep = c['url'].split('//')[1].split('/')
c['url_local'] = '%s-%s.html'%(urlsep[1],urlsep[2])
c['url_local'] = '%s-%s.html' % (urlsep[1], urlsep[2])
page_info['rows'].append(c)
client.close()
#
return page_info


def get_wooyun_total_count():
client = pymongo.MongoClient(connection_string)
db = client[app.config['MONGODB_DB']]
Expand All @@ -65,28 +74,33 @@ def get_wooyun_total_count():
total_count_drops = collection_drops.find().count()
client.close()

return (total_count_bugs,total_count_drops)
return (total_count_bugs, total_count_drops)


@app.route('/')
def index():
total_count_bugs,total_count_drops = get_wooyun_total_count()
return render_template('index.html',total_count_bugs=total_count_bugs,total_count_drops=total_count_drops,title=u'乌云公开漏洞、知识库搜索')
total_count_bugs, total_count_drops = get_wooyun_total_count()
return render_template('index.html', total_count_bugs=total_count_bugs, total_count_drops=total_count_drops, title=u'乌云公开漏洞、知识库搜索')


@app.route('/search', methods=['get'])
def search():
keywords = request.args.get('keywords')
page = int(request.args.get('page',1))
search_by_html = True if 'true' == request.args.get('search_by_html','false').lower() else False
content_search_by = request.args.get('content_search_by','by_bugs')
if page<1: page = 1
page = int(request.args.get('page', 1))
search_by_html = True if 'true' == request.args.get(
'search_by_html', 'false').lower() else False
content_search_by = request.args.get('content_search_by', 'by_bugs')
if page < 1:
page = 1
#
page_info = search_mongodb(keywords,page,content_search_by,search_by_html)
page_info = search_mongodb(
keywords, page, content_search_by, search_by_html)
#
return render_template(content[content_search_by]['template_html'],keywords=keywords,page_info=page_info,search_by_html=search_by_html,title=u'搜索结果-乌云公开漏洞、知识库搜索')
return render_template(content[content_search_by]['template_html'], keywords=keywords, page_info=page_info, search_by_html=search_by_html, title=u'搜索结果-乌云公开漏洞、知识库搜索')


def main():
app.run(host='0.0.0.0',debug=True)
app.run(host='0.0.0.0', debug=True)

if __name__ == '__main__':
main()
main()
9 changes: 5 additions & 4 deletions scrapy/wooyun/wooyun/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
class MongoDBPipeline(object):
def __init__(self):
self.connection_string = "mongodb://%s:%d" % (settings['MONGODB_SERVER'],settings['MONGODB_PORT'])

def open_spider(self, spider):
self.client = pymongo.MongoClient(self.connection_string)
self.db = self.client[settings['MONGODB_DB']]
Expand Down Expand Up @@ -63,7 +63,7 @@ def process_item(self,item,spider):
#save file as utf-8 format
with codecs.open(path_name,mode='w',encoding='utf-8',errors='ignore') as f:
f.write(post_data['html'])

return item

def __process_html(self,item):
Expand All @@ -72,12 +72,13 @@ def __process_html(self,item):
return False
#deal the img
for img in item['images']:
#处理部份图片存放于http://www.wooyun.org时,使用/upload/..形式的路径
if img['url'].startswith('http://www.wooyun.org'):
img['url'] = img['url'].replace('http://www.wooyun.org','')
item['html'] = re.sub('<img src=[\'\"]%s[\'\"]'%img['url'],'<img src=\'%s\''%img['path'],item['html'])
#deal css
item['html'] = re.sub(r'<link href=\"/css/style\.css','<link href=\"css/style.css',item['html'])
#deal script
item['html'] = re.sub(r'<script src=\"https://static\.wooyun\.org/static/js/jquery\-1\.4\.2\.min\.js','<script src=\"js/jquery-1.4.2.min.js',item['html'])

return True


Binary file modified scrapy/wooyun/wooyun/pipelines.pyc
Binary file not shown.
15 changes: 11 additions & 4 deletions scrapy/wooyun/wooyun/spiders/WooyunSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def parse_list(self,response):
url = response.urljoin(url)
yield scrapy.Request(url, self.parse_detail)

def parse_detail(self,response):
def parse(self, response):
#def parse_detail(self,response):
item = WooyunItem()
item['wooyun_id'] = response.xpath('//*[@id="bugDetail"]/div[5]/h3[1]/a/@href').extract()[0].split('/')[2]
item['title'] = response.xpath('//title/text()').extract()[0].split("|")[0]
Expand All @@ -72,10 +73,16 @@ def parse_detail(self,response):
#images url for download
item['image_urls']=[]
if self.local_store:
image_urls = response.xpath("//img[contains(@src, 'http://static.wooyun.org/wooyun/upload/')]/@src").extract()
#乌云图片目前发两种格式,一种是http://static.wooyun.org/wooyun/upload/,另一格式是/upload/...
#因此,对后一种在爬取时,增加http://www.wooyun.org,以形成完整的url地址
#同时,在piplines.py存放时,作相应的反向处理
image_urls = response.xpath("//img[contains(@src, '/upload/')]/@src").extract()
for u in image_urls:
if 'https://' not in u:
item['image_urls'].append(u)
if u.startswith('https://'):
continue
if u.startswith('/'):
u = 'http://www.wooyun.org' + u
item['image_urls'].append(u)
return item

def __search_mongodb(self,wooyun_id):
Expand Down
Binary file modified scrapy/wooyun/wooyun/spiders/WooyunSpider.pyc
Binary file not shown.

0 comments on commit be143fa

Please sign in to comment.