Skip to content

Commit

Permalink
feat: 添加通过文件添加关键词的功能
Browse files Browse the repository at this point in the history
KEYWORD_LIST既可以是关键词列表如[keyword1, keyword2,keyword3],也可以是保存这关键词的txt文件路径,如'keyword_list.txt',每一个要搜索的关键词一行,如果一行中包含多个以空格分隔的关键词,则这些关键词为一个关键词,如内容:
迪丽热巴
尿布 啤酒

表示第一个关键词搜索包含迪丽热巴的微博;第二个表示搜索同时包含尿布和啤酒的微博。

Issue dataabc#33
  • Loading branch information
dataabc committed Sep 25, 2020
1 parent 5354d95 commit e1140e3
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 4 deletions.
13 changes: 11 additions & 2 deletions weibo/spiders/search.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
from datetime import datetime, timedelta
from urllib.parse import unquote

import scrapy
import weibo.utils.util as util
from scrapy.exceptions import CloseSpider
from scrapy.utils.project import get_project_settings

import weibo.utils.util as util
from weibo.items import WeiboItem


Expand All @@ -16,6 +17,14 @@ class SearchSpider(scrapy.Spider):
allowed_domains = ['weibo.com']
settings = get_project_settings()
keyword_list = settings.get('KEYWORD_LIST')
if not isinstance(keyword_list, list):
if not os.path.isabs(keyword_list):
keyword_list = os.getcwd() + os.sep + keyword_list
if not os.path.isfile(keyword_list):
print('不存在%s文件' % keyword_list)
sys.exit()
keyword_list = util.get_keyword_list(keyword_list)

for i, keyword in enumerate(keyword_list):
if len(keyword) > 2 and keyword[0] == '#' and keyword[-1] == '#':
keyword_list[i] = '%23' + keyword[1:-1] + '%23'
Expand Down
20 changes: 18 additions & 2 deletions weibo/utils/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

import sys
from datetime import datetime, timedelta

from weibo.utils.region import region_dict
Expand Down Expand Up @@ -38,6 +38,22 @@ def convert_contain_type(contain_type):
return '&suball=1'


def get_keyword_list(file_name):
"""获取文件中的关键词列表"""
with open(file_name, 'rb') as f:
try:
lines = f.read().splitlines()
lines = [line.decode('utf-8-sig') for line in lines]
except UnicodeDecodeError:
print(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序', file_name)
sys.exit()
keyword_list = []
for line in lines:
if line:
keyword_list.append(line)
return keyword_list


def get_regions(region):
"""根据区域筛选条件返回符合要求的region"""
new_region = {}
Expand Down Expand Up @@ -68,7 +84,7 @@ def standardize_date(created_at):
created_at = (datetime.now() - hour).strftime("%Y-%m-%d %H:%M")
elif "今天" in created_at:
today = datetime.now().strftime('%Y-%m-%d')
created_at = today+' '+created_at[2:]
created_at = today + ' ' + created_at[2:]
elif '年' not in created_at:
year = datetime.now().strftime("%Y")
month = created_at[:2]
Expand Down

0 comments on commit e1140e3

Please sign in to comment.