Skip to content

Commit

Permalink
添加功能:(哔哩哔哩,快手,小红书)每个视频/帖子抓取评论最大条数限制,评论关键词筛选
Browse files Browse the repository at this point in the history
  • Loading branch information
PeanutSplash committed Dec 13, 2023
1 parent 5c42076 commit f17a853
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 15 deletions.
8 changes: 4 additions & 4 deletions config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@
# 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 10

# 抖音每个视频抓取评论最大条数 (为0则不限制)
DY_MAX_COMMENTS_PER_POST = 10
# 每个视频/帖子抓取评论最大条数 (为0则不限制)
MAX_COMMENTS_PER_POST = 10

# 抖音评论关键词筛选(只会留下包含关键词的评论,为空不限制)
DY_COMMENT_KEYWORDS = [
# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
COMMENT_KEYWORDS = [
"我"
# ........................
]
Expand Down
28 changes: 24 additions & 4 deletions media_platform/bilibili/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,18 +146,38 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
"""
async with semaphore:
try:
utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...")
await self.bili_client.get_video_all_comments(
utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...")
# Read keyword and quantity from config
keywords = config.COMMENT_KEYWORDS
max_comments = config.MAX_COMMENTS_PER_POST

# Download comments
all_comments = await self.bili_client.get_video_all_comments(
video_id=video_id,
crawl_interval=random.random(),
callback=bilibili.batch_update_bilibili_video_comments
)

# Filter comments by keyword
if keywords:
filtered_comments = [
comment for comment in all_comments if
any(keyword in comment["content"]["message"] for keyword in keywords)
]
else:
filtered_comments = all_comments

# Limit the number of comments
if max_comments > 0:
filtered_comments = filtered_comments[:max_comments]

# Update bilibili video comments
await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments)

except DataFetchError as ex:
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
except Exception as e:
utils.logger.error(f"[get_comments] may be been blocked, err:", e)


async def get_specified_videos(self):
"""
get specified videos info
Expand Down
4 changes: 2 additions & 2 deletions media_platform/douyin/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list:
task = asyncio.create_task(
self.get_comments(aweme_id, semaphore, max_comments=config.DY_MAX_COMMENTS_PER_POST), name=aweme_id)
self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id)
task_list.append(task)
await asyncio.wait(task_list)

Expand All @@ -143,7 +143,7 @@ async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_co
comments = await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id,
max_comments=max_comments, # 最大数量
keywords=config.DY_COMMENT_KEYWORDS # 关键词列表
keywords=config.COMMENT_KEYWORDS # 关键词列表
)
# 现在返回的 comments 已经是经过关键词筛选的
await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
Expand Down
24 changes: 20 additions & 4 deletions media_platform/kuaishou/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import httpx
from playwright.async_api import BrowserContext, Page

import config
from tools import utils

from .exception import DataFetchError, IPBlockError
Expand Down Expand Up @@ -124,7 +125,7 @@ async def get_video_comments(self, photo_id: str, pcursor: str = "") -> Dict:
return await self.post("", post_data)

async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
callback: Optional[Callable] = None, ):
callback: Optional[Callable] = None):
"""
get video all comments include sub comments
:param photo_id:
Expand All @@ -136,18 +137,33 @@ async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.

result = []
pcursor = ""
while pcursor != "no_more":
count = 0 # 计数器,记录已获取的评论数量

while pcursor != "no_more" and (
config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST):
comments_res = await self.get_video_comments(photo_id, pcursor)
vision_commen_list = comments_res.get("visionCommentList", {})
pcursor = vision_commen_list.get("pcursor", "")
comments = vision_commen_list.get("rootComments", [])

filtered_comments = [] # 存储经过关键词筛选后的评论

for comment in comments:
content = comment.get("content", "")

if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS):
filtered_comments.append(comment)

count += 1
if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST:
break

if callback: # 如果有回调函数,就执行回调函数
await callback(photo_id, comments)
await callback(photo_id, filtered_comments)

result.extend(filtered_comments)
await asyncio.sleep(crawl_interval)
if not is_fetch_sub_comments:
result.extend(comments)
continue
# todo handle get sub comments
return result
18 changes: 17 additions & 1 deletion media_platform/xhs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,27 @@ async def batch_get_note_comments(self, note_list: List[str]):
await asyncio.gather(*task_list)

async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
"""Get note comments"""
"""Get note comments with keyword filtering and quantity limitation"""
async with semaphore:
utils.logger.info(f"Begin get note id comments {note_id}")
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())

# 从配置文件中读取关键词和数量限制
keywords = getattr(config, 'COMMENT_KEYWORDS', [])
max_comments = getattr(config, 'MAX_COMMENTS_PER_POST', 0)

# 过滤评论
filtered_comments = []
for comment in all_comments:
# 检查评论内容是否包含关键词
if not keywords or any(keyword in comment['content'] for keyword in keywords):
filtered_comments.append(comment)
# 如果达到最大评论数量限制,则停止添加更多评论
if max_comments and len(filtered_comments) >= max_comments:
break

# 更新或保存过滤后的评论
for comment in filtered_comments:
await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)

@staticmethod
Expand Down

0 comments on commit f17a853

Please sign in to comment.