添加功能:(哔哩哔哩,快手,小红书)每个视频/帖子抓取评论最大条数限制,评论关键词筛选

JackLongKing · Dec 13, 2023 · f17a853 · f17a853
1 parent 5c42076
commit f17a853
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 15 deletions.
diff --git a/config/base_config.py b/config/base_config.py
@@ -29,11 +29,11 @@
 # 并发爬虫数量控制
 MAX_CONCURRENCY_NUM = 10
 
-# 抖音每个视频抓取评论最大条数 (为0则不限制)
-DY_MAX_COMMENTS_PER_POST = 10
+# 每个视频/帖子抓取评论最大条数 (为0则不限制)
+MAX_COMMENTS_PER_POST = 10
 
-# 抖音评论关键词筛选(只会留下包含关键词的评论,为空不限制)
-DY_COMMENT_KEYWORDS = [
+# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
+COMMENT_KEYWORDS = [
     "我"
     # ........................
 ]

diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py
@@ -146,18 +146,38 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
         """
         async with semaphore:
             try:
-                utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...")
-                await self.bili_client.get_video_all_comments(
+                utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...")
+                # Read keyword and quantity from config
+                keywords = config.COMMENT_KEYWORDS
+                max_comments = config.MAX_COMMENTS_PER_POST
+
+                # Download comments
+                all_comments = await self.bili_client.get_video_all_comments(
                     video_id=video_id,
                     crawl_interval=random.random(),
-                    callback=bilibili.batch_update_bilibili_video_comments
                 )
+
+                # Filter comments by keyword
+                if keywords:
+                    filtered_comments = [
+                        comment for comment in all_comments if
+                        any(keyword in comment["content"]["message"] for keyword in keywords)
+                    ]
+                else:
+                    filtered_comments = all_comments
+
+                # Limit the number of comments
+                if max_comments > 0:
+                    filtered_comments = filtered_comments[:max_comments]
+
+                # Update bilibili video comments
+                await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments)
+
             except DataFetchError as ex:
                 utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
             except Exception as e:
                 utils.logger.error(f"[get_comments] may be been blocked, err:", e)
 
-
     async def get_specified_videos(self):
         """
         get specified videos info

diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py
@@ -132,7 +132,7 @@ async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
         semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
         for aweme_id in aweme_list:
             task = asyncio.create_task(
-                self.get_comments(aweme_id, semaphore, max_comments=config.DY_MAX_COMMENTS_PER_POST), name=aweme_id)
+                self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id)
             task_list.append(task)
         await asyncio.wait(task_list)
 
@@ -143,7 +143,7 @@ async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_co
                 comments = await self.dy_client.get_aweme_all_comments(
                     aweme_id=aweme_id,
                     max_comments=max_comments, # 最大数量
-                    keywords=config.DY_COMMENT_KEYWORDS  # 关键词列表
+                    keywords=config.COMMENT_KEYWORDS  # 关键词列表
                 )
                 # 现在返回的 comments 已经是经过关键词筛选的
                 await douyin.batch_update_dy_aweme_comments(aweme_id, comments)

diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py
@@ -7,6 +7,7 @@
 import httpx
 from playwright.async_api import BrowserContext, Page
 
+import config
 from tools import utils
 
 from .exception import DataFetchError, IPBlockError
@@ -124,7 +125,7 @@ async def get_video_comments(self, photo_id: str, pcursor: str = "") -> Dict:
         return await self.post("", post_data)
 
     async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
-                                     callback: Optional[Callable] = None, ):
+                                     callback: Optional[Callable] = None):
         """
         get video all comments include sub comments
         :param photo_id:
@@ -136,18 +137,33 @@ async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.
 
         result = []
         pcursor = ""
-        while pcursor != "no_more":
+        count = 0  # 计数器，记录已获取的评论数量
+
+        while pcursor != "no_more" and (
+                config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST):
             comments_res = await self.get_video_comments(photo_id, pcursor)
             vision_commen_list = comments_res.get("visionCommentList", {})
             pcursor = vision_commen_list.get("pcursor", "")
             comments = vision_commen_list.get("rootComments", [])
 
+            filtered_comments = []  # 存储经过关键词筛选后的评论
+
+            for comment in comments:
+                content = comment.get("content", "")
+
+                if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS):
+                    filtered_comments.append(comment)
+
+                    count += 1
+                    if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST:
+                        break
+
             if callback:  # 如果有回调函数，就执行回调函数
-                await callback(photo_id, comments)
+                await callback(photo_id, filtered_comments)
 
+            result.extend(filtered_comments)
             await asyncio.sleep(crawl_interval)
             if not is_fetch_sub_comments:
-                result.extend(comments)
                 continue
             # todo handle get sub comments
         return result
diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
@@ -152,11 +152,27 @@ async def batch_get_note_comments(self, note_list: List[str]):
         await asyncio.gather(*task_list)
 
     async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
-        """Get note comments"""
+        """Get note comments with keyword filtering and quantity limitation"""
         async with semaphore:
             utils.logger.info(f"Begin get note id comments {note_id}")
             all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
+
+            # 从配置文件中读取关键词和数量限制
+            keywords = getattr(config, 'COMMENT_KEYWORDS', [])
+            max_comments = getattr(config, 'MAX_COMMENTS_PER_POST', 0)
+
+            # 过滤评论
+            filtered_comments = []
             for comment in all_comments:
+                # 检查评论内容是否包含关键词
+                if not keywords or any(keyword in comment['content'] for keyword in keywords):
+                    filtered_comments.append(comment)
+                    # 如果达到最大评论数量限制，则停止添加更多评论
+                    if max_comments and len(filtered_comments) >= max_comments:
+                        break
+
+            # 更新或保存过滤后的评论
+            for comment in filtered_comments:
                 await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
 
     @staticmethod