feat: weibo支持指定创作者主页

xcc313 · Aug 23, 2024 · ab7d814 · ab7d814
1 parent 61f023e
commit ab7d814
Show file tree

Hide file tree

Showing 9 changed files with 368 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@
 | 抖音  | ✅     | ✅        | ✅    | ✅       | ✅     | ✅     | ✅    |
 | 快手  | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | B 站 | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
-| 微博  | ✅     | ✅        | ✅   | ❌      | ✅     | ✅     | ✅    |
+| 微博  | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | 贴吧  | ✅     | ✅        | ✅   | ❌      | ✅     | ✅     | ✅    |
 
 

diff --git a/config/base_config.py b/config/base_config.py
@@ -85,6 +85,12 @@
     # ........................
 ]
 
+# 指定weibo创作者ID列表
+WEIBO_CREATOR_ID_LIST = [
+    "5533390220",
+    # ........................
+]
+
 # 指定贴吧需要爬取的帖子列表
 TIEBA_SPECIFIED_ID_LIST = [
 

diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py
@@ -7,10 +7,11 @@
 import copy
 import json
 import re
-from typing import Any, Callable, Dict, List, Optional
-from urllib.parse import urlencode
+from typing import Callable, Dict, List, Optional, Union
+from urllib.parse import parse_qs, unquote, urlencode
 
 import httpx
+from httpx import Response
 from playwright.async_api import BrowserContext, Page
 
 import config
@@ -38,28 +39,34 @@ def __init__(
         self.cookie_dict = cookie_dict
         self._image_agent_host = "https://i1.wp.com/"
 
-    async def request(self, method, url, **kwargs) -> Any:
+    async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
+        enable_return_response = kwargs.pop("return_response", False)
         async with httpx.AsyncClient(proxies=self.proxies) as client:
             response = await client.request(
                 method, url, timeout=self.timeout,
                 **kwargs
             )
+
+        if enable_return_response:
+            return response
+
         data: Dict = response.json()
-        if data.get("ok") != 1:
+        ok_code = data.get("ok")
+        if ok_code not in [0, 1]:
             utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
             raise DataFetchError(data.get("msg", "unkonw error"))
         else:
             return data.get("data", {})
 
-    async def get(self, uri: str, params=None, headers=None) -> Dict:
+    async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]:
         final_uri = uri
         if isinstance(params, dict):
             final_uri = (f"{uri}?"
                          f"{urlencode(params)}")
 
         if headers is None:
             headers = self.headers
-        return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
+        return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs)
 
     async def post(self, uri: str, data: dict) -> Dict:
         json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
@@ -229,3 +236,123 @@ async def get_note_image(self, image_url: str) -> bytes:
                 return None
             else:
                 return response.content
+
+
+
+    async def get_creator_container_info(self, creator_id: str) -> Dict:
+        """
+        获取用户的容器ID, 容器信息代表着真实请求的API路径
+            fid_container_id：用户的微博详情API的容器ID
+            lfid_container_id：用户的微博列表API的容器ID
+        Args:
+            creator_id:
+
+        Returns: {
+
+        """
+        response = await self.get(f"/u/{creator_id}", return_response=True)
+        m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
+        if not m_weibocn_params:
+            raise DataFetchError("get containerid failed")
+        m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
+        return {
+            "fid_container_id": m_weibocn_params_dict.get("fid", [""])[0],
+            "lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]
+        }
+
+    async def get_creator_info_by_id(self, creator_id: str) -> Dict:
+        """
+        根据用户ID获取用户详情
+        Args:
+            creator_id:
+
+        Returns:
+
+        """
+        uri = "/api/container/getIndex"
+        container_info = await self.get_creator_container_info(creator_id)
+        if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
+            utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
+            raise DataFetchError("get containerid failed")
+        params = {
+            "jumpfrom": "weibocom",
+            "type": "uid",
+            "value": creator_id,
+            "containerid": container_info["fid_container_id"],
+        }
+
+        user_res = await self.get(uri, params)
+
+        if user_res.get("tabsInfo"):
+            tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
+            for tab in tabs:
+                if tab.get("tabKey") == "weibo":
+                    container_info["lfid_container_id"] = tab.get("containerid")
+                    break
+
+        user_res.update(container_info)
+        return user_res
+
+    async def get_notes_by_creator(self, creator: str, container_id: str, since_id: str = "0", ) -> Dict:
+        """
+        获取博主的笔记
+        Args:
+            creator: 博主ID
+            container_id: 容器ID
+            since_id: 上一页最后一条笔记的ID
+        Returns:
+
+        """
+
+        uri = "/api/container/getIndex"
+        params = {
+            "jumpfrom": "weibocom",
+            "type": "uid",
+            "value": creator,
+            "containerid": container_id,
+            "since_id": since_id,
+        }
+        return await self.get(uri, params)
+
+    async def get_all_notes_by_creator_id(self, creator_id: str, container_id: str, crawl_interval: float = 1.0,
+                                          callback: Optional[Callable] = None) -> List[Dict]:
+        """
+        获取指定用户下的所有发过的帖子，该方法会一直查找一个用户下的所有帖子信息
+        Args:
+            creator_id:
+            container_id:
+            crawl_interval:
+            callback:
+
+        Returns:
+
+        """
+        result = []
+        notes_has_more = True
+        since_id = ""
+        crawler_total_count = 0
+        while notes_has_more:
+            notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
+            if not notes_res:
+                utils.logger.error(
+                    f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
+                break
+
+            notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
+            since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
+            notes_has_more += 10
+            if "cards" not in notes_res:
+                utils.logger.info(
+                    f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
+                break
+
+            notes = notes_res["cards"]
+            utils.logger.info(
+                f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
+            notes = [note for note  in notes if note.get("card_type") == 9]
+            if callback:
+                await callback(notes)
+            await asyncio.sleep(crawl_interval)
+            result.extend(notes)
+        return result
+
diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py
@@ -84,6 +84,9 @@ async def start(self):
             elif config.CRAWLER_TYPE == "detail":
                 # Get the information and comments of the specified post
                 await self.get_specified_notes()
+            elif config.CRAWLER_TYPE == "creator":
+                # Get creator's information and their notes and comments
+                await self.get_creators_and_notes()
             else:
                 pass
             utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...")
@@ -221,6 +224,41 @@ async def get_note_images(self, mblog: Dict):
                 extension_file_name = url.split(".")[-1]
                 await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
 
+
+    async def get_creators_and_notes(self) -> None:
+        """
+        Get creator's information and their notes and comments
+        Returns:
+
+        """
+        utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
+        for user_id in config.WEIBO_CREATOR_ID_LIST:
+            createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
+            if createor_info_res:
+                createor_info: Dict = createor_info_res.get("userInfo", {})
+                utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
+                if not createor_info:
+                    raise DataFetchError("Get creator info error")
+                await weibo_store.save_creator(user_id, user_info=createor_info)
+
+                # Get all note information of the creator
+                all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
+                    creator_id=user_id,
+                    container_id=createor_info_res.get("lfid_container_id"),
+                    crawl_interval=0,
+                    callback=weibo_store.batch_update_weibo_notes
+                )
+
+                note_ids = [note_item.get("mlog", {}).get("id") for note_item in all_notes_list if
+                            note_item.get("mlog", {}).get("id")]
+                await self.batch_get_notes_comments(note_ids)
+
+            else:
+                utils.logger.error(
+                    f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")
+
+
+
     async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
         """Create xhs client"""
         utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")

diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
@@ -185,9 +185,6 @@ async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaph
             async with semaphore:
                 try:
                     _note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
-                    print("------------------------")
-                    print(_note_detail)
-                    print("------------------------")
                     if not _note_detail:
                         utils.logger.error(
                             f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}")

diff --git a/schema/tables.sql b/schema/tables.sql
@@ -406,3 +406,22 @@ alter table kuaishou_video add column `source_keyword` varchar(255) default '' c
 alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
 alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
 alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
+
+
+DROP TABLE IF EXISTS `weibo_creator`;
+CREATE TABLE `weibo_creator`
+(
+    `id`             int         NOT NULL AUTO_INCREMENT COMMENT '自增ID',
+    `user_id`        varchar(64) NOT NULL COMMENT '用户ID',
+    `nickname`       varchar(64)  DEFAULT NULL COMMENT '用户昵称',
+    `avatar`         varchar(255) DEFAULT NULL COMMENT '用户头像地址',
+    `ip_location`    varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
+    `add_ts`         bigint      NOT NULL COMMENT '记录添加时间戳',
+    `last_modify_ts` bigint      NOT NULL COMMENT '记录最后修改时间戳',
+    `desc`           longtext COMMENT '用户描述',
+    `gender`         varchar(1)   DEFAULT NULL COMMENT '性别',
+    `follows`        varchar(16)  DEFAULT NULL COMMENT '关注数',
+    `fans`           varchar(16)  DEFAULT NULL COMMENT '粉丝数',
+    `tag_list`       longtext COMMENT '标签列表',
+    PRIMARY KEY (`id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博博主';