Skip to content

Commit

Permalink
feat: weibo支持指定创作者主页
Browse files Browse the repository at this point in the history
  • Loading branch information
NanmiCoder committed Aug 23, 2024
1 parent 61f023e commit ab7d814
Show file tree
Hide file tree
Showing 9 changed files with 368 additions and 16 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
| 抖音 ||||||||
| 快手 ||||||||
| B 站 ||||||||
| 微博 |||| ||||
| 微博 |||| ||||
| 贴吧 ||||||||


Expand Down
6 changes: 6 additions & 0 deletions config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@
# ........................
]

# 指定weibo创作者ID列表
WEIBO_CREATOR_ID_LIST = [
"5533390220",
# ........................
]

# 指定贴吧需要爬取的帖子列表
TIEBA_SPECIFIED_ID_LIST = [

Expand Down
139 changes: 133 additions & 6 deletions media_platform/weibo/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
import copy
import json
import re
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import urlencode
from typing import Callable, Dict, List, Optional, Union
from urllib.parse import parse_qs, unquote, urlencode

import httpx
from httpx import Response
from playwright.async_api import BrowserContext, Page

import config
Expand Down Expand Up @@ -38,28 +39,34 @@ def __init__(
self.cookie_dict = cookie_dict
self._image_agent_host = "https://i1.wp.com/"

async def request(self, method, url, **kwargs) -> Any:
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
enable_return_response = kwargs.pop("return_response", False)
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
**kwargs
)

if enable_return_response:
return response

data: Dict = response.json()
if data.get("ok") != 1:
ok_code = data.get("ok")
if ok_code not in [0, 1]:
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
raise DataFetchError(data.get("msg", "unkonw error"))
else:
return data.get("data", {})

async def get(self, uri: str, params=None, headers=None) -> Dict:
async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]:
final_uri = uri
if isinstance(params, dict):
final_uri = (f"{uri}?"
f"{urlencode(params)}")

if headers is None:
headers = self.headers
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs)

async def post(self, uri: str, data: dict) -> Dict:
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
Expand Down Expand Up @@ -229,3 +236,123 @@ async def get_note_image(self, image_url: str) -> bytes:
return None
else:
return response.content



async def get_creator_container_info(self, creator_id: str) -> Dict:
"""
获取用户的容器ID, 容器信息代表着真实请求的API路径
fid_container_id:用户的微博详情API的容器ID
lfid_container_id:用户的微博列表API的容器ID
Args:
creator_id:
Returns: {
"""
response = await self.get(f"/u/{creator_id}", return_response=True)
m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
if not m_weibocn_params:
raise DataFetchError("get containerid failed")
m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
return {
"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0],
"lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]
}

async def get_creator_info_by_id(self, creator_id: str) -> Dict:
"""
根据用户ID获取用户详情
Args:
creator_id:
Returns:
"""
uri = "/api/container/getIndex"
container_info = await self.get_creator_container_info(creator_id)
if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
raise DataFetchError("get containerid failed")
params = {
"jumpfrom": "weibocom",
"type": "uid",
"value": creator_id,
"containerid": container_info["fid_container_id"],
}

user_res = await self.get(uri, params)

if user_res.get("tabsInfo"):
tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
for tab in tabs:
if tab.get("tabKey") == "weibo":
container_info["lfid_container_id"] = tab.get("containerid")
break

user_res.update(container_info)
return user_res

async def get_notes_by_creator(self, creator: str, container_id: str, since_id: str = "0", ) -> Dict:
"""
获取博主的笔记
Args:
creator: 博主ID
container_id: 容器ID
since_id: 上一页最后一条笔记的ID
Returns:
"""

uri = "/api/container/getIndex"
params = {
"jumpfrom": "weibocom",
"type": "uid",
"value": creator,
"containerid": container_id,
"since_id": since_id,
}
return await self.get(uri, params)

async def get_all_notes_by_creator_id(self, creator_id: str, container_id: str, crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[Dict]:
"""
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
Args:
creator_id:
container_id:
crawl_interval:
callback:
Returns:
"""
result = []
notes_has_more = True
since_id = ""
crawler_total_count = 0
while notes_has_more:
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
if not notes_res:
utils.logger.error(
f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
break

notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
notes_has_more += 10
if "cards" not in notes_res:
utils.logger.info(
f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
break

notes = notes_res["cards"]
utils.logger.info(
f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
notes = [note for note in notes if note.get("card_type") == 9]
if callback:
await callback(notes)
await asyncio.sleep(crawl_interval)
result.extend(notes)
return result

38 changes: 38 additions & 0 deletions media_platform/weibo/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ async def start(self):
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
await self.get_creators_and_notes()
else:
pass
utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...")
Expand Down Expand Up @@ -221,6 +224,41 @@ async def get_note_images(self, mblog: Dict):
extension_file_name = url.split(".")[-1]
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)


async def get_creators_and_notes(self) -> None:
"""
Get creator's information and their notes and comments
Returns:
"""
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
for user_id in config.WEIBO_CREATOR_ID_LIST:
createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
if createor_info_res:
createor_info: Dict = createor_info_res.get("userInfo", {})
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
if not createor_info:
raise DataFetchError("Get creator info error")
await weibo_store.save_creator(user_id, user_info=createor_info)

# Get all note information of the creator
all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
creator_id=user_id,
container_id=createor_info_res.get("lfid_container_id"),
crawl_interval=0,
callback=weibo_store.batch_update_weibo_notes
)

note_ids = [note_item.get("mlog", {}).get("id") for note_item in all_notes_list if
note_item.get("mlog", {}).get("id")]
await self.batch_get_notes_comments(note_ids)

else:
utils.logger.error(
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")



async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
"""Create xhs client"""
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
Expand Down
3 changes: 0 additions & 3 deletions media_platform/xhs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,6 @@ async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaph
async with semaphore:
try:
_note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
print("------------------------")
print(_note_detail)
print("------------------------")
if not _note_detail:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}")
Expand Down
19 changes: 19 additions & 0 deletions schema/tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -406,3 +406,22 @@ alter table kuaishou_video add column `source_keyword` varchar(255) default '' c
alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';


DROP TABLE IF EXISTS `weibo_creator`;
CREATE TABLE `weibo_creator`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`desc` longtext COMMENT '用户描述',
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
`tag_list` longtext COMMENT '标签列表',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博博主';
Loading

0 comments on commit ab7d814

Please sign in to comment.