Skip to content

Commit

Permalink
feat: 支持评论模式是否开启爬取选项
Browse files Browse the repository at this point in the history
  • Loading branch information
NanmiCoder committed Mar 16, 2024
1 parent 2d12ecb commit 59cd9f6
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 14 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
### 运行爬虫程序

```shell
# 默认没有开启评论爬取模式,有需要请到配置文件中指定
# 从配置文件中读取关键词搜索相关的帖子并爬去帖子信息与评论
python main.py --platform xhs --lt qrcode --type search

Expand Down
14 changes: 5 additions & 9 deletions config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
KEYWORDS = "python,golang"
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
SORT_TYPE="popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
CRAWLER_TYPE = "search"

# 是否开启 IP 代理
Expand All @@ -19,7 +19,7 @@
SAVE_LOGIN_STATE = True

# 数据保存类型选项配置,支持三种类型:csv、db、json
SAVE_DATA_OPTION = "json" # csv or db or json
SAVE_DATA_OPTION = "json" # csv or db or json

# 用户浏览器缓存的浏览器文件配置
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
Expand All @@ -30,12 +30,8 @@
# 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 4


# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
COMMENT_KEYWORDS = [
# "真棒"
# ........................
]
# 是否开启爬评论模式, 默认不开启爬评论
ENABLE_GET_COMMENTS = False

# 指定小红书需要爬虫的笔记ID列表
XHS_SPECIFIED_ID_LIST = [
Expand Down Expand Up @@ -78,4 +74,4 @@
"61b87386000000001000b18b",
"5e8558100000000001005bc5",
# ........................
]
]
4 changes: 4 additions & 0 deletions media_platform/bilibili/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ async def batch_get_video_comments(self, video_id_list: List[str]):
:param video_id_list:
:return:
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
return

utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
Expand Down
6 changes: 5 additions & 1 deletion media_platform/douyin/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) ->
return None

async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
return

task_list: List[Task] = []
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list:
Expand All @@ -145,7 +149,7 @@ async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Non
async with semaphore:
try:
# 将关键词列表传递给 get_aweme_all_comments 方法
comments = await self.dy_client.get_aweme_all_comments(
await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id,
crawl_interval=random.random(),
callback=douyin_store.batch_update_dy_aweme_comments
Expand Down
4 changes: 4 additions & 0 deletions media_platform/kuaishou/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ async def batch_get_video_comments(self, video_id_list: List[str]):
:param video_id_list:
:return:
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
return

utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
Expand Down
4 changes: 4 additions & 0 deletions media_platform/weibo/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ async def batch_get_notes_comments(self, note_id_list: List[str]):
:param note_id_list:
:return:
"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
return

utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
Expand Down
14 changes: 10 additions & 4 deletions media_platform/xhs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ async def search(self) -> None:
notes_res = await self.xhs_client.get_note_by_keyword(
keyword=keyword,
page=page,
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE!='' else SearchSortType.GENERAL,
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
)
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
Expand All @@ -122,7 +122,7 @@ async def search(self) -> None:
page += 1
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
await self.batch_get_note_comments(note_id_list)

async def get_creators_and_notes(self) -> None:
"""Get creator's notes and retrieve their comment information."""
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
Expand Down Expand Up @@ -151,7 +151,8 @@ async def get_creators_and_notes(self) -> None:

# save creator info
await xhs_store.save_creator(creator, creator_and_notes_info.get('creator'))
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] save creator info:{creator_and_notes_info.get('creator')}")
utils.logger.info(
f"[XiaoHongShuCrawler.get_creators_and_notes] save creator info:{creator_and_notes_info.get('creator')}")
else:
# get notes
notes = await self.xhs_client.get_notes_by_creator(creator, cursor)
Expand All @@ -164,7 +165,8 @@ async def get_creators_and_notes(self) -> None:
cursor = notes.get('cursor')
has_more_notes = notes.get('has_more_notes')
notes_res = notes.get('notes')
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes res:{notes_res}")
utils.logger.info(
f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes res:{notes_res}")

semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
Expand Down Expand Up @@ -211,6 +213,10 @@ async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> O

async def batch_get_note_comments(self, note_list: List[str]):
"""Batch get note comments"""
if not config.ENABLE_GET_COMMENTS:
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
return

utils.logger.info(
f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
Expand Down

0 comments on commit 59cd9f6

Please sign in to comment.