Skip to content

Commit

Permalink
feat: 微博爬虫帖子搜索完成
Browse files Browse the repository at this point in the history
  • Loading branch information
NanmiCoder committed Dec 24, 2023
1 parent 9785abb commit c5b64fd
Show file tree
Hide file tree
Showing 14 changed files with 671 additions and 19 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
# 仓库描述

**小红书爬虫****抖音爬虫****快手爬虫****B站爬虫**...。
目前能抓取小红书、抖音、快手、B站的视频、图片、评论、点赞、转发等信息。
**小红书爬虫****抖音爬虫****快手爬虫****B站爬虫****微博爬虫**...。
目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。

原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数
通过使用此方式,免去了复现核心加密JS代码,逆向难度大大降低
爬虫技术交流群:[949715256](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=NFz-oY7Pek3gpG5zbLJFHARlB8lKL94f&authKey=FlxIQK99Uu90wddNV5W%2FBga6T6lXU5BRqyTTc26f2P2ZK5OW%2BDhHp7MwviX%2BbrPa&noverify=0&group_code=949715256),同时欢迎大家贡献代码提交PR

爬虫技术交流群:[949715256](http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=NFz-oY7Pek3gpG5zbLJFHARlB8lKL94f&authKey=FlxIQK99Uu90wddNV5W%2FBga6T6lXU5BRqyTTc26f2P2ZK5OW%2BDhHp7MwviX%2BbrPa&noverify=0&group_code=949715256),同时欢迎大家贡献代码提交PR

## SPONSORED BY

## 赞助商
目前爬虫正在用的IP代理:<a href="https://www.jisuhttp.com/?pl=mAKphQ&plan=ZY&kd=Yang">极速HTTP代理</a> 新用户注册认证最高送12000IP,0元试用<br>
<a href="https://www.jisuhttp.com/?pl=mAKphQ&plan=ZY&kd=Yang" target="_blank"><img src="https://s2.loli.net/2023/11/30/RapQtL8A2w6TGfj.png" alt="极速HTTP代理-官网图"></a>

Expand Down
2 changes: 0 additions & 2 deletions base/base_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from abc import ABC, abstractmethod

from proxy.proxy_account_pool import AccountPool


class AbstractCrawler(ABC):
@abstractmethod
Expand Down
9 changes: 5 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
from media_platform.douyin import DouYinCrawler
from media_platform.kuaishou import KuaishouCrawler
from media_platform.xhs import XiaoHongShuCrawler
from proxy import proxy_account_pool
from media_platform.weibo import WeiboCrawler


class CrawlerFactory:
CRAWLERS = {
"xhs": XiaoHongShuCrawler,
"dy": DouYinCrawler,
"ks": KuaishouCrawler,
"bili": BilibiliCrawler
"bili": BilibiliCrawler,
"wb": WeiboCrawler
}

@staticmethod
Expand All @@ -31,8 +32,8 @@ def create_crawler(platform: str) -> AbstractCrawler:
async def main():
# define command line params ...
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili)',
choices=["xhs", "dy", "ks", "bili"], default=config.PLATFORM)
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
parser.add_argument('--type', type=str, help='crawler type (search | detail)',
Expand Down
6 changes: 3 additions & 3 deletions media_platform/bilibili/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ async def start(self):
await self.get_specified_videos()
else:
pass
utils.logger.info("Bilibili Crawler finished ...")
pass
utils.logger.info("[BilibiliCrawler.start] Bilibili Crawler finished ...")


async def search(self):
"""
Expand Down Expand Up @@ -220,7 +220,7 @@ async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Sema

async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
"""Create xhs client"""
utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create xiaohongshu API client ...")
utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
bilibili_client_obj = BilibiliClient(
proxies=httpx_proxy,
Expand Down
10 changes: 5 additions & 5 deletions media_platform/bilibili/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@
import sys
from typing import Optional

import redis
from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)

import config
from base.base_crawler import AbstractLogin
from tools import utils

Expand All @@ -33,7 +31,7 @@ def __init__(self,
self.cookie_str = cookie_str

async def begin(self):
"""Start login xiaohongshu"""
"""Start login bilibili"""
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
if self.login_type == "qrcode":
await self.login_by_qrcode()
Expand All @@ -42,7 +40,8 @@ async def begin(self):
elif self.login_type == "cookie":
await self.login_by_cookies()
else:
raise ValueError("[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
raise ValueError(
"[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool:
Expand Down Expand Up @@ -89,7 +88,8 @@ async def login_by_qrcode(self):
sys.exit()

wait_redirect_seconds = 5
utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(
f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)

async def login_by_mobile(self):
Expand Down
7 changes: 7 additions & 0 deletions media_platform/weibo/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
# @Author : [email protected]
# @Time : 2023/12/23 15:40
# @Desc :
from .core import WeiboCrawler
from .login import WeiboLogin
from .client import WeiboClient
98 changes: 98 additions & 0 deletions media_platform/weibo/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# -*- coding: utf-8 -*-
# @Author : [email protected]
# @Time : 2023/12/23 15:40
# @Desc : 微博爬虫 API 请求 client

import asyncio
import json
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlencode

import httpx
from playwright.async_api import BrowserContext, Page

from tools import utils

from .exception import DataFetchError
from .field import SearchType


class WeiboClient:
def __init__(
self,
timeout=10,
proxies=None,
*,
headers: Dict[str, str],
playwright_page: Page,
cookie_dict: Dict[str, str],
):
self.proxies = proxies
self.timeout = timeout
self.headers = headers
self._host = "https://m.weibo.cn"
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict

async def request(self, method, url, **kwargs) -> Any:
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
**kwargs
)
data: Dict = response.json()
if data.get("ok") != 1:
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
raise DataFetchError(data.get("msg", "unkonw error"))
else:
return data.get("data", {})

async def get(self, uri: str, params=None) -> Dict:
final_uri = uri
if isinstance(params, dict):
final_uri = (f"{uri}?"
f"{urlencode(params)}")
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers)

async def post(self, uri: str, data: dict) -> Dict:
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=f"{self._host}{uri}",
data=json_str, headers=self.headers)

async def pong(self) -> bool:
"""get a note to check if login state is ok"""
utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
ping_flag = False
try:
pass
except Exception as e:
utils.logger.error(f"[BilibiliClient.pong] Pong weibo failed: {e}, and try to login again...")
ping_flag = False
return ping_flag

async def update_cookies(self, browser_context: BrowserContext):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict

async def get_note_by_keyword(
self,
keyword: str,
page: int = 1,
search_type: SearchType = SearchType.DEFAULT
) -> Dict:
"""
search note by keyword
:param keyword: 微博搜搜的关键词
:param page: 分页参数 -当前页码
:param search_type: 搜索的类型,见 weibo/filed.py 中的枚举SearchType
:return:
"""
uri = "/api/container/getIndex"
containerid = f"100103type={search_type.value}&q={keyword}"
params = {
"containerid": containerid,
"page_type": "searchall",
"page": page,
}
return await self.get(uri, params)
Loading

0 comments on commit c5b64fd

Please sign in to comment.