Skip to content

Commit

Permalink
fix: 增加小红书登录两种形态下弹窗的兼容代码
Browse files Browse the repository at this point in the history
  • Loading branch information
NanmiCoder committed Jun 22, 2023
1 parent 88e8ee3 commit 1085a2a
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 23 deletions.
9 changes: 6 additions & 3 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@

platform = "xhs"
keyword = "健身"
login_type = "handby" # qrcode or phone
login_phone = "13812345678" # your login phone
login_webSession="040069b5f35b1cfef9787551bd364b86f4d839"
login_type = "cookie" # qrcode or phone or cookie
login_phone = "" # your login phone

# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
# web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
cookies = ""

# redis config
redis_db_host = "redis://127.0.0.1"
Expand Down
6 changes: 3 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@ async def main():
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | handby)', default=config.login_type)
parser.add_argument('--web_session', type=str, help='cookies to keep log in', default=config.login_webSession)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type)
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies)

args = parser.parse_args()
crawler = CrawlerFactory().create_crawler(platform=args.platform)
crawler.init_config(
keywords=args.keywords,
login_phone=args.phone,
login_type=args.lt,
web_session=args.web_session
cookie_str=args.cookies
)
await crawler.start()

Expand Down
67 changes: 50 additions & 17 deletions media_platform/xhs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def __init__(self):
self.login_type = None
self.keywords = None
self.web_session = None
self.cookies: Optional[List[Cookie]] = None
self.cookies: Optional[List[Cookie]] = None # cookies from browser context
self.cookie_str: Optional[str] = None # cookie string from config or command line
self.browser_context: Optional[BrowserContext] = None
self.context_page: Optional[Page] = None
self.proxy: Optional[Dict] = None
Expand Down Expand Up @@ -88,28 +89,51 @@ async def start(self):

async def login(self):
"""login xiaohongshu website and keep webdriver login state"""
# There are two ways to log in:
# There are three ways to log in:
# 1. Semi-automatic: Log in by scanning the QR code.
# 2. Fully automatic: Log in using forwarded text message notifications
# 3. handby automatic: Log in using preset cookie
# which includes mobile phone number and verification code.
# 3. Semi-automatic: Log in using preset cookie
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
await self.login_by_mobile()
elif self.login_type == "handby":
await self.browser_context.add_cookies([{
'name': 'web_session',
'value': self.web_session,
'domain': ".xiaohongshu.com",
'path': "/"
}])
else:
elif self.login_type == "cookie":
# cookie str convert to cookie dict
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
'value': value,
'domain': ".xiaohongshu.com",
'path': "/"
}])
else:
pass


async def login_by_mobile(self):
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")

await asyncio.sleep(1)
try:
# After entering the main page of Xiaohongshu,
# the login window may not pop up automatically and you need to manually click the login button.
login_button_ele = await self.context_page.wait_for_selector(
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
timeout=5000
)
await login_button_ele.click()

# There are also two types of login dialog boxes for pop-ups.
# One type directly shows the phone number and verification code.
# Another type requires clicking to switch to mobile login.
element = await self.context_page.wait_for_selector(
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
timeout=5000
)
await element.click()
except:
print("have not found mobile button icon and keep going ...")
await asyncio.sleep(1)

login_container_ele = await self.context_page.wait_for_selector("div.login-container")
# Fill login phone
input_ele = await login_container_ele.query_selector("label.phone > input")
Expand Down Expand Up @@ -158,16 +182,25 @@ async def login_by_mobile(self):
async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state"""
print("Start scanning QR code to log in to Xiaohongshu. ...")
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"

# find login qrcode
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector="div.login-container > div.left > div.qrcode > img"
selector=qrcode_img_selector
)
if not base64_qrcode_img:
# todo ...if this website does not automatically popup login dialog box, we will manual click login button
print("login failed , have not found qrcode please check ....")
sys.exit()
print("have not found qrcode and try again get it ....")
# if this website does not automatically popup login dialog box, we will manual click login button
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
await login_button_ele.click()
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
print("login failed , program exit ...")
sys.exit()

# get not logged session
current_cookie = await self.browser_context.cookies()
Expand Down
16 changes: 16 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,22 @@ def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]:
return cookies_str, cookie_dict


def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
cookie_dict = dict()
if not cookie_str:
return cookie_dict
for cookie in cookie_str.split(";"):
cookie = cookie.strip()
if not cookie:
continue
cookie = cookie.split("=")
cookie_value = cookie[1]
if isinstance(cookie_value, list):
cookie_value = "".join(cookie_value)
cookie_dict[cookie[0]] = cookie_value
return cookie_dict


def get_current_timestamp():
return int(time.time() * 1000)

Expand Down

0 comments on commit 1085a2a

Please sign in to comment.