Skip to content

Commit

Permalink
refactor:优化部分代码
Browse files Browse the repository at this point in the history
feat: 增加IP代理账号池
  • Loading branch information
NanmiCoder committed Jun 27, 2023
1 parent 963d9a1 commit b8093a2
Show file tree
Hide file tree
Showing 19 changed files with 614 additions and 253 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- [x] 小红书 笔记、评论
- [x] 小红书 二维码扫描登录 | 手机号+验证码自动登录 | cookies登录
- [x] 爬取抖音视频、评论
- [x] IP代理池,账号池
- [ ] To do 抖音滑块

## 技术栈
Expand All @@ -28,7 +29,7 @@
2. 安装playwright浏览器驱动
`playwright install`
3. 运行爬虫程序
`python main.py --platform xhs --keywords 健身 --lt qrcode`
`python main.py --platform xhs --lt qrcode`
4. 打开小红书扫二维码登录

## 小红书运行截图
Expand All @@ -46,8 +47,8 @@
- 转发软件中配置WEBHOOK相关的信息,主要分为 消息模板(请查看本项目中的recv_sms_notification.py)、一个能push短信通知的API地址
- push的API地址一般是需要绑定一个域名的(当然也可以是内网的IP地址),我用的是内网穿透方式,会有一个免费的域名绑定到内网的web server,内网穿透工具 [ngrok](https://ngrok.com/docs/)
- 安装redis并设置一个密码 [redis安装](https://www.cnblogs.com/hunanzp/p/12304622.html)
- 执行 `python recv_sms_notification.py` 等待短信转发器发送HTTP通知
- 执行手机号登录的爬虫程序 `python main.py --platform xhs --keywords 健身 --lt phone --phone 13812345678`
- 执行 `python tools/recv_sms_notification.py` 等待短信转发器发送HTTP通知
- 执行手机号登录的爬虫程序 `python main.py --platform xhs --lt phone`

备注:
- 小红书这边一个手机号一天只能发10条短信(悠着点),目前在发验证码时还未触发滑块验证,估计多了之后也会有~
Expand Down
Empty file added base/__init__.py
Empty file.
41 changes: 41 additions & 0 deletions base/base_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from abc import ABC, abstractmethod


class AbstractCrawler(ABC):
@abstractmethod
def init_config(self, **kwargs):
pass

@abstractmethod
async def start(self):
pass

@abstractmethod
async def search_posts(self):
pass

@abstractmethod
async def get_comments(self, item_id: int):
pass


class AbstractLogin(ABC):
@abstractmethod
async def begin(self):
pass

@abstractmethod
async def check_login_state(self):
pass

@abstractmethod
async def login_by_qrcode(self):
pass

@abstractmethod
async def login_by_mobile(self):
pass

@abstractmethod
async def login_by_cookies(self):
pass
130 changes: 130 additions & 0 deletions base/proxy_account_pool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import config


class PhonePool:
"""phone pool class"""

def __init__(self):
self.phones = []
self.used_phones = set()

def add_phone(self, phone):
"""add phone to the pool"""
if phone not in self.phones:
self.phones.append(phone)
return True
return False

def remove_phone(self, phone):
"""remove phone from the pool"""
if phone in self.used_phones:
self.phones.remove(phone)
self.used_phones.remove(phone)
return True
return False

def get_phone(self):
"""get phone and mark as used"""
if self.phones:
left_phone = self.phones.pop(0)
self.used_phones.add(left_phone)
return left_phone
return None

def clear(self):
"""clear phone pool"""
self.phones = []
self.used_phones = set()


class IPPool:
def __init__(self):
self.ips = []
self.used_ips = set()

def add_ip(self, ip):
"""添加ip"""
if ip not in self.ips:
self.ips.append(ip)
return True
return False

def remove_ip(self, ip):
"""remove ip"""
if ip in self.used_ips:
self.ips.remove(ip)
self.used_ips.remove(ip)
return True
return False

def get_ip(self):
"""get ip and mark as used"""
if self.ips:
left_ips = self.ips.pop(0)
self.used_ips.add(left_ips)
return left_ips
return None

def clear(self):
""" clear ip pool"""
self.ips = []
self.used_ips = set()


class AccountPool:
"""account pool class"""

def __init__(self):
self.phone_pool = PhonePool()
self.ip_pool = IPPool()

def add_account(self, phone, ip):
"""add account to pool with phone and ip"""
if self.phone_pool.add_phone(phone) and self.ip_pool.add_ip(ip):
return True
return False

def remove_account(self, phone, ip):
"""remove account from pool """
if self.phone_pool.remove_phone(phone) and self.ip_pool.remove_ip(ip):
return True
return False

def get_account(self):
"""get account if no account, reload account pool"""
phone = self.phone_pool.get_phone()
ip = self.ip_pool.get_ip()
if not phone or not ip:
reload_account_pool(self)
return self.get_account()
return phone, ip

def clear_account(self):
"""clear account pool"""
self.phone_pool.clear()
self.ip_pool.clear()


def reload_account_pool(apo: AccountPool):
"""reload account pool"""
apo.clear_account()
for phone, ip in zip(config.PHONE_LIST, config.IP_PROXY_LIST):
apo.add_account(phone, ip)


def create_account_pool() -> AccountPool:
"""create account pool"""
apo = AccountPool()
reload_account_pool(apo=apo)
return apo


if __name__ == '__main__':
import time

ac_pool = create_account_pool()
p, i = ac_pool.get_account()
while p:
print(f"get phone:{p}, ip proxy:{i} from account pool")
p, i = ac_pool.get_account()
time.sleep(1)
23 changes: 0 additions & 23 deletions base_crawler.py

This file was deleted.

14 changes: 0 additions & 14 deletions config.py

This file was deleted.

2 changes: 2 additions & 0 deletions config/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .base_config import *
from .account_config import *
27 changes: 27 additions & 0 deletions config/account_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
# account_config.py

PHONE_LIST = [
"13012345671",
"13012345672",
"13012345673",
"13012345674",
"13012345675",
"13012345676",
# ...
]

IP_PROXY_LIST = [
"111.122.xx.xx1:8888",
"111.122.xx.xx2:8888",
"111.122.xx.xx3:8888",
"111.122.xx.xx4:8888",
"111.122.xx.xx5:8888",
"111.122.xx.xx6:8888",
# ...
]

IP_PROXY_PROTOCOL = "http://"
IP_PROXY_USER = "xxxx"
IP_PROXY_PASSWORD = "xxxx"

19 changes: 19 additions & 0 deletions config/base_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
PLATFORM = "xhs"
KEYWORDS = "健身,旅游"
LOGIN_TYPE = "qrcode" # qrcode or phone or cookies
# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
# xhs cookie format -> web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
COOKIES = ""

# redis config
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
REDIS_DB_PWD = "123456" # your redis password

# enable ip proxy
ENABLE_IP_PROXY = False

# retry_interval
RETRY_INTERVAL = 60 * 30 # 30 minutes

# playwright headless
HEADLESS = True
33 changes: 24 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import argparse

import config
from tools import utils
from base import proxy_account_pool
from media_platform.douyin import DouYinCrawler
from media_platform.xhs import XiaoHongShuCrawler

Expand All @@ -19,24 +21,37 @@ def create_crawler(platform: str):


async def main():
utils.init_loging_config()
# define command line params ...
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type)
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies)
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.LOGIN_TYPE)

# init account pool
account_pool = proxy_account_pool.create_account_pool()

args = parser.parse_args()
crawler = CrawlerFactory().create_crawler(platform=args.platform)
crawler.init_config(
keywords=args.keywords,
login_phone=args.phone,
login_type=args.lt,
cookie_str=args.cookies
command_args=args,
account_pool=account_pool
)
await crawler.start()

"""
# retry when exception ...
while True:
try:
await crawler.start()
except Exception as e:
logging.info(f"crawler start error: {e} ...")
await crawler.close()
# If you encounter an exception
# sleep for a period of time before retrying
# to avoid frequent requests that may result in the account being blocked.
await asyncio.sleep(config.RETRY_INTERVAL)
"""


if __name__ == '__main__':
try:
Expand Down
Loading

0 comments on commit b8093a2

Please sign in to comment.