-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
326 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#!/usr/bin/env python | ||
# encoding: utf-8 | ||
|
||
""" | ||
@version: v1.0 | ||
@author: xag | ||
@license: Apache Licence | ||
@contact: [email protected] | ||
@site: http://www.xingag.top | ||
@software: PyCharm | ||
@file: models.py | ||
@time: 12/15/18 23:08 | ||
@description:数据模型【使用mongoengine来简化操作】 | ||
""" | ||
|
||
from datetime import datetime | ||
|
||
from mongoengine import connect | ||
from mongoengine import DateTimeField | ||
from mongoengine import Document | ||
from mongoengine import IntField | ||
from mongoengine import StringField | ||
from mongoengine import URLField | ||
|
||
__author__ = 'xag' | ||
|
||
# 连接mongodb | ||
# 普通连接数据库【数据库没有设置权限,可以任意的写入数据。不需要指定用户名和密码】 | ||
# response = connect('admin', host='localhost', port=27017) | ||
|
||
# 权限连接数据库【数据库设置了权限,这里必须指定用户名和密码】 | ||
response = connect('admin', host='localhost', port=27017,username='root', password='xag') | ||
|
||
|
||
|
||
class Post(Document): | ||
""" | ||
文章【模型】 | ||
""" | ||
title = StringField() # 标题 | ||
content_url = StringField() # 文章链接 | ||
source_url = StringField() # 原文链接 | ||
digest = StringField() # 文章摘要 | ||
cover = URLField(validation=None) # 封面图 | ||
p_date = DateTimeField() # 推送时间 | ||
author = StringField() # 作者 | ||
|
||
content = StringField() # 文章内容 | ||
|
||
read_num = IntField(default=0) # 阅读量 | ||
like_num = IntField(default=0) # 点赞数 | ||
comment_num = IntField(default=0) # 评论数 | ||
reward_num = IntField(default=0) # 点赞数 | ||
|
||
c_date = DateTimeField(default=datetime.now) # 数据生成时间 | ||
u_date = DateTimeField(default=datetime.now) # 数据最后更新时间 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
#!/usr/bin/env python | ||
# encoding: utf-8 | ||
|
||
""" | ||
@version: v1.0 | ||
@author: xag | ||
@license: Apache Licence | ||
@contact: [email protected] | ||
@site: http://www.xingag.top | ||
@software: PyCharm | ||
@file: tools.py | ||
@time: 12/15/18 23:23 | ||
@description:工具类 | ||
""" | ||
|
||
import html | ||
from urllib.parse import urlsplit | ||
|
||
|
||
def sub_dict(data, keys): | ||
""" | ||
取字典中有用的数据出来 | ||
获取字典的子字典可以用字典推导式实现 | ||
:param data:字典 | ||
:param keys:键值列表 | ||
:return:有用的键值组成的字典 | ||
""" | ||
return {k: html.unescape(data[k]) for k in data if k in keys} | ||
|
||
|
||
def str_to_dict(s, join_symbol="\n", split_symbol=":"): | ||
""" | ||
把参数字符串转换为一个字典 | ||
例如: a=b&c=d join_symbol是&, split_symbol是= | ||
:param s: 原字符串 | ||
:param join_symbol: 连接符 | ||
:param split_symbol: 分隔符 | ||
:return: 字典 | ||
""" | ||
# 通过join_symbol把字符串分为一个列表 | ||
s_list = s.split(join_symbol) | ||
|
||
# 定义一个新的字典 | ||
data = dict() | ||
|
||
for item in s_list: | ||
item = item.strip() | ||
if item: | ||
# a = b 分成一个元组,第二个参数:分割次数 | ||
k, v = item.split(split_symbol, 1) | ||
|
||
# 去除空格 | ||
data[k.strip()] = v.strip() | ||
return data | ||
|
||
|
||
def dic_to_str(source_dict): | ||
""" | ||
字典拆出来,通过&和=连接起来;和上面的str_to_dict函数是逆操作 | ||
:param source_dict: {"a":1,"b":2} | ||
:return: a=1&b=2 | ||
""" | ||
dict_item = [] | ||
for key, value in source_dict.items(): | ||
dict_item.append("%s=%s" % (key, str(value))) | ||
return "&".join(dict_item) | ||
|
||
|
||
|
||
|
||
def compound_dict(dict1, dict2): | ||
""" | ||
合并两个Dict | ||
:param dict1: | ||
:param dict2: | ||
:return: | ||
""" | ||
dict1.update(dict2) | ||
return dict1 | ||
|
||
|
||
def update_url_query_params(url, query_update_data): | ||
# 把url分割:返回一个包含5个字符串项目的元组:协议、位置、路径、查询、片段 | ||
parse_result = urlsplit(url) | ||
scheme = parse_result.scheme | ||
netloc = parse_result.netloc | ||
path = parse_result.path | ||
query = parse_result.query | ||
fragment = parse_result.fragment | ||
# print('scheme:%s,netloc:%s,path:%s,query:%s,fragment:%s' % (scheme, netloc, path, query, fragment)) | ||
|
||
# query内容转换为dict | ||
query_dict = str_to_dict(query, "&", "=") | ||
|
||
# 更新query内容 | ||
if query_update_data: | ||
query_dict.update(query_update_data) | ||
|
||
# 再把query内容由dict类型转换为str类型 | ||
# 字典和字符串相互转换 | ||
# myDict = eval(myStr) | ||
# myStr = str(myDict) | ||
# query_dict_result = str(query_dict) | ||
|
||
# 把字典转换为普通的连接方式 | ||
query_dict_result = dic_to_str(query_dict) | ||
|
||
# 重新组装成一个字符串 | ||
url_result = "%s://%s" % (scheme, ''.join([netloc, path, query_dict_result, fragment])) | ||
|
||
print(url_result) | ||
|
||
|
||
# 测试 | ||
# d = {"a": "1", "b": "2", "c": "3"} | ||
# print(sub_dict(d, ("a", "b"))) | ||
# print(sub_dict(d, ["a", "b"])) | ||
|
||
# url_more = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzIxNzYxMTU0OQ==&f=json&offset={}&count=10&is_ok=1&scene=126&uin=777&key=777&pass_ticket=9hPJTQdf%2Bb2ggjH2MqJn9y481xiwLT4d1q2SFGi4BEuzA7Bbw4rSn2Hya1%2BLOexv&wxtoken=&appmsg_token=988_JDEZ3T3UkeiLmP4Gq6tztDPOHNaDfZPb5IDHDg~~&x5=0&f=json' | ||
# update_dict = {'__biz':'xag_biz','pass_ticket':'xag_pass_ticket','appmsg_token':23} | ||
# update_url_query_params(url_more, update_dict) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
#!/usr/bin/env python | ||
# encoding: utf-8 | ||
|
||
""" | ||
@version: v1.0 | ||
@author: xag | ||
@license: Apache Licence | ||
@contact: [email protected] | ||
@site: http://www.xingag.top | ||
@software: PyCharm | ||
@file: wx_spider.py | ||
@time: 12/14/18 21:40 | ||
@description:爬取全部的文章并写入到 MongoDB 数据库中 | ||
""" | ||
|
||
import requests | ||
import re | ||
import html | ||
import json | ||
import logging | ||
import time | ||
from datetime import datetime | ||
|
||
from models import Post | ||
from tools import sub_dict | ||
|
||
|
||
class WeiXinSpider(object): | ||
def __init__(self): | ||
|
||
# 注意:微信安全性导致url_more、Cookie经常失效,需要重新请求更换 | ||
self.headers = { | ||
'Host': 'mp.weixin.qq.com', | ||
'Connection': 'keep-alive', | ||
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/16B92 MicroMessenger/6.7.4(0x1607042c) NetType/WIFI Language/zh_CN', | ||
'Accept-Language': 'zh-cn', | ||
'X-Requested-With': 'XMLHttpRequest', | ||
'Cookie': 'devicetype=iOS12.1.2; lang=zh_CN; pass_ticket=KlcW/tVyaNTxBr3kYaB0QC5zLbhDzo0nhEGF2JPrpjPwpJi4TGz+XvxWoGhMPYqP; version=17000028; wap_sid2=CMOw8aYBEnBfM0UxZGdabGszY0lhZ0tCZlNhUmtNa0dYazd6dDJWeXFHZmZJczRJc2tIMUZZUXBJS3RVS1lIVXdTbmlkUkRpLVRZeDVlUjJ2Tk95U29acWRGRXVmcG14Y05NamtBU0VHb0hZMmZudGtzX2RBd0FBMLGQl+EFOA1AlU4=; wxuin=349984835; wxtokenkey=777; rewardsn=; pgv_pvid=2237276040; pac_uid=0_f82bd5abff9aa; tvfe_boss_uuid=05faefd1e90836f4', | ||
'Accept': '*/*', | ||
'Referer': 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzIxNzYxMTU0OQ==&scene=126&bizpsid=0&sessionid=1545979942&subscene=0&devicetype=iOS12.1.2&version=17000028&lang=zh_CN&nettype=WIFI&a8scene=0&fontScale=100&pass_ticket=KlcW%2FtVyaNTxBr3kYaB0QC5zLbhDzo0nhEGF2JPrpjPwpJi4TGz%2BXvxWoGhMPYqP&wx_header=1' | ||
} | ||
|
||
# 更多文章 URL | ||
# # 请将appmsg_token和pass_ticket替换成你自己的 | ||
self.url_more = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzIxNzYxMTU0OQ==&f=json&offset={}&count=10&is_ok=1&scene=126&uin=777&key=777&pass_ticket=KlcW%2FtVyaNTxBr3kYaB0QC5zLbhDzo0nhEGF2JPrpjPwpJi4TGz%2BXvxWoGhMPYqP&wxtoken=&appmsg_token=989_OeyBFD%252FX7XluAq0e-7Y_WOs1crl4AXsor39LGA~~&x5=0&f=json' | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
self.logger = logging.getLogger(__name__) | ||
|
||
def spider_more(self, offset): | ||
""" | ||
爬取更多数据 | ||
offset:消息索引 | ||
:return: | ||
""" | ||
current_request_url = self.url_more.format(offset) | ||
self.logger.info('当前请求的地址是:%s' % (current_request_url)) | ||
|
||
response = requests.get(current_request_url, headers=self.headers, verify=False) | ||
result = response.json() | ||
|
||
if result.get("ret") == 0: | ||
msg_list = result.get('general_msg_list') | ||
|
||
# 保存数据 | ||
self._save(msg_list) | ||
self.logger.info("获取到一页数据成功, data=%s" % (msg_list)) | ||
|
||
# 获取下一页数据 | ||
has_next_page = result.get('can_msg_continue') | ||
if has_next_page == 1: | ||
# 继续爬取写一页的数据【通过next_offset】 | ||
next_offset = result.get('next_offset') | ||
|
||
# 休眠2秒,继续爬下一页 | ||
time.sleep(2) | ||
self.spider_more(next_offset) | ||
else: # 当 has_next 为 0 时,说明已经到了最后一页,这时才算爬完了一个公众号的所有历史文章 | ||
print('爬取公号完成!') | ||
else: | ||
self.logger.info('无法获取到更多内容,请更新cookie或其他请求头信息') | ||
|
||
def _save(self, msg_list): | ||
""" | ||
数据解析 | ||
:param msg_list: | ||
:return: | ||
""" | ||
# 1.去掉多余的斜线,使【链接地址】可用 | ||
msg_list = msg_list.replace("\/", "/") | ||
data = json.loads(msg_list) | ||
|
||
# 2.获取列表数据 | ||
msg_list = data.get("list") | ||
for msg in msg_list: | ||
# 3.发布时间 | ||
p_date = msg.get('comm_msg_info').get('datetime') | ||
|
||
# 注意:非图文消息没有此字段 | ||
msg_info = msg.get("app_msg_ext_info") | ||
|
||
if msg_info: # 图文消息 | ||
# 如果是多图文推送,把第二条第三条也保存 | ||
multi_msg_info = msg_info.get("multi_app_msg_item_list") | ||
|
||
# 如果是多图文,就从multi_msg_info中获取数据插入;反之直接从app_msg_ext_info中插入 | ||
if multi_msg_info: | ||
for multi_msg_item in multi_msg_info: | ||
self._insert(multi_msg_item, p_date) | ||
else: | ||
self._insert(msg_info, p_date) | ||
else: | ||
# 非图文消息 | ||
# 转换为字符串再打印出来 | ||
self.logger.warning(u"此消息不是图文推送,data=%s" % json.dumps(msg.get("comm_msg_info"))) | ||
|
||
def _insert(self, msg_info, p_date): | ||
""" | ||
数据插入到 MongoDB 数据库中 | ||
:param msg_info: | ||
:param p_date: | ||
:return: | ||
""" | ||
keys = ['title', 'author', 'content_url', 'digest', 'cover', 'source_url'] | ||
|
||
# 获取有用的数据,构建数据模型 | ||
data = sub_dict(msg_info, keys) | ||
post = Post(**data) | ||
|
||
# 时间格式化 | ||
date_pretty = datetime.fromtimestamp(p_date) | ||
post["p_date"] = date_pretty | ||
|
||
self.logger.info('save data %s ' % post.title) | ||
|
||
# 保存数据 | ||
try: | ||
post.save() | ||
except Exception as e: | ||
self.logger.error("保存失败 data=%s" % post.to_json(), exc_info=True) | ||
|
||
# ============================================================================================== | ||
|
||
|
||
if __name__ == '__main__': | ||
spider = WeiXinSpider() | ||
# 从首页开始爬取 | ||
spider.spider_more(0) | ||
print('恭喜,爬取数据完成!') |
Binary file not shown.