Skip to content

Commit

Permalink
爬取微信公众号文章1
Browse files Browse the repository at this point in the history
  • Loading branch information
xingag committed Dec 28, 2018
1 parent ea4603f commit c1f3a83
Show file tree
Hide file tree
Showing 6 changed files with 326 additions and 0 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file added WeiXinArticle/.DS_Store
Binary file not shown.
56 changes: 56 additions & 0 deletions WeiXinArticle/WeiXin_v1.0/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python
# encoding: utf-8

"""
@version: v1.0
@author: xag
@license: Apache Licence
@contact: [email protected]
@site: http://www.xingag.top
@software: PyCharm
@file: models.py
@time: 12/15/18 23:08
@description:数据模型【使用mongoengine来简化操作】
"""

from datetime import datetime

from mongoengine import connect
from mongoengine import DateTimeField
from mongoengine import Document
from mongoengine import IntField
from mongoengine import StringField
from mongoengine import URLField

__author__ = 'xag'

# 连接mongodb
# 普通连接数据库【数据库没有设置权限,可以任意的写入数据。不需要指定用户名和密码】
# response = connect('admin', host='localhost', port=27017)

# 权限连接数据库【数据库设置了权限,这里必须指定用户名和密码】
response = connect('admin', host='localhost', port=27017,username='root', password='xag')



class Post(Document):
"""
文章【模型】
"""
title = StringField() # 标题
content_url = StringField() # 文章链接
source_url = StringField() # 原文链接
digest = StringField() # 文章摘要
cover = URLField(validation=None) # 封面图
p_date = DateTimeField() # 推送时间
author = StringField() # 作者

content = StringField() # 文章内容

read_num = IntField(default=0) # 阅读量
like_num = IntField(default=0) # 点赞数
comment_num = IntField(default=0) # 评论数
reward_num = IntField(default=0) # 点赞数

c_date = DateTimeField(default=datetime.now) # 数据生成时间
u_date = DateTimeField(default=datetime.now) # 数据最后更新时间
121 changes: 121 additions & 0 deletions WeiXinArticle/WeiXin_v1.0/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python
# encoding: utf-8

"""
@version: v1.0
@author: xag
@license: Apache Licence
@contact: [email protected]
@site: http://www.xingag.top
@software: PyCharm
@file: tools.py
@time: 12/15/18 23:23
@description:工具类
"""

import html
from urllib.parse import urlsplit


def sub_dict(data, keys):
"""
取字典中有用的数据出来
获取字典的子字典可以用字典推导式实现
:param data:字典
:param keys:键值列表
:return:有用的键值组成的字典
"""
return {k: html.unescape(data[k]) for k in data if k in keys}


def str_to_dict(s, join_symbol="\n", split_symbol=":"):
"""
把参数字符串转换为一个字典
例如: a=b&c=d join_symbol是&, split_symbol是=
:param s: 原字符串
:param join_symbol: 连接符
:param split_symbol: 分隔符
:return: 字典
"""
# 通过join_symbol把字符串分为一个列表
s_list = s.split(join_symbol)

# 定义一个新的字典
data = dict()

for item in s_list:
item = item.strip()
if item:
# a = b 分成一个元组,第二个参数:分割次数
k, v = item.split(split_symbol, 1)

# 去除空格
data[k.strip()] = v.strip()
return data


def dic_to_str(source_dict):
"""
字典拆出来,通过&和=连接起来;和上面的str_to_dict函数是逆操作
:param source_dict: {"a":1,"b":2}
:return: a=1&b=2
"""
dict_item = []
for key, value in source_dict.items():
dict_item.append("%s=%s" % (key, str(value)))
return "&".join(dict_item)




def compound_dict(dict1, dict2):
"""
合并两个Dict
:param dict1:
:param dict2:
:return:
"""
dict1.update(dict2)
return dict1


def update_url_query_params(url, query_update_data):
# 把url分割:返回一个包含5个字符串项目的元组:协议、位置、路径、查询、片段
parse_result = urlsplit(url)
scheme = parse_result.scheme
netloc = parse_result.netloc
path = parse_result.path
query = parse_result.query
fragment = parse_result.fragment
# print('scheme:%s,netloc:%s,path:%s,query:%s,fragment:%s' % (scheme, netloc, path, query, fragment))

# query内容转换为dict
query_dict = str_to_dict(query, "&", "=")

# 更新query内容
if query_update_data:
query_dict.update(query_update_data)

# 再把query内容由dict类型转换为str类型
# 字典和字符串相互转换
# myDict = eval(myStr)
# myStr = str(myDict)
# query_dict_result = str(query_dict)

# 把字典转换为普通的连接方式
query_dict_result = dic_to_str(query_dict)

# 重新组装成一个字符串
url_result = "%s://%s" % (scheme, ''.join([netloc, path, query_dict_result, fragment]))

print(url_result)


# 测试
# d = {"a": "1", "b": "2", "c": "3"}
# print(sub_dict(d, ("a", "b")))
# print(sub_dict(d, ["a", "b"]))

# url_more = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzIxNzYxMTU0OQ==&f=json&offset={}&count=10&is_ok=1&scene=126&uin=777&key=777&pass_ticket=9hPJTQdf%2Bb2ggjH2MqJn9y481xiwLT4d1q2SFGi4BEuzA7Bbw4rSn2Hya1%2BLOexv&wxtoken=&appmsg_token=988_JDEZ3T3UkeiLmP4Gq6tztDPOHNaDfZPb5IDHDg~~&x5=0&f=json'
# update_dict = {'__biz':'xag_biz','pass_ticket':'xag_pass_ticket','appmsg_token':23}
# update_url_query_params(url_more, update_dict)
149 changes: 149 additions & 0 deletions WeiXinArticle/WeiXin_v1.0/wx_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/usr/bin/env python
# encoding: utf-8

"""
@version: v1.0
@author: xag
@license: Apache Licence
@contact: [email protected]
@site: http://www.xingag.top
@software: PyCharm
@file: wx_spider.py
@time: 12/14/18 21:40
@description:爬取全部的文章并写入到 MongoDB 数据库中
"""

import requests
import re
import html
import json
import logging
import time
from datetime import datetime

from models import Post
from tools import sub_dict


class WeiXinSpider(object):
def __init__(self):

# 注意:微信安全性导致url_more、Cookie经常失效,需要重新请求更换
self.headers = {
'Host': 'mp.weixin.qq.com',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/16B92 MicroMessenger/6.7.4(0x1607042c) NetType/WIFI Language/zh_CN',
'Accept-Language': 'zh-cn',
'X-Requested-With': 'XMLHttpRequest',
'Cookie': 'devicetype=iOS12.1.2; lang=zh_CN; pass_ticket=KlcW/tVyaNTxBr3kYaB0QC5zLbhDzo0nhEGF2JPrpjPwpJi4TGz+XvxWoGhMPYqP; version=17000028; wap_sid2=CMOw8aYBEnBfM0UxZGdabGszY0lhZ0tCZlNhUmtNa0dYazd6dDJWeXFHZmZJczRJc2tIMUZZUXBJS3RVS1lIVXdTbmlkUkRpLVRZeDVlUjJ2Tk95U29acWRGRXVmcG14Y05NamtBU0VHb0hZMmZudGtzX2RBd0FBMLGQl+EFOA1AlU4=; wxuin=349984835; wxtokenkey=777; rewardsn=; pgv_pvid=2237276040; pac_uid=0_f82bd5abff9aa; tvfe_boss_uuid=05faefd1e90836f4',
'Accept': '*/*',
'Referer': 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzIxNzYxMTU0OQ==&scene=126&bizpsid=0&sessionid=1545979942&subscene=0&devicetype=iOS12.1.2&version=17000028&lang=zh_CN&nettype=WIFI&a8scene=0&fontScale=100&pass_ticket=KlcW%2FtVyaNTxBr3kYaB0QC5zLbhDzo0nhEGF2JPrpjPwpJi4TGz%2BXvxWoGhMPYqP&wx_header=1'
}

# 更多文章 URL
# # 请将appmsg_token和pass_ticket替换成你自己的
self.url_more = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzIxNzYxMTU0OQ==&f=json&offset={}&count=10&is_ok=1&scene=126&uin=777&key=777&pass_ticket=KlcW%2FtVyaNTxBr3kYaB0QC5zLbhDzo0nhEGF2JPrpjPwpJi4TGz%2BXvxWoGhMPYqP&wxtoken=&appmsg_token=989_OeyBFD%252FX7XluAq0e-7Y_WOs1crl4AXsor39LGA~~&x5=0&f=json'

logging.basicConfig(level=logging.INFO)
self.logger = logging.getLogger(__name__)

def spider_more(self, offset):
"""
爬取更多数据
offset:消息索引
:return:
"""
current_request_url = self.url_more.format(offset)
self.logger.info('当前请求的地址是:%s' % (current_request_url))

response = requests.get(current_request_url, headers=self.headers, verify=False)
result = response.json()

if result.get("ret") == 0:
msg_list = result.get('general_msg_list')

# 保存数据
self._save(msg_list)
self.logger.info("获取到一页数据成功, data=%s" % (msg_list))

# 获取下一页数据
has_next_page = result.get('can_msg_continue')
if has_next_page == 1:
# 继续爬取写一页的数据【通过next_offset】
next_offset = result.get('next_offset')

# 休眠2秒,继续爬下一页
time.sleep(2)
self.spider_more(next_offset)
else: # 当 has_next 为 0 时,说明已经到了最后一页,这时才算爬完了一个公众号的所有历史文章
print('爬取公号完成!')
else:
self.logger.info('无法获取到更多内容,请更新cookie或其他请求头信息')

def _save(self, msg_list):
"""
数据解析
:param msg_list:
:return:
"""
# 1.去掉多余的斜线,使【链接地址】可用
msg_list = msg_list.replace("\/", "/")
data = json.loads(msg_list)

# 2.获取列表数据
msg_list = data.get("list")
for msg in msg_list:
# 3.发布时间
p_date = msg.get('comm_msg_info').get('datetime')

# 注意:非图文消息没有此字段
msg_info = msg.get("app_msg_ext_info")

if msg_info: # 图文消息
# 如果是多图文推送,把第二条第三条也保存
multi_msg_info = msg_info.get("multi_app_msg_item_list")

# 如果是多图文,就从multi_msg_info中获取数据插入;反之直接从app_msg_ext_info中插入
if multi_msg_info:
for multi_msg_item in multi_msg_info:
self._insert(multi_msg_item, p_date)
else:
self._insert(msg_info, p_date)
else:
# 非图文消息
# 转换为字符串再打印出来
self.logger.warning(u"此消息不是图文推送,data=%s" % json.dumps(msg.get("comm_msg_info")))

def _insert(self, msg_info, p_date):
"""
数据插入到 MongoDB 数据库中
:param msg_info:
:param p_date:
:return:
"""
keys = ['title', 'author', 'content_url', 'digest', 'cover', 'source_url']

# 获取有用的数据,构建数据模型
data = sub_dict(msg_info, keys)
post = Post(**data)

# 时间格式化
date_pretty = datetime.fromtimestamp(p_date)
post["p_date"] = date_pretty

self.logger.info('save data %s ' % post.title)

# 保存数据
try:
post.save()
except Exception as e:
self.logger.error("保存失败 data=%s" % post.to_json(), exc_info=True)

# ==============================================================================================


if __name__ == '__main__':
spider = WeiXinSpider()
# 从首页开始爬取
spider.spider_more(0)
print('恭喜,爬取数据完成!')
Binary file modified WeiXinProj/.DS_Store
Binary file not shown.

0 comments on commit c1f3a83

Please sign in to comment.