From 3b9f862944ac6f029ebcea01f38d720940144b68 Mon Sep 17 00:00:00 2001 From: dataabc Date: Mon, 1 Nov 2021 19:01:13 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=83=A8=E5=88=86?= =?UTF-8?q?=E5=BE=AE=E5=8D=9A=E6=97=A0=E6=B3=95=E8=8E=B7=E5=8F=96=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #374 --- setup.py | 2 +- weibo_spider/parser/album_parser.py | 9 +++++++-- weibo_spider/parser/photo_parser.py | 8 ++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 631503b2..e76d2d0b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='weibo-spider', - version='0.2.7', + version='0.2.8', author='Chen Lei', author_email='chillychen1991@gmail.com', description='新浪微博爬虫,用python爬取新浪微博数据。', diff --git a/weibo_spider/parser/album_parser.py b/weibo_spider/parser/album_parser.py index c187b9df..546bb672 100644 --- a/weibo_spider/parser/album_parser.py +++ b/weibo_spider/parser/album_parser.py @@ -1,5 +1,5 @@ -from .util import handle_html from .parser import Parser +from .util import handle_html class AlbumParser(Parser): @@ -10,4 +10,9 @@ def __init__(self, cookie, album_url): def extract_pic_urls(self): # - return self.selector.xpath('//img[@class="c"]/@src') \ No newline at end of file + pic_list = self.selector.xpath('//div[@class="c"]//img/@src') + for i, pic in enumerate(pic_list): + if "?" in pic: + pic = pic[:pic.index("?")] + pic_list[i] = pic + return pic_list diff --git a/weibo_spider/parser/photo_parser.py b/weibo_spider/parser/photo_parser.py index 33551c7e..236e76e2 100644 --- a/weibo_spider/parser/photo_parser.py +++ b/weibo_spider/parser/photo_parser.py @@ -1,5 +1,5 @@ -from .util import handle_html from .parser import Parser +from .util import handle_html class PhotoParser(Parser): @@ -7,9 +7,13 @@ def __init__(self, cookie, user_id): self.cookie = cookie self.url = "https://weibo.cn/" + str(user_id) + "/photo?tf=6_008" self.selector = handle_html(self.cookie, self.url) + self.user_id = user_id def extract_avatar_album_url(self): # Finds the href attribute of the table td div element with text 头像相册, e.g. # 头像相册 result = self.selector.xpath('//img[@alt="头像相册"]/../@href') - return "https://weibo.cn" + result[0] + if len(result) > 0: + return "https://weibo.cn" + result[0] + else: + return "https://weibo.cn/" + str(self.user_id) + "/avatar?rl=0"