Skip to content

Commit

Permalink
fix: 修复部分微博无法获取的问题
Browse files Browse the repository at this point in the history
  • Loading branch information
dataabc committed Nov 1, 2021
1 parent 16c7454 commit 3b9f862
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 5 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name='weibo-spider',
version='0.2.7',
version='0.2.8',
author='Chen Lei',
author_email='[email protected]',
description='新浪微博爬虫,用python爬取新浪微博数据。',
Expand Down
9 changes: 7 additions & 2 deletions weibo_spider/parser/album_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .util import handle_html
from .parser import Parser
from .util import handle_html


class AlbumParser(Parser):
Expand All @@ -10,4 +10,9 @@ def __init__(self, cookie, album_url):

def extract_pic_urls(self):
# <img src="http://wx2.sinaimg.cn/wap180/76102133ly8fwr33wpn8fj20v90v9tbw.jpg" alt="" class="c">
return self.selector.xpath('//img[@class="c"]/@src')
pic_list = self.selector.xpath('//div[@class="c"]//img/@src')
for i, pic in enumerate(pic_list):
if "?" in pic:
pic = pic[:pic.index("?")]
pic_list[i] = pic
return pic_list
8 changes: 6 additions & 2 deletions weibo_spider/parser/photo_parser.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
from .util import handle_html
from .parser import Parser
from .util import handle_html


class PhotoParser(Parser):
def __init__(self, cookie, user_id):
self.cookie = cookie
self.url = "https://weibo.cn/" + str(user_id) + "/photo?tf=6_008"
self.selector = handle_html(self.cookie, self.url)
self.user_id = user_id

def extract_avatar_album_url(self):
# Finds the href attribute of the table td div element with text 头像相册, e.g.
# <a href="/album/166564740000001980768563?rl=1"><img width="80" height="80" src="https://tvax1.sinaimg.cn/crop.0.0.1080.1080.180/76102133ly8ga961tpte6j20u00u0q65.jpg?KID=imgbed,tva&amp;Expires=1629227741&amp;ssig=TEUDkMXcS1" alt="头像相册"></a>
result = self.selector.xpath('//img[@alt="头像相册"]/../@href')
return "https://weibo.cn" + result[0]
if len(result) > 0:
return "https://weibo.cn" + result[0]
else:
return "https://weibo.cn/" + str(self.user_id) + "/avatar?rl=0"

0 comments on commit 3b9f862

Please sign in to comment.