fix: 修复部分微博无法获取的问题

Issue dataabc#374
MrDingDan · Nov 1, 2021 · 3b9f862 · 3b9f862
1 parent 16c7454
commit 3b9f862
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 5 deletions.
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name='weibo-spider',
-    version='0.2.7',
+    version='0.2.8',
     author='Chen Lei',
     author_email='[email protected]',
     description='新浪微博爬虫，用python爬取新浪微博数据。',

diff --git a/weibo_spider/parser/album_parser.py b/weibo_spider/parser/album_parser.py
@@ -1,5 +1,5 @@
-from .util import handle_html
 from .parser import Parser
+from .util import handle_html
 
 
 class AlbumParser(Parser):
@@ -10,4 +10,9 @@ def __init__(self, cookie, album_url):
 
     def extract_pic_urls(self):
         # <img src="http://wx2.sinaimg.cn/wap180/76102133ly8fwr33wpn8fj20v90v9tbw.jpg" alt="" class="c">
-        return self.selector.xpath('//img[@class="c"]/@src')
+        pic_list = self.selector.xpath('//div[@class="c"]//img/@src')
+        for i, pic in enumerate(pic_list):
+            if "?" in pic:
+                pic = pic[:pic.index("?")]
+            pic_list[i] = pic
+        return pic_list
diff --git a/weibo_spider/parser/photo_parser.py b/weibo_spider/parser/photo_parser.py
@@ -1,15 +1,19 @@
-from .util import handle_html
 from .parser import Parser
+from .util import handle_html
 
 
 class PhotoParser(Parser):
     def __init__(self, cookie, user_id):
         self.cookie = cookie
         self.url = "https://weibo.cn/" + str(user_id) + "/photo?tf=6_008"
         self.selector = handle_html(self.cookie, self.url)
+        self.user_id = user_id
 
     def extract_avatar_album_url(self):
         # Finds the href attribute of the table td div element with text 头像相册, e.g.
         # <a href="/album/166564740000001980768563?rl=1"><img width="80" height="80" src="https://tvax1.sinaimg.cn/crop.0.0.1080.1080.180/76102133ly8ga961tpte6j20u00u0q65.jpg?KID=imgbed,tva&amp;Expires=1629227741&amp;ssig=TEUDkMXcS1" alt="头像相册"></a>
         result = self.selector.xpath('//img[@alt="头像相册"]/../@href')
-        return "https://weibo.cn" + result[0]
+        if len(result) > 0:
+            return "https://weibo.cn" + result[0]
+        else:
+            return "https://weibo.cn/" + str(self.user_id) + "/avatar?rl=0"