ADD: 支持抓取图片到本地，留言板头像和附件都抓到本地来

MrMoDoor · Jul 24, 2018 · acb23cc · acb23cc
1 parent 11ab5c0
commit acb23cc
Show file tree

Hide file tree

Showing 8 changed files with 54 additions and 207 deletions.
diff --git a/.gitignore b/.gitignore
@@ -103,6 +103,11 @@ venv.bak/
 # mypy
 .mypy_cache/
 
+.vscode
+
 # local db
 *.db
 *.db-journal
+
+# local images
+/static/img/
diff --git a/Pipfile b/Pipfile
@@ -11,7 +11,6 @@ ipython = "*"
 requests = "*"
 peewee = "*"
 flask = "*"
-requests-html = "*"
 
 [requires]
 python_version = "3.7"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/crawl/crawler.py b/crawl/crawler.py
@@ -3,7 +3,6 @@
 import re
 
 import requests
-import requests_html
 
 from config import crawl_config as config
 
@@ -18,7 +17,6 @@ class Crawler(object):
 
     def __init__(self):
         self.session = requests.session()
-        self.html_session = requests_html.HTMLSession()
 
     def get_url(self, url, params=dict()):
         resp = self.session.get(url, params=params, headers=Crawler.DEFAULT_HEADER, cookies=config.COOKIES)
@@ -28,18 +26,5 @@ def post_for_json(self, url, params=dict()):
         resp = self.session.post(url, params=params, headers=Crawler.DEFAULT_HEADER, cookies=config.COOKIES)
         return resp
 
-    def get_html(self, url, params=dict()):
-        resp = self.html_session.get(url, params=params, headers=Crawler.DEFAULT_HEADER, cookies=config.COOKIES)
-        return resp
-
-    def get_index_page(self):
-        resp = self.get_url(f"http://www.renren.com/{config.UID}/profile")
-        index_content = resp.content.decode('utf8')
-
-        find_requestToken = re.findall(r"requestToken\s:\s'(-*\d+)'", index_content)
-        find_rtk = re.findall(r"_rtk\s:\s'(\w+)'", index_content)
-
-        print(find_requestToken, find_rtk)
-
 
 crawler = Crawler()
diff --git a/crawl/gossip.py b/crawl/gossip.py
@@ -8,6 +8,7 @@
 from models import Gossip
 
 from .crawler import crawler
+from .utils import get_image
 
 
 normal_pattern = re.compile(r'<span style="color:#\d*">(.*)</span>')
@@ -23,14 +24,16 @@ def load_gossip_page(page):
     r = json.loads(resp.text)
 
     for c in r['array']:
+        local_pic = get_image(c['tinyUrl'])
+
         gossip = {
             'id': c['id'],
             't': datetime.strptime(c['time'], "%Y-%m-%d %H:%M"),
             'guestId': c['guestId'],
             'guestName': c['guestName'],
-            'headPic': c['tinyUrl'],    # 居然保存的是当时的头像，这里不能往 User 表里塞了
-            'attachSnap': c.get('headUrl', ''),
-            'attachPic': c.get('largeUrl', ''),
+            'headPic': local_pic,    # 居然保存的是当时的头像，这里不能往 User 表里塞了
+            'attachSnap': get_image(c.get('headUrl', '')),
+            'attachPic': get_image(c.get('largeUrl', '')),
             'whisper': c['whisper'] == 'true',
             'wap': c['wap'] == 'true',
             'gift': c['giftImg'] if c['gift'] == 'true' else ''