Skip to content

Commit

Permalink
ADD: 支持抓取图片到本地,留言板头像和附件都抓到本地来
Browse files Browse the repository at this point in the history
  • Loading branch information
whusnoopy committed Jul 24, 2018
1 parent 11ab5c0 commit acb23cc
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 207 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ venv.bak/
# mypy
.mypy_cache/

.vscode

# local db
*.db
*.db-journal

# local images
/static/img/
1 change: 0 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ ipython = "*"
requests = "*"
peewee = "*"
flask = "*"
requests-html = "*"

[requires]
python_version = "3.7"
194 changes: 11 additions & 183 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 0 additions & 15 deletions crawl/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import re

import requests
import requests_html

from config import crawl_config as config

Expand All @@ -18,7 +17,6 @@ class Crawler(object):

def __init__(self):
self.session = requests.session()
self.html_session = requests_html.HTMLSession()

def get_url(self, url, params=dict()):
resp = self.session.get(url, params=params, headers=Crawler.DEFAULT_HEADER, cookies=config.COOKIES)
Expand All @@ -28,18 +26,5 @@ def post_for_json(self, url, params=dict()):
resp = self.session.post(url, params=params, headers=Crawler.DEFAULT_HEADER, cookies=config.COOKIES)
return resp

def get_html(self, url, params=dict()):
resp = self.html_session.get(url, params=params, headers=Crawler.DEFAULT_HEADER, cookies=config.COOKIES)
return resp

def get_index_page(self):
resp = self.get_url(f"http://www.renren.com/{config.UID}/profile")
index_content = resp.content.decode('utf8')

find_requestToken = re.findall(r"requestToken\s:\s'(-*\d+)'", index_content)
find_rtk = re.findall(r"_rtk\s:\s'(\w+)'", index_content)

print(find_requestToken, find_rtk)


crawler = Crawler()
9 changes: 6 additions & 3 deletions crawl/gossip.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from models import Gossip

from .crawler import crawler
from .utils import get_image


normal_pattern = re.compile(r'<span style="color:#\d*">(.*)</span>')
Expand All @@ -23,14 +24,16 @@ def load_gossip_page(page):
r = json.loads(resp.text)

for c in r['array']:
local_pic = get_image(c['tinyUrl'])

gossip = {
'id': c['id'],
't': datetime.strptime(c['time'], "%Y-%m-%d %H:%M"),
'guestId': c['guestId'],
'guestName': c['guestName'],
'headPic': c['tinyUrl'], # 居然保存的是当时的头像,这里不能往 User 表里塞了
'attachSnap': c.get('headUrl', ''),
'attachPic': c.get('largeUrl', ''),
'headPic': local_pic, # 居然保存的是当时的头像,这里不能往 User 表里塞了
'attachSnap': get_image(c.get('headUrl', '')),
'attachPic': get_image(c.get('largeUrl', '')),
'whisper': c['whisper'] == 'true',
'wap': c['wap'] == 'true',
'gift': c['giftImg'] if c['gift'] == 'true' else ''
Expand Down
Loading

0 comments on commit acb23cc

Please sign in to comment.