From acb23cc0eefcec6d5cef2fa8d9e84cc13421f632 Mon Sep 17 00:00:00 2001 From: Wen YE Date: Tue, 24 Jul 2018 21:27:52 +0800 Subject: [PATCH] =?UTF-8?q?ADD:=20=E6=94=AF=E6=8C=81=E6=8A=93=E5=8F=96?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E5=88=B0=E6=9C=AC=E5=9C=B0=EF=BC=8C=E7=95=99?= =?UTF-8?q?=E8=A8=80=E6=9D=BF=E5=A4=B4=E5=83=8F=E5=92=8C=E9=99=84=E4=BB=B6?= =?UTF-8?q?=E9=83=BD=E6=8A=93=E5=88=B0=E6=9C=AC=E5=9C=B0=E6=9D=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 5 + Pipfile | 1 - Pipfile.lock | 194 +++---------------------------------- crawl/crawler.py | 15 --- crawl/gossip.py | 9 +- crawl/utils.py | 27 ++++++ fetch.py | 7 +- templates/gossip_list.html | 3 +- 8 files changed, 54 insertions(+), 207 deletions(-) diff --git a/.gitignore b/.gitignore index e343ccc..4641788 100644 --- a/.gitignore +++ b/.gitignore @@ -103,6 +103,11 @@ venv.bak/ # mypy .mypy_cache/ +.vscode + # local db *.db *.db-journal + +# local images +/static/img/ diff --git a/Pipfile b/Pipfile index 5cbef73..ebc8c75 100644 --- a/Pipfile +++ b/Pipfile @@ -11,7 +11,6 @@ ipython = "*" requests = "*" peewee = "*" flask = "*" -requests-html = "*" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index ea2d90b..ab01817 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "191ad73dda1f4a92059c52a982e7b361c5547d5bdd8d34eb09b1663fbf7e9c35" + "sha256": "5f74733fae5836155b625dc73a97bb9874171f7679c2f930a0123d88f0767d52" }, "pipfile-spec": 6, "requires": { @@ -16,20 +16,6 @@ ] }, "default": { - "beautifulsoup4": { - "hashes": [ - "sha256:11a9a27b7d3bddc6d86f59fb76afb70e921a25ac2d6cc55b40d072bd68435a76", - "sha256:7015e76bf32f1f574636c4288399a6de66ce08fb7b2457f628a8d70c0fbabb11", - "sha256:808b6ac932dccb0a4126558f7dfdcf41710dd44a4ef497a0bb59a77f9f078e89" - ], - "version": "==4.6.0" - }, - "bs4": { - "hashes": [ - "sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a" - ], - "version": "==0.0.1" - }, "certifi": { "hashes": [ "sha256:13e698f54293db9f89122b0581843a782ad0934a4fe0172d2a980ba77fc61bb7", @@ -51,20 +37,6 @@ ], "version": "==6.7" }, - "cssselect": { - "hashes": [ - "sha256:066d8bc5229af09617e24b3ca4d52f1f9092d9e061931f4184cd572885c23204", - "sha256:3b5103e8789da9e936a68d993b70df732d06b8bb9a337a05ed4eb52c17ef7206" - ], - "markers": "python_version != '3.0.*' and python_version >= '2.7' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", - "version": "==1.0.3" - }, - "fake-useragent": { - "hashes": [ - "sha256:cc9b9ddcebc708b3deac846f5fccb16e37c02ee47435a4ec7132271dd96aec8c" - ], - "version": "==0.1.10" - }, "flask": { "hashes": [ "sha256:2271c0070dbcb5275fad4a82e29f23ab92682dc45f9dfbc22c02ba9b9322ce48", @@ -93,49 +65,12 @@ ], "version": "==2.10" }, - "lxml": { - "hashes": [ - "sha256:0941f4313208c07734410414d8308812b044fd3fb98573454e3d3a0d2e201f3d", - "sha256:0b18890aa5730f9d847bc5469e8820f782d72af9985a15a7552109a86b01c113", - "sha256:21f427945f612ac75576632b1bb8c21233393c961f2da890d7be3927a4b6085f", - "sha256:24cf6f622a4d49851afcf63ac4f0f3419754d4e98a7a548ab48dd03c635d9bd3", - "sha256:2dc6705486b8abee1af9e2a3761e30a3cb19e8276f20ca7e137ee6611b93707c", - "sha256:2e43b2e5b7d2b9abe6e0301eef2c2c122ab45152b968910eae68bdee2c4cfae0", - "sha256:329a6d8b6d36f7d6f8b6c6a1db3b2c40f7e30a19d3caf62023c9d6a677c1b5e1", - "sha256:423cde55430a348bda6f1021faad7235c2a95a6bdb749e34824e5758f755817a", - "sha256:4651ea05939374cfb5fe87aab5271ed38c31ea47997e17ec3834b75b94bd9f15", - "sha256:4be3bbfb2968d7da6e5c2cd4104fc5ec1caf9c0794f6cae724da5a53b4d9f5a3", - "sha256:622f7e40faef13d232fb52003661f2764ce6cdef3edb0a59af7c1559e4cc36d1", - "sha256:664dfd4384d886b239ef0d7ee5cff2b463831079d250528b10e394a322f141f9", - "sha256:697c0f58ac637b11991a1bc92e07c34da4a72e2eda34d317d2c1c47e2f24c1b3", - "sha256:6ec908b4c8a4faa7fe1a0080768e2ce733f268b287dfefb723273fb34141475f", - "sha256:7ec3fe795582b75bb49bb1685ffc462dbe38d74312dac07ce386671a28b5316b", - "sha256:8c39babd923c431dcf1e5874c0f778d3a5c745a62c3a9b6bd755efd489ee8a1d", - "sha256:949ca5bc56d6cb73d956f4862ba06ad3c5d2808eac76304284f53ae0c8b2334a", - "sha256:9f0daddeefb0791a600e6195441910bdf01eac470be596b9467e6122b51239a6", - "sha256:a359893b01c30e949eae0e8a85671a593364c9f0b8162afe0cb97317af0953bf", - "sha256:ad5d5d8efed59e6b1d4c50c1eac59fb6ecec91b2073676af1e15fc4d43e9b6c5", - "sha256:bc1a36f95a6b3667c09b34995fc3a46a82e4cf0dc3e7ab281e4c77b15bd7af05", - "sha256:be37b3f55b6d7d923f43bf74c356fc1878eb36e28505f38e198cb432c19c7b1a", - "sha256:c45bca5e544eb75f7500ffd730df72922eb878a2f0213b0dc5a5f357ded3a85d", - "sha256:ccee7ebbb4735ebc341d347fca9ee09f2fa6c0580528c1414bc4e1d31372835c", - "sha256:dc62c0840b2fc7753550b40405532a3e125c0d3761f34af948873393aa688160", - "sha256:f7d9d5aa1c7e54167f1a3cba36b5c52c7c540f30952c9bd7d9302a1eda318424" - ], - "version": "==4.2.3" - }, "markupsafe": { "hashes": [ "sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665" ], "version": "==1.0" }, - "parse": { - "hashes": [ - "sha256:c3cdf6206f22aeebfa00e5b954fcfea13d1b2dc271c75806b6025b94fb490939" - ], - "version": "==1.8.4" - }, "peewee": { "hashes": [ "sha256:538d7b7f892e59ecbef6eeefb867fd37f20ac77e602b4ee94b2766836cfa24ba" @@ -143,28 +78,6 @@ "index": "pypi", "version": "==3.6.4" }, - "pyee": { - "hashes": [ - "sha256:47f8fa96d6dee61c82001831e1fbba55f3f808003a322d0e6653aa01c59f6b9e", - "sha256:4ec22817297b7024f89721cc34f790ee2767c5b5ca44284c565ee643abafbe32" - ], - "version": "==5.0.0" - }, - "pyppeteer": { - "hashes": [ - "sha256:c35cff5c3b4d3c391c10a85b1a56b9f39daf18c795b0d8f6e0b7eaeac1ad5be3" - ], - "markers": "python_version >= '3.4'", - "version": "==0.0.19" - }, - "pyquery": { - "hashes": [ - "sha256:07987c2ed2aed5cba29ff18af95e56e9eb04a2249f42ce47bddfb37f487229a3", - "sha256:4771db76bd14352eba006463656aef990a0147a0eeaf094725097acfa90442bf" - ], - "markers": "python_version != '3.0.*' and python_version >= '2.7' and python_version != '3.3.*' and python_version != '3.2.*' and python_version != '3.1.*'", - "version": "==1.4.0" - }, "requests": { "hashes": [ "sha256:63b52e3c866428a224f97cab011de738c36aec0185aa91cfacd418b5d58911d1", @@ -173,63 +86,14 @@ "index": "pypi", "version": "==2.19.1" }, - "requests-html": { - "hashes": [ - "sha256:34257d5249b20b8ed14573eba910f48032a61205e70d11ce8a3ef6abf8edc50b", - "sha256:9686f21c5753ba6c025c6ba223a8329c7b149a935a73055097faf8999eee85b1" - ], - "index": "pypi", - "version": "==0.9.0" - }, - "six": { - "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" - ], - "version": "==1.11.0" - }, "urllib3": { "hashes": [ "sha256:a68ac5e15e76e7e5dd2b8f94007233e01effe3e50e8daddf69acfd81cb686baf", "sha256:b5725a0bd4ba422ab0e66e89e030c806576753ea3ee08554382c14e685d117b5" ], - "markers": "python_version >= '2.6' and python_version != '3.3.*' and python_version != '3.1.*' and python_version != '3.2.*' and python_version < '4' and python_version != '3.0.*'", + "markers": "python_version != '3.1.*' and python_version >= '2.6' and python_version != '3.3.*' and python_version != '3.0.*' and python_version < '4' and python_version != '3.2.*'", "version": "==1.23" }, - "w3lib": { - "hashes": [ - "sha256:55994787e93b411c2d659068b51b9998d9d0c05e0df188e6daf8f45836e1ea38", - "sha256:aaf7362464532b1036ab0092e2eee78e8fd7b56787baa9ed4967457b083d011b" - ], - "version": "==1.19.0" - }, - "websockets": { - "hashes": [ - "sha256:0e2f7d6567838369af074f0ef4d0b802d19fa1fee135d864acc656ceefa33136", - "sha256:2a16dac282b2fdae75178d0ed3d5b9bc3258dabfae50196cbb30578d84b6f6a6", - "sha256:5a1fa6072405648cb5b3688e9ed3b94be683ce4a4e5723e6f5d34859dee495c1", - "sha256:5c1f55a1274df9d6a37553fef8cff2958515438c58920897675c9bc70f5a0538", - "sha256:669d1e46f165e0ad152ed8197f7edead22854a6c90419f544e0f234cc9dac6c4", - "sha256:695e34c4dbea18d09ab2c258994a8bf6a09564e762655408241f6a14592d2908", - "sha256:6b2e03d69afa8d20253455e67b64de1a82ff8612db105113cccec35d3f8429f0", - "sha256:79ca7cdda7ad4e3663ea3c43bfa8637fc5d5604c7737f19a8964781abbd1148d", - "sha256:7fd2dd9a856f72e6ed06f82facfce01d119b88457cd4b47b7ae501e8e11eba9c", - "sha256:82c0354ac39379d836719a77ee360ef865377aa6fdead87909d50248d0f05f4d", - "sha256:8f3b956d11c5b301206382726210dc1d3bee1a9ccf7aadf895aaf31f71c3716c", - "sha256:91ec98640220ae05b34b79ee88abf27f97ef7c61cf525eec57ea8fcea9f7dddb", - "sha256:952be9540d83dba815569d5cb5f31708801e0bbfc3a8c5aef1890b57ed7e58bf", - "sha256:99ac266af38ba1b1fe13975aea01ac0e14bb5f3a3200d2c69f05385768b8568e", - "sha256:9fa122e7adb24232247f8a89f2d9070bf64b7869daf93ac5e19546b409e47e96", - "sha256:a0873eadc4b8ca93e2e848d490809e0123eea154aa44ecd0109c4d0171869584", - "sha256:cb998bd4d93af46b8b49ecf5a72c0a98e5cc6d57fdca6527ba78ad89d6606484", - "sha256:e02e57346f6a68523e3c43bbdf35dde5c440318d1f827208ae455f6a2ace446d", - "sha256:e79a5a896bcee7fff24a788d72e5c69f13e61369d055f28113e71945a7eb1559", - "sha256:ee55eb6bcf23ecc975e6b47c127c201b913598f38b6a300075f84eeef2d3baff", - "sha256:f1414e6cbcea8d22843e7eafdfdfae3dd1aba41d1945f6ca66e4806c07c4f454" - ], - "markers": "python_version >= '3.4'", - "version": "==6.0" - }, "werkzeug": { "hashes": [ "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c", @@ -253,6 +117,14 @@ ], "version": "==0.1.0" }, + "colorama": { + "hashes": [ + "sha256:463f8483208e921368c9f306094eb6f725c6ca42b0f97e313cb5d5512459feda", + "sha256:48eb22f4f8461b1df5734a074b57042430fb06e1d61bd1e11b078c0fe6d7a1f1" + ], + "markers": "sys_platform == 'win32'", + "version": "==0.3.9" + }, "decorator": { "hashes": [ "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", @@ -281,7 +153,7 @@ "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8", "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497" ], - "markers": "python_version != '3.1.*' and python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*'", + "markers": "python_version >= '2.7' and python_version != '3.3.*' and python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.1.*'", "version": "==4.3.4" }, "jedi": { @@ -339,14 +211,6 @@ ], "version": "==0.3.1" }, - "pexpect": { - "hashes": [ - "sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba", - "sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b" - ], - "markers": "sys_platform != 'win32'", - "version": "==4.6.0" - }, "pickleshare": { "hashes": [ "sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b", @@ -362,13 +226,6 @@ ], "version": "==1.0.15" }, - "ptyprocess": { - "hashes": [ - "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", - "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" - ], - "version": "==0.6.0" - }, "pygments": { "hashes": [ "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d", @@ -404,35 +261,6 @@ ], "version": "==4.3.2" }, - "typed-ast": { - "hashes": [ - "sha256:0948004fa228ae071054f5208840a1e88747a357ec1101c17217bfe99b299d58", - "sha256:10703d3cec8dcd9eef5a630a04056bbc898abc19bac5691612acba7d1325b66d", - "sha256:1f6c4bd0bdc0f14246fd41262df7dfc018d65bb05f6e16390b7ea26ca454a291", - "sha256:25d8feefe27eb0303b73545416b13d108c6067b846b543738a25ff304824ed9a", - "sha256:29464a177d56e4e055b5f7b629935af7f49c196be47528cc94e0a7bf83fbc2b9", - "sha256:2e214b72168ea0275efd6c884b114ab42e316de3ffa125b267e732ed2abda892", - "sha256:3e0d5e48e3a23e9a4d1a9f698e32a542a4a288c871d33ed8df1b092a40f3a0f9", - "sha256:519425deca5c2b2bdac49f77b2c5625781abbaf9a809d727d3a5596b30bb4ded", - "sha256:57fe287f0cdd9ceaf69e7b71a2e94a24b5d268b35df251a88fef5cc241bf73aa", - "sha256:668d0cec391d9aed1c6a388b0d5b97cd22e6073eaa5fbaa6d2946603b4871efe", - "sha256:68ba70684990f59497680ff90d18e756a47bf4863c604098f10de9716b2c0bdd", - "sha256:6de012d2b166fe7a4cdf505eee3aaa12192f7ba365beeefaca4ec10e31241a85", - "sha256:79b91ebe5a28d349b6d0d323023350133e927b4de5b651a8aa2db69c761420c6", - "sha256:8550177fa5d4c1f09b5e5f524411c44633c80ec69b24e0e98906dd761941ca46", - "sha256:898f818399cafcdb93cbbe15fc83a33d05f18e29fb498ddc09b0214cdfc7cd51", - "sha256:94b091dc0f19291adcb279a108f5d38de2430411068b219f41b343c03b28fb1f", - "sha256:a26863198902cda15ab4503991e8cf1ca874219e0118cbf07c126bce7c4db129", - "sha256:a8034021801bc0440f2e027c354b4eafd95891b573e12ff0418dec385c76785c", - "sha256:bc978ac17468fe868ee589c795d06777f75496b1ed576d308002c8a5756fb9ea", - "sha256:c05b41bc1deade9f90ddc5d988fe506208019ebba9f2578c622516fd201f5863", - "sha256:c9b060bd1e5a26ab6e8267fd46fc9e02b54eb15fffb16d112d4c7b1c12987559", - "sha256:edb04bdd45bfd76c8292c4d9654568efaedf76fe78eb246dde69bdb13b2dad87", - "sha256:f19f2a4f547505fe9072e15f6f4ae714af51b5a681a97f187971f50c283193b6" - ], - "markers": "python_version < '3.7' and implementation_name == 'cpython'", - "version": "==1.1.0" - }, "wcwidth": { "hashes": [ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", diff --git a/crawl/crawler.py b/crawl/crawler.py index 689ab88..dd24bd6 100644 --- a/crawl/crawler.py +++ b/crawl/crawler.py @@ -3,7 +3,6 @@ import re import requests -import requests_html from config import crawl_config as config @@ -18,7 +17,6 @@ class Crawler(object): def __init__(self): self.session = requests.session() - self.html_session = requests_html.HTMLSession() def get_url(self, url, params=dict()): resp = self.session.get(url, params=params, headers=Crawler.DEFAULT_HEADER, cookies=config.COOKIES) @@ -28,18 +26,5 @@ def post_for_json(self, url, params=dict()): resp = self.session.post(url, params=params, headers=Crawler.DEFAULT_HEADER, cookies=config.COOKIES) return resp - def get_html(self, url, params=dict()): - resp = self.html_session.get(url, params=params, headers=Crawler.DEFAULT_HEADER, cookies=config.COOKIES) - return resp - - def get_index_page(self): - resp = self.get_url(f"http://www.renren.com/{config.UID}/profile") - index_content = resp.content.decode('utf8') - - find_requestToken = re.findall(r"requestToken\s:\s'(-*\d+)'", index_content) - find_rtk = re.findall(r"_rtk\s:\s'(\w+)'", index_content) - - print(find_requestToken, find_rtk) - crawler = Crawler() diff --git a/crawl/gossip.py b/crawl/gossip.py index 278a293..5ff3056 100644 --- a/crawl/gossip.py +++ b/crawl/gossip.py @@ -8,6 +8,7 @@ from models import Gossip from .crawler import crawler +from .utils import get_image normal_pattern = re.compile(r'(.*)') @@ -23,14 +24,16 @@ def load_gossip_page(page): r = json.loads(resp.text) for c in r['array']: + local_pic = get_image(c['tinyUrl']) + gossip = { 'id': c['id'], 't': datetime.strptime(c['time'], "%Y-%m-%d %H:%M"), 'guestId': c['guestId'], 'guestName': c['guestName'], - 'headPic': c['tinyUrl'], # 居然保存的是当时的头像,这里不能往 User 表里塞了 - 'attachSnap': c.get('headUrl', ''), - 'attachPic': c.get('largeUrl', ''), + 'headPic': local_pic, # 居然保存的是当时的头像,这里不能往 User 表里塞了 + 'attachSnap': get_image(c.get('headUrl', '')), + 'attachPic': get_image(c.get('largeUrl', '')), 'whisper': c['whisper'] == 'true', 'wap': c['wap'] == 'true', 'gift': c['giftImg'] if c['gift'] == 'true' else '' diff --git a/crawl/utils.py b/crawl/utils.py index eb7164d..6810395 100644 --- a/crawl/utils.py +++ b/crawl/utils.py @@ -2,6 +2,7 @@ from datetime import datetime import json +import os from config import crawl_config as config from models import User, Comment, Like @@ -9,6 +10,32 @@ from .crawler import crawler +def get_image(img_url): + if not img_url: + return '' + + path = img_url.split('/') + path[0] = 'static' + path[1] = 'img' + path[2] = path[2].replace('.', '_') + + filename = '/'.join(path) + filepath = '/'.join(path[:-1]) + + if os.path.exists(filename): + return f'/{filename}' + + if not os.path.exists(filepath): + os.makedirs(filepath) + + resp = crawler.get_url(img_url) + with open(filename, 'wb') as fp: + fp.write(resp.content) + + print(f' get {img_url} to {filename}') + return f'/{filename}' + + def get_comments(entry_id, entry_type): param = { "entryOwnerId": config.UID, diff --git a/fetch.py b/fetch.py index 5d09c59..b6a3355 100644 --- a/fetch.py +++ b/fetch.py @@ -3,13 +3,14 @@ from crawl import status as crawl_status from crawl import gossip as crawl_gossip from crawl import album as crawl_album +from crawl.utils import get_image from config import config from models import database, User, Comment, Like, Status, Album, Photo, Gossip with database: database.create_tables([User, Comment, Like, Status, Album, Photo, Gossip]) - status_count = crawl_status.get_status() - # gossip_count = crawl_gossip.get_gossip() + # status_count = crawl_status.get_status() + gossip_count = crawl_gossip.get_gossip() - album_count = crawl_album.get_albums() + # album_count = crawl_album.get_albums() diff --git a/templates/gossip_list.html b/templates/gossip_list.html index 21dd339..7dbd374 100644 --- a/templates/gossip_list.html +++ b/templates/gossip_list.html @@ -6,8 +6,7 @@