Skip to content

Commit

Permalink
movies spider
Browse files Browse the repository at this point in the history
  • Loading branch information
Nyloner committed May 12, 2018
1 parent d15d568 commit 692814d
Show file tree
Hide file tree
Showing 2 changed files with 287 additions and 0 deletions.
152 changes: 152 additions & 0 deletions v.qq.com/movies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
from util import *
import time
from bs4 import BeautifulSoup
import json


def get_movie_list():
offset = 0
base_url = 'http://v.qq.com/x/list/movie?sort=19&offset={}'
while True:
try:
req = build_request(base_url.format(offset))
figures_list = BeautifulSoup(req.text, 'lxml').find(
'ul', {'class': 'figures_list'}).find_all('li', {'class': 'list_item'})
except Exception as e:
print('offset', offset, 'fail', e)
continue
f = open('./files/movie_list', 'a')
for item in figures_list:
movie = {}
movie['url'] = item.find('a').get('href')
movie['cid'] = item.find('a').get('data-float')
movie['title'] = item.find(
'div', {'class': 'figure_title_score'}).find('a').get_text()
try:
movie['figure_score'] = item.find(
'div', {'class': 'figure_score'}).get_text().replace('\n', '')
except:
movie['figure_score'] = ''
try:
movie['mark_v'] = item.find(
'i', {'class': 'mark_v'}).find('img').get('alt')
except:
movie['mark_v'] = ''
f.write(json.dumps(movie)+'\n')
f.close()
if offset == 4980:
break
print(current_time(), offset, 'OK')
offset += 30
time.sleep(1)


def get_float_info():
url = 'http://node.video.qq.com/x/api/float_vinfo2?cid=3ou2gtkskly3a99'


def get_movie_info(url):
req = build_request(url)
try:
res_text = req.text.encode('iso-8859-1').decode('utf-8')
except:
res_text = req.text
soup = BeautifulSoup(res_text, 'lxml').find('body')
info = {}
current_item = soup.find('ul', {'class': 'figure_list'}).find(
'li', {'class': 'list_item'})
info['play_time'] = current_item.find(
'div', {'class': 'figure_count'}).find('span').get_text()
info['figure_num'] = current_item.find(
'div', {'class': 'figure_num'}).find('span').get_text()
info['mod_cover_playnum'] = soup.find(
'em', {'id': 'mod_cover_playnum'}).get_text()
try:
douban_score = soup.find('span', {'class': 'douban_score'})
info['douban_score'] = douban_score.get_text().replace('\n', '')
except:
info['douban_score'] = ''
video_tags = soup.find('div', {'class': 'video_tags'})
if '豆瓣高分' in str(video_tags):
info['douban_high_score'] = '是'
else:
info['douban_high_score'] = '否'
info['院线'] = '否'
info['tags'] = ''
video_tag_list = video_tags.find_all('a')
for tag in video_tag_list:
try:
href = tag.get('href')
except:
href = ''
if 'area=' in href:
info['area'] = tag.get_text()
elif 'year=' in href:
info['year'] = tag.get_text()
else:
tag_value = tag.get_text()
if '院线' in tag_value:
info['院线'] = '是'
else:
info['tags'] += tag_value+' '
director = soup.find('div', {'class': 'director'})
if director is None:
return info
director_text = director.get_text().replace('\xa0', '').replace('\n', '')
if '演员' in director_text:
info['actor_list'] = director_text.split('演员')[-1].replace(':', '')
info['director'] = director_text.split('演员')[0].replace('导演:', '')
else:
info['director'] = director_text
info['actor_list'] = ''
return info


def crawl_movie_info():
for line in open('./files/movie_list', 'r'):
try:
item = json.loads(line)
info = get_movie_info(item['url'])
except Exception as e:
print(current_time(), 'fail', e)
f = open('./files/fail', 'a')
f.write(line)
f.close()
continue
for key in item:
info[key] = item[key]
f = open('./files/result', 'a')
f.write(json.dumps(info)+'\n')
f.close()
print(current_time(), info['title'], 'OK')
time.sleep(0.2)

def load_result():
keys = ['title', 'year', 'play_time', 'tags', 'figure_score', 'douban_score', 'figure_num',
'mod_cover_playnum', 'mark_v', '院线', 'director', 'actor_list', 'douban_high_score', 'area','url']
num = 1
for line in open('./files/result', 'r'):
movie = json.loads(line)
item = [num]
if 'area' not in movie:
movie['area']=movie['tags'].split(' ')[0]
movie['tags']=movie['tags'].replace(movie['area']+' ','')
for key in keys:
if key == 'mark_v':
values = ['否', '否', '否']
if 'VIP' in movie[key]:
values[1] = '是'
elif '用券' in movie[key]:
values[0] = '是'
elif '独播' in movie[key]:
values[2] = '是'
item += values
continue
try:
item.append(movie[key])
except:
item.append('')
yield item
num+=1

write_to_excel(load_result(),'腾讯电影.xlsx')
135 changes: 135 additions & 0 deletions v.qq.com/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import requests
import time
import openpyxl
import random
import datetime
import json
import re
import csv
import os


def get_headers():
pc_headers = {
"X-Forwarded-For": '%s.%s.%s.%s' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
return pc_headers


class NetWorkError(Exception):
pass


def build_request(url, headers=None, data=None, json_data=None, timeout=15, try_times=3):
if headers is None:
headers = get_headers()
for i in range(try_times):
try:
if data:
response = requests.post(
url, data=data, headers=headers, timeout=timeout)
elif json_data:
headers['Content-Type'] = 'application/json'
response = requests.post(
url, data=json.dumps(json_data), headers=headers, timeout=timeout)
else:
response = requests.get(url, headers=headers, timeout=timeout)
return response
except Exception as e:
continue
raise NetWorkError


def write_to_excel(lines, filename, write_only=True):
excel = openpyxl.Workbook(write_only=write_only)
sheet = excel.create_sheet()
for line in lines:
try:
sheet.append(line)
except:
print(line)
excel.save(filename)


def write_to_csv(lines, filename):
csvfile = open(filename, 'w', encoding='utf-8')
spamwriter = csv.writer(csvfile, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
for line in lines:
spamwriter.writerow(line)
csvfile.close()


def get_next_date(current_date='2017-01-01'):
current_date = datetime.datetime.strptime(current_date, '%Y-%m-%d')
oneday = datetime.timedelta(days=1)
next_date = current_date+oneday
return str(next_date).split(' ')[0]


def current_time():
return time.strftime("%Y-%m-%d %H:%M:%S")


def load_txt(filename):
for line in open(filename, 'r'):
try:
item = json.loads(line)
except:
continue
yield item


def sub_str(string, words=None, append=None):
if words is None:
words = ['\r', '\n', '\t', '\xa0']
if append is not None:
words += append
string = re.sub('|'.join(words), '', string)
return string


def get_proxies_abuyun():
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = ''
proxyPass = ''

proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
return proxies


def build_proxy_request(url, data=None, headers=None, json_data=None):
if headers is None:
headers = get_headers()
for i in range(5):
try:
if data:
response = requests.post(
url, proxies=get_proxies_abuyun(), data=data, headers=headers, timeout=15)
elif json_data:
headers['Content-Type'] = 'application/json'
response = requests.post(
url, data=json.dumps(json_data), proxies=get_proxies_abuyun(), headers=headers, timeout=15)
else:
response = requests.get(
url, headers=headers, proxies=get_proxies_abuyun(), timeout=15)
return response
except Exception as e:
if '429' in str(e):
time.sleep(random.randint(0, 1000)/1000.0)
continue
raise NetWorkError

0 comments on commit 692814d

Please sign in to comment.