Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Nyloner committed May 16, 2018
1 parent bf5313b commit f584da4
Show file tree
Hide file tree
Showing 2 changed files with 262 additions and 0 deletions.
126 changes: 126 additions & 0 deletions www.mafengwo.cn/travellist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from util import *
from bs4 import BeautifulSoup
import time
import json
import re


def get_travellist(mddid):
page = 1
num = 0
while True:
data = {
'mddid': mddid,
'pageid': 'mdd_index',
'sort': 2,
'cost': 0,
'days': 0,
'month': 0,
'tagid': 0,
'page': page
}
try:
req = build_request(
'http://www.mafengwo.cn/gonglve/ajax.php?act=get_travellist', data=data)
res_data = req.json()['list']
except Exception as e:
print(current_time(), mddid, page, 'fail', e)
continue
tn_list = BeautifulSoup(res_data, 'lxml').find_all(
'div', {'class': 'tn-item'})
f = open('./files/{}_tn_list'.format(mddid), 'a')
for item in tn_list:
tn_wrapper = item.find('div', {'class': 'tn-wrapper'})
des = tn_wrapper.find('dd').get_text()
if len(des) < 100:
continue
url_list = tn_wrapper.find('dt').find_all('a')
url = url_list[-1].get('href')
title = url_list[-1].get_text()
user_name = tn_wrapper.find(
'span', {'class': 'tn-user'}).get_text()
f.write(json.dumps([title, user_name, url])+'\n')
num += 1
f.close()
print(current_time(), mddid, page, num, 'OK')
page += 1
time.sleep(0.5)


def get_user_info(iid):
url = 'http://pagelet.mafengwo.cn/note/pagelet/headOperateApi?params={{"iid":"{}"}}'.format(
iid)
req = build_request(url)
soup = BeautifulSoup(req.json()['data']['html'], 'lxml')
per_name = soup.find('a', {'class': 'per_name'}).get_text().replace(
'\n', '').replace(' ', '')
try:
from_city = re.findall('\((.*?)\)', per_name)[0]
except:
from_city = ''
try:
p_time = soup.find('span', {'class': 'time'}).get_text()
except:
p_time = ''
return [per_name, from_city, p_time]


def get_travel_info(url):
user_info = get_user_info(url.split('/')[-1].split('.')[0])
req = build_request(url)
soup = BeautifulSoup(req.text, 'lxml').find('div', {'class': 'main'})
view_con = soup.find('div', {'class': 'view_con'})
content = view_con.get_text()
tarvel_dir_list = view_con.find('div', {'class': 'tarvel_dir_list'})
try:
v_time = tarvel_dir_list.find(
'li', {'class': 'time'}).get_text().split('/')[-1]
except:
v_time = ''
try:
day = tarvel_dir_list.find(
'li', {'class': 'day'}).get_text().split('/')[-1]
except:
day = ''
try:
people = tarvel_dir_list.find(
'li', {'class': 'people'}).get_text().split('/')[-1]
except:
people = ''
try:
cost = tarvel_dir_list.find(
'li', {'class': 'cost'}).get_text().split('/')[-1]
except:
cost = ''
return user_info+[v_time, day, people, cost, content]


def crawl_travel():
mddid = '10065'
# get_travellist(mddid)
succ_num = 0
fail_num = 0
for line in open('./files/{}_tn_list'.format(mddid), 'r'):
try:
travel = json.loads(line)
except:
continue
try:
travel_info = get_travel_info('http://www.mafengwo.cn'+travel[-1])
except:
f = open('./files/fail_{}_tn_list'.format(mddid), 'a')
f.write(line)
f.close()
fail_num += 1
time.sleep(1)
continue
succ_num += 1
f = open('./files/result_{}_tn_list'.format(mddid), 'a')
f.write(json.dumps(
travel+['http://www.mafengwo.cn'+travel[-1]]+travel_info)+'\n')
f.close()
print(current_time(), succ_num, fail_num)
time.sleep(1)


crawl_travel()
136 changes: 136 additions & 0 deletions www.mafengwo.cn/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import requests
import time
import openpyxl
import random
import datetime
import json
import re
import csv
import os


def get_headers():
pc_headers = {
"X-Forwarded-For": '%s.%s.%s.%s' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
return pc_headers


class NetWorkError(Exception):
pass


def build_request(url, headers=None, data=None, json_data=None, timeout=15, try_times=3):
if headers is None:
headers = get_headers()
for i in range(try_times):
try:
if data:
response = requests.post(
url, data=data, headers=headers, timeout=timeout)
elif json_data:
headers['Content-Type'] = 'application/json'
response = requests.post(
url, data=json.dumps(json_data), headers=headers, timeout=timeout)
else:
response = requests.get(url, headers=headers, timeout=timeout)
return response
except Exception as e:
continue
raise NetWorkError


def write_to_excel(lines, filename, write_only=True):
excel = openpyxl.Workbook(write_only=write_only)
sheet = excel.create_sheet()
for line in lines:
try:
sheet.append(line)
except:
continue
excel.save(filename)


def write_to_csv(lines, filename):
csvfile = open(filename, 'w', encoding='utf-8')
spamwriter = csv.writer(csvfile, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
for line in lines:
spamwriter.writerow(line)
csvfile.close()


def get_next_date(current_date='2017-01-01'):
current_date = datetime.datetime.strptime(current_date, '%Y-%m-%d')
oneday = datetime.timedelta(days=1)
next_date = current_date+oneday
return str(next_date).split(' ')[0]


def current_time():
return time.strftime("%Y-%m-%d %H:%M:%S")


def load_txt(filename):
for line in open(filename, 'r'):
try:
item = json.loads(line)
except:
continue
yield item


def sub_str(string, words=None, append=None):
if words is None:
words = ['\r', '\n', '\t', '\xa0']
if append is not None:
words += append
string = re.sub('|'.join(words), '', string)
return string


def get_proxies_abuyun():
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = ''
proxyPass = ''

proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
return proxies


def build_proxy_request(url, data=None, headers=None, json_data=None):
if headers is None:
headers = get_headers()
for i in range(5):
try:
if data:
response = requests.post(
url, proxies=get_proxies_abuyun(), data=data, headers=headers, timeout=15)
elif json_data:
headers['Content-Type'] = 'application/json'
response = requests.post(
url, data=json.dumps(json_data), proxies=get_proxies_abuyun(), headers=headers, timeout=15)
else:
response = requests.get(
url, headers=headers, proxies=get_proxies_abuyun(), timeout=15)
return response
except Exception as e:
if '429' in str(e):
time.sleep(random.randint(0, 1000)/1000.0)
continue
raise NetWorkError

0 comments on commit f584da4

Please sign in to comment.