add www.mafengwo.cn

soloyuyang · May 16, 2018 · f584da4 · f584da4
1 parent bf5313b
commit f584da4
Show file tree

Hide file tree

Showing 2 changed files with 262 additions and 0 deletions.
diff --git a/www.mafengwo.cn/travellist.py b/www.mafengwo.cn/travellist.py
@@ -0,0 +1,126 @@
+from util import *
+from bs4 import BeautifulSoup
+import time
+import json
+import re
+
+
+def get_travellist(mddid):
+    page = 1
+    num = 0
+    while True:
+        data = {
+            'mddid': mddid,
+            'pageid': 'mdd_index',
+            'sort': 2,
+            'cost': 0,
+            'days': 0,
+            'month': 0,
+            'tagid': 0,
+            'page': page
+        }
+        try:
+            req = build_request(
+                'http://www.mafengwo.cn/gonglve/ajax.php?act=get_travellist', data=data)
+            res_data = req.json()['list']
+        except Exception as e:
+            print(current_time(), mddid, page, 'fail', e)
+            continue
+        tn_list = BeautifulSoup(res_data, 'lxml').find_all(
+            'div', {'class': 'tn-item'})
+        f = open('./files/{}_tn_list'.format(mddid), 'a')
+        for item in tn_list:
+            tn_wrapper = item.find('div', {'class': 'tn-wrapper'})
+            des = tn_wrapper.find('dd').get_text()
+            if len(des) < 100:
+                continue
+            url_list = tn_wrapper.find('dt').find_all('a')
+            url = url_list[-1].get('href')
+            title = url_list[-1].get_text()
+            user_name = tn_wrapper.find(
+                'span', {'class': 'tn-user'}).get_text()
+            f.write(json.dumps([title, user_name, url])+'\n')
+            num += 1
+        f.close()
+        print(current_time(), mddid, page, num, 'OK')
+        page += 1
+        time.sleep(0.5)
+
+
+def get_user_info(iid):
+    url = 'http://pagelet.mafengwo.cn/note/pagelet/headOperateApi?params={{"iid":"{}"}}'.format(
+        iid)
+    req = build_request(url)
+    soup = BeautifulSoup(req.json()['data']['html'], 'lxml')
+    per_name = soup.find('a', {'class': 'per_name'}).get_text().replace(
+        '\n', '').replace('  ', '')
+    try:
+        from_city = re.findall('\((.*?)\)', per_name)[0]
+    except:
+        from_city = ''
+    try:
+        p_time = soup.find('span', {'class': 'time'}).get_text()
+    except:
+        p_time = ''
+    return [per_name, from_city, p_time]
+
+
+def get_travel_info(url):
+    user_info = get_user_info(url.split('/')[-1].split('.')[0])
+    req = build_request(url)
+    soup = BeautifulSoup(req.text, 'lxml').find('div', {'class': 'main'})
+    view_con = soup.find('div', {'class': 'view_con'})
+    content = view_con.get_text()
+    tarvel_dir_list = view_con.find('div', {'class': 'tarvel_dir_list'})
+    try:
+        v_time = tarvel_dir_list.find(
+            'li', {'class': 'time'}).get_text().split('/')[-1]
+    except:
+        v_time = ''
+    try:
+        day = tarvel_dir_list.find(
+            'li', {'class': 'day'}).get_text().split('/')[-1]
+    except:
+        day = ''
+    try:
+        people = tarvel_dir_list.find(
+            'li', {'class': 'people'}).get_text().split('/')[-1]
+    except:
+        people = ''
+    try:
+        cost = tarvel_dir_list.find(
+            'li', {'class': 'cost'}).get_text().split('/')[-1]
+    except:
+        cost = ''
+    return user_info+[v_time, day, people, cost, content]
+
+
+def crawl_travel():
+    mddid = '10065'
+    # get_travellist(mddid)
+    succ_num = 0
+    fail_num = 0
+    for line in open('./files/{}_tn_list'.format(mddid), 'r'):
+        try:
+            travel = json.loads(line)
+        except:
+            continue
+        try:
+            travel_info = get_travel_info('http://www.mafengwo.cn'+travel[-1])
+        except:
+            f = open('./files/fail_{}_tn_list'.format(mddid), 'a')
+            f.write(line)
+            f.close()
+            fail_num += 1
+            time.sleep(1)
+            continue
+        succ_num += 1
+        f = open('./files/result_{}_tn_list'.format(mddid), 'a')
+        f.write(json.dumps(
+            travel+['http://www.mafengwo.cn'+travel[-1]]+travel_info)+'\n')
+        f.close()
+        print(current_time(), succ_num, fail_num)
+        time.sleep(1)
+
+
+crawl_travel()
diff --git a/www.mafengwo.cn/util.py b/www.mafengwo.cn/util.py
@@ -0,0 +1,136 @@
+import requests
+import time
+import openpyxl
+import random
+import datetime
+import json
+import re
+import csv
+import os
+
+
+def get_headers():
+    pc_headers = {
+        "X-Forwarded-For": '%s.%s.%s.%s' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
+        "Accept-Encoding": "gzip, deflate",
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
+    }
+    return pc_headers
+
+
+class NetWorkError(Exception):
+    pass
+
+
+def build_request(url, headers=None, data=None, json_data=None, timeout=15, try_times=3):
+    if headers is None:
+        headers = get_headers()
+    for i in range(try_times):
+        try:
+            if data:
+                response = requests.post(
+                    url, data=data, headers=headers, timeout=timeout)
+            elif json_data:
+                headers['Content-Type'] = 'application/json'
+                response = requests.post(
+                    url, data=json.dumps(json_data), headers=headers, timeout=timeout)
+            else:
+                response = requests.get(url, headers=headers, timeout=timeout)
+            return response
+        except Exception as e:
+            continue
+    raise NetWorkError
+
+
+def write_to_excel(lines, filename, write_only=True):
+    excel = openpyxl.Workbook(write_only=write_only)
+    sheet = excel.create_sheet()
+    for line in lines:
+        try:
+            sheet.append(line)
+        except:
+            continue
+    excel.save(filename)
+
+
+def write_to_csv(lines, filename):
+    csvfile = open(filename, 'w', encoding='utf-8')
+    spamwriter = csv.writer(csvfile, delimiter=',',
+                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
+    for line in lines:
+        spamwriter.writerow(line)
+    csvfile.close()
+
+
+def get_next_date(current_date='2017-01-01'):
+    current_date = datetime.datetime.strptime(current_date, '%Y-%m-%d')
+    oneday = datetime.timedelta(days=1)
+    next_date = current_date+oneday
+    return str(next_date).split(' ')[0]
+
+
+def current_time():
+    return time.strftime("%Y-%m-%d %H:%M:%S")
+
+
+def load_txt(filename):
+    for line in open(filename, 'r'):
+        try:
+            item = json.loads(line)
+        except:
+            continue
+        yield item
+
+
+def sub_str(string, words=None, append=None):
+    if words is None:
+        words = ['\r', '\n', '\t', '\xa0']
+    if append is not None:
+        words += append
+    string = re.sub('|'.join(words), '', string)
+    return string
+
+
+def get_proxies_abuyun():
+    proxyHost = "http-dyn.abuyun.com"
+    proxyPort = "9020"
+    # 代理隧道验证信息
+    proxyUser = ''
+    proxyPass = ''
+
+    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
+        "host": proxyHost,
+        "port": proxyPort,
+        "user": proxyUser,
+        "pass": proxyPass,
+    }
+    proxies = {
+        "http": proxyMeta,
+        "https": proxyMeta,
+    }
+    return proxies
+
+
+def build_proxy_request(url, data=None, headers=None, json_data=None):
+    if headers is None:
+        headers = get_headers()
+    for i in range(5):
+        try:
+            if data:
+                response = requests.post(
+                    url, proxies=get_proxies_abuyun(), data=data, headers=headers, timeout=15)
+            elif json_data:
+                headers['Content-Type'] = 'application/json'
+                response = requests.post(
+                    url, data=json.dumps(json_data), proxies=get_proxies_abuyun(), headers=headers, timeout=15)
+            else:
+                response = requests.get(
+                    url, headers=headers, proxies=get_proxies_abuyun(), timeout=15)
+            return response
+        except Exception as e:
+            if '429' in str(e):
+                time.sleep(random.randint(0, 1000)/1000.0)
+            continue
+    raise NetWorkError
+