Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Nyloner committed May 24, 2018
1 parent 7b35a31 commit b1c239b
Show file tree
Hide file tree
Showing 2 changed files with 205 additions and 0 deletions.
113 changes: 113 additions & 0 deletions www.youzy.cn/major_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from util import *
import json
from bs4 import BeautifulSoup


def get_bk_major_list():
req = build_request('https://www.youzy.cn/major/index/bk')
table = BeautifulSoup(req.text, 'lxml').find(
'div', {'class': 'bk-major-list'}).find_all('div', {'class': 'content'})
result = []
for item in table:
level_1 = item.find(
'div', {'class': 'major-title'}).find('div').get_text()
major_num_list = item.find_all('div', {'class': 'major-num'})
ul_list = item.find_all('ul')
for i in range(len(major_num_list)):
level_2 = major_num_list[i].get_text()
for li in ul_list[i].find_all('li'):
level_3 = li.get_text()
url = 'https://www.youzy.cn'+li.find('a').get("href")
result.append([level_1, level_2, level_3, url])
f = open('./files/bk_major', 'w')
for major in result:
f.write(json.dumps(major, ensure_ascii=False)+'\n')
f.close()


def get_zk_major_list():
req = build_request('https://www.youzy.cn/major/index/zk')
table = BeautifulSoup(req.text, 'lxml').find(
'div', {'class': 'bk-major-list'}).find_all('div', {'class': 'content'})
result = []
for item in table:
level_1 = item.find(
'div', {'class': 'major-title'}).find('div').get_text()
major_num_list = item.find_all('div', {'class': 'major-num'})
ul_list = item.find_all('ul')
for i in range(len(major_num_list)):
level_2 = major_num_list[i].get_text()
for li in ul_list[i].find_all('li'):
level_3 = li.get_text()
url = 'https://www.youzy.cn'+li.find('a').get("href")
result.append([level_1, level_2, level_3, url])
f = open('./files/zk_major', 'w')
for major in result:
f.write(json.dumps(major, ensure_ascii=False)+'\n')
f.close()


def get_major_info(major_id):
req = build_request('https://www.youzy.cn/Majors/V3/Detail.aspx?majorId={}&mc='.format(major_id))
detail = BeautifulSoup(req.text, 'lxml').find(
'div', {'class': "major-detail"})
result = {}
major_con = detail.find('div', {'class': 'major-con'}).get_text()
result['major_con'] = sub_str(major_con,append=[' '])
base_info_list = detail.find(
'div', {'class': 'base-info'}).find_all('div', {'class': 'mt20'})
for item in base_info_list:
key = item.find('p', {'class': 'title'}).get_text()
p_list=item.find_all('p')
value=''
for p_item in p_list[1:]:
value += p_item.get_text().replace(' ','')
result[sub_str(key,append=['•'])] = value

req=build_request('https://www.youzy.cn/Majors/V3/JobProspect.aspx?majorId={}&mc='.format(major_id))
try:
soup=BeautifulSoup(req.text,'lxml').find('div',{'class':'job-prospect'}).find('div',{'class':'mt30'})
result['就业方向']=soup.find_all('p')[-1].get_text().replace('\r\n','').replace(' ','')
except:
pass
return result

def crawl_major_info():
for line in open('./files/bk_major','r'):
major=json.loads(line)
info=get_major_info(major[-1])
info['base']=major
f=open('./files/bk_result','a')
f.write(json.dumps(info,ensure_ascii=False)+'\n')
f.close()
print(major,'OK')
for line in open('./files/zk_major','r'):
major=json.loads(line)
info=get_major_info(major[-1])
info['base']=major
f=open('./files/zk_result','a')
f.write(json.dumps(info,ensure_ascii=False)+'\n')
f.close()
print(major,'OK')

def load_major_info(filename):
keys=['major_con', ' 专业简介', ' 培养目标', ' 培养要求', ' 名人学者', ' 主干课程', ' 学科要求', ' 知识能力', '就业方向']
yield ['','','','']+keys
for line in open(filename,'r'):
major=json.loads(line)
values=major['base']
for key in keys:
try:
values.append(major[key])
except:
values.append('')
yield values

#crawl_major_info()
write_to_excel(load_major_info('./files/bk_result'),'本科专业.xlsx')
write_to_excel(load_major_info('./files/zk_result'),'专科专业.xlsx')





92 changes: 92 additions & 0 deletions www.youzy.cn/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import requests
import time
import openpyxl
import random
import datetime
import json
import re
import csv
import os


def get_headers():
pc_headers = {
"X-Forwarded-For": '%s.%s.%s.%s' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
return pc_headers


class NetWorkError(Exception):
pass


def build_request(url, headers=None, data=None, json_data=None, timeout=15, try_times=3):
if headers is None:
headers = get_headers()
for i in range(try_times):
try:
if data:
response = requests.post(
url, data=data, headers=headers, timeout=timeout)
elif json_data:
headers['Content-Type'] = 'application/json'
response = requests.post(
url, data=json.dumps(json_data), headers=headers, timeout=timeout)
else:
response = requests.get(url, headers=headers, timeout=timeout)
return response
except Exception as e:
continue
raise NetWorkError


def write_to_excel(lines, filename, write_only=True):
excel = openpyxl.Workbook(write_only=write_only)
sheet = excel.create_sheet()
for line in lines:
try:
sheet.append(line)
except:
continue
excel.save(filename)


def write_to_csv(lines, filename):
csvfile = open(filename, 'w', encoding='utf-8')
spamwriter = csv.writer(csvfile, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
for line in lines:
spamwriter.writerow(line)
csvfile.close()


def get_next_date(current_date='2017-01-01'):
current_date = datetime.datetime.strptime(current_date, '%Y-%m-%d')
oneday = datetime.timedelta(days=1)
next_date = current_date+oneday
return str(next_date).split(' ')[0]


def current_time():
return time.strftime("%Y-%m-%d %H:%M:%S")


def load_txt(filename):
for line in open(filename, 'r'):
try:
item = json.loads(line)
except:
continue
yield item


def sub_str(string, words=None, append=None):
if words is None:
words = ['\r', '\n', '\t', '\xa0']
if append is not None:
words += append
string = re.sub('|'.join(words), '', string)
return string

0 comments on commit b1c239b

Please sign in to comment.