forked from queensun/Nyspider
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
394 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,224 @@ | ||
from util import * | ||
from bs4 import BeautifulSoup | ||
import json | ||
import threading | ||
import random | ||
|
||
URL = 'https://centricparts.centriccatalog.com/EcatMain.aspx' | ||
|
||
T_URL = 'https://centricparts.centriccatalog.com/Inquiry/AppResult.aspx?id=WEB_PADS&v=LD/MD&y=1988&m=16&mm=83&uid=ANR&sid=0' | ||
|
||
|
||
session_pool = [] | ||
|
||
|
||
def get_products(): | ||
req = build_request(URL) | ||
products = parser_select('CatalogsDropdownlist', req.text) | ||
return products | ||
|
||
|
||
def parser_select(name, html): | ||
select_list = BeautifulSoup(html, 'lxml').find( | ||
'select', {'id': name}).find_all('option') | ||
result = [] | ||
for option in select_list: | ||
value = option.get('value') | ||
if 'Select a' in value: | ||
continue | ||
key = option.get_text() | ||
result.append([key, value]) | ||
return result | ||
|
||
|
||
def get_vehicle_type(): | ||
session = requests.session() | ||
session.get(URL) | ||
products = get_products() | ||
result = [] | ||
for product, value in products: | ||
url = URL+'?id=%s' % (value) | ||
req = session.get(url) | ||
vehicle_list = parser_select('VehicleTypesDropDownList', req.text) | ||
for vehicle in vehicle_list: | ||
item = [[product, value], vehicle] | ||
result.append(item) | ||
f = open('./files/id_v', 'a') | ||
for line in result: | ||
f.write(json.dumps(line)+'\n') | ||
f.close() | ||
|
||
|
||
def get_year_values(): | ||
session = requests.session() | ||
session.get(URL) | ||
for line in open('./files/id_v', 'r'): | ||
item = json.loads(line) | ||
url = URL+'?id={}&v={}'.format(item[0][1], item[1][1]) | ||
try: | ||
req = session.get(url) | ||
year_list = parser_select('YearsDropdownlist', req.text) | ||
except: | ||
print(item, 'fail') | ||
f = open('./files/id_v_fail', 'a') | ||
f.write(line) | ||
f.close() | ||
continue | ||
f = open('./files/id_v_year', 'a') | ||
for year_item in year_list: | ||
f.write(json.dumps(item+[year_item])+'\n') | ||
f.close() | ||
print(item, 'OK') | ||
|
||
|
||
def get_make_values(): | ||
session = requests.session() | ||
session.get(URL) | ||
for line in open('./files/id_v_year', 'r'): | ||
item = json.loads(line) | ||
url = URL+'?id={}&v={}&y={}'.format(item[0][1], item[1][1], item[2][1]) | ||
try: | ||
req = session.get(url, timeout=20) | ||
make_list = parser_select('MakesDropdownlist', req.text) | ||
except: | ||
session = requests.session() | ||
session.get(URL, timeout=20) | ||
print(item, 'fail') | ||
f = open('./files/id_v_year_fail', 'a') | ||
f.write(line) | ||
f.close() | ||
continue | ||
f = open('./files/id_v_year_make', 'a') | ||
for make_item in make_list: | ||
f.write(json.dumps(item+[make_item])+'\n') | ||
f.close() | ||
print(item, 'OK') | ||
|
||
|
||
def parser_table(html): | ||
table = BeautifulSoup(html, 'lxml').find( | ||
'table', {'id': 'AppDataGrid'}).find_all('tr') | ||
result = [] | ||
for tr in table: | ||
td_list = tr.find_all('td') | ||
line = [] | ||
for td in td_list: | ||
line.append(td.get_text()) | ||
result.append(line) | ||
return result | ||
|
||
|
||
def get_inquiry_result(value_item): | ||
url = 'https://centricparts.centriccatalog.com/Inquiry/AppResult.aspx' + \ | ||
'?id={}&v={}&y={}&m={}&mm={}'.format( | ||
value_item[0][1], value_item[1][1], value_item[2][1], value_item[3][1], value_item[4][1]) | ||
session = requests.session() | ||
session.get(URL) | ||
req = session.get(url) | ||
result = parser_table(req.text) | ||
|
||
|
||
def create_session_pool(): | ||
global session_pool | ||
for i in range(40): | ||
session = requests.session() | ||
try: | ||
session.get(URL, timeout=10) | ||
except: | ||
continue | ||
print('create session', i+1, 'OK') | ||
session_pool.append(session) | ||
|
||
|
||
def get_model_value(value_item): | ||
url = URL + \ | ||
'?id={}&v={}&y={}&m={}'.format( | ||
value_item[0][1], value_item[1][1], value_item[2][1], value_item[3][1]) | ||
global session_pool | ||
if len(session_pool) == 0: | ||
create_session_pool() | ||
session = random.choice(session_pool) | ||
|
||
for i in range(3): | ||
try: | ||
req = session.get(url, timeout=20, headers=get_headers()) | ||
model_list = parser_select('ModelsDropdownlist', req.text) | ||
result = [] | ||
for model_item in model_list: | ||
result.append(value_item+[model_item]) | ||
if len(result) == 0: | ||
raise NetWorkError | ||
return result | ||
except Exception as e: | ||
session_pool.remove(session) | ||
session = requests.session() | ||
session.get(URL, timeout=10, headers=get_headers()) | ||
session_pool.append(session) | ||
continue | ||
raise NetWorkError | ||
|
||
|
||
class ModelList(threading.Thread): | ||
def __init__(self, item): | ||
super(ModelList, self).__init__() | ||
self.item = item | ||
self.daemon = True | ||
|
||
def run(self): | ||
self.status = False | ||
try: | ||
self.result = get_model_value(self.item) | ||
if len(self.result) != 0: | ||
self.status = True | ||
except Exception as e: | ||
return | ||
|
||
|
||
def load_id_v_year_make_items(): | ||
items = [] | ||
for line in open('./files/id_v_year_make', 'r'): | ||
try: | ||
item = json.loads(line) | ||
except: | ||
f = open('./files/id_v_year_make_fail', 'a') | ||
f.write(line) | ||
f.close() | ||
continue | ||
items.append(item) | ||
if len(items) < 20: | ||
continue | ||
yield items | ||
items = [] | ||
yield items | ||
|
||
|
||
def crawl_models(): | ||
result = [] | ||
success_num = 0 | ||
failed_num = 0 | ||
for items in load_id_v_year_make_items(): | ||
tasks = [] | ||
for item in items: | ||
task = ModelList(item) | ||
tasks.append(task) | ||
for task in tasks: | ||
task.start() | ||
for task in tasks: | ||
task.join() | ||
for task in tasks: | ||
if task.status: | ||
f = open('./files/id_v_year_make_model', 'a') | ||
for line in task.result: | ||
f.write(json.dumps(line)+'\n') | ||
f.close() | ||
success_num += 1 | ||
else: | ||
f = open('./files/id_v_year_make_fail', 'a') | ||
f.write(json.dumps(task.item)+'\n') | ||
f.close() | ||
failed_num += 1 | ||
print(current_time(), success_num, failed_num) | ||
|
||
|
||
create_session_pool() | ||
crawl_models() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
import requests | ||
import time | ||
import openpyxl | ||
import random | ||
import datetime | ||
import json | ||
import re | ||
import csv | ||
import os | ||
|
||
|
||
def get_headers(): | ||
pc_headers = { | ||
"X-Forwarded-For": '%s.%s.%s.%s' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)), | ||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", | ||
"Accept-Encoding": "gzip, deflate", | ||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" | ||
} | ||
return pc_headers | ||
|
||
|
||
class NetWorkError(Exception): | ||
pass | ||
|
||
|
||
def build_request(url, headers=None, data=None, json_data=None, timeout=15, try_times=3): | ||
if headers is None: | ||
headers = get_headers() | ||
for i in range(try_times): | ||
try: | ||
if data: | ||
response = requests.post( | ||
url, data=data, headers=headers, timeout=timeout) | ||
elif json_data: | ||
headers['Content-Type'] = 'application/json' | ||
response = requests.post( | ||
url, data=json.dumps(json_data), headers=headers, timeout=timeout) | ||
else: | ||
response = requests.get(url, headers=headers, timeout=timeout) | ||
return response | ||
except: | ||
continue | ||
raise NetWorkError | ||
|
||
|
||
def write_to_excel(lines, filename, write_only=True): | ||
excel = openpyxl.Workbook(write_only=write_only) | ||
sheet = excel.create_sheet() | ||
for line in lines: | ||
try: | ||
sheet.append(line) | ||
except: | ||
print(line) | ||
excel.save(filename) | ||
|
||
|
||
def write_to_csv(lines, filename): | ||
csvfile = open(filename, 'w', encoding='utf-8') | ||
spamwriter = csv.writer(csvfile, delimiter=',', | ||
quotechar='"', quoting=csv.QUOTE_MINIMAL) | ||
for line in lines: | ||
spamwriter.writerow(line) | ||
csvfile.close() | ||
|
||
|
||
def get_next_date(current_date='2017-01-01'): | ||
current_date = datetime.datetime.strptime(current_date, '%Y-%m-%d') | ||
oneday = datetime.timedelta(days=1) | ||
next_date = current_date+oneday | ||
return str(next_date).split(' ')[0] | ||
|
||
|
||
def current_time(): | ||
return time.strftime("%Y-%m-%d %H:%M:%S") | ||
|
||
|
||
def load_txt(filename): | ||
for line in open(filename, 'r'): | ||
try: | ||
item = json.loads(line) | ||
except: | ||
continue | ||
yield item | ||
|
||
|
||
def sub_str(string, words=None, append=None): | ||
if words is None: | ||
words = ['\r', '\n', '\t', '\xa0'] | ||
if append is not None: | ||
words += append | ||
string = re.sub('|'.join(words), '', string) | ||
return string | ||
|
||
|
||
def get_proxies_abuyun(): | ||
proxyHost = "http-dyn.abuyun.com" | ||
proxyPort = "9020" | ||
# 代理隧道验证信息 | ||
proxyUser = '' | ||
proxyPass = '' | ||
|
||
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { | ||
"host": proxyHost, | ||
"port": proxyPort, | ||
"user": proxyUser, | ||
"pass": proxyPass, | ||
} | ||
proxies = { | ||
"http": proxyMeta, | ||
"https": proxyMeta, | ||
} | ||
return proxies | ||
|
||
|
||
def build_proxy_request(url, data=None, headers=None, json_data=None): | ||
if headers is None: | ||
headers = get_headers() | ||
for i in range(5): | ||
try: | ||
if data: | ||
response = requests.post( | ||
url, proxies=get_proxies_abuyun(), data=data, headers=headers, timeout=15) | ||
elif json_data: | ||
headers['Content-Type'] = 'application/json' | ||
response = requests.post( | ||
url, data=json.dumps(json_data), proxies=get_proxies_abuyun(), headers=headers, timeout=15) | ||
else: | ||
response = requests.get( | ||
url, headers=headers, proxies=get_proxies_abuyun(), timeout=15) | ||
return response | ||
except Exception as e: | ||
if '429' in str(e): | ||
time.sleep(random.randint(0, 1000)/1000.0) | ||
continue | ||
raise NetWorkError | ||
|
||
|
||
def create_cookie_pool(url, pool_size=50): | ||
cookie_pool = [] | ||
succ_num = 0 | ||
while succ_num < pool_size: | ||
session = requests.session() | ||
try: | ||
session.get(url, headers=get_headers(), timeout=10) | ||
cookie = requests.utils.dict_from_cookiejar(session.cookies) | ||
cookie_pool.append(cookie) | ||
except: | ||
continue | ||
succ_num += 1 | ||
print('Create', succ_num, 'OK') | ||
f = open('./cookie_pool.json', 'w') | ||
json.dump(cookie_pool, f) | ||
f.close() | ||
|
||
|
||
def load_cookie_pool(): | ||
if not os.path.exists('./cookie_pool.json'): | ||
print('Cookie Pool File Not Exists.') | ||
return [] | ||
f = open('./cookie_pool.json', 'r') | ||
cookie_pool = json.load(f) | ||
return cookie_pool | ||
|
||
|
||
def build_header_cookie(cookie_dict): | ||
values = [] | ||
for key in cookie_dict: | ||
value = key+'='+cookie_dict[key] | ||
values.append(value) | ||
return '; '.join(values) |