Skip to content

Commit

Permalink
add centricparts.centriccatalog.com
Browse files Browse the repository at this point in the history
  • Loading branch information
Nyloner committed May 1, 2018
1 parent 64a0af8 commit e8c9286
Show file tree
Hide file tree
Showing 2 changed files with 394 additions and 0 deletions.
224 changes: 224 additions & 0 deletions centricparts.centriccatalog.com/centricparts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
from util import *
from bs4 import BeautifulSoup
import json
import threading
import random

URL = 'https://centricparts.centriccatalog.com/EcatMain.aspx'

T_URL = 'https://centricparts.centriccatalog.com/Inquiry/AppResult.aspx?id=WEB_PADS&v=LD/MD&y=1988&m=16&mm=83&uid=ANR&sid=0'


session_pool = []


def get_products():
req = build_request(URL)
products = parser_select('CatalogsDropdownlist', req.text)
return products


def parser_select(name, html):
select_list = BeautifulSoup(html, 'lxml').find(
'select', {'id': name}).find_all('option')
result = []
for option in select_list:
value = option.get('value')
if 'Select a' in value:
continue
key = option.get_text()
result.append([key, value])
return result


def get_vehicle_type():
session = requests.session()
session.get(URL)
products = get_products()
result = []
for product, value in products:
url = URL+'?id=%s' % (value)
req = session.get(url)
vehicle_list = parser_select('VehicleTypesDropDownList', req.text)
for vehicle in vehicle_list:
item = [[product, value], vehicle]
result.append(item)
f = open('./files/id_v', 'a')
for line in result:
f.write(json.dumps(line)+'\n')
f.close()


def get_year_values():
session = requests.session()
session.get(URL)
for line in open('./files/id_v', 'r'):
item = json.loads(line)
url = URL+'?id={}&v={}'.format(item[0][1], item[1][1])
try:
req = session.get(url)
year_list = parser_select('YearsDropdownlist', req.text)
except:
print(item, 'fail')
f = open('./files/id_v_fail', 'a')
f.write(line)
f.close()
continue
f = open('./files/id_v_year', 'a')
for year_item in year_list:
f.write(json.dumps(item+[year_item])+'\n')
f.close()
print(item, 'OK')


def get_make_values():
session = requests.session()
session.get(URL)
for line in open('./files/id_v_year', 'r'):
item = json.loads(line)
url = URL+'?id={}&v={}&y={}'.format(item[0][1], item[1][1], item[2][1])
try:
req = session.get(url, timeout=20)
make_list = parser_select('MakesDropdownlist', req.text)
except:
session = requests.session()
session.get(URL, timeout=20)
print(item, 'fail')
f = open('./files/id_v_year_fail', 'a')
f.write(line)
f.close()
continue
f = open('./files/id_v_year_make', 'a')
for make_item in make_list:
f.write(json.dumps(item+[make_item])+'\n')
f.close()
print(item, 'OK')


def parser_table(html):
table = BeautifulSoup(html, 'lxml').find(
'table', {'id': 'AppDataGrid'}).find_all('tr')
result = []
for tr in table:
td_list = tr.find_all('td')
line = []
for td in td_list:
line.append(td.get_text())
result.append(line)
return result


def get_inquiry_result(value_item):
url = 'https://centricparts.centriccatalog.com/Inquiry/AppResult.aspx' + \
'?id={}&v={}&y={}&m={}&mm={}'.format(
value_item[0][1], value_item[1][1], value_item[2][1], value_item[3][1], value_item[4][1])
session = requests.session()
session.get(URL)
req = session.get(url)
result = parser_table(req.text)


def create_session_pool():
global session_pool
for i in range(40):
session = requests.session()
try:
session.get(URL, timeout=10)
except:
continue
print('create session', i+1, 'OK')
session_pool.append(session)


def get_model_value(value_item):
url = URL + \
'?id={}&v={}&y={}&m={}'.format(
value_item[0][1], value_item[1][1], value_item[2][1], value_item[3][1])
global session_pool
if len(session_pool) == 0:
create_session_pool()
session = random.choice(session_pool)

for i in range(3):
try:
req = session.get(url, timeout=20, headers=get_headers())
model_list = parser_select('ModelsDropdownlist', req.text)
result = []
for model_item in model_list:
result.append(value_item+[model_item])
if len(result) == 0:
raise NetWorkError
return result
except Exception as e:
session_pool.remove(session)
session = requests.session()
session.get(URL, timeout=10, headers=get_headers())
session_pool.append(session)
continue
raise NetWorkError


class ModelList(threading.Thread):
def __init__(self, item):
super(ModelList, self).__init__()
self.item = item
self.daemon = True

def run(self):
self.status = False
try:
self.result = get_model_value(self.item)
if len(self.result) != 0:
self.status = True
except Exception as e:
return


def load_id_v_year_make_items():
items = []
for line in open('./files/id_v_year_make', 'r'):
try:
item = json.loads(line)
except:
f = open('./files/id_v_year_make_fail', 'a')
f.write(line)
f.close()
continue
items.append(item)
if len(items) < 20:
continue
yield items
items = []
yield items


def crawl_models():
result = []
success_num = 0
failed_num = 0
for items in load_id_v_year_make_items():
tasks = []
for item in items:
task = ModelList(item)
tasks.append(task)
for task in tasks:
task.start()
for task in tasks:
task.join()
for task in tasks:
if task.status:
f = open('./files/id_v_year_make_model', 'a')
for line in task.result:
f.write(json.dumps(line)+'\n')
f.close()
success_num += 1
else:
f = open('./files/id_v_year_make_fail', 'a')
f.write(json.dumps(task.item)+'\n')
f.close()
failed_num += 1
print(current_time(), success_num, failed_num)


create_session_pool()
crawl_models()
170 changes: 170 additions & 0 deletions centricparts.centriccatalog.com/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import requests
import time
import openpyxl
import random
import datetime
import json
import re
import csv
import os


def get_headers():
pc_headers = {
"X-Forwarded-For": '%s.%s.%s.%s' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
return pc_headers


class NetWorkError(Exception):
pass


def build_request(url, headers=None, data=None, json_data=None, timeout=15, try_times=3):
if headers is None:
headers = get_headers()
for i in range(try_times):
try:
if data:
response = requests.post(
url, data=data, headers=headers, timeout=timeout)
elif json_data:
headers['Content-Type'] = 'application/json'
response = requests.post(
url, data=json.dumps(json_data), headers=headers, timeout=timeout)
else:
response = requests.get(url, headers=headers, timeout=timeout)
return response
except:
continue
raise NetWorkError


def write_to_excel(lines, filename, write_only=True):
excel = openpyxl.Workbook(write_only=write_only)
sheet = excel.create_sheet()
for line in lines:
try:
sheet.append(line)
except:
print(line)
excel.save(filename)


def write_to_csv(lines, filename):
csvfile = open(filename, 'w', encoding='utf-8')
spamwriter = csv.writer(csvfile, delimiter=',',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
for line in lines:
spamwriter.writerow(line)
csvfile.close()


def get_next_date(current_date='2017-01-01'):
current_date = datetime.datetime.strptime(current_date, '%Y-%m-%d')
oneday = datetime.timedelta(days=1)
next_date = current_date+oneday
return str(next_date).split(' ')[0]


def current_time():
return time.strftime("%Y-%m-%d %H:%M:%S")


def load_txt(filename):
for line in open(filename, 'r'):
try:
item = json.loads(line)
except:
continue
yield item


def sub_str(string, words=None, append=None):
if words is None:
words = ['\r', '\n', '\t', '\xa0']
if append is not None:
words += append
string = re.sub('|'.join(words), '', string)
return string


def get_proxies_abuyun():
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = ''
proxyPass = ''

proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
return proxies


def build_proxy_request(url, data=None, headers=None, json_data=None):
if headers is None:
headers = get_headers()
for i in range(5):
try:
if data:
response = requests.post(
url, proxies=get_proxies_abuyun(), data=data, headers=headers, timeout=15)
elif json_data:
headers['Content-Type'] = 'application/json'
response = requests.post(
url, data=json.dumps(json_data), proxies=get_proxies_abuyun(), headers=headers, timeout=15)
else:
response = requests.get(
url, headers=headers, proxies=get_proxies_abuyun(), timeout=15)
return response
except Exception as e:
if '429' in str(e):
time.sleep(random.randint(0, 1000)/1000.0)
continue
raise NetWorkError


def create_cookie_pool(url, pool_size=50):
cookie_pool = []
succ_num = 0
while succ_num < pool_size:
session = requests.session()
try:
session.get(url, headers=get_headers(), timeout=10)
cookie = requests.utils.dict_from_cookiejar(session.cookies)
cookie_pool.append(cookie)
except:
continue
succ_num += 1
print('Create', succ_num, 'OK')
f = open('./cookie_pool.json', 'w')
json.dump(cookie_pool, f)
f.close()


def load_cookie_pool():
if not os.path.exists('./cookie_pool.json'):
print('Cookie Pool File Not Exists.')
return []
f = open('./cookie_pool.json', 'r')
cookie_pool = json.load(f)
return cookie_pool


def build_header_cookie(cookie_dict):
values = []
for key in cookie_dict:
value = key+'='+cookie_dict[key]
values.append(value)
return '; '.join(values)

0 comments on commit e8c9286

Please sign in to comment.