Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Nyloner committed Jan 31, 2018
1 parent d967eb2 commit 096b4cb
Showing 1 changed file with 223 additions and 0 deletions.
223 changes: 223 additions & 0 deletions www.adidas.com.cn/adidas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
import json
import re
import time
from bs4 import BeautifulSoup
import requests
import openpyxl
import random
import threading


def get_headers():
pc_headers = {
"X-Forwarded-For": '%s.%s.%s.%s' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.5",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
return pc_headers


def get_proxies_abuyun():
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = ''
proxyPass = ''

proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
return proxies


class NetWorkError(Exception):
pass


def build_request(url, headers=None, proxies=None):
if headers is None:
headers = get_headers()
for i in range(5):
try:
response = requests.get(
url, headers=headers, proxies=get_proxies_abuyun(), timeout=15)
return response
except Exception as e:
continue
raise NetWorkError


def write_to_excel(lines, filename, write_only=True):
excel = openpyxl.Workbook(write_only=write_only)
sheet = excel.create_sheet()
for line in lines:
sheet.append(line)
excel.save(filename)


def current_time():
now_time = time.strftime('%Y-%m-%d %H:%M:%S')
return now_time


def get_products():
need_urls = ['https://www.adidas.com.cn/plp/list.json?ni=20&pf=25-40%2C25-60%2C14-2509&pr=-&fo=p25%2Cp14&pn={}&pageSize=120&p=%E7%94%B7%E5%AD%90-%E5%95%86%E5%93%81%E7%B1%BB%E5%9E%8B&isSaleTop=false',
'https://www.adidas.com.cn/plp/list.json?ni=75&pf=25-82%2C25-60%2C14-2509&pr=-&fo=p25%2Cp14&pn={}&pageSize=120&p=%E5%A5%B3%E5%AD%90-%E5%95%86%E5%93%81%E7%B1%BB%E5%9E%8B&isSaleTop=false',
'https://www.adidas.com.cn/plp/list.json?ni=120&pf=25-160%2C14-2509&pr=-&s=default_order&fo=p25%2Cp14&pn={}&pageSize=120&p=%E7%94%B7%E7%AB%A5-%E5%95%86%E5%93%81%E7%B1%BB%E5%9E%8B&isSaleTop=false']
result = []
for base_url in need_urls:
page = 1
while True:
try:
url = base_url.format(page) + '&_=' + \
str(int(time.time() * 1000))
req = build_request(url)
res = json.loads(req.text)
return_obj = res['returnObject']
if 'view' not in return_obj:
break
except Exception as e:
print(current_time(), '[get_products][request error]', url, e)
continue
try:
items = return_obj['view']['items']
except Exception as e:
break
for item in items:
base_info = {}
try:
base_info['title'] = item['t']
except:
base_info['title'] = '-'
try:
base_info['s_title'] = item['st']
except:
base_info['s_title'] = ''
try:
base_info['original_price'] = item['lp']
except:
base_info['original_price'] = '-'
try:
base_info['real_price'] = item['sp']
except:
base_info['real_price'] = '-'
base_info['code'] = item['c']
result.append(base_info)
print(current_time(), '[get_products]', 'Url', url, 'OK')
page += 1
return result


def get_ava_sku(item_id):
sku_str = "[]"
for i in range(3):
try:
url = 'https://www.adidas.com.cn/productGetItemIvts/{}.json?_={}'.format(
item_id, str(int(time.time() * 1000)))
req = build_request(url)
res_text = req.text
data = json.loads(res_text)
sku_str = data['skuStr']
break
except:
continue
result = json.loads(sku_str)
return result


def get_product_info(url):
req = build_request(url)
soup = BeautifulSoup(req.text, 'lxml')
item_id = soup.find("input", {"id": 'itemId'}).get("value")
color = soup.find("input", {'id': 'colorDisPaly'}).get('value')
table = soup.find('div', {'class': 'overview product-size'}).find_all("li")
product_size = []
for li in table:
display_size = li.get_text()
size_id = li.get('ipi')
product_size.append([size_id, display_size])
ava_list = get_ava_sku(item_id)
sku_info = []
for item in product_size:
for ava_sku in ava_list:
if item[0] in ava_sku['properties']:
sku_info.append([item[1], ava_sku['availableQty']])
break
return {
'color': color,
'sku_info': sku_info
}


class AdidasProduct(threading.Thread):
def __init__(self, base_info):
super(AdidasProduct, self).__init__()
self.base_info = base_info
self.pdp_url = self.base_info[-1]

def run(self):
try:
self.product = get_product_info(self.pdp_url)
except Exception as e:
print(current_time(),
'[get_product_info][error]', self.pdp_url, e)
self.product = {'color': '', 'sku_info': []}
self.lines = []
if len(self.product['sku_info']) == 0:
self.lines.append(self.base_info + [self.product['color']])
else:
for sku_item in self.product['sku_info']:
line = self.base_info + [self.product['color']] + sku_item
self.lines.append(line)


def load_products():
products = get_products()
keys = ['title', 's_title', 'original_price',
'real_price', 'code']
items = []
for product in products:
item = []
for key in keys:
value = product[key]
item.append(value)
item.append('https://www.adidas.com.cn/item/' + product['code'])
items.append(item)
if len(items) < 5:
continue
yield items
items = []
yield items


def crawl():
result = []
counter = 0
for products in load_products():
tasks = []
for item in products:
task = AdidasProduct(item)
tasks.append(task)
for task in tasks:
task.start()
for task in tasks:
task.join()
for task in tasks:
result += task.lines
counter += 1
print(current_time(),
'[get_product_info][OK]', task.pdp_url, counter)
write_to_excel(result, 'files/' +
current_time().replace(':', '_')+'_adidas' + '.xlsx')


crawl()

0 comments on commit 096b4cb

Please sign in to comment.