-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawl_amazon_once.py
71 lines (56 loc) · 2.11 KB
/
crawl_amazon_once.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding:utf-8 -*-
import time
import datetime
import traceback
import MySQLdb as mydb
from xlogging import log
from conf import use_proxy, DB_HOST, DB_USER, DB_PASSWD, DB_DB
from lib_crawl import do_crawl
from crawl_exception import RobotCheckError, OperationError
SQL_ALL_ASIN = """
SELECT asin FROM `crawl_asin` WHERE `id` >=1 ORDER BY `id` ASC
"""
SQL_INSERT_STAT = """
INSERT into crawl_asin_statistic VALUES (NULL, '%s', '%s', '%s');
"""
Max_Num_Msg = 'exceeded the maximum number of items'
log.setConfig(module_name='crawl_amazon_manual', ro_rotateby=2, ro_when='midnight', \
ro_backupcount=4, logfile='crawl_amazon_manual.log')
if __name__ == '__main__':
# connect to db
try:
conn = mydb.connect(host=DB_HOST, user=DB_USER, passwd=DB_PASSWD, db=DB_DB)
curs = conn.cursor()
num = curs.execute(SQL_ALL_ASIN)
info = curs.fetchmany(num)
asin_list = []
for item in info:
asin = item[0]
asin_list.append(asin)
except Exception, e:
log.critical('Get ALL Asin Failed: %s' % traceback.format_exc())
print 'Get Asin failed'
conn.close()
# do crawl
for asin in asin_list:
print 'Start Crawl ', asin
log.info('Start Crawl %s' %asin)
try:
q, msg = do_crawl(asin, is_proxy=use_proxy)
if msg.find(Max_Num_Msg) > 0:
q = 1000
except RobotCheckError, e:
print 'Crawl %s faild: %s' % (asin, traceback.format_exc())
log.critical("crawl %s failed: %s" % (asin, traceback.format_exc()))
q = -1
except OperationError, e:
print 'Crawl %s faild: %s' % (asin, traceback.format_exc())
log.critical("crawl %s failed: %s" % (asin, traceback.format_exc()))
q = -1
sql = SQL_INSERT_STAT % (asin, int(q), str(datetime.date.today()))
curs.execute(sql)
conn.commit()
print 'Finish Crawl ', asin
log.info('Finish Crawl %s' % asin)
time.sleep(0.5)
conn.close()