@@ -28,32 +28,39 @@ def start_requests(self):
28
28
def parse (self , response ):
29
29
30
30
num_pages = response .meta ['num_pages' ]
31
- page_num = response .meta ['page_num' ] + 1
31
+ page_num = response .meta ['page_num' ]
32
+
33
+ ids = response .xpath ("//span[1]/span[@class='sku']/text()" ).extract ()
32
34
33
35
brands = response .xpath ("//a[@class='c5']/span[1]/text()" ).extract ()
34
36
titles = response .xpath ("//a[@class='c5']/span[2]/text()" ).extract ()
35
- ids = response .xpath ("//span[1]/span[@class='sku']/text()" ).extract ()
36
- # XPATH does not work entirely for prices, use beautifulsoup instead
37
- # retail_prices = response.xpath("//span[@class='price bold sixteen c7']/text()").extract()
38
- soup = BeautifulSoup (response .body , 'lxml' )
39
- prices = [float (price .get_text ().strip ().strip ('$' ).replace (',' ,'' )) \
40
- for price in soup .find_all ('span' ,'price' )]
37
+
38
+ if len (brands ) != len (titles ): # an element in brands is a new-release title, remove it
39
+ for i ,brand in enumerate (brands ):
40
+ if len (brand .split ()) > 1 :
41
+ brands .pop (i )
41
42
42
43
43
44
45
+ # XPATH does not work entirely for prices, use beautifulsoup instead
46
+ # soup = BeautifulSoup(response.body, 'lxml')
47
+ # prices = [float(price.get_text().strip().strip('$').replace(',','')) \
48
+ # for price in soup.find_all('span','price')]
49
+
50
+
44
51
for i in range (len (brands )):
45
52
item = BhPhotoDigitalCameraItem ()
46
53
item ['brand' ] = brands [i ].strip ()
47
- item ['title' ] = titles [i ].strip ()
48
- item ['bh_id' ] = ids [i ].strip ()
54
+ item ['title' ] = titles [i ].strip ()
49
55
50
56
# NOTE: Sometimes, the price field is not there, hopefully this only occurs when
51
57
# the item is at the bottom of the page, otherwise the brands,titles,ids,prices
52
58
# elements will be out of sync.
53
- try :
54
- item ['retail_price' ] = prices [i ]
55
- except IndexError as e :
56
- print e
59
+ # item['bh_id'] = ids[i].strip()
60
+ # try:
61
+ # item['retail_price'] = prices[i]
62
+ # except IndexError as e:
63
+ # print e
57
64
# item['retail_price'] = None
58
65
59
66
@@ -62,16 +69,20 @@ def parse(self, response):
62
69
63
70
64
71
# when done processing items, move onto next page
65
- if page_num < num_pages :
72
+ if page_num <= num_pages :
73
+ logging .debug ('Scraping page {}' .format (page_num ))
66
74
next_url = 'https://www.bhphotovideo.com/c/buy/Digital-Cameras/ci/9811/pn/{}/N/4288586282?via=js' .format (page_num )
67
- yield scrapy .Request (next_url , callback = self .parse , meta = {'num_pages' :num_pages ,'page_num' :page_num })
68
-
75
+ yield scrapy .Request (next_url , callback = self .parse , meta = {'num_pages' :num_pages ,'page_num' :page_num + 1 })
76
+ else :
77
+ logging .debug ('Should be done scraping..' )
78
+ # raise CloseSpider('Done Crawling.')
79
+ yield
69
80
70
81
71
82
def get_num_pages (self , response ):
72
83
logging .debug ('Made it here!' )
73
84
num_pages = response .xpath ("//p[@class='pageNuber']/text()" ).extract_first ().strip ().split ()[- 1 ]
74
- page_num = 0 # start at page 1
75
- yield scrapy .Request (url = response .url , callback = self .parse , dont_filter = True , meta = {'num_pages' :num_pages ,'page_num' :page_num })
85
+ page_num = 1 # start at page 1
86
+ yield scrapy .Request (url = response .url , callback = self .parse , dont_filter = True , meta = {'num_pages' :num_pages ,'page_num' :page_num + 1 })
76
87
77
88
0 commit comments