NathanZorndorf
diff --git a/‎.ipynb_checkpoints/Untitled-checkpoint.ipynb
-180 b/‎.ipynb_checkpoints/Untitled-checkpoint.ipynb
-180
diff --git a/‎Ebay Capstone Progress Journal.rtf
+76-4 b/‎Ebay Capstone Progress Journal.rtf
+76-4
diff --git a/‎bh_photo_scraper/bh_photo_scraper/pipelines.py
+2-2 b/‎bh_photo_scraper/bh_photo_scraper/pipelines.py
+2-2
diff --git a/‎bh_photo_scraper/bh_photo_scraper/pipelines.pyc
-75 Bytes b/‎bh_photo_scraper/bh_photo_scraper/pipelines.pyc
-75 Bytes
diff --git a/‎bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.py
+29-18 b/‎bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.py
+29-18
diff --git a/‎bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.pyc
-108 Bytes b/‎bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.pyc
-108 Bytes
@@ -6,13 +6,15 @@
 \red255\green83\blue8;\red133\green0\blue175;\red174\green0\blue240;\red255\green255\blue255;\red255\green39\blue18;
 \red63\green105\blue30;\red255\green255\blue51;\red179\green179\blue179;\red128\green128\blue128;\red255\green250\blue131;
 \red38\green38\blue38;\red255\green255\blue255;\red194\green229\blue166;\red192\green237\blue254;\red255\green252\blue171;
+\red255\green164\blue159;\red254\green187\blue100;\red194\green229\blue166;\red0\green0\blue0;\red255\green255\blue255;
 }
 {\*\expandedcolortbl;;\cssrgb\c0\c0\c0;\csgenericrgb\c33333\c55686\c15686;\csgenericrgb\c0\c0\c0;
 \csgenericrgb\c100000\c32549\c3137;\csgenericrgb\c52157\c0\c68627;\csgenericrgb\c68235\c0\c94118;\csgenericrgb\c100000\c100000\c100000;\csgenericrgb\c100000\c15294\c7059;
 \csgenericrgb\c24706\c41176\c11765;\csgenericrgb\c100000\c100000\c20000;\csgray\c75407;\csgray\c57415;\csgenericrgb\c100000\c98039\c51373;
 \cssrgb\c20000\c20000\c20000;\cssrgb\c100000\c100000\c100000;\csgenericrgb\c76078\c89804\c65098;\csgenericrgb\c75294\c92941\c99608;\csgenericrgb\c100000\c98824\c67059;
+\csgenericrgb\c100000\c64314\c62353;\csgenericrgb\c99608\c73333\c39216;\csgenericrgb\c76078\c89804\c65098;\cssrgb\c0\c0\c0;\cssrgb\c100000\c100000\c100000;
 }
-\margl1440\margr1440\vieww18700\viewh8760\viewkind0
+\margl1440\margr1440\vieww19000\viewh9060\viewkind0
 \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 
 \f0\fs24 \cf0 eBay Capstone Work Journal:\
@@ -772,9 +774,10 @@ Create table:\
 	"Kit"				BOOLEAN,\
 	"Has Lens" 			BOOLEAN,\
 	"Lens"				TEXT,\
-	"B&H Id" 			TEXT\
-)
-\f2 ;\
+	"B&H Id" 			TEXT,\
+	"Title"			TEXT\
+);
+\f2 \
 \
 # -*- coding: utf-8 -*-\
 \
@@ -800,6 +803,75 @@ class CameraRetailerScraperItem(scrapy.Item):\
 \
 \
 \
+4/17:\
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+
+\b\fs32 \cf0 Classification
+\b0\fs24 \
+
+\b\fs28 pd.read_pickle('./pickles/df_classification_count_vec.p')
+\b0\fs24  \
+\
+RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1) \
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+\cf0 \ul \ulc0 Not Cross Val\ulnone \
+Baseline Accuracy: 0.847\
+Model accuracy: 0.903\
+\ul 3-Fold Cross Val:\ulnone \
+\
+Logistic Regression:\
+\ul Not\ulnone  \ul Cross Val:\ulnone \
+Baseline Accuracy: 0.8475 \
+Model accuracy: 0.8754\
+\ul 3-Fold\ulnone  \ul Cross Val:\ulnone \
+\cb20 Accuracy: 0.781099028892\cb1 \
+\
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+\cf0 Logistic Regression Interesting Important Features:\
+6th (u\'92fast', 1.87)\
+9th (u\'92box', 1.6658)\
+18th (u\'92gently used', 1.3718)\
+25th (u\'92good cosmetic', 1.0716)\
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+\cf0 \
+\
+Classification Ensemble (RF, LR, XG):\
+\ul KFold Cross Val:\ulnone \
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+\cf0 [(0.94599636950383215, 0.89573459715639814),  \
+(0.94433519891090612, 0.88674868898749493),  \
+(0.94418393586446836, 0.89179104477611937)]\
+\cb21 Overfitting.\cb1 \
+Baseline Accuracy: 0.854 
+\f1 \
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+
+\f2 \cf0 \cb22 Cross Validated Ensemble GMean Prediction Accuracy: 0.891\
+Increase Accuracy due to model: 0.036\cb1 \
+\
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+
+\b\fs32 \cf0 Regression\
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+
+\b0\fs24 \cf0 RandomForestRegressor\
+\pard\pardeftab720\sl320\partightenfactor0
+
+\f1\fs28 \cf23 \cb24 \expnd0\expndtw0\kerning0
+\outl0\strokewidth0 \strokec23 Average Cross Validated RFR Score: -41.1394388889\
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+
+\f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \outl0\strokewidth0 \
+\
+\
+\
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+\cf0 \
+\
+\
+\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
+\cf0 \
+\
 \
 \
 \
 
@@ -69,9 +69,9 @@ def process_item(self, item, spider):
 
 		insert_statement = '''INSERT INTO {table_name} (%s) VALUES %s;'''.format(table_name=self.postgres_table)
 
-		keys = ['Brand','Title','Retail Price','B&H Id']
+		keys = ['Brand','Title']
 		keys = ['"{}"'.format(key) for key in keys] 
-		values = (item['brand'],item['title'],item['retail_price'],item['bh_id'])	
+		values = (item['brand'],item['title'])	
 
 		SQL = self.cur.mogrify(insert_statement, (psycopg2.extensions.AsIs(','.join(keys)), values))
 
 
@@ -28,32 +28,39 @@ def start_requests(self):
     def parse(self, response):
 
         num_pages = response.meta['num_pages']
-        page_num = response.meta['page_num'] + 1
+        page_num = response.meta['page_num']
+
+        ids = response.xpath("//span[1]/span[@class='sku']/text()").extract()
 
         brands = response.xpath("//a[@class='c5']/span[1]/text()").extract()
         titles = response.xpath("//a[@class='c5']/span[2]/text()").extract()        
-        ids = response.xpath("//span[1]/span[@class='sku']/text()").extract()
-        # XPATH does not work entirely for prices, use beautifulsoup instead
-        # retail_prices = response.xpath("//span[@class='price bold sixteen c7']/text()").extract()
-        soup = BeautifulSoup(response.body, 'lxml')
-        prices = [float(price.get_text().strip().strip('$').replace(',','')) \
-                for price in soup.find_all('span','price')]
+
+        if len(brands) != len(titles): # an element in brands is a new-release title, remove it
+            for i,brand in enumerate(brands):
+                if len(brand.split()) > 1: 
+                    brands.pop(i)
 
 
 
+        # XPATH does not work entirely for prices, use beautifulsoup instead
+        # soup = BeautifulSoup(response.body, 'lxml')
+        # prices = [float(price.get_text().strip().strip('$').replace(',','')) \
+        #         for price in soup.find_all('span','price')]        
+
+
         for i in range(len(brands)):
             item = BhPhotoDigitalCameraItem()
             item['brand'] = brands[i].strip()
-            item['title'] = titles[i].strip()
-            item['bh_id'] = ids[i].strip()
+            item['title'] = titles[i].strip()            
 
             # NOTE: Sometimes, the price field is not there, hopefully this only occurs when 
             # the item is  at the bottom of the page, otherwise the brands,titles,ids,prices  
             # elements will be out of sync.
-            try:
-                item['retail_price'] = prices[i] 
-            except IndexError as e: 
-                print e
+            # item['bh_id'] = ids[i].strip()
+            # try:
+            #     item['retail_price'] = prices[i] 
+            # except IndexError as e: 
+            #     print e
                 # item['retail_price'] = None
 
 
@@ -62,16 +69,20 @@ def parse(self, response):
 
 
         # when done processing items, move onto next page 
-        if page_num < num_pages:
+        if page_num <= num_pages:
+            logging.debug('Scraping page {}'.format(page_num))
             next_url = 'https://www.bhphotovideo.com/c/buy/Digital-Cameras/ci/9811/pn/{}/N/4288586282?via=js'.format(page_num)
-            yield scrapy.Request(next_url, callback=self.parse, meta={'num_pages':num_pages,'page_num':page_num})
-
+            yield scrapy.Request(next_url, callback=self.parse, meta={'num_pages':num_pages,'page_num':page_num+1})
+        else:
+            logging.debug('Should be done scraping..')
+            # raise CloseSpider('Done Crawling.')
+            yield
 
 
     def get_num_pages(self, response):
         logging.debug('Made it here!')
         num_pages = response.xpath("//p[@class='pageNuber']/text()").extract_first().strip().split()[-1]
-        page_num = 0 # start at page 1
-        yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True, meta={'num_pages':num_pages,'page_num':page_num})
+        page_num = 1 # start at page 1
+        yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True, meta={'num_pages':num_pages,'page_num':page_num+1})