Skip to content

Commit a19a6b3

Browse files
Got classification ensemble with +3.5% accuracy. Working on regression models.
1 parent fc7cd6e commit a19a6b3

14 files changed

+40395
-5069
lines changed

.ipynb_checkpoints/Untitled-checkpoint.ipynb

-180
This file was deleted.

Ebay Capstone Progress Journal.rtf

+76-4
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@
66
\red255\green83\blue8;\red133\green0\blue175;\red174\green0\blue240;\red255\green255\blue255;\red255\green39\blue18;
77
\red63\green105\blue30;\red255\green255\blue51;\red179\green179\blue179;\red128\green128\blue128;\red255\green250\blue131;
88
\red38\green38\blue38;\red255\green255\blue255;\red194\green229\blue166;\red192\green237\blue254;\red255\green252\blue171;
9+
\red255\green164\blue159;\red254\green187\blue100;\red194\green229\blue166;\red0\green0\blue0;\red255\green255\blue255;
910
}
1011
{\*\expandedcolortbl;;\cssrgb\c0\c0\c0;\csgenericrgb\c33333\c55686\c15686;\csgenericrgb\c0\c0\c0;
1112
\csgenericrgb\c100000\c32549\c3137;\csgenericrgb\c52157\c0\c68627;\csgenericrgb\c68235\c0\c94118;\csgenericrgb\c100000\c100000\c100000;\csgenericrgb\c100000\c15294\c7059;
1213
\csgenericrgb\c24706\c41176\c11765;\csgenericrgb\c100000\c100000\c20000;\csgray\c75407;\csgray\c57415;\csgenericrgb\c100000\c98039\c51373;
1314
\cssrgb\c20000\c20000\c20000;\cssrgb\c100000\c100000\c100000;\csgenericrgb\c76078\c89804\c65098;\csgenericrgb\c75294\c92941\c99608;\csgenericrgb\c100000\c98824\c67059;
15+
\csgenericrgb\c100000\c64314\c62353;\csgenericrgb\c99608\c73333\c39216;\csgenericrgb\c76078\c89804\c65098;\cssrgb\c0\c0\c0;\cssrgb\c100000\c100000\c100000;
1416
}
15-
\margl1440\margr1440\vieww18700\viewh8760\viewkind0
17+
\margl1440\margr1440\vieww19000\viewh9060\viewkind0
1618
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
1719

1820
\f0\fs24 \cf0 eBay Capstone Work Journal:\
@@ -772,9 +774,10 @@ Create table:\
772774
"Kit" BOOLEAN,\
773775
"Has Lens" BOOLEAN,\
774776
"Lens" TEXT,\
775-
"B&H Id" TEXT\
776-
)
777-
\f2 ;\
777+
"B&H Id" TEXT,\
778+
"Title" TEXT\
779+
);
780+
\f2 \
778781
\
779782
# -*- coding: utf-8 -*-\
780783
\
@@ -800,6 +803,75 @@ class CameraRetailerScraperItem(scrapy.Item):\
800803
\
801804
\
802805
\
806+
4/17:\
807+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
808+
809+
\b\fs32 \cf0 Classification
810+
\b0\fs24 \
811+
812+
\b\fs28 pd.read_pickle('./pickles/df_classification_count_vec.p')
813+
\b0\fs24 \
814+
\
815+
RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1) \
816+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
817+
\cf0 \ul \ulc0 Not Cross Val\ulnone \
818+
Baseline Accuracy: 0.847\
819+
Model accuracy: 0.903\
820+
\ul 3-Fold Cross Val:\ulnone \
821+
\
822+
Logistic Regression:\
823+
\ul Not\ulnone \ul Cross Val:\ulnone \
824+
Baseline Accuracy: 0.8475 \
825+
Model accuracy: 0.8754\
826+
\ul 3-Fold\ulnone \ul Cross Val:\ulnone \
827+
\cb20 Accuracy: 0.781099028892\cb1 \
828+
\
829+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
830+
\cf0 Logistic Regression Interesting Important Features:\
831+
6th (u\'92fast', 1.87)\
832+
9th (u\'92box', 1.6658)\
833+
18th (u\'92gently used', 1.3718)\
834+
25th (u\'92good cosmetic', 1.0716)\
835+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
836+
\cf0 \
837+
\
838+
Classification Ensemble (RF, LR, XG):\
839+
\ul KFold Cross Val:\ulnone \
840+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
841+
\cf0 [(0.94599636950383215, 0.89573459715639814), \
842+
(0.94433519891090612, 0.88674868898749493), \
843+
(0.94418393586446836, 0.89179104477611937)]\
844+
\cb21 Overfitting.\cb1 \
845+
Baseline Accuracy: 0.854
846+
\f1 \
847+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
848+
849+
\f2 \cf0 \cb22 Cross Validated Ensemble GMean Prediction Accuracy: 0.891\
850+
Increase Accuracy due to model: 0.036\cb1 \
851+
\
852+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
853+
854+
\b\fs32 \cf0 Regression\
855+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
856+
857+
\b0\fs24 \cf0 RandomForestRegressor\
858+
\pard\pardeftab720\sl320\partightenfactor0
859+
860+
\f1\fs28 \cf23 \cb24 \expnd0\expndtw0\kerning0
861+
\outl0\strokewidth0 \strokec23 Average Cross Validated RFR Score: -41.1394388889\
862+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
863+
864+
\f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \outl0\strokewidth0 \
865+
\
866+
\
867+
\
868+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
869+
\cf0 \
870+
\
871+
\
872+
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
873+
\cf0 \
874+
\
803875
\
804876
\
805877
\

bh_photo_scraper/bh_photo_scraper/pipelines.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,9 @@ def process_item(self, item, spider):
6969

7070
insert_statement = '''INSERT INTO {table_name} (%s) VALUES %s;'''.format(table_name=self.postgres_table)
7171

72-
keys = ['Brand','Title','Retail Price','B&H Id']
72+
keys = ['Brand','Title']
7373
keys = ['"{}"'.format(key) for key in keys]
74-
values = (item['brand'],item['title'],item['retail_price'],item['bh_id'])
74+
values = (item['brand'],item['title'])
7575

7676
SQL = self.cur.mogrify(insert_statement, (psycopg2.extensions.AsIs(','.join(keys)), values))
7777

-75 Bytes
Binary file not shown.

bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.py

+29-18
Original file line numberDiff line numberDiff line change
@@ -28,32 +28,39 @@ def start_requests(self):
2828
def parse(self, response):
2929

3030
num_pages = response.meta['num_pages']
31-
page_num = response.meta['page_num'] + 1
31+
page_num = response.meta['page_num']
32+
33+
ids = response.xpath("//span[1]/span[@class='sku']/text()").extract()
3234

3335
brands = response.xpath("//a[@class='c5']/span[1]/text()").extract()
3436
titles = response.xpath("//a[@class='c5']/span[2]/text()").extract()
35-
ids = response.xpath("//span[1]/span[@class='sku']/text()").extract()
36-
# XPATH does not work entirely for prices, use beautifulsoup instead
37-
# retail_prices = response.xpath("//span[@class='price bold sixteen c7']/text()").extract()
38-
soup = BeautifulSoup(response.body, 'lxml')
39-
prices = [float(price.get_text().strip().strip('$').replace(',','')) \
40-
for price in soup.find_all('span','price')]
37+
38+
if len(brands) != len(titles): # an element in brands is a new-release title, remove it
39+
for i,brand in enumerate(brands):
40+
if len(brand.split()) > 1:
41+
brands.pop(i)
4142

4243

4344

45+
# XPATH does not work entirely for prices, use beautifulsoup instead
46+
# soup = BeautifulSoup(response.body, 'lxml')
47+
# prices = [float(price.get_text().strip().strip('$').replace(',','')) \
48+
# for price in soup.find_all('span','price')]
49+
50+
4451
for i in range(len(brands)):
4552
item = BhPhotoDigitalCameraItem()
4653
item['brand'] = brands[i].strip()
47-
item['title'] = titles[i].strip()
48-
item['bh_id'] = ids[i].strip()
54+
item['title'] = titles[i].strip()
4955

5056
# NOTE: Sometimes, the price field is not there, hopefully this only occurs when
5157
# the item is at the bottom of the page, otherwise the brands,titles,ids,prices
5258
# elements will be out of sync.
53-
try:
54-
item['retail_price'] = prices[i]
55-
except IndexError as e:
56-
print e
59+
# item['bh_id'] = ids[i].strip()
60+
# try:
61+
# item['retail_price'] = prices[i]
62+
# except IndexError as e:
63+
# print e
5764
# item['retail_price'] = None
5865

5966

@@ -62,16 +69,20 @@ def parse(self, response):
6269

6370

6471
# when done processing items, move onto next page
65-
if page_num < num_pages:
72+
if page_num <= num_pages:
73+
logging.debug('Scraping page {}'.format(page_num))
6674
next_url = 'https://www.bhphotovideo.com/c/buy/Digital-Cameras/ci/9811/pn/{}/N/4288586282?via=js'.format(page_num)
67-
yield scrapy.Request(next_url, callback=self.parse, meta={'num_pages':num_pages,'page_num':page_num})
68-
75+
yield scrapy.Request(next_url, callback=self.parse, meta={'num_pages':num_pages,'page_num':page_num+1})
76+
else:
77+
logging.debug('Should be done scraping..')
78+
# raise CloseSpider('Done Crawling.')
79+
yield
6980

7081

7182
def get_num_pages(self, response):
7283
logging.debug('Made it here!')
7384
num_pages = response.xpath("//p[@class='pageNuber']/text()").extract_first().strip().split()[-1]
74-
page_num = 0 # start at page 1
75-
yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True, meta={'num_pages':num_pages,'page_num':page_num})
85+
page_num = 1 # start at page 1
86+
yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True, meta={'num_pages':num_pages,'page_num':page_num+1})
7687

7788

Binary file not shown.

0 commit comments

Comments
 (0)