update class 10 notebook and script

wizofe · Sep 18, 2015 · 0c3ec02 · 0c3ec02
1 parent 3bb65f5
commit 0c3ec02
Show file tree

Hide file tree

Showing 2 changed files with 1,368 additions and 194 deletions.
diff --git a/code/10_linear_regression_nb.py b/code/10_linear_regression_nb.py
@@ -30,15 +30,15 @@
 # 
 # We'll be working with a dataset from Capital Bikeshare that was used in a Kaggle competition ([data dictionary](https://www.kaggle.com/c/bike-sharing-demand/data)).
 
-# In[ ]:
+# In[1]:
 
 # read the data and set the datetime as the index
 import pandas as pd
 url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv'
 bikes = pd.read_csv(url, index_col='datetime', parse_dates=True)
 
 
-# In[ ]:
+# In[2]:
 
 bikes.head()
 
@@ -49,15 +49,15 @@
 # - What is the response variable (as defined by Kaggle)?
 # - How many features are there?
 
-# In[ ]:
+# In[3]:
 
 # "count" is a method, so it's best to name that column something else
 bikes.rename(columns={'count':'total'}, inplace=True)
 
 
 # ## Visualizing the data
 
-# In[ ]:
+# In[4]:
 
 import seaborn as sns
 import matplotlib.pyplot as plt
@@ -66,13 +66,13 @@
 plt.rcParams['font.size'] = 14
 
 
-# In[ ]:
+# In[5]:
 
 # Pandas scatter plot
 bikes.plot(kind='scatter', x='temp', y='total', alpha=0.2)
 
 
-# In[ ]:
+# In[6]:
 
 # Seaborn scatter plot with regression line
 sns.lmplot(x='temp', y='total', data=bikes, aspect=1.5, scatter_kws={'alpha':0.2})
@@ -103,23 +103,23 @@
 
 # ## Building a linear regression model
 
-# In[ ]:
+# In[7]:
 
 # create X and y
 feature_cols = ['temp']
 X = bikes[feature_cols]
 y = bikes.total
 
 
-# In[ ]:
+# In[8]:
 
 # import, instantiate, fit
 from sklearn.linear_model import LinearRegression
 linreg = LinearRegression()
 linreg.fit(X, y)
 
 
-# In[ ]:
+# In[9]:
 
 # print the coefficients
 print linreg.intercept_
@@ -143,13 +143,13 @@
 # 
 # How many bike rentals would we predict if the temperature was 25 degrees Celsius?
 
-# In[ ]:
+# In[10]:
 
 # manually calculate the prediction
 linreg.intercept_ + linreg.coef_*25
 
 
-# In[ ]:
+# In[11]:
 
 # use the predict method
 linreg.predict(25)
@@ -159,20 +159,20 @@
 # 
 # Let's say that temperature was measured in Fahrenheit, rather than Celsius. How would that affect the model?
 
-# In[ ]:
+# In[12]:
 
 # create a new column for Fahrenheit temperature
 bikes['temp_F'] = bikes.temp * 1.8 + 32
 bikes.head()
 
 
-# In[ ]:
+# In[13]:
 
 # Seaborn scatter plot with regression line
 sns.lmplot(x='temp_F', y='total', data=bikes, aspect=1.5, scatter_kws={'alpha':0.2})
 
 
-# In[ ]:
+# In[14]:
 
 # create X and y
 feature_cols = ['temp_F']
@@ -188,41 +188,41 @@
 print linreg.coef_
 
 
-# In[ ]:
+# In[15]:
 
 # convert 25 degrees Celsius to Fahrenheit
 25 * 1.8 + 32
 
 
-# In[ ]:
+# In[16]:
 
 # predict rentals for 77 degrees Fahrenheit
 linreg.predict(77)
 
 
 # **Conclusion:** The scale of the features is **irrelevant** for linear regression models. When changing the scale, we simply change our **interpretation** of the coefficients.
 
-# In[ ]:
+# In[17]:
 
 # remove the temp_F column
 bikes.drop('temp_F', axis=1, inplace=True)
 
 
 # ## Visualizing the data (part 2)
 
-# In[ ]:
+# In[18]:
 
 # explore more features
 feature_cols = ['temp', 'season', 'weather', 'humidity']
 
 
-# In[ ]:
+# In[19]:
 
 # multiple scatter plots in Seaborn
 sns.pairplot(bikes, x_vars=feature_cols, y_vars='total', kind='reg')
 
 
-# In[ ]:
+# In[20]:
 
 # multiple scatter plots in Pandas
 fig, axs = plt.subplots(1, len(feature_cols), sharey=True)
@@ -232,13 +232,13 @@
 
 # Are you seeing anything that you did not expect?
 
-# In[ ]:
+# In[21]:
 
 # cross-tabulation of season and month
 pd.crosstab(bikes.season, bikes.index.month)
 
 
-# In[ ]:
+# In[22]:
 
 # box plot of rentals, grouped by season
 bikes.boxplot(column='total', by='season')
@@ -249,7 +249,7 @@
 # - A line can't capture a non-linear relationship.
 # - There are more rentals in winter than in spring (?)
 
-# In[ ]:
+# In[23]:
 
 # line plot of rentals
 bikes.total.plot()
@@ -259,13 +259,13 @@
 # 
 # There are more rentals in the winter than the spring, but only because the system is experiencing **overall growth** and the winter months happen to come after the spring months.
 
-# In[ ]:
+# In[24]:
 
 # correlation matrix (ranges from 1 to -1)
 bikes.corr()
 
 
-# In[ ]:
+# In[25]:
 
 # visualize correlation matrix in Seaborn using a heatmap
 sns.heatmap(bikes.corr())
@@ -275,13 +275,13 @@
 
 # ## Adding more features to the model
 
-# In[ ]:
+# In[26]:
 
 # create a list of features
 feature_cols = ['temp', 'season', 'weather', 'humidity']
 
 
-# In[ ]:
+# In[27]:
 
 # create X and y
 X = bikes[feature_cols]
@@ -296,7 +296,7 @@
 print linreg.coef_
 
 
-# In[ ]:
+# In[28]:
 
 # pair the feature names with the coefficients
 zip(feature_cols, linreg.coef_)
@@ -345,14 +345,14 @@
 # 
 # $$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$
 
-# In[ ]:
+# In[29]:
 
 # example true and predicted response values
 true = [10, 7, 5, 5]
 pred = [8, 6, 5, 10]
 
 
-# In[ ]:
+# In[30]:
 
 # calculate these metrics by hand!
 from sklearn import metrics
@@ -372,7 +372,7 @@
 # 
 # Here's an additional example, to demonstrate how MSE/RMSE punish larger errors:
 
-# In[ ]:
+# In[31]:
 
 # same true values as above
 true = [10, 7, 5, 5]
@@ -390,7 +390,7 @@
 
 # ## Comparing models with train/test split and RMSE
 
-# In[ ]:
+# In[32]:
 
 from sklearn.cross_validation import train_test_split
 
@@ -405,23 +405,25 @@ def train_test_rmse(feature_cols):
     return np.sqrt(metrics.mean_squared_error(y_test, y_pred))
 
 
-# In[ ]:
+# In[33]:
 
+# compare different sets of features
 print train_test_rmse(['temp', 'season', 'weather', 'humidity'])
 print train_test_rmse(['temp', 'season', 'weather'])
 print train_test_rmse(['temp', 'season', 'humidity'])
 
 
-# In[ ]:
+# In[34]:
 
+# using these as features is not allowed!
 print train_test_rmse(['casual', 'registered'])
 
 
 # ## Comparing testing RMSE with null RMSE
 # 
 # Null RMSE is the RMSE that could be achieved by **always predicting the mean response value**. It is a benchmark against which you may want to measure your regression model.
 
-# In[ ]:
+# In[35]:
 
 # split X and y into training and testing sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
@@ -434,7 +436,7 @@ def train_test_rmse(feature_cols):
 y_null
 
 
-# In[ ]:
+# In[36]:
 
 # compute null RMSE
 np.sqrt(metrics.mean_squared_error(y_test, y_null))
@@ -454,7 +456,7 @@ def train_test_rmse(feature_cols):
 # 
 # For season, we can't simply leave the encoding as 1 = spring, 2 = summer, 3 = fall, and 4 = winter, because that would imply an **ordered relationship**. Instead, we create **multiple dummy variables:**
 
-# In[ ]:
+# In[37]:
 
 # create dummy variables
 season_dummies = pd.get_dummies(bikes.season, prefix='season')
@@ -467,7 +469,7 @@ def train_test_rmse(feature_cols):
 # 
 # Why? Because three dummies captures all of the "information" about the season feature, and implicitly defines spring (season 1) as the **baseline level:**
 
-# In[ ]:
+# In[38]:
 
 # drop the first column
 season_dummies.drop(season_dummies.columns[0], axis=1, inplace=True)
@@ -480,7 +482,7 @@ def train_test_rmse(feature_cols):
 # 
 # If that's confusing, think about why we only need one dummy variable for holiday, not two dummy variables (holiday_yes and holiday_no).
 
-# In[ ]:
+# In[39]:
 
 # concatenate the original DataFrame and the dummy DataFrame (axis=0 means rows, axis=1 means columns)
 bikes = pd.concat([bikes, season_dummies], axis=1)
@@ -489,7 +491,7 @@ def train_test_rmse(feature_cols):
 bikes.sample(n=5, random_state=1)
 
 
-# In[ ]:
+# In[40]:
 
 # include dummy variables for season in the model
 feature_cols = ['temp', 'season_2', 'season_3', 'season_4', 'humidity']
@@ -512,7 +514,7 @@ def train_test_rmse(feature_cols):
 # 
 # **Important:** Dummy encoding is relevant for all machine learning models, not just linear regression models.
 
-# In[ ]:
+# In[41]:
 
 # compare original season variable with dummy variables
 print train_test_rmse(['temp', 'season', 'humidity'])
@@ -529,33 +531,6 @@ def train_test_rmse(feature_cols):
 # 
 # Then, try using each of the three features (on its own) with `train_test_rmse` to see which one performs the best!
 
-# In[ ]:
-
-# hour as a numeric feature
-bikes['hour'] = bikes.index.hour
-
-
-# In[ ]:
-
-# hour as a categorical feature
-hour_dummies = pd.get_dummies(bikes.hour, prefix='hour')
-hour_dummies.drop(hour_dummies.columns[0], axis=1, inplace=True)
-bikes = pd.concat([bikes, hour_dummies], axis=1)
-
-
-# In[ ]:
-
-# daytime as a categorical feature
-bikes['daytime'] = ((bikes.hour > 6) & (bikes.hour < 21)).astype(int)
-
-
-# In[ ]:
-
-print train_test_rmse(['hour'])
-print train_test_rmse(bikes.columns[bikes.columns.str.startswith('hour_')])
-print train_test_rmse(['daytime'])
-
-
 # ## Comparing linear regression with other models
 # 
 # Advantages of linear regression: