Skip to content

Commit

Permalink
update class 10 notebook and script
Browse files Browse the repository at this point in the history
  • Loading branch information
justmarkham committed Sep 18, 2015
1 parent 3bb65f5 commit 0c3ec02
Show file tree
Hide file tree
Showing 2 changed files with 1,368 additions and 194 deletions.
111 changes: 43 additions & 68 deletions code/10_linear_regression_nb.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@
#
# We'll be working with a dataset from Capital Bikeshare that was used in a Kaggle competition ([data dictionary](https://www.kaggle.com/c/bike-sharing-demand/data)).

# In[ ]:
# In[1]:

# read the data and set the datetime as the index
import pandas as pd
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv'
bikes = pd.read_csv(url, index_col='datetime', parse_dates=True)


# In[ ]:
# In[2]:

bikes.head()

Expand All @@ -49,15 +49,15 @@
# - What is the response variable (as defined by Kaggle)?
# - How many features are there?

# In[ ]:
# In[3]:

# "count" is a method, so it's best to name that column something else
bikes.rename(columns={'count':'total'}, inplace=True)


# ## Visualizing the data

# In[ ]:
# In[4]:

import seaborn as sns
import matplotlib.pyplot as plt
Expand All @@ -66,13 +66,13 @@
plt.rcParams['font.size'] = 14


# In[ ]:
# In[5]:

# Pandas scatter plot
bikes.plot(kind='scatter', x='temp', y='total', alpha=0.2)


# In[ ]:
# In[6]:

# Seaborn scatter plot with regression line
sns.lmplot(x='temp', y='total', data=bikes, aspect=1.5, scatter_kws={'alpha':0.2})
Expand Down Expand Up @@ -103,23 +103,23 @@

# ## Building a linear regression model

# In[ ]:
# In[7]:

# create X and y
feature_cols = ['temp']
X = bikes[feature_cols]
y = bikes.total


# In[ ]:
# In[8]:

# import, instantiate, fit
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X, y)


# In[ ]:
# In[9]:

# print the coefficients
print linreg.intercept_
Expand All @@ -143,13 +143,13 @@
#
# How many bike rentals would we predict if the temperature was 25 degrees Celsius?

# In[ ]:
# In[10]:

# manually calculate the prediction
linreg.intercept_ + linreg.coef_*25


# In[ ]:
# In[11]:

# use the predict method
linreg.predict(25)
Expand All @@ -159,20 +159,20 @@
#
# Let's say that temperature was measured in Fahrenheit, rather than Celsius. How would that affect the model?

# In[ ]:
# In[12]:

# create a new column for Fahrenheit temperature
bikes['temp_F'] = bikes.temp * 1.8 + 32
bikes.head()


# In[ ]:
# In[13]:

# Seaborn scatter plot with regression line
sns.lmplot(x='temp_F', y='total', data=bikes, aspect=1.5, scatter_kws={'alpha':0.2})


# In[ ]:
# In[14]:

# create X and y
feature_cols = ['temp_F']
Expand All @@ -188,41 +188,41 @@
print linreg.coef_


# In[ ]:
# In[15]:

# convert 25 degrees Celsius to Fahrenheit
25 * 1.8 + 32


# In[ ]:
# In[16]:

# predict rentals for 77 degrees Fahrenheit
linreg.predict(77)


# **Conclusion:** The scale of the features is **irrelevant** for linear regression models. When changing the scale, we simply change our **interpretation** of the coefficients.

# In[ ]:
# In[17]:

# remove the temp_F column
bikes.drop('temp_F', axis=1, inplace=True)


# ## Visualizing the data (part 2)

# In[ ]:
# In[18]:

# explore more features
feature_cols = ['temp', 'season', 'weather', 'humidity']


# In[ ]:
# In[19]:

# multiple scatter plots in Seaborn
sns.pairplot(bikes, x_vars=feature_cols, y_vars='total', kind='reg')


# In[ ]:
# In[20]:

# multiple scatter plots in Pandas
fig, axs = plt.subplots(1, len(feature_cols), sharey=True)
Expand All @@ -232,13 +232,13 @@

# Are you seeing anything that you did not expect?

# In[ ]:
# In[21]:

# cross-tabulation of season and month
pd.crosstab(bikes.season, bikes.index.month)


# In[ ]:
# In[22]:

# box plot of rentals, grouped by season
bikes.boxplot(column='total', by='season')
Expand All @@ -249,7 +249,7 @@
# - A line can't capture a non-linear relationship.
# - There are more rentals in winter than in spring (?)

# In[ ]:
# In[23]:

# line plot of rentals
bikes.total.plot()
Expand All @@ -259,13 +259,13 @@
#
# There are more rentals in the winter than the spring, but only because the system is experiencing **overall growth** and the winter months happen to come after the spring months.

# In[ ]:
# In[24]:

# correlation matrix (ranges from 1 to -1)
bikes.corr()


# In[ ]:
# In[25]:

# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(bikes.corr())
Expand All @@ -275,13 +275,13 @@

# ## Adding more features to the model

# In[ ]:
# In[26]:

# create a list of features
feature_cols = ['temp', 'season', 'weather', 'humidity']


# In[ ]:
# In[27]:

# create X and y
X = bikes[feature_cols]
Expand All @@ -296,7 +296,7 @@
print linreg.coef_


# In[ ]:
# In[28]:

# pair the feature names with the coefficients
zip(feature_cols, linreg.coef_)
Expand Down Expand Up @@ -345,14 +345,14 @@
#
# $$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$

# In[ ]:
# In[29]:

# example true and predicted response values
true = [10, 7, 5, 5]
pred = [8, 6, 5, 10]


# In[ ]:
# In[30]:

# calculate these metrics by hand!
from sklearn import metrics
Expand All @@ -372,7 +372,7 @@
#
# Here's an additional example, to demonstrate how MSE/RMSE punish larger errors:

# In[ ]:
# In[31]:

# same true values as above
true = [10, 7, 5, 5]
Expand All @@ -390,7 +390,7 @@

# ## Comparing models with train/test split and RMSE

# In[ ]:
# In[32]:

from sklearn.cross_validation import train_test_split

Expand All @@ -405,23 +405,25 @@ def train_test_rmse(feature_cols):
return np.sqrt(metrics.mean_squared_error(y_test, y_pred))


# In[ ]:
# In[33]:

# compare different sets of features
print train_test_rmse(['temp', 'season', 'weather', 'humidity'])
print train_test_rmse(['temp', 'season', 'weather'])
print train_test_rmse(['temp', 'season', 'humidity'])


# In[ ]:
# In[34]:

# using these as features is not allowed!
print train_test_rmse(['casual', 'registered'])


# ## Comparing testing RMSE with null RMSE
#
# Null RMSE is the RMSE that could be achieved by **always predicting the mean response value**. It is a benchmark against which you may want to measure your regression model.

# In[ ]:
# In[35]:

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
Expand All @@ -434,7 +436,7 @@ def train_test_rmse(feature_cols):
y_null


# In[ ]:
# In[36]:

# compute null RMSE
np.sqrt(metrics.mean_squared_error(y_test, y_null))
Expand All @@ -454,7 +456,7 @@ def train_test_rmse(feature_cols):
#
# For season, we can't simply leave the encoding as 1 = spring, 2 = summer, 3 = fall, and 4 = winter, because that would imply an **ordered relationship**. Instead, we create **multiple dummy variables:**

# In[ ]:
# In[37]:

# create dummy variables
season_dummies = pd.get_dummies(bikes.season, prefix='season')
Expand All @@ -467,7 +469,7 @@ def train_test_rmse(feature_cols):
#
# Why? Because three dummies captures all of the "information" about the season feature, and implicitly defines spring (season 1) as the **baseline level:**

# In[ ]:
# In[38]:

# drop the first column
season_dummies.drop(season_dummies.columns[0], axis=1, inplace=True)
Expand All @@ -480,7 +482,7 @@ def train_test_rmse(feature_cols):
#
# If that's confusing, think about why we only need one dummy variable for holiday, not two dummy variables (holiday_yes and holiday_no).

# In[ ]:
# In[39]:

# concatenate the original DataFrame and the dummy DataFrame (axis=0 means rows, axis=1 means columns)
bikes = pd.concat([bikes, season_dummies], axis=1)
Expand All @@ -489,7 +491,7 @@ def train_test_rmse(feature_cols):
bikes.sample(n=5, random_state=1)


# In[ ]:
# In[40]:

# include dummy variables for season in the model
feature_cols = ['temp', 'season_2', 'season_3', 'season_4', 'humidity']
Expand All @@ -512,7 +514,7 @@ def train_test_rmse(feature_cols):
#
# **Important:** Dummy encoding is relevant for all machine learning models, not just linear regression models.

# In[ ]:
# In[41]:

# compare original season variable with dummy variables
print train_test_rmse(['temp', 'season', 'humidity'])
Expand All @@ -529,33 +531,6 @@ def train_test_rmse(feature_cols):
#
# Then, try using each of the three features (on its own) with `train_test_rmse` to see which one performs the best!

# In[ ]:

# hour as a numeric feature
bikes['hour'] = bikes.index.hour


# In[ ]:

# hour as a categorical feature
hour_dummies = pd.get_dummies(bikes.hour, prefix='hour')
hour_dummies.drop(hour_dummies.columns[0], axis=1, inplace=True)
bikes = pd.concat([bikes, hour_dummies], axis=1)


# In[ ]:

# daytime as a categorical feature
bikes['daytime'] = ((bikes.hour > 6) & (bikes.hour < 21)).astype(int)


# In[ ]:

print train_test_rmse(['hour'])
print train_test_rmse(bikes.columns[bikes.columns.str.startswith('hour_')])
print train_test_rmse(['daytime'])


# ## Comparing linear regression with other models
#
# Advantages of linear regression:
Expand Down
Loading

0 comments on commit 0c3ec02

Please sign in to comment.