From d7cc9d044b32ffbadaa5db1d11f28a99e0785a0d Mon Sep 17 00:00:00 2001
From: Kevin Markham <justmarkham@gmail.com>
Date: Fri, 9 Oct 2015 10:57:29 -0400
Subject: [PATCH] add materials for classes 17 and 18

---
 README.md                             |    5 +-
 code/17_bikeshare_exercise_nb.py      |   68 ++
 code/17_decision_trees_nb.py          |  432 +++++++++++
 code/18_ensembling_nb.py              |  491 ++++++++++++
 notebooks/17_bikeshare_exercise.ipynb |  212 ++++++
 notebooks/17_decision_trees.ipynb     |  898 ++++++++++++++++++++++
 notebooks/18_ensembling.ipynb         | 1013 +++++++++++++++++++++++++
 other/model_comparison.md             |   37 +
 8 files changed, 3154 insertions(+), 2 deletions(-)
 create mode 100644 code/17_bikeshare_exercise_nb.py
 create mode 100644 code/17_decision_trees_nb.py
 create mode 100644 code/18_ensembling_nb.py
 create mode 100644 notebooks/17_bikeshare_exercise.ipynb
 create mode 100644 notebooks/17_decision_trees.ipynb
 create mode 100644 notebooks/18_ensembling.ipynb

diff --git a/README.md b/README.md
index 9b9c39b..454154a 100644
--- a/README.md
+++ b/README.md
@@ -468,8 +468,6 @@ Tuesday | Thursday
 * These examples may help you to better understand the process of feature engineering: predicting the number of [passengers at a train station](https://medium.com/@chris_bour/french-largest-data-science-challenge-ever-organized-shows-the-unreasonable-effectiveness-of-open-8399705a20ef), identifying [fraudulent users of an online store](https://docs.google.com/presentation/d/1UdI5NY-mlHyseiRVbpTLyvbrHxY8RciHp5Vc-ZLrwmU/edit#slide=id.p), identifying [bots in an online auction](https://www.kaggle.com/c/facebook-recruiting-iv-human-or-bot/forums/t/14628/share-your-secret-sauce), predicting who will [subscribe to the next season of an orchestra](http://blog.kaggle.com/2015/01/05/kaggle-inclass-stanfords-getting-a-handel-on-data-science-winners-report/), and evaluating the [quality of e-commerce search engine results](http://blog.kaggle.com/2015/07/22/crowdflower-winners-interview-3rd-place-team-quartet/).
 * [Our perfect submission](https://www.kaggle.com/c/restaurant-revenue-prediction/forums/t/13950/our-perfect-submission) is a fun read about how great performance on the [public leaderboard](https://www.kaggle.com/c/restaurant-revenue-prediction/leaderboard/public) does not guarantee that a model will generalize to new data.
 
-<!--
-
 -----
 
 ### Class 17: Decision Trees
@@ -503,8 +501,11 @@ Tuesday | Thursday
 * [Not Even the People Who Write Algorithms Really Know How They Work](http://www.theatlantic.com/technology/archive/2015/09/not-even-the-people-who-write-algorithms-really-know-how-they-work/406099/) argues that the decreased interpretability of state-of-the-art machine learning models has a negative impact on society.
 * For an intuitive explanation of Random Forests, read Edwin Chen's answer to [How do random forests work in layman's terms?](http://www.quora.com/Random-Forests/How-do-random-forests-work-in-laymans-terms/answer/Edwin-Chen-1)
 * [Large Scale Decision Forests: Lessons Learned](http://blog.siftscience.com/blog/2015/large-scale-decision-forests-lessons-learned) is an excellent post from Sift Science about their custom implementation of Random Forests.
+* [Unboxing the Random Forest Classifier](http://nerds.airbnb.com/unboxing-the-random-forest-classifier/) describes a way to interpret the inner workings of Random Forests beyond just feature importances.
 * [Understanding Random Forests: From Theory to Practice](http://arxiv.org/pdf/1407.7502v3.pdf) is an in-depth academic analysis of Random Forests, including details of its implementation in scikit-learn.
 
+<!--
+
 -----
 
 ### Class 19: Advanced scikit-learn and Clustering
diff --git a/code/17_bikeshare_exercise_nb.py b/code/17_bikeshare_exercise_nb.py
new file mode 100644
index 0000000..3ea7d22
--- /dev/null
+++ b/code/17_bikeshare_exercise_nb.py
@@ -0,0 +1,68 @@
+# # Exercise with Capital Bikeshare data
+
+# ## Introduction
+# 
+# - Capital Bikeshare dataset from Kaggle: [data](https://github.com/justmarkham/DAT8/blob/master/data/bikeshare.csv), [data dictionary](https://www.kaggle.com/c/bike-sharing-demand/data)
+# - Each observation represents the bikeshare rentals initiated during a given hour of a given day
+
+import pandas as pd
+import numpy as np
+from sklearn.cross_validation import cross_val_score
+from sklearn.linear_model import LinearRegression
+from sklearn.tree import DecisionTreeRegressor, export_graphviz
+
+
+# read the data and set "datetime" as the index
+url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv'
+bikes = pd.read_csv(url, index_col='datetime', parse_dates=True)
+
+
+# "count" is a method, so it's best to rename that column
+bikes.rename(columns={'count':'total'}, inplace=True)
+
+
+# create "hour" as its own feature
+bikes['hour'] = bikes.index.hour
+
+
+bikes.head()
+
+
+bikes.tail()
+
+
+# - **hour** ranges from 0 (midnight) through 23 (11pm)
+# - **workingday** is either 0 (weekend or holiday) or 1 (non-holiday weekday)
+
+# ## Task 1
+# 
+# Run these two `groupby` statements and figure out what they tell you about the data.
+
+bikes.groupby('workingday').total.mean()
+
+
+bikes.groupby('hour').total.mean()
+
+
+# ## Task 2
+# 
+# Run this plotting code, and make sure you understand the output. Then, separate this plot into two separate plots conditioned on "workingday". (In other words, one plot should display the hourly trend for "workingday=0", and the other should display the hourly trend for "workingday=1".)
+
+bikes.groupby('hour').total.mean().plot()
+
+
+# ## Task 3
+# 
+# Fit a linear regression model to the entire dataset, using "total" as the response and "hour" and "workingday" as the only features. Then, print the coefficients and interpret them. What are the limitations of linear regression in this instance?
+
+# ## Task 4
+# 
+# Use 10-fold cross-validation to calculate the RMSE for the linear regression model.
+
+# ## Task 5
+# 
+# Use 10-fold cross-validation to evaluate a decision tree model with those same features (fit to any "max_depth" you choose).
+
+# ## Task 6
+# 
+# Fit a decision tree model to the entire dataset using "max_depth=3", and create a tree diagram using Graphviz. Then, figure out what each leaf represents. What did the decision tree learn that a linear regression model could not learn?
diff --git a/code/17_decision_trees_nb.py b/code/17_decision_trees_nb.py
new file mode 100644
index 0000000..566835e
--- /dev/null
+++ b/code/17_decision_trees_nb.py
@@ -0,0 +1,432 @@
+# # Decision Trees
+# 
+# *Adapted from Chapter 8 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/)*
+
+# Why are we learning about decision trees?
+# 
+# - Can be applied to both regression and classification problems
+# - Many useful properties
+# - Very popular
+# - Basis for more sophisticated models
+# - Have a different way of "thinking" than the other models we have studied
+
+# ## Lesson objectives
+# 
+# Students will be able to:
+# 
+# - Explain how a decision tree is created
+# - Build a decision tree model in scikit-learn
+# - Tune a decision tree model and explain how tuning impacts the model
+# - Interpret a tree diagram
+# - Describe the key differences between regression and classification trees
+# - Decide whether a decision tree is an appropriate model for a given problem
+
+# # Part 1: Regression trees
+# 
+# Major League Baseball player data from 1986-87:
+# 
+# - **Years** (x-axis): number of years playing in the major leagues
+# - **Hits** (y-axis): number of hits in the previous year
+# - **Salary** (color): low salary is blue/green, high salary is red/yellow
+
+# ![Salary data](images/salary_color.png)
+
+# Group exercise:
+# 
+# - The data above is our **training data**.
+# - We want to build a model that predicts the Salary of **future players** based on Years and Hits.
+# - We are going to "segment" the feature space into regions, and then use the **mean Salary in each region** as the predicted Salary for future players.
+# - Intuitively, you want to **maximize** the similarity (or "homogeneity") within a given region, and **minimize** the similarity between different regions.
+# 
+# Rules for segmenting:
+# 
+# - You can only use **straight lines**, drawn one at a time.
+# - Your line must either be **vertical or horizontal**.
+# - Your line **stops** when it hits an existing line.
+
+# ![Salary regions](images/salary_regions.png)
+
+# Above are the regions created by a computer:
+# 
+# - $R_1$: players with **less than 5 years** of experience, mean Salary of **\$166,000 **
+# - $R_2$: players with **5 or more years** of experience and **less than 118 hits**, mean Salary of **\$403,000 **
+# - $R_3$: players with **5 or more years** of experience and **118 hits or more**, mean Salary of **\$846,000 **
+# 
+# **Note:** Years and Hits are both integers, but the convention is to use the **midpoint** between adjacent values to label a split.
+# 
+# These regions are used to make predictions on **out-of-sample data**. Thus, there are only three possible predictions! (Is this different from how **linear regression** makes predictions?)
+# 
+# Below is the equivalent regression tree:
+
+# ![Salary tree](images/salary_tree.png)
+
+# The first split is **Years < 4.5**, thus that split goes at the top of the tree. When a splitting rule is **True**, you follow the left branch. When a splitting rule is **False**, you follow the right branch.
+# 
+# For players in the **left branch**, the mean Salary is \$166,000, thus you label it with that value. (Salary has been divided by 1000 and log-transformed to 5.11.)
+# 
+# For players in the **right branch**, there is a further split on **Hits < 117.5**, dividing players into two more Salary regions: \$403,000 (transformed to 6.00), and \$846,000 (transformed to 6.74).
+
+# ![Salary tree annotated](images/salary_tree_annotated.png)
+
+# **What does this tree tell you about your data?**
+# 
+# - Years is the most important factor determining Salary, with a lower number of Years corresponding to a lower Salary.
+# - For a player with a lower number of Years, Hits is not an important factor determining Salary.
+# - For a player with a higher number of Years, Hits is an important factor determining Salary, with a greater number of Hits corresponding to a higher Salary.
+# 
+# **Question:** What do you like and dislike about decision trees so far?
+
+# ## Building a regression tree by hand
+# 
+# Your **training data** is a tiny dataset of [used vehicle sale prices](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/vehicles_train.csv). Your goal is to **predict price** for testing data.
+# 
+# 1. Read the data into a Pandas DataFrame.
+# 2. Explore the data by sorting, plotting, or split-apply-combine (aka `group_by`).
+# 3. Decide which feature is the most important predictor, and use that to create your first splitting rule.
+#     - Only binary splits are allowed.
+# 4. After making your first split, split your DataFrame into two parts, and then explore each part to figure out what other splits to make.
+# 5. Stop making splits once you are convinced that it strikes a good balance between underfitting and overfitting.
+#     - Your goal is to build a model that generalizes well.
+#     - You are allowed to split on the same variable multiple times!
+# 6. Draw your tree, labeling the leaves with the mean price for the observations in that region.
+#     - Make sure nothing is backwards: You follow the **left branch** if the rule is true, and the **right branch** if the rule is false.
+
+# ## How does a computer build a regression tree?
+# 
+# **Ideal approach:** Consider every possible partition of the feature space (computationally infeasible)
+# 
+# **"Good enough" approach:** recursive binary splitting
+# 
+# 1. Begin at the top of the tree.
+# 2. For **every feature**, examine **every possible cutpoint**, and choose the feature and cutpoint such that the resulting tree has the lowest possible mean squared error (MSE). Make that split.
+# 3. Examine the two resulting regions, and again make a **single split** (in one of the regions) to minimize the MSE.
+# 4. Keep repeating step 3 until a **stopping criterion** is met:
+#     - maximum tree depth (maximum number of splits required to arrive at a leaf)
+#     - minimum number of observations in a leaf
+
+# ### Demo: Choosing the ideal cutpoint for a given feature
+
+# vehicle data
+import pandas as pd
+url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/vehicles_train.csv'
+train = pd.read_csv(url)
+
+
+# before splitting anything, just predict the mean of the entire dataset
+train['prediction'] = train.price.mean()
+train
+
+
+# calculate RMSE for those predictions
+from sklearn import metrics
+import numpy as np
+np.sqrt(metrics.mean_squared_error(train.price, train.prediction))
+
+
+# define a function that calculates the RMSE for a given split of miles
+def mileage_split(miles):
+    lower_mileage_price = train[train.miles < miles].price.mean()
+    higher_mileage_price = train[train.miles >= miles].price.mean()
+    train['prediction'] = np.where(train.miles < miles, lower_mileage_price, higher_mileage_price)
+    return np.sqrt(metrics.mean_squared_error(train.price, train.prediction))
+
+
+# calculate RMSE for tree which splits on miles < 50000
+print 'RMSE:', mileage_split(50000)
+train
+
+
+# calculate RMSE for tree which splits on miles < 100000
+print 'RMSE:', mileage_split(100000)
+train
+
+
+# check all possible mileage splits
+mileage_range = range(train.miles.min(), train.miles.max(), 1000)
+RMSE = [mileage_split(miles) for miles in mileage_range]
+
+
+# allow plots to appear in the notebook
+import matplotlib.pyplot as plt
+plt.rcParams['figure.figsize'] = (6, 4)
+plt.rcParams['font.size'] = 14
+
+
+# plot mileage cutpoint (x-axis) versus RMSE (y-axis)
+plt.plot(mileage_range, RMSE)
+plt.xlabel('Mileage cutpoint')
+plt.ylabel('RMSE (lower is better)')
+
+
+# **Recap:** Before every split, this process is repeated for every feature, and the feature and cutpoint that produces the lowest MSE is chosen.
+
+# ## Building a regression tree in scikit-learn
+
+# encode car as 0 and truck as 1
+
+
+# define X and y
+
+
+# instantiate a DecisionTreeRegressor (with random_state=1)
+
+
+# use leave-one-out cross-validation (LOOCV) to estimate the RMSE for this model
+
+
+# ## What happens when we grow a tree too deep?
+# 
+# - Left: Regression tree for Salary **grown deeper**
+# - Right: Comparison of the **training, testing, and cross-validation errors** for trees with different numbers of leaves
+
+# ![Salary tree grown deep](images/salary_tree_deep.png)
+
+# The **training error** continues to go down as the tree size increases (due to overfitting), but the lowest **cross-validation error** occurs for a tree with 3 leaves.
+
+# ## Tuning a regression tree
+# 
+# Let's try to reduce the RMSE by tuning the **max_depth** parameter:
+
+# try different values one-by-one
+treereg = DecisionTreeRegressor(max_depth=1, random_state=1)
+scores = cross_val_score(treereg, X, y, cv=14, scoring='mean_squared_error')
+np.mean(np.sqrt(-scores))
+
+
+# Or, we could write a loop to try a range of values:
+
+# list of values to try
+max_depth_range = range(1, 8)
+
+# list to store the average RMSE for each value of max_depth
+RMSE_scores = []
+
+# use LOOCV with each value of max_depth
+for depth in max_depth_range:
+    treereg = DecisionTreeRegressor(max_depth=depth, random_state=1)
+    MSE_scores = cross_val_score(treereg, X, y, cv=14, scoring='mean_squared_error')
+    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
+
+
+# plot max_depth (x-axis) versus RMSE (y-axis)
+plt.plot(max_depth_range, RMSE_scores)
+plt.xlabel('max_depth')
+plt.ylabel('RMSE (lower is better)')
+
+
+# max_depth=3 was best, so fit a tree using that parameter
+treereg = DecisionTreeRegressor(max_depth=3, random_state=1)
+treereg.fit(X, y)
+
+
+# "Gini importance" of each feature: the (normalized) total reduction of error brought by that feature
+pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_})
+
+
+# ## Creating a tree diagram
+
+# create a Graphviz file
+from sklearn.tree import export_graphviz
+export_graphviz(treereg, out_file='tree_vehicles.dot', feature_names=feature_cols)
+
+# At the command line, run this to convert to PNG:
+#   dot -Tpng tree_vehicles.dot -o tree_vehicles.png
+
+
+# ![Tree for vehicle data](images/tree_vehicles.png)
+
+# Reading the internal nodes:
+# 
+# - **samples:** number of observations in that node before splitting
+# - **mse:** MSE calculated by comparing the actual response values in that node against the mean response value in that node
+# - **rule:** rule used to split that node (go left if true, go right if false)
+# 
+# Reading the leaves:
+# 
+# - **samples:** number of observations in that node
+# - **value:** mean response value in that node
+# - **mse:** MSE calculated by comparing the actual response values in that node against "value"
+
+# ## Making predictions for the testing data
+
+# read the testing data
+
+
+# **Question:** Using the tree diagram above, what predictions will the model make for each observation?
+
+# use fitted model to make predictions on testing data
+
+
+# calculate RMSE
+
+
+# calculate RMSE for your own tree!
+
+
+# # Part 2: Classification trees
+# 
+# **Example:** Predict whether Barack Obama or Hillary Clinton will win the Democratic primary in a particular county in 2008:
+
+# ![Obama-Clinton decision tree](images/obama_clinton_tree.jpg)
+
+# **Questions:**
+# 
+# - What are the observations? How many observations are there?
+# - What is the response variable?
+# - What are the features?
+# - What is the most predictive feature?
+# - Why does the tree split on high school graduation rate twice in a row?
+# - What is the class prediction for the following county: 15% African-American, 90% high school graduation rate, located in the South, high poverty, high population density?
+# - What is the predicted probability for that same county?
+
+# ## Comparing regression trees and classification trees
+# 
+# |regression trees|classification trees|
+# |---|---|
+# |predict a continuous response|predict a categorical response|
+# |predict using mean response of each leaf|predict using most commonly occuring class of each leaf|
+# |splits are chosen to minimize MSE|splits are chosen to minimize Gini index (discussed below)|
+
+# ## Splitting criteria for classification trees
+# 
+# Common options for the splitting criteria:
+# 
+# - **classification error rate:** fraction of training observations in a region that don't belong to the most common class
+# - **Gini index:** measure of total variance across classes in a region
+
+# ### Example of classification error rate
+# 
+# Pretend we are predicting whether someone buys an iPhone or an Android:
+# 
+# - At a particular node, there are **25 observations** (phone buyers), of whom **10 bought iPhones and 15 bought Androids**.
+# - Since the majority class is **Android**, that's our prediction for all 25 observations, and thus the classification error rate is **10/25 = 40%**.
+# 
+# Our goal in making splits is to **reduce the classification error rate**. Let's try splitting on gender:
+# 
+# - **Males:** 2 iPhones and 12 Androids, thus the predicted class is Android
+# - **Females:** 8 iPhones and 3 Androids, thus the predicted class is iPhone
+# - Classification error rate after this split would be **5/25 = 20%**
+# 
+# Compare that with a split on age:
+# 
+# - **30 or younger:** 4 iPhones and 8 Androids, thus the predicted class is Android
+# - **31 or older:** 6 iPhones and 7 Androids, thus the predicted class is Android
+# - Classification error rate after this split would be **10/25 = 40%**
+# 
+# The decision tree algorithm will try **every possible split across all features**, and choose the split that **reduces the error rate the most.**
+
+# ### Example of Gini index
+# 
+# Calculate the Gini index before making a split:
+# 
+# $$1 - \left(\frac {iPhone} {Total}\right)^2 - \left(\frac {Android} {Total}\right)^2 = 1 - \left(\frac {10} {25}\right)^2 - \left(\frac {15} {25}\right)^2 = 0.48$$
+# 
+# - The **maximum value** of the Gini index is 0.5, and occurs when the classes are perfectly balanced in a node.
+# - The **minimum value** of the Gini index is 0, and occurs when there is only one class represented in a node.
+# - A node with a lower Gini index is said to be more "pure".
+# 
+# Evaluating the split on **gender** using Gini index:
+# 
+# $$\text{Males: } 1 - \left(\frac {2} {14}\right)^2 - \left(\frac {12} {14}\right)^2 = 0.24$$
+# $$\text{Females: } 1 - \left(\frac {8} {11}\right)^2 - \left(\frac {3} {11}\right)^2 = 0.40$$
+# $$\text{Weighted Average: } 0.24 \left(\frac {14} {25}\right) + 0.40 \left(\frac {11} {25}\right) = 0.31$$
+# 
+# Evaluating the split on **age** using Gini index:
+# 
+# $$\text{30 or younger: } 1 - \left(\frac {4} {12}\right)^2 - \left(\frac {8} {12}\right)^2 = 0.44$$
+# $$\text{31 or older: } 1 - \left(\frac {6} {13}\right)^2 - \left(\frac {7} {13}\right)^2 = 0.50$$
+# $$\text{Weighted Average: } 0.44 \left(\frac {12} {25}\right) + 0.50 \left(\frac {13} {25}\right) = 0.47$$
+# 
+# Again, the decision tree algorithm will try **every possible split**, and will choose the split that **reduces the Gini index (and thus increases the "node purity") the most.**
+
+# ### Comparing classification error rate and Gini index
+# 
+# - Gini index is generally preferred because it will make splits that **increase node purity**, even if that split does not change the classification error rate.
+# - Node purity is important because we're interested in the **class proportions** in each region, since that's how we calculate the **predicted probability** of each class.
+# - scikit-learn's default splitting criteria for classification trees is Gini index.
+# 
+# Note: There is another common splitting criteria called **cross-entropy**. It's numerically similar to Gini index, but slower to compute, thus it's not as popular as Gini index.
+
+# ## Building a classification tree in scikit-learn
+
+# We'll build a classification tree using the Titanic data:
+
+# read in the data
+url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/titanic.csv'
+titanic = pd.read_csv(url)
+
+# encode female as 0 and male as 1
+titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1})
+
+# fill in the missing values for age with the median age
+titanic.Age.fillna(titanic.Age.median(), inplace=True)
+
+# create a DataFrame of dummy variables for Embarked
+embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked')
+embarked_dummies.drop(embarked_dummies.columns[0], axis=1, inplace=True)
+
+# concatenate the original DataFrame and the dummy DataFrame
+titanic = pd.concat([titanic, embarked_dummies], axis=1)
+
+# print the updated DataFrame
+titanic.head()
+
+
+# - **Survived:** 0=died, 1=survived (response variable)
+# - **Pclass:** 1=first class, 2=second class, 3=third class
+#     - What will happen if the tree splits on this feature?
+# - **Sex:** 0=female, 1=male
+# - **Age:** numeric value
+# - **Embarked:** C or Q or S
+
+# define X and y
+feature_cols = ['Pclass', 'Sex', 'Age', 'Embarked_Q', 'Embarked_S']
+X = titanic[feature_cols]
+y = titanic.Survived
+
+
+# fit a classification tree with max_depth=3 on all data
+from sklearn.tree import DecisionTreeClassifier
+treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
+treeclf.fit(X, y)
+
+
+# create a Graphviz file
+export_graphviz(treeclf, out_file='tree_titanic.dot', feature_names=feature_cols)
+
+# At the command line, run this to convert to PNG:
+#   dot -Tpng tree_titanic.dot -o tree_titanic.png
+
+
+# ![Tree for Titanic data](images/tree_titanic.png)
+
+# Notice the split in the bottom right: the **same class** is predicted in both of its leaves. That split didn't affect the **classification error rate**, though it did increase the **node purity**, which is important because it increases the accuracy of our predicted probabilities.
+
+# compute the feature importances
+pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})
+
+
+# # Part 3: Comparing decision trees with other models
+# 
+# **Advantages of decision trees:**
+# 
+# - Can be used for regression or classification
+# - Can be displayed graphically
+# - Highly interpretable
+# - Can be specified as a series of rules, and more closely approximate human decision-making than other models
+# - Prediction is fast
+# - Features don't need scaling
+# - Automatically learns feature interactions
+# - Tends to ignore irrelevant features
+# - Non-parametric (will outperform linear models if relationship between features and response is highly non-linear)
+
+# ![Trees versus linear models](images/tree_vs_linear.png)
+
+# **Disadvantages of decision trees:**
+# 
+# - Performance is (generally) not competitive with the best supervised learning methods
+# - Can easily overfit the training data (tuning is required)
+# - Small variations in the data can result in a completely different tree (high variance)
+# - Recursive binary splitting makes "locally optimal" decisions that may not result in a globally optimal tree
+# - Doesn't tend to work well if the classes are highly unbalanced
+# - Doesn't tend to work well with very small datasets
diff --git a/code/18_ensembling_nb.py b/code/18_ensembling_nb.py
new file mode 100644
index 0000000..5abe70b
--- /dev/null
+++ b/code/18_ensembling_nb.py
@@ -0,0 +1,491 @@
+# # Ensembling
+# 
+# *Adapted from Chapter 8 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/)*
+
+# Why are we learning about ensembling?
+# 
+# - Very popular method for improving the predictive performance of machine learning models
+# - Provides a foundation for understanding more sophisticated models
+
+# ## Lesson objectives
+# 
+# Students will be able to:
+# 
+# - Define ensembling and its requirements
+# - Identify the two basic methods of ensembling
+# - Decide whether manual ensembling is a useful approach for a given problem
+# - Explain bagging and how it can be applied to decision trees
+# - Explain how out-of-bag error and feature importances are calculated from bagged trees
+# - Explain the difference between bagged trees and Random Forests
+# - Build and tune a Random Forest model in scikit-learn
+# - Decide whether a decision tree or a Random Forest is a better model for a given problem
+
+# # Part 1: Introduction
+# 
+# Let's pretend that instead of building a single model to solve a classification problem, you created **five independent models**, and each model was correct about 70% of the time. If you combined these models into an "ensemble" and used their majority vote as a prediction, how often would the ensemble be correct?
+
+import numpy as np
+
+# set a seed for reproducibility
+np.random.seed(1234)
+
+# generate 1000 random numbers (between 0 and 1) for each model, representing 1000 observations
+mod1 = np.random.rand(1000)
+mod2 = np.random.rand(1000)
+mod3 = np.random.rand(1000)
+mod4 = np.random.rand(1000)
+mod5 = np.random.rand(1000)
+
+# each model independently predicts 1 (the "correct response") if random number was at least 0.3
+preds1 = np.where(mod1 > 0.3, 1, 0)
+preds2 = np.where(mod2 > 0.3, 1, 0)
+preds3 = np.where(mod3 > 0.3, 1, 0)
+preds4 = np.where(mod4 > 0.3, 1, 0)
+preds5 = np.where(mod5 > 0.3, 1, 0)
+
+# print the first 20 predictions from each model
+print preds1[:20]
+print preds2[:20]
+print preds3[:20]
+print preds4[:20]
+print preds5[:20]
+
+
+# average the predictions and then round to 0 or 1
+ensemble_preds = np.round((preds1 + preds2 + preds3 + preds4 + preds5)/5.0).astype(int)
+
+# print the ensemble's first 20 predictions
+print ensemble_preds[:20]
+
+
+# how accurate was each individual model?
+print preds1.mean()
+print preds2.mean()
+print preds3.mean()
+print preds4.mean()
+print preds5.mean()
+
+
+# how accurate was the ensemble?
+print ensemble_preds.mean()
+
+
+# **Note:** As you add more models to the voting process, the probability of error decreases, which is known as [Condorcet's Jury Theorem](http://en.wikipedia.org/wiki/Condorcet%27s_jury_theorem).
+
+# ## What is ensembling?
+# 
+# **Ensemble learning (or "ensembling")** is the process of combining several predictive models in order to produce a combined model that is more accurate than any individual model.
+# 
+# - **Regression:** take the average of the predictions
+# - **Classification:** take a vote and use the most common prediction, or take the average of the predicted probabilities
+# 
+# For ensembling to work well, the models must have the following characteristics:
+# 
+# - **Accurate:** they outperform the null model
+# - **Independent:** their predictions are generated using different processes
+# 
+# **The big idea:** If you have a collection of individually imperfect (and independent) models, the "one-off" mistakes made by each model are probably not going to be made by the rest of the models, and thus the mistakes will be discarded when averaging the models.
+# 
+# There are two basic **methods for ensembling:**
+# 
+# - Manually ensemble your individual models
+# - Use a model that ensembles for you
+
+# # Part 2: Manual ensembling
+# 
+# What makes a good manual ensemble?
+# 
+# - Different types of **models**
+# - Different combinations of **features**
+# - Different **tuning parameters**
+
+# ![Machine learning flowchart](images/crowdflower_ensembling.jpg)
+# 
+# *Machine learning flowchart created by the [winner](https://github.com/ChenglongChen/Kaggle_CrowdFlower) of Kaggle's [CrowdFlower competition](https://www.kaggle.com/c/crowdflower-search-relevance)*
+
+# ## Comparing manual ensembling with a single model approach
+# 
+# **Advantages of ensembling:**
+# 
+# - Increases predictive accuracy
+# - Easy to get started
+# 
+# **Disadvantages of ensembling:**
+# 
+# - Decreases interpretability
+# - Takes longer to train
+# - Takes longer to predict
+# - More complex to automate and maintain
+# - Small gains in accuracy may not be worth the added complexity
+
+# # Part 3: Bagging
+# 
+# The primary weakness of **decision trees** is that they don't tend to have the best predictive accuracy. This is partially due to **high variance**, meaning that different splits in the training data can lead to very different trees.
+# 
+# **Bagging** is a general purpose procedure for reducing the variance of a machine learning method, but is particularly useful for decision trees. Bagging is short for **bootstrap aggregation**, meaning the aggregation of bootstrap samples.
+# 
+# What is a **bootstrap sample**? A random sample with replacement:
+
+# set a seed for reproducibility
+np.random.seed(1)
+
+# create an array of 1 through 20
+nums = np.arange(1, 21)
+print nums
+
+# sample that array 20 times with replacement
+print np.random.choice(a=nums, size=20, replace=True)
+
+
+# **How does bagging work (for decision trees)?**
+# 
+# 1. Grow B trees using B bootstrap samples from the training data.
+# 2. Train each tree on its bootstrap sample and make predictions.
+# 3. Combine the predictions:
+#     - Average the predictions for **regression trees**
+#     - Take a majority vote for **classification trees**
+# 
+# Notes:
+# 
+# - **Each bootstrap sample** should be the same size as the original training set.
+# - **B** should be a large enough value that the error seems to have "stabilized".
+# - The trees are **grown deep** so that they have low bias/high variance.
+# 
+# Bagging increases predictive accuracy by **reducing the variance**, similar to how cross-validation reduces the variance associated with train/test split (for estimating out-of-sample error) by splitting many times an averaging the results.
+
+# ## Manually implementing bagged decision trees (with B=10)
+
+# read in and prepare the vehicle training data
+import pandas as pd
+url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/vehicles_train.csv'
+train = pd.read_csv(url)
+train['vtype'] = train.vtype.map({'car':0, 'truck':1})
+train
+
+
+# set a seed for reproducibility
+np.random.seed(123)
+
+# create ten bootstrap samples (will be used to select rows from the DataFrame)
+samples = [np.random.choice(a=14, size=14, replace=True) for _ in range(1, 11)]
+samples
+
+
+# show the rows for the first decision tree
+train.iloc[samples[0], :]
+
+
+# read in and prepare the vehicle testing data
+url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/vehicles_test.csv'
+test = pd.read_csv(url)
+test['vtype'] = test.vtype.map({'car':0, 'truck':1})
+test
+
+
+from sklearn.tree import DecisionTreeRegressor
+
+# grow each tree deep
+treereg = DecisionTreeRegressor(max_depth=None, random_state=123)
+
+# list for storing predicted price from each tree
+predictions = []
+
+# define testing data
+X_test = test.iloc[:, 1:]
+y_test = test.iloc[:, 0]
+
+# grow one tree for each bootstrap sample and make predictions on testing data
+for sample in samples:
+    X_train = train.iloc[sample, 1:]
+    y_train = train.iloc[sample, 0]
+    treereg.fit(X_train, y_train)
+    y_pred = treereg.predict(X_test)
+    predictions.append(y_pred)
+
+# convert predictions from list to NumPy array
+predictions = np.array(predictions)
+predictions
+
+
+# average predictions
+np.mean(predictions, axis=0)
+
+
+# calculate RMSE
+from sklearn import metrics
+y_pred = np.mean(predictions, axis=0)
+np.sqrt(metrics.mean_squared_error(y_test, y_pred))
+
+
+# ## Bagged decision trees in scikit-learn (with B=500)
+
+# define the training and testing sets
+X_train = train.iloc[:, 1:]
+y_train = train.iloc[:, 0]
+X_test = test.iloc[:, 1:]
+y_test = test.iloc[:, 0]
+
+
+# instruct BaggingRegressor to use DecisionTreeRegressor as the "base estimator"
+from sklearn.ensemble import BaggingRegressor
+bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)
+
+
+# fit and predict
+bagreg.fit(X_train, y_train)
+y_pred = bagreg.predict(X_test)
+y_pred
+
+
+# calculate RMSE
+np.sqrt(metrics.mean_squared_error(y_test, y_pred))
+
+
+# ## Estimating out-of-sample error
+# 
+# For bagged models, out-of-sample error can be estimated without using **train/test split** or **cross-validation**!
+# 
+# On average, each bagged tree uses about **two-thirds** of the observations. For each tree, the **remaining observations** are called "out-of-bag" observations.
+
+# show the first bootstrap sample
+samples[0]
+
+
+# show the "in-bag" observations for each sample
+for sample in samples:
+    print set(sample)
+
+
+# show the "out-of-bag" observations for each sample
+for sample in samples:
+    print sorted(set(range(14)) - set(sample))
+
+
+# How to calculate **"out-of-bag error":**
+# 
+# 1. For every observation in the training data, predict its response value using **only** the trees in which that observation was out-of-bag. Average those predictions (for regression) or take a majority vote (for classification).
+# 2. Compare all predictions to the actual response values in order to compute the out-of-bag error.
+# 
+# When B is sufficiently large, the **out-of-bag error** is an accurate estimate of **out-of-sample error**.
+
+# compute the out-of-bag R-squared score (not MSE, unfortunately!) for B=500
+bagreg.oob_score_
+
+
+# ## Estimating feature importance
+# 
+# Bagging increases **predictive accuracy**, but decreases **model interpretability** because it's no longer possible to visualize the tree to understand the importance of each feature.
+# 
+# However, we can still obtain an overall summary of **feature importance** from bagged models:
+# 
+# - **Bagged regression trees:** calculate the total amount that **MSE** is decreased due to splits over a given feature, averaged over all trees
+# - **Bagged classification trees:** calculate the total amount that **Gini index** is decreased due to splits over a given feature, averaged over all trees
+
+# # Part 4: Random Forests
+# 
+# Random Forests is a **slight variation of bagged trees** that has even better performance:
+# 
+# - Exactly like bagging, we create an ensemble of decision trees using bootstrapped samples of the training set.
+# - However, when building each tree, each time a split is considered, a **random sample of m features** is chosen as split candidates from the **full set of p features**. The split is only allowed to use **one of those m features**.
+#     - A new random sample of features is chosen for **every single tree at every single split**.
+#     - For **classification**, m is typically chosen to be the square root of p.
+#     - For **regression**, m is typically chosen to be somewhere between p/3 and p.
+# 
+# What's the point?
+# 
+# - Suppose there is **one very strong feature** in the data set. When using bagged trees, most of the trees will use that feature as the top split, resulting in an ensemble of similar trees that are **highly correlated**.
+# - Averaging highly correlated quantities does not significantly reduce variance (which is the entire goal of bagging).
+# - By randomly leaving out candidate features from each split, **Random Forests "decorrelates" the trees**, such that the averaging process can reduce the variance of the resulting model.
+
+# # Part 5: Building and tuning decision trees and Random Forests
+# 
+# - Major League Baseball player data from 1986-87: [data](https://github.com/justmarkham/DAT8/blob/master/data/hitters.csv), [data dictionary](https://cran.r-project.org/web/packages/ISLR/ISLR.pdf) (page 7)
+# - Each observation represents a player
+# - **Goal:** Predict player salary
+
+# ## Preparing the data
+
+# read in the data
+url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/hitters.csv'
+hitters = pd.read_csv(url)
+
+# remove rows with missing values
+hitters.dropna(inplace=True)
+
+
+hitters.head()
+
+
+# encode categorical variables as integers
+hitters['League'] = pd.factorize(hitters.League)[0]
+hitters['Division'] = pd.factorize(hitters.Division)[0]
+hitters['NewLeague'] = pd.factorize(hitters.NewLeague)[0]
+hitters.head()
+
+
+# allow plots to appear in the notebook
+import matplotlib.pyplot as plt
+
+
+# scatter plot of Years versus Hits colored by Salary
+hitters.plot(kind='scatter', x='Years', y='Hits', c='Salary', colormap='jet', xlim=(0, 25), ylim=(0, 250))
+
+
+# define features: exclude career statistics (which start with "C") and the response (Salary)
+feature_cols = hitters.columns[hitters.columns.str.startswith('C') == False].drop('Salary')
+feature_cols
+
+
+# define X and y
+X = hitters[feature_cols]
+y = hitters.Salary
+
+
+# ## Predicting salary with a decision tree
+# 
+# Find the best **max_depth** for a decision tree using cross-validation:
+
+# list of values to try for max_depth
+max_depth_range = range(1, 21)
+
+# list to store the average RMSE for each value of max_depth
+RMSE_scores = []
+
+# use 10-fold cross-validation with each value of max_depth
+from sklearn.cross_validation import cross_val_score
+for depth in max_depth_range:
+    treereg = DecisionTreeRegressor(max_depth=depth, random_state=1)
+    MSE_scores = cross_val_score(treereg, X, y, cv=10, scoring='mean_squared_error')
+    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
+
+
+# plot max_depth (x-axis) versus RMSE (y-axis)
+plt.plot(max_depth_range, RMSE_scores)
+plt.xlabel('max_depth')
+plt.ylabel('RMSE (lower is better)')
+
+
+# show the best RMSE and the corresponding max_depth
+sorted(zip(RMSE_scores, max_depth_range))[0]
+
+
+# max_depth=2 was best, so fit a tree using that parameter
+treereg = DecisionTreeRegressor(max_depth=2, random_state=1)
+treereg.fit(X, y)
+
+
+# compute feature importances
+pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_}).sort('importance')
+
+
+# ## Predicting salary with a Random Forest
+
+from sklearn.ensemble import RandomForestRegressor
+rfreg = RandomForestRegressor()
+rfreg
+
+
+# ### Tuning n_estimators
+# 
+# One important tuning parameter is **n_estimators**, which is the number of trees that should be grown. It should be a large enough value that the error seems to have "stabilized".
+
+# list of values to try for n_estimators
+estimator_range = range(10, 310, 10)
+
+# list to store the average RMSE for each value of n_estimators
+RMSE_scores = []
+
+# use 5-fold cross-validation with each value of n_estimators (WARNING: SLOW!)
+for estimator in estimator_range:
+    rfreg = RandomForestRegressor(n_estimators=estimator, random_state=1)
+    MSE_scores = cross_val_score(rfreg, X, y, cv=5, scoring='mean_squared_error')
+    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
+
+
+# plot n_estimators (x-axis) versus RMSE (y-axis)
+plt.plot(estimator_range, RMSE_scores)
+plt.xlabel('n_estimators')
+plt.ylabel('RMSE (lower is better)')
+
+
+# ### Tuning max_features
+# 
+# The other important tuning parameter is **max_features**, which is the number of features that should be considered at each split.
+
+# list of values to try for max_features
+feature_range = range(1, len(feature_cols)+1)
+
+# list to store the average RMSE for each value of max_features
+RMSE_scores = []
+
+# use 10-fold cross-validation with each value of max_features (WARNING: SLOW!)
+for feature in feature_range:
+    rfreg = RandomForestRegressor(n_estimators=150, max_features=feature, random_state=1)
+    MSE_scores = cross_val_score(rfreg, X, y, cv=10, scoring='mean_squared_error')
+    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))
+
+
+# plot max_features (x-axis) versus RMSE (y-axis)
+plt.plot(feature_range, RMSE_scores)
+plt.xlabel('max_features')
+plt.ylabel('RMSE (lower is better)')
+
+
+# show the best RMSE and the corresponding max_features
+sorted(zip(RMSE_scores, feature_range))[0]
+
+
+# ### Fitting a Random Forest with the best parameters
+
+# max_features=8 is best and n_estimators=150 is sufficiently large
+rfreg = RandomForestRegressor(n_estimators=150, max_features=8, oob_score=True, random_state=1)
+rfreg.fit(X, y)
+
+
+# compute feature importances
+pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort('importance')
+
+
+# compute the out-of-bag R-squared score
+rfreg.oob_score_
+
+
+# ### Reducing X to its most important features
+
+# check the shape of X
+X.shape
+
+
+# set a threshold for which features to include
+print rfreg.transform(X, threshold=0.1).shape
+print rfreg.transform(X, threshold='mean').shape
+print rfreg.transform(X, threshold='median').shape
+
+
+# create a new feature matrix that only includes important features
+X_important = rfreg.transform(X, threshold='mean')
+
+
+# check the RMSE for a Random Forest that only includes important features
+rfreg = RandomForestRegressor(n_estimators=150, max_features=3, random_state=1)
+scores = cross_val_score(rfreg, X_important, y, cv=10, scoring='mean_squared_error')
+np.mean(np.sqrt(-scores))
+
+
+# ## Comparing Random Forests with decision trees
+# 
+# **Advantages of Random Forests:**
+# 
+# - Performance is competitive with the best supervised learning methods
+# - Provides a more reliable estimate of feature importance
+# - Allows you to estimate out-of-sample error without using train/test split or cross-validation
+# 
+# **Disadvantages of Random Forests:**
+# 
+# - Less interpretable
+# - Slower to train
+# - Slower to predict
+
+# ![Machine learning flowchart](images/driver_ensembling.png)
+# 
+# *Machine learning flowchart created by the [second place finisher](http://blog.kaggle.com/2015/04/20/axa-winners-interview-learning-telematic-fingerprints-from-gps-data/) of Kaggle's [Driver Telematics competition](https://www.kaggle.com/c/axa-driver-telematics-analysis)*
diff --git a/notebooks/17_bikeshare_exercise.ipynb b/notebooks/17_bikeshare_exercise.ipynb
new file mode 100644
index 0000000..01f2bab
--- /dev/null
+++ b/notebooks/17_bikeshare_exercise.ipynb
@@ -0,0 +1,212 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exercise with Capital Bikeshare data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Introduction\n",
+    "\n",
+    "- Capital Bikeshare dataset from Kaggle: [data](https://github.com/justmarkham/DAT8/blob/master/data/bikeshare.csv), [data dictionary](https://www.kaggle.com/c/bike-sharing-demand/data)\n",
+    "- Each observation represents the bikeshare rentals initiated during a given hour of a given day"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.cross_validation import cross_val_score\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.tree import DecisionTreeRegressor, export_graphviz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# read the data and set \"datetime\" as the index\n",
+    "url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/bikeshare.csv'\n",
+    "bikes = pd.read_csv(url, index_col='datetime', parse_dates=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# \"count\" is a method, so it's best to rename that column\n",
+    "bikes.rename(columns={'count':'total'}, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# create \"hour\" as its own feature\n",
+    "bikes['hour'] = bikes.index.hour"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "bikes.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "bikes.tail()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- **hour** ranges from 0 (midnight) through 23 (11pm)\n",
+    "- **workingday** is either 0 (weekend or holiday) or 1 (non-holiday weekday)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task 1\n",
+    "\n",
+    "Run these two `groupby` statements and figure out what they tell you about the data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "bikes.groupby('workingday').total.mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "bikes.groupby('hour').total.mean()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task 2\n",
+    "\n",
+    "Run this plotting code, and make sure you understand the output. Then, separate this plot into two separate plots conditioned on \"workingday\". (In other words, one plot should display the hourly trend for \"workingday=0\", and the other should display the hourly trend for \"workingday=1\".)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "bikes.groupby('hour').total.mean().plot()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task 3\n",
+    "\n",
+    "Fit a linear regression model to the entire dataset, using \"total\" as the response and \"hour\" and \"workingday\" as the only features. Then, print the coefficients and interpret them. What are the limitations of linear regression in this instance?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task 4\n",
+    "\n",
+    "Use 10-fold cross-validation to calculate the RMSE for the linear regression model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task 5\n",
+    "\n",
+    "Use 10-fold cross-validation to evaluate a decision tree model with those same features (fit to any \"max_depth\" you choose)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task 6\n",
+    "\n",
+    "Fit a decision tree model to the entire dataset using \"max_depth=3\", and create a tree diagram using Graphviz. Then, figure out what each leaf represents. What did the decision tree learn that a linear regression model could not learn?"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/17_decision_trees.ipynb b/notebooks/17_decision_trees.ipynb
new file mode 100644
index 0000000..89abee1
--- /dev/null
+++ b/notebooks/17_decision_trees.ipynb
@@ -0,0 +1,898 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Decision Trees\n",
+    "\n",
+    "*Adapted from Chapter 8 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/)*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Why are we learning about decision trees?\n",
+    "\n",
+    "- Can be applied to both regression and classification problems\n",
+    "- Many useful properties\n",
+    "- Very popular\n",
+    "- Basis for more sophisticated models\n",
+    "- Have a different way of \"thinking\" than the other models we have studied"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Lesson objectives\n",
+    "\n",
+    "Students will be able to:\n",
+    "\n",
+    "- Explain how a decision tree is created\n",
+    "- Build a decision tree model in scikit-learn\n",
+    "- Tune a decision tree model and explain how tuning impacts the model\n",
+    "- Interpret a tree diagram\n",
+    "- Describe the key differences between regression and classification trees\n",
+    "- Decide whether a decision tree is an appropriate model for a given problem"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 1: Regression trees\n",
+    "\n",
+    "Major League Baseball player data from 1986-87:\n",
+    "\n",
+    "- **Years** (x-axis): number of years playing in the major leagues\n",
+    "- **Hits** (y-axis): number of hits in the previous year\n",
+    "- **Salary** (color): low salary is blue/green, high salary is red/yellow"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Salary data](images/salary_color.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Group exercise:\n",
+    "\n",
+    "- The data above is our **training data**.\n",
+    "- We want to build a model that predicts the Salary of **future players** based on Years and Hits.\n",
+    "- We are going to \"segment\" the feature space into regions, and then use the **mean Salary in each region** as the predicted Salary for future players.\n",
+    "- Intuitively, you want to **maximize** the similarity (or \"homogeneity\") within a given region, and **minimize** the similarity between different regions.\n",
+    "\n",
+    "Rules for segmenting:\n",
+    "\n",
+    "- You can only use **straight lines**, drawn one at a time.\n",
+    "- Your line must either be **vertical or horizontal**.\n",
+    "- Your line **stops** when it hits an existing line."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Salary regions](images/salary_regions.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Above are the regions created by a computer:\n",
+    "\n",
+    "- $R_1$: players with **less than 5 years** of experience, mean Salary of **\\$166,000 **\n",
+    "- $R_2$: players with **5 or more years** of experience and **less than 118 hits**, mean Salary of **\\$403,000 **\n",
+    "- $R_3$: players with **5 or more years** of experience and **118 hits or more**, mean Salary of **\\$846,000 **\n",
+    "\n",
+    "**Note:** Years and Hits are both integers, but the convention is to use the **midpoint** between adjacent values to label a split.\n",
+    "\n",
+    "These regions are used to make predictions on **out-of-sample data**. Thus, there are only three possible predictions! (Is this different from how **linear regression** makes predictions?)\n",
+    "\n",
+    "Below is the equivalent regression tree:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Salary tree](images/salary_tree.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The first split is **Years < 4.5**, thus that split goes at the top of the tree. When a splitting rule is **True**, you follow the left branch. When a splitting rule is **False**, you follow the right branch.\n",
+    "\n",
+    "For players in the **left branch**, the mean Salary is \\$166,000, thus you label it with that value. (Salary has been divided by 1000 and log-transformed to 5.11.)\n",
+    "\n",
+    "For players in the **right branch**, there is a further split on **Hits < 117.5**, dividing players into two more Salary regions: \\$403,000 (transformed to 6.00), and \\$846,000 (transformed to 6.74)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Salary tree annotated](images/salary_tree_annotated.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**What does this tree tell you about your data?**\n",
+    "\n",
+    "- Years is the most important factor determining Salary, with a lower number of Years corresponding to a lower Salary.\n",
+    "- For a player with a lower number of Years, Hits is not an important factor determining Salary.\n",
+    "- For a player with a higher number of Years, Hits is an important factor determining Salary, with a greater number of Hits corresponding to a higher Salary.\n",
+    "\n",
+    "**Question:** What do you like and dislike about decision trees so far?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Building a regression tree by hand\n",
+    "\n",
+    "Your **training data** is a tiny dataset of [used vehicle sale prices](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/vehicles_train.csv). Your goal is to **predict price** for testing data.\n",
+    "\n",
+    "1. Read the data into a Pandas DataFrame.\n",
+    "2. Explore the data by sorting, plotting, or split-apply-combine (aka `group_by`).\n",
+    "3. Decide which feature is the most important predictor, and use that to create your first splitting rule.\n",
+    "    - Only binary splits are allowed.\n",
+    "4. After making your first split, split your DataFrame into two parts, and then explore each part to figure out what other splits to make.\n",
+    "5. Stop making splits once you are convinced that it strikes a good balance between underfitting and overfitting.\n",
+    "    - Your goal is to build a model that generalizes well.\n",
+    "    - You are allowed to split on the same variable multiple times!\n",
+    "6. Draw your tree, labeling the leaves with the mean price for the observations in that region.\n",
+    "    - Make sure nothing is backwards: You follow the **left branch** if the rule is true, and the **right branch** if the rule is false."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## How does a computer build a regression tree?\n",
+    "\n",
+    "**Ideal approach:** Consider every possible partition of the feature space (computationally infeasible)\n",
+    "\n",
+    "**\"Good enough\" approach:** recursive binary splitting\n",
+    "\n",
+    "1. Begin at the top of the tree.\n",
+    "2. For **every feature**, examine **every possible cutpoint**, and choose the feature and cutpoint such that the resulting tree has the lowest possible mean squared error (MSE). Make that split.\n",
+    "3. Examine the two resulting regions, and again make a **single split** (in one of the regions) to minimize the MSE.\n",
+    "4. Keep repeating step 3 until a **stopping criterion** is met:\n",
+    "    - maximum tree depth (maximum number of splits required to arrive at a leaf)\n",
+    "    - minimum number of observations in a leaf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Demo: Choosing the ideal cutpoint for a given feature"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# vehicle data\n",
+    "import pandas as pd\n",
+    "url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/vehicles_train.csv'\n",
+    "train = pd.read_csv(url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# before splitting anything, just predict the mean of the entire dataset\n",
+    "train['prediction'] = train.price.mean()\n",
+    "train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# calculate RMSE for those predictions\n",
+    "from sklearn import metrics\n",
+    "import numpy as np\n",
+    "np.sqrt(metrics.mean_squared_error(train.price, train.prediction))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# define a function that calculates the RMSE for a given split of miles\n",
+    "def mileage_split(miles):\n",
+    "    lower_mileage_price = train[train.miles < miles].price.mean()\n",
+    "    higher_mileage_price = train[train.miles >= miles].price.mean()\n",
+    "    train['prediction'] = np.where(train.miles < miles, lower_mileage_price, higher_mileage_price)\n",
+    "    return np.sqrt(metrics.mean_squared_error(train.price, train.prediction))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# calculate RMSE for tree which splits on miles < 50000\n",
+    "print 'RMSE:', mileage_split(50000)\n",
+    "train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# calculate RMSE for tree which splits on miles < 100000\n",
+    "print 'RMSE:', mileage_split(100000)\n",
+    "train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# check all possible mileage splits\n",
+    "mileage_range = range(train.miles.min(), train.miles.max(), 1000)\n",
+    "RMSE = [mileage_split(miles) for miles in mileage_range]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# allow plots to appear in the notebook\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "plt.rcParams['figure.figsize'] = (6, 4)\n",
+    "plt.rcParams['font.size'] = 14"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# plot mileage cutpoint (x-axis) versus RMSE (y-axis)\n",
+    "plt.plot(mileage_range, RMSE)\n",
+    "plt.xlabel('Mileage cutpoint')\n",
+    "plt.ylabel('RMSE (lower is better)')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Recap:** Before every split, this process is repeated for every feature, and the feature and cutpoint that produces the lowest MSE is chosen."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Building a regression tree in scikit-learn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# encode car as 0 and truck as 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# define X and y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# instantiate a DecisionTreeRegressor (with random_state=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# use leave-one-out cross-validation (LOOCV) to estimate the RMSE for this model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## What happens when we grow a tree too deep?\n",
+    "\n",
+    "- Left: Regression tree for Salary **grown deeper**\n",
+    "- Right: Comparison of the **training, testing, and cross-validation errors** for trees with different numbers of leaves"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Salary tree grown deep](images/salary_tree_deep.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The **training error** continues to go down as the tree size increases (due to overfitting), but the lowest **cross-validation error** occurs for a tree with 3 leaves."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tuning a regression tree\n",
+    "\n",
+    "Let's try to reduce the RMSE by tuning the **max_depth** parameter:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# try different values one-by-one\n",
+    "treereg = DecisionTreeRegressor(max_depth=1, random_state=1)\n",
+    "scores = cross_val_score(treereg, X, y, cv=14, scoring='mean_squared_error')\n",
+    "np.mean(np.sqrt(-scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Or, we could write a loop to try a range of values:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# list of values to try\n",
+    "max_depth_range = range(1, 8)\n",
+    "\n",
+    "# list to store the average RMSE for each value of max_depth\n",
+    "RMSE_scores = []\n",
+    "\n",
+    "# use LOOCV with each value of max_depth\n",
+    "for depth in max_depth_range:\n",
+    "    treereg = DecisionTreeRegressor(max_depth=depth, random_state=1)\n",
+    "    MSE_scores = cross_val_score(treereg, X, y, cv=14, scoring='mean_squared_error')\n",
+    "    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# plot max_depth (x-axis) versus RMSE (y-axis)\n",
+    "plt.plot(max_depth_range, RMSE_scores)\n",
+    "plt.xlabel('max_depth')\n",
+    "plt.ylabel('RMSE (lower is better)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# max_depth=3 was best, so fit a tree using that parameter\n",
+    "treereg = DecisionTreeRegressor(max_depth=3, random_state=1)\n",
+    "treereg.fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# \"Gini importance\" of each feature: the (normalized) total reduction of error brought by that feature\n",
+    "pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating a tree diagram"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# create a Graphviz file\n",
+    "from sklearn.tree import export_graphviz\n",
+    "export_graphviz(treereg, out_file='tree_vehicles.dot', feature_names=feature_cols)\n",
+    "\n",
+    "# At the command line, run this to convert to PNG:\n",
+    "#   dot -Tpng tree_vehicles.dot -o tree_vehicles.png"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Tree for vehicle data](images/tree_vehicles.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Reading the internal nodes:\n",
+    "\n",
+    "- **samples:** number of observations in that node before splitting\n",
+    "- **mse:** MSE calculated by comparing the actual response values in that node against the mean response value in that node\n",
+    "- **rule:** rule used to split that node (go left if true, go right if false)\n",
+    "\n",
+    "Reading the leaves:\n",
+    "\n",
+    "- **samples:** number of observations in that node\n",
+    "- **value:** mean response value in that node\n",
+    "- **mse:** MSE calculated by comparing the actual response values in that node against \"value\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Making predictions for the testing data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# read the testing data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Question:** Using the tree diagram above, what predictions will the model make for each observation?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# use fitted model to make predictions on testing data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# calculate RMSE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# calculate RMSE for your own tree!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 2: Classification trees\n",
+    "\n",
+    "**Example:** Predict whether Barack Obama or Hillary Clinton will win the Democratic primary in a particular county in 2008:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Obama-Clinton decision tree](images/obama_clinton_tree.jpg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Questions:**\n",
+    "\n",
+    "- What are the observations? How many observations are there?\n",
+    "- What is the response variable?\n",
+    "- What are the features?\n",
+    "- What is the most predictive feature?\n",
+    "- Why does the tree split on high school graduation rate twice in a row?\n",
+    "- What is the class prediction for the following county: 15% African-American, 90% high school graduation rate, located in the South, high poverty, high population density?\n",
+    "- What is the predicted probability for that same county?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Comparing regression trees and classification trees\n",
+    "\n",
+    "|regression trees|classification trees|\n",
+    "|---|---|\n",
+    "|predict a continuous response|predict a categorical response|\n",
+    "|predict using mean response of each leaf|predict using most commonly occuring class of each leaf|\n",
+    "|splits are chosen to minimize MSE|splits are chosen to minimize Gini index (discussed below)|"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Splitting criteria for classification trees\n",
+    "\n",
+    "Common options for the splitting criteria:\n",
+    "\n",
+    "- **classification error rate:** fraction of training observations in a region that don't belong to the most common class\n",
+    "- **Gini index:** measure of total variance across classes in a region"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Example of classification error rate\n",
+    "\n",
+    "Pretend we are predicting whether someone buys an iPhone or an Android:\n",
+    "\n",
+    "- At a particular node, there are **25 observations** (phone buyers), of whom **10 bought iPhones and 15 bought Androids**.\n",
+    "- Since the majority class is **Android**, that's our prediction for all 25 observations, and thus the classification error rate is **10/25 = 40%**.\n",
+    "\n",
+    "Our goal in making splits is to **reduce the classification error rate**. Let's try splitting on gender:\n",
+    "\n",
+    "- **Males:** 2 iPhones and 12 Androids, thus the predicted class is Android\n",
+    "- **Females:** 8 iPhones and 3 Androids, thus the predicted class is iPhone\n",
+    "- Classification error rate after this split would be **5/25 = 20%**\n",
+    "\n",
+    "Compare that with a split on age:\n",
+    "\n",
+    "- **30 or younger:** 4 iPhones and 8 Androids, thus the predicted class is Android\n",
+    "- **31 or older:** 6 iPhones and 7 Androids, thus the predicted class is Android\n",
+    "- Classification error rate after this split would be **10/25 = 40%**\n",
+    "\n",
+    "The decision tree algorithm will try **every possible split across all features**, and choose the split that **reduces the error rate the most.**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Example of Gini index\n",
+    "\n",
+    "Calculate the Gini index before making a split:\n",
+    "\n",
+    "$$1 - \\left(\\frac {iPhone} {Total}\\right)^2 - \\left(\\frac {Android} {Total}\\right)^2 = 1 - \\left(\\frac {10} {25}\\right)^2 - \\left(\\frac {15} {25}\\right)^2 = 0.48$$\n",
+    "\n",
+    "- The **maximum value** of the Gini index is 0.5, and occurs when the classes are perfectly balanced in a node.\n",
+    "- The **minimum value** of the Gini index is 0, and occurs when there is only one class represented in a node.\n",
+    "- A node with a lower Gini index is said to be more \"pure\".\n",
+    "\n",
+    "Evaluating the split on **gender** using Gini index:\n",
+    "\n",
+    "$$\\text{Males: } 1 - \\left(\\frac {2} {14}\\right)^2 - \\left(\\frac {12} {14}\\right)^2 = 0.24$$\n",
+    "$$\\text{Females: } 1 - \\left(\\frac {8} {11}\\right)^2 - \\left(\\frac {3} {11}\\right)^2 = 0.40$$\n",
+    "$$\\text{Weighted Average: } 0.24 \\left(\\frac {14} {25}\\right) + 0.40 \\left(\\frac {11} {25}\\right) = 0.31$$\n",
+    "\n",
+    "Evaluating the split on **age** using Gini index:\n",
+    "\n",
+    "$$\\text{30 or younger: } 1 - \\left(\\frac {4} {12}\\right)^2 - \\left(\\frac {8} {12}\\right)^2 = 0.44$$\n",
+    "$$\\text{31 or older: } 1 - \\left(\\frac {6} {13}\\right)^2 - \\left(\\frac {7} {13}\\right)^2 = 0.50$$\n",
+    "$$\\text{Weighted Average: } 0.44 \\left(\\frac {12} {25}\\right) + 0.50 \\left(\\frac {13} {25}\\right) = 0.47$$\n",
+    "\n",
+    "Again, the decision tree algorithm will try **every possible split**, and will choose the split that **reduces the Gini index (and thus increases the \"node purity\") the most.**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Comparing classification error rate and Gini index\n",
+    "\n",
+    "- Gini index is generally preferred because it will make splits that **increase node purity**, even if that split does not change the classification error rate.\n",
+    "- Node purity is important because we're interested in the **class proportions** in each region, since that's how we calculate the **predicted probability** of each class.\n",
+    "- scikit-learn's default splitting criteria for classification trees is Gini index.\n",
+    "\n",
+    "Note: There is another common splitting criteria called **cross-entropy**. It's numerically similar to Gini index, but slower to compute, thus it's not as popular as Gini index."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Building a classification tree in scikit-learn"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll build a classification tree using the Titanic data:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# read in the data\n",
+    "url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/titanic.csv'\n",
+    "titanic = pd.read_csv(url)\n",
+    "\n",
+    "# encode female as 0 and male as 1\n",
+    "titanic['Sex'] = titanic.Sex.map({'female':0, 'male':1})\n",
+    "\n",
+    "# fill in the missing values for age with the median age\n",
+    "titanic.Age.fillna(titanic.Age.median(), inplace=True)\n",
+    "\n",
+    "# create a DataFrame of dummy variables for Embarked\n",
+    "embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked')\n",
+    "embarked_dummies.drop(embarked_dummies.columns[0], axis=1, inplace=True)\n",
+    "\n",
+    "# concatenate the original DataFrame and the dummy DataFrame\n",
+    "titanic = pd.concat([titanic, embarked_dummies], axis=1)\n",
+    "\n",
+    "# print the updated DataFrame\n",
+    "titanic.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- **Survived:** 0=died, 1=survived (response variable)\n",
+    "- **Pclass:** 1=first class, 2=second class, 3=third class\n",
+    "    - What will happen if the tree splits on this feature?\n",
+    "- **Sex:** 0=female, 1=male\n",
+    "- **Age:** numeric value\n",
+    "- **Embarked:** C or Q or S"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# define X and y\n",
+    "feature_cols = ['Pclass', 'Sex', 'Age', 'Embarked_Q', 'Embarked_S']\n",
+    "X = titanic[feature_cols]\n",
+    "y = titanic.Survived"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# fit a classification tree with max_depth=3 on all data\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)\n",
+    "treeclf.fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# create a Graphviz file\n",
+    "export_graphviz(treeclf, out_file='tree_titanic.dot', feature_names=feature_cols)\n",
+    "\n",
+    "# At the command line, run this to convert to PNG:\n",
+    "#   dot -Tpng tree_titanic.dot -o tree_titanic.png"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Tree for Titanic data](images/tree_titanic.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice the split in the bottom right: the **same class** is predicted in both of its leaves. That split didn't affect the **classification error rate**, though it did increase the **node purity**, which is important because it increases the accuracy of our predicted probabilities."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# compute the feature importances\n",
+    "pd.DataFrame({'feature':feature_cols, 'importance':treeclf.feature_importances_})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 3: Comparing decision trees with other models\n",
+    "\n",
+    "**Advantages of decision trees:**\n",
+    "\n",
+    "- Can be used for regression or classification\n",
+    "- Can be displayed graphically\n",
+    "- Highly interpretable\n",
+    "- Can be specified as a series of rules, and more closely approximate human decision-making than other models\n",
+    "- Prediction is fast\n",
+    "- Features don't need scaling\n",
+    "- Automatically learns feature interactions\n",
+    "- Tends to ignore irrelevant features\n",
+    "- Non-parametric (will outperform linear models if relationship between features and response is highly non-linear)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Trees versus linear models](images/tree_vs_linear.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Disadvantages of decision trees:**\n",
+    "\n",
+    "- Performance is (generally) not competitive with the best supervised learning methods\n",
+    "- Can easily overfit the training data (tuning is required)\n",
+    "- Small variations in the data can result in a completely different tree (high variance)\n",
+    "- Recursive binary splitting makes \"locally optimal\" decisions that may not result in a globally optimal tree\n",
+    "- Doesn't tend to work well if the classes are highly unbalanced\n",
+    "- Doesn't tend to work well with very small datasets"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/18_ensembling.ipynb b/notebooks/18_ensembling.ipynb
new file mode 100644
index 0000000..401e4fb
--- /dev/null
+++ b/notebooks/18_ensembling.ipynb
@@ -0,0 +1,1013 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Ensembling\n",
+    "\n",
+    "*Adapted from Chapter 8 of [An Introduction to Statistical Learning](http://www-bcf.usc.edu/~gareth/ISL/)*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Why are we learning about ensembling?\n",
+    "\n",
+    "- Very popular method for improving the predictive performance of machine learning models\n",
+    "- Provides a foundation for understanding more sophisticated models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Lesson objectives\n",
+    "\n",
+    "Students will be able to:\n",
+    "\n",
+    "- Define ensembling and its requirements\n",
+    "- Identify the two basic methods of ensembling\n",
+    "- Decide whether manual ensembling is a useful approach for a given problem\n",
+    "- Explain bagging and how it can be applied to decision trees\n",
+    "- Explain how out-of-bag error and feature importances are calculated from bagged trees\n",
+    "- Explain the difference between bagged trees and Random Forests\n",
+    "- Build and tune a Random Forest model in scikit-learn\n",
+    "- Decide whether a decision tree or a Random Forest is a better model for a given problem"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 1: Introduction\n",
+    "\n",
+    "Let's pretend that instead of building a single model to solve a classification problem, you created **five independent models**, and each model was correct about 70% of the time. If you combined these models into an \"ensemble\" and used their majority vote as a prediction, how often would the ensemble be correct?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# set a seed for reproducibility\n",
+    "np.random.seed(1234)\n",
+    "\n",
+    "# generate 1000 random numbers (between 0 and 1) for each model, representing 1000 observations\n",
+    "mod1 = np.random.rand(1000)\n",
+    "mod2 = np.random.rand(1000)\n",
+    "mod3 = np.random.rand(1000)\n",
+    "mod4 = np.random.rand(1000)\n",
+    "mod5 = np.random.rand(1000)\n",
+    "\n",
+    "# each model independently predicts 1 (the \"correct response\") if random number was at least 0.3\n",
+    "preds1 = np.where(mod1 > 0.3, 1, 0)\n",
+    "preds2 = np.where(mod2 > 0.3, 1, 0)\n",
+    "preds3 = np.where(mod3 > 0.3, 1, 0)\n",
+    "preds4 = np.where(mod4 > 0.3, 1, 0)\n",
+    "preds5 = np.where(mod5 > 0.3, 1, 0)\n",
+    "\n",
+    "# print the first 20 predictions from each model\n",
+    "print preds1[:20]\n",
+    "print preds2[:20]\n",
+    "print preds3[:20]\n",
+    "print preds4[:20]\n",
+    "print preds5[:20]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# average the predictions and then round to 0 or 1\n",
+    "ensemble_preds = np.round((preds1 + preds2 + preds3 + preds4 + preds5)/5.0).astype(int)\n",
+    "\n",
+    "# print the ensemble's first 20 predictions\n",
+    "print ensemble_preds[:20]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# how accurate was each individual model?\n",
+    "print preds1.mean()\n",
+    "print preds2.mean()\n",
+    "print preds3.mean()\n",
+    "print preds4.mean()\n",
+    "print preds5.mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# how accurate was the ensemble?\n",
+    "print ensemble_preds.mean()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note:** As you add more models to the voting process, the probability of error decreases, which is known as [Condorcet's Jury Theorem](http://en.wikipedia.org/wiki/Condorcet%27s_jury_theorem)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## What is ensembling?\n",
+    "\n",
+    "**Ensemble learning (or \"ensembling\")** is the process of combining several predictive models in order to produce a combined model that is more accurate than any individual model.\n",
+    "\n",
+    "- **Regression:** take the average of the predictions\n",
+    "- **Classification:** take a vote and use the most common prediction, or take the average of the predicted probabilities\n",
+    "\n",
+    "For ensembling to work well, the models must have the following characteristics:\n",
+    "\n",
+    "- **Accurate:** they outperform the null model\n",
+    "- **Independent:** their predictions are generated using different processes\n",
+    "\n",
+    "**The big idea:** If you have a collection of individually imperfect (and independent) models, the \"one-off\" mistakes made by each model are probably not going to be made by the rest of the models, and thus the mistakes will be discarded when averaging the models.\n",
+    "\n",
+    "There are two basic **methods for ensembling:**\n",
+    "\n",
+    "- Manually ensemble your individual models\n",
+    "- Use a model that ensembles for you"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 2: Manual ensembling\n",
+    "\n",
+    "What makes a good manual ensemble?\n",
+    "\n",
+    "- Different types of **models**\n",
+    "- Different combinations of **features**\n",
+    "- Different **tuning parameters**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Machine learning flowchart](images/crowdflower_ensembling.jpg)\n",
+    "\n",
+    "*Machine learning flowchart created by the [winner](https://github.com/ChenglongChen/Kaggle_CrowdFlower) of Kaggle's [CrowdFlower competition](https://www.kaggle.com/c/crowdflower-search-relevance)*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Comparing manual ensembling with a single model approach\n",
+    "\n",
+    "**Advantages of ensembling:**\n",
+    "\n",
+    "- Increases predictive accuracy\n",
+    "- Easy to get started\n",
+    "\n",
+    "**Disadvantages of ensembling:**\n",
+    "\n",
+    "- Decreases interpretability\n",
+    "- Takes longer to train\n",
+    "- Takes longer to predict\n",
+    "- More complex to automate and maintain\n",
+    "- Small gains in accuracy may not be worth the added complexity"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 3: Bagging\n",
+    "\n",
+    "The primary weakness of **decision trees** is that they don't tend to have the best predictive accuracy. This is partially due to **high variance**, meaning that different splits in the training data can lead to very different trees.\n",
+    "\n",
+    "**Bagging** is a general purpose procedure for reducing the variance of a machine learning method, but is particularly useful for decision trees. Bagging is short for **bootstrap aggregation**, meaning the aggregation of bootstrap samples.\n",
+    "\n",
+    "What is a **bootstrap sample**? A random sample with replacement:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# set a seed for reproducibility\n",
+    "np.random.seed(1)\n",
+    "\n",
+    "# create an array of 1 through 20\n",
+    "nums = np.arange(1, 21)\n",
+    "print nums\n",
+    "\n",
+    "# sample that array 20 times with replacement\n",
+    "print np.random.choice(a=nums, size=20, replace=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**How does bagging work (for decision trees)?**\n",
+    "\n",
+    "1. Grow B trees using B bootstrap samples from the training data.\n",
+    "2. Train each tree on its bootstrap sample and make predictions.\n",
+    "3. Combine the predictions:\n",
+    "    - Average the predictions for **regression trees**\n",
+    "    - Take a majority vote for **classification trees**\n",
+    "\n",
+    "Notes:\n",
+    "\n",
+    "- **Each bootstrap sample** should be the same size as the original training set.\n",
+    "- **B** should be a large enough value that the error seems to have \"stabilized\".\n",
+    "- The trees are **grown deep** so that they have low bias/high variance.\n",
+    "\n",
+    "Bagging increases predictive accuracy by **reducing the variance**, similar to how cross-validation reduces the variance associated with train/test split (for estimating out-of-sample error) by splitting many times an averaging the results."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Manually implementing bagged decision trees (with B=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# read in and prepare the vehicle training data\n",
+    "import pandas as pd\n",
+    "url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/vehicles_train.csv'\n",
+    "train = pd.read_csv(url)\n",
+    "train['vtype'] = train.vtype.map({'car':0, 'truck':1})\n",
+    "train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# set a seed for reproducibility\n",
+    "np.random.seed(123)\n",
+    "\n",
+    "# create ten bootstrap samples (will be used to select rows from the DataFrame)\n",
+    "samples = [np.random.choice(a=14, size=14, replace=True) for _ in range(1, 11)]\n",
+    "samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# show the rows for the first decision tree\n",
+    "train.iloc[samples[0], :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# read in and prepare the vehicle testing data\n",
+    "url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/vehicles_test.csv'\n",
+    "test = pd.read_csv(url)\n",
+    "test['vtype'] = test.vtype.map({'car':0, 'truck':1})\n",
+    "test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "\n",
+    "# grow each tree deep\n",
+    "treereg = DecisionTreeRegressor(max_depth=None, random_state=123)\n",
+    "\n",
+    "# list for storing predicted price from each tree\n",
+    "predictions = []\n",
+    "\n",
+    "# define testing data\n",
+    "X_test = test.iloc[:, 1:]\n",
+    "y_test = test.iloc[:, 0]\n",
+    "\n",
+    "# grow one tree for each bootstrap sample and make predictions on testing data\n",
+    "for sample in samples:\n",
+    "    X_train = train.iloc[sample, 1:]\n",
+    "    y_train = train.iloc[sample, 0]\n",
+    "    treereg.fit(X_train, y_train)\n",
+    "    y_pred = treereg.predict(X_test)\n",
+    "    predictions.append(y_pred)\n",
+    "\n",
+    "# convert predictions from list to NumPy array\n",
+    "predictions = np.array(predictions)\n",
+    "predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# average predictions\n",
+    "np.mean(predictions, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# calculate RMSE\n",
+    "from sklearn import metrics\n",
+    "y_pred = np.mean(predictions, axis=0)\n",
+    "np.sqrt(metrics.mean_squared_error(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Bagged decision trees in scikit-learn (with B=500)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# define the training and testing sets\n",
+    "X_train = train.iloc[:, 1:]\n",
+    "y_train = train.iloc[:, 0]\n",
+    "X_test = test.iloc[:, 1:]\n",
+    "y_test = test.iloc[:, 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# instruct BaggingRegressor to use DecisionTreeRegressor as the \"base estimator\"\n",
+    "from sklearn.ensemble import BaggingRegressor\n",
+    "bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# fit and predict\n",
+    "bagreg.fit(X_train, y_train)\n",
+    "y_pred = bagreg.predict(X_test)\n",
+    "y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# calculate RMSE\n",
+    "np.sqrt(metrics.mean_squared_error(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Estimating out-of-sample error\n",
+    "\n",
+    "For bagged models, out-of-sample error can be estimated without using **train/test split** or **cross-validation**!\n",
+    "\n",
+    "On average, each bagged tree uses about **two-thirds** of the observations. For each tree, the **remaining observations** are called \"out-of-bag\" observations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# show the first bootstrap sample\n",
+    "samples[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# show the \"in-bag\" observations for each sample\n",
+    "for sample in samples:\n",
+    "    print set(sample)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# show the \"out-of-bag\" observations for each sample\n",
+    "for sample in samples:\n",
+    "    print sorted(set(range(14)) - set(sample))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "How to calculate **\"out-of-bag error\":**\n",
+    "\n",
+    "1. For every observation in the training data, predict its response value using **only** the trees in which that observation was out-of-bag. Average those predictions (for regression) or take a majority vote (for classification).\n",
+    "2. Compare all predictions to the actual response values in order to compute the out-of-bag error.\n",
+    "\n",
+    "When B is sufficiently large, the **out-of-bag error** is an accurate estimate of **out-of-sample error**."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# compute the out-of-bag R-squared score (not MSE, unfortunately!) for B=500\n",
+    "bagreg.oob_score_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Estimating feature importance\n",
+    "\n",
+    "Bagging increases **predictive accuracy**, but decreases **model interpretability** because it's no longer possible to visualize the tree to understand the importance of each feature.\n",
+    "\n",
+    "However, we can still obtain an overall summary of **feature importance** from bagged models:\n",
+    "\n",
+    "- **Bagged regression trees:** calculate the total amount that **MSE** is decreased due to splits over a given feature, averaged over all trees\n",
+    "- **Bagged classification trees:** calculate the total amount that **Gini index** is decreased due to splits over a given feature, averaged over all trees"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 4: Random Forests\n",
+    "\n",
+    "Random Forests is a **slight variation of bagged trees** that has even better performance:\n",
+    "\n",
+    "- Exactly like bagging, we create an ensemble of decision trees using bootstrapped samples of the training set.\n",
+    "- However, when building each tree, each time a split is considered, a **random sample of m features** is chosen as split candidates from the **full set of p features**. The split is only allowed to use **one of those m features**.\n",
+    "    - A new random sample of features is chosen for **every single tree at every single split**.\n",
+    "    - For **classification**, m is typically chosen to be the square root of p.\n",
+    "    - For **regression**, m is typically chosen to be somewhere between p/3 and p.\n",
+    "\n",
+    "What's the point?\n",
+    "\n",
+    "- Suppose there is **one very strong feature** in the data set. When using bagged trees, most of the trees will use that feature as the top split, resulting in an ensemble of similar trees that are **highly correlated**.\n",
+    "- Averaging highly correlated quantities does not significantly reduce variance (which is the entire goal of bagging).\n",
+    "- By randomly leaving out candidate features from each split, **Random Forests \"decorrelates\" the trees**, such that the averaging process can reduce the variance of the resulting model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 5: Building and tuning decision trees and Random Forests\n",
+    "\n",
+    "- Major League Baseball player data from 1986-87: [data](https://github.com/justmarkham/DAT8/blob/master/data/hitters.csv), [data dictionary](https://cran.r-project.org/web/packages/ISLR/ISLR.pdf) (page 7)\n",
+    "- Each observation represents a player\n",
+    "- **Goal:** Predict player salary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparing the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# read in the data\n",
+    "url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/hitters.csv'\n",
+    "hitters = pd.read_csv(url)\n",
+    "\n",
+    "# remove rows with missing values\n",
+    "hitters.dropna(inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "hitters.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# encode categorical variables as integers\n",
+    "hitters['League'] = pd.factorize(hitters.League)[0]\n",
+    "hitters['Division'] = pd.factorize(hitters.Division)[0]\n",
+    "hitters['NewLeague'] = pd.factorize(hitters.NewLeague)[0]\n",
+    "hitters.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# allow plots to appear in the notebook\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# scatter plot of Years versus Hits colored by Salary\n",
+    "hitters.plot(kind='scatter', x='Years', y='Hits', c='Salary', colormap='jet', xlim=(0, 25), ylim=(0, 250))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# define features: exclude career statistics (which start with \"C\") and the response (Salary)\n",
+    "feature_cols = hitters.columns[hitters.columns.str.startswith('C') == False].drop('Salary')\n",
+    "feature_cols"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# define X and y\n",
+    "X = hitters[feature_cols]\n",
+    "y = hitters.Salary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Predicting salary with a decision tree\n",
+    "\n",
+    "Find the best **max_depth** for a decision tree using cross-validation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# list of values to try for max_depth\n",
+    "max_depth_range = range(1, 21)\n",
+    "\n",
+    "# list to store the average RMSE for each value of max_depth\n",
+    "RMSE_scores = []\n",
+    "\n",
+    "# use 10-fold cross-validation with each value of max_depth\n",
+    "from sklearn.cross_validation import cross_val_score\n",
+    "for depth in max_depth_range:\n",
+    "    treereg = DecisionTreeRegressor(max_depth=depth, random_state=1)\n",
+    "    MSE_scores = cross_val_score(treereg, X, y, cv=10, scoring='mean_squared_error')\n",
+    "    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# plot max_depth (x-axis) versus RMSE (y-axis)\n",
+    "plt.plot(max_depth_range, RMSE_scores)\n",
+    "plt.xlabel('max_depth')\n",
+    "plt.ylabel('RMSE (lower is better)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# show the best RMSE and the corresponding max_depth\n",
+    "sorted(zip(RMSE_scores, max_depth_range))[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# max_depth=2 was best, so fit a tree using that parameter\n",
+    "treereg = DecisionTreeRegressor(max_depth=2, random_state=1)\n",
+    "treereg.fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# compute feature importances\n",
+    "pd.DataFrame({'feature':feature_cols, 'importance':treereg.feature_importances_}).sort('importance')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Predicting salary with a Random Forest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "rfreg = RandomForestRegressor()\n",
+    "rfreg"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tuning n_estimators\n",
+    "\n",
+    "One important tuning parameter is **n_estimators**, which is the number of trees that should be grown. It should be a large enough value that the error seems to have \"stabilized\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# list of values to try for n_estimators\n",
+    "estimator_range = range(10, 310, 10)\n",
+    "\n",
+    "# list to store the average RMSE for each value of n_estimators\n",
+    "RMSE_scores = []\n",
+    "\n",
+    "# use 5-fold cross-validation with each value of n_estimators (WARNING: SLOW!)\n",
+    "for estimator in estimator_range:\n",
+    "    rfreg = RandomForestRegressor(n_estimators=estimator, random_state=1)\n",
+    "    MSE_scores = cross_val_score(rfreg, X, y, cv=5, scoring='mean_squared_error')\n",
+    "    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# plot n_estimators (x-axis) versus RMSE (y-axis)\n",
+    "plt.plot(estimator_range, RMSE_scores)\n",
+    "plt.xlabel('n_estimators')\n",
+    "plt.ylabel('RMSE (lower is better)')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tuning max_features\n",
+    "\n",
+    "The other important tuning parameter is **max_features**, which is the number of features that should be considered at each split."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# list of values to try for max_features\n",
+    "feature_range = range(1, len(feature_cols)+1)\n",
+    "\n",
+    "# list to store the average RMSE for each value of max_features\n",
+    "RMSE_scores = []\n",
+    "\n",
+    "# use 10-fold cross-validation with each value of max_features (WARNING: SLOW!)\n",
+    "for feature in feature_range:\n",
+    "    rfreg = RandomForestRegressor(n_estimators=150, max_features=feature, random_state=1)\n",
+    "    MSE_scores = cross_val_score(rfreg, X, y, cv=10, scoring='mean_squared_error')\n",
+    "    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# plot max_features (x-axis) versus RMSE (y-axis)\n",
+    "plt.plot(feature_range, RMSE_scores)\n",
+    "plt.xlabel('max_features')\n",
+    "plt.ylabel('RMSE (lower is better)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# show the best RMSE and the corresponding max_features\n",
+    "sorted(zip(RMSE_scores, feature_range))[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fitting a Random Forest with the best parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# max_features=8 is best and n_estimators=150 is sufficiently large\n",
+    "rfreg = RandomForestRegressor(n_estimators=150, max_features=8, oob_score=True, random_state=1)\n",
+    "rfreg.fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# compute feature importances\n",
+    "pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort('importance')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# compute the out-of-bag R-squared score\n",
+    "rfreg.oob_score_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Reducing X to its most important features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# check the shape of X\n",
+    "X.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# set a threshold for which features to include\n",
+    "print rfreg.transform(X, threshold=0.1).shape\n",
+    "print rfreg.transform(X, threshold='mean').shape\n",
+    "print rfreg.transform(X, threshold='median').shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# create a new feature matrix that only includes important features\n",
+    "X_important = rfreg.transform(X, threshold='mean')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# check the RMSE for a Random Forest that only includes important features\n",
+    "rfreg = RandomForestRegressor(n_estimators=150, max_features=3, random_state=1)\n",
+    "scores = cross_val_score(rfreg, X_important, y, cv=10, scoring='mean_squared_error')\n",
+    "np.mean(np.sqrt(-scores))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Comparing Random Forests with decision trees\n",
+    "\n",
+    "**Advantages of Random Forests:**\n",
+    "\n",
+    "- Performance is competitive with the best supervised learning methods\n",
+    "- Provides a more reliable estimate of feature importance\n",
+    "- Allows you to estimate out-of-sample error without using train/test split or cross-validation\n",
+    "\n",
+    "**Disadvantages of Random Forests:**\n",
+    "\n",
+    "- Less interpretable\n",
+    "- Slower to train\n",
+    "- Slower to predict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![Machine learning flowchart](images/driver_ensembling.png)\n",
+    "\n",
+    "*Machine learning flowchart created by the [second place finisher](http://blog.kaggle.com/2015/04/20/axa-winners-interview-learning-telematic-fingerprints-from-gps-data/) of Kaggle's [Driver Telematics competition](https://www.kaggle.com/c/axa-driver-telematics-analysis)*"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/other/model_comparison.md b/other/model_comparison.md
index dcfd0ef..dce0684 100644
--- a/other/model_comparison.md
+++ b/other/model_comparison.md
@@ -68,3 +68,40 @@
 - Correlated features can be problematic (due to the independence assumption)
 - Can't handle negative features (with Multinomial Naive Bayes)
 - Has a higher "asymptotic error" than logistic regression
+
+## Decision Trees
+
+**Advantages:**
+
+- Can be used for regression or classification
+- Can be displayed graphically
+- Highly interpretable
+- Can be specified as a series of rules, and more closely approximate human decision-making than other models
+- Prediction is fast
+- Features don't need scaling
+- Automatically learns feature interactions
+- Tends to ignore irrelevant features
+- Non-parametric (will outperform linear models if relationship between features and response is highly non-linear)
+
+**Disadvantages:**
+
+- Performance is (generally) not competitive with the best supervised learning methods
+- Can easily overfit the training data (tuning is required)
+- Small variations in the data can result in a completely different tree (high variance)
+- Recursive binary splitting makes "locally optimal" decisions that may not result in a globally optimal tree
+- Doesn't tend to work well if the classes are highly unbalanced
+- Doesn't tend to work well with very small datasets
+
+## Random Forests
+
+**Advantages (compared to decision trees):**
+
+- Performance is competitive with the best supervised learning methods
+- Provides a more reliable estimate of feature importance
+- Allows you to estimate out-of-sample error without using train/test split or cross-validation
+
+**Disadvantages (compared to decision trees):**
+
+- Less interpretable
+- Slower to train
+- Slower to predict