diff --git a/src/conv.py b/src/conv.py
index 759bf86cf..52a606663 100644
--- a/src/conv.py
+++ b/src/conv.py
@@ -9,22 +9,37 @@
 
 """
 
+from collections import Counter
+
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import numpy as np
+import theano
+import theano.tensor as T
+
 import network3
 from network3 import sigmoid, tanh, ReLU, Network
 from network3 import ConvPoolLayer, FullyConnectedLayer, SoftmaxLayer
+
 training_data, validation_data, test_data = network3.load_data_shared()
 mini_batch_size = 10
 
-def shallow():
-    for j in range(3):
+def shallow(n=3, epochs=60):
+    nets = []
+    for j in range(n):
         print "A shallow net with 100 hidden neurons"
         net = Network([
             FullyConnectedLayer(n_in=784, n_out=100),
             SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
-        net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data)
+        net.SGD(
+            training_data, epochs, mini_batch_size, 0.1, 
+            validation_data, test_data)
+        nets.append(net)
+    return nets 
 
-def basic_conv():
-    for j in range(3):
+def basic_conv(n=3, epochs=60):
+    for j in range(n):
         print "Conv + FC architecture"
         net = Network([
             ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), 
@@ -32,7 +47,9 @@ def basic_conv():
                           poolsize=(2, 2)),
             FullyConnectedLayer(n_in=20*12*12, n_out=100),
             SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
-        net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data)        
+        net.SGD(
+            training_data, epochs, mini_batch_size, 0.1, validation_data, test_data)
+    return net 
 
 def omit_FC():
     for j in range(3):
@@ -43,6 +60,7 @@ def omit_FC():
                           poolsize=(2, 2)),
             SoftmaxLayer(n_in=20*12*12, n_out=10)], mini_batch_size)
         net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data)
+    return net 
 
 def dbl_conv(activation_fn=sigmoid):
     for j in range(3):
@@ -59,8 +77,14 @@ def dbl_conv(activation_fn=sigmoid):
             FullyConnectedLayer(
                 n_in=40*4*4, n_out=100, activation_fn=activation_fn),
             SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
-        net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data)        
+        net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data)
+    return net 
 
+# The following experiment was eventually omitted from the chapter,
+# but I've left it in here, since it's an important negative result:
+# basic l2 regularization didn't help much.  The reason (I believe) is
+# that using convolutional-pooling layers is already a pretty strong
+# regularizer.
 def regularized_dbl_conv():
     for lmbda in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]:
         for j in range(3):
@@ -96,11 +120,15 @@ def dbl_conv_relu():
 #### Some subsequent functions may make use of the expanded MNIST
 #### data.  That can be generated by running expand_mnist.py.
 
-def expanded_data():
+def expanded_data(n=100):
+    """n is the number of neurons in the fully-connected layer.  We'll try
+    n=100, 300, and 1000.
+
+    """
     expanded_training_data, _, _ = network3.load_data_shared(
         "../data/mnist_expanded.pkl.gz")
     for j in range(3):
-        print "Training with expanded data, run num %s" % j
+        print "Training with expanded data, %s neurons in the FC layer, run num %s" % (n, j)
         net = Network([
             ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), 
                           filter_shape=(20, 1, 5, 5), 
@@ -110,8 +138,160 @@ def expanded_data():
                           filter_shape=(40, 20, 5, 5), 
                           poolsize=(2, 2), 
                           activation_fn=ReLU),
-            FullyConnectedLayer(n_in=40*4*4, n_out=100, activation_fn=ReLU),
-            SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
-        net.SGD(expanded_training_data, 20, mini_batch_size, 0.03, 
+            FullyConnectedLayer(n_in=40*4*4, n_out=n, activation_fn=ReLU),
+            SoftmaxLayer(n_in=n, n_out=10)], mini_batch_size)
+        net.SGD(expanded_training_data, 60, mini_batch_size, 0.03, 
+                validation_data, test_data, lmbda=0.1)
+    return net 
+
+def expanded_data_double_fc(n=100):
+    """n is the number of neurons in both fully-connected layers.  We'll
+    try n=100, 300, and 1000.
+
+    """
+    expanded_training_data, _, _ = network3.load_data_shared(
+        "../data/mnist_expanded.pkl.gz")
+    for j in range(3):
+        print "Training with expanded data, %s neurons in two FC layers, run num %s" % (n, j)
+        net = Network([
+            ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), 
+                          filter_shape=(20, 1, 5, 5), 
+                          poolsize=(2, 2), 
+                          activation_fn=ReLU),
+            ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12), 
+                          filter_shape=(40, 20, 5, 5), 
+                          poolsize=(2, 2), 
+                          activation_fn=ReLU),
+            FullyConnectedLayer(n_in=40*4*4, n_out=n, activation_fn=ReLU),
+            FullyConnectedLayer(n_in=n, n_out=n, activation_fn=ReLU),
+            SoftmaxLayer(n_in=n, n_out=10)], mini_batch_size)
+        net.SGD(expanded_training_data, 60, mini_batch_size, 0.03, 
                 validation_data, test_data, lmbda=0.1)
+
+def double_fc_dropout(p0, p1, p2, repetitions):
+    expanded_training_data, _, _ = network3.load_data_shared(
+        "../data/mnist_expanded.pkl.gz")
+    nets = []
+    for j in range(repetitions):
+        print "\n\nTraining using a dropout network with parameters ",p0,p1,p2
+        print "Training with expanded data, run num %s" % j
+        net = Network([
+            ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), 
+                          filter_shape=(20, 1, 5, 5), 
+                          poolsize=(2, 2), 
+                          activation_fn=ReLU),
+            ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12), 
+                          filter_shape=(40, 20, 5, 5), 
+                          poolsize=(2, 2), 
+                          activation_fn=ReLU),
+            FullyConnectedLayer(
+                n_in=40*4*4, n_out=1000, activation_fn=ReLU, p_dropout=p0),
+            FullyConnectedLayer(
+                n_in=1000, n_out=1000, activation_fn=ReLU, p_dropout=p1),
+            SoftmaxLayer(n_in=1000, n_out=10, p_dropout=p2)], mini_batch_size)
+        net.SGD(expanded_training_data, 40, mini_batch_size, 0.03, 
+                validation_data, test_data)
+        nets.append(net)
+    return nets
+
+def ensemble(nets): 
+    """Takes as input a list of nets, and then computes the accuracy on
+    the test data when classifications are computed by taking a vote
+    amongst the nets.  Returns a tuple containing a list of indices
+    for test data which is erroneously classified, and a list of the
+    corresponding erroneous predictions.
+
+    Note that this is a quick-and-dirty kluge: it'd be more reusable
+    (and faster) to define a Theano function taking the vote.  But
+    this works.
+
+    """
     
+    test_x, test_y = test_data
+    for net in nets:
+        i = T.lscalar() # mini-batch index
+        net.test_mb_predictions = theano.function(
+            [i], net.layers[-1].y_out,
+            givens={
+                net.x: 
+                test_x[i*net.mini_batch_size: (i+1)*net.mini_batch_size]
+            })
+        net.test_predictions = list(np.concatenate(
+            [net.test_mb_predictions(i) for i in xrange(1000)]))
+    all_test_predictions = zip(*[net.test_predictions for net in nets])
+    def plurality(p): return Counter(p).most_common(1)[0][0]
+    plurality_test_predictions = [plurality(p) 
+                                  for p in all_test_predictions]
+    test_y_eval = test_y.eval()
+    error_locations = [j for j in xrange(10000) 
+                       if plurality_test_predictions[j] != test_y_eval[j]]
+    erroneous_predictions = [plurality(all_test_predictions[j])
+                             for j in error_locations]
+    print "Accuracy is {:.2%}".format((1-len(error_locations)/10000.0))
+    return error_locations, erroneous_predictions
+
+def plot_errors(error_locations, erroneous_predictions=None):
+    test_x, test_y = test_data[0].eval(), test_data[1].eval()
+    fig = plt.figure()
+    error_images = [np.array(test_x[i]).reshape(28, -1) for i in error_locations]
+    n = min(40, len(error_locations))
+    for j in range(n):
+        ax = plt.subplot2grid((5, 8), (j/8, j % 8))
+        ax.matshow(error_images[j], cmap = matplotlib.cm.binary)
+        ax.text(24, 5, test_y[error_locations[j]])
+        if erroneous_predictions:
+            ax.text(24, 24, erroneous_predictions[j])
+        plt.xticks(np.array([]))
+        plt.yticks(np.array([]))
+    plt.tight_layout()
+    return plt
+    
+def plot_filters(net, layer, x, y):
+
+    """Plot the filters for net after the (convolutional) layer number
+    layer.  They are plotted in x by y format.  So, for example, if we
+    have 20 filters after layer 0, then we can call show_filters(net, 0, 5, 4) to
+    get a 5 by 4 plot of all filters."""
+    filters = net.layers[layer].w.eval()
+    fig = plt.figure()
+    for j in range(len(filters)):
+        ax = fig.add_subplot(y, x, j)
+        ax.matshow(filters[j][0], cmap = matplotlib.cm.binary)
+        plt.xticks(np.array([]))
+        plt.yticks(np.array([]))
+    plt.tight_layout()
+    return plt
+
+
+#### Helper method to run all experiments in the book
+
+def run_experiments():
+
+    """Run the experiments described in the book.  Note that the later
+    experiments require access to the expanded training data, which
+    can be generated by running expand_mnist.py.
+
+    """
+    shallow()
+    basic_conv()
+    omit_FC()
+    dbl_conv(activation_fn=sigmoid)
+    # omitted, but still interesting: regularized_dbl_conv()
+    dbl_conv_relu()
+    expanded_data(n=100)
+    expanded_data(n=300)
+    expanded_data(n=1000)
+    expanded_data_double_fc(n=100)    
+    expanded_data_double_fc(n=300)
+    expanded_data_double_fc(n=1000)
+    nets = double_fc_dropout(0.5, 0.5, 0.5, 5)
+    # plot the erroneous digits in the ensemble of nets just trained
+    error_locations, erroneous_predictions = ensemble(nets)
+    plt = plot_errors(error_locations, erroneous_predictions)
+    plt.savefig("ensemble_errors.png")
+    # plot the filters learned by the first of the nets just trained
+    plt = plot_filters(nets[0], 0, 5, 4)
+    plt.savefig("net_full_layer_0.png")
+    plt = plot_filters(nets[0], 1, 8, 5)
+    plt.savefig("net_full_layer_1.png")
+
diff --git a/src/network3.py b/src/network3.py
index 89a70ce76..b43136e08 100644
--- a/src/network3.py
+++ b/src/network3.py
@@ -37,23 +37,27 @@
 import theano.tensor as T
 from theano.tensor.nnet import conv
 from theano.tensor.nnet import softmax
+from theano.tensor import shared_randomstreams
 from theano.tensor.signal import downsample
 
 # Activation functions for neurons
 def linear(z): return z
-def ReLU(z): return T.maximum(0, z)
+def ReLU(z): return T.maximum(0.0, z)
 from theano.tensor.nnet import sigmoid
 from theano.tensor import tanh
 
 
 #### Constants
-GPU = False
+GPU = True
 if GPU:
     print "Trying to run under a GPU.  If this is not desired, then modify "+\
         "network3.py\nto set the GPU flag to False."
     try: theano.config.device = 'gpu'
     except: pass # it's already set
     theano.config.floatX = 'float32'
+else:
+    print "Running with a CPU.  If this is not desired, then the modify "+\
+        "network3.py to set\nthe GPU flag to True."
 
 #### Load the MNIST data
 def load_data_shared(filename="../data/mnist.pkl.gz"):
@@ -87,27 +91,34 @@ def __init__(self, layers, mini_batch_size):
         self.x = T.matrix("x")  
         self.y = T.ivector("y")
         init_layer = self.layers[0]
-        init_layer.set_inpt(self.x, self.mini_batch_size)
+        init_layer.set_inpt(self.x, self.x, self.mini_batch_size)
         for j in xrange(1, len(self.layers)):
             prev_layer, layer  = self.layers[j-1], self.layers[j]
-            layer.set_inpt(prev_layer.output, self.mini_batch_size)
+            layer.set_inpt(
+                prev_layer.output, prev_layer.output_dropout, self.mini_batch_size)
         self.output = self.layers[-1].output
+        self.output_dropout = self.layers[-1].output_dropout
 
     def SGD(self, training_data, epochs, mini_batch_size, eta, 
-            validation_data, test_data, lmbda=0.0):
+            validation_data=None, test_data=None, lmbda=0.0):
         """Train the network using mini-batch stochastic gradient descent."""
         training_x, training_y = training_data
-        validation_x, validation_y = validation_data
-        test_x, test_y = test_data
+        if validation_data:
+            validation_x, validation_y = validation_data
+        if test_data:
+            test_x, test_y = test_data
 
         # compute number of minibatches for training, validation and testing
         num_training_batches = size(training_data)/mini_batch_size
-        num_validation_batches = size(validation_data)/mini_batch_size
-        num_test_batches = size(test_data)/mini_batch_size
+        if validation_data:
+            num_validation_batches = size(validation_data)/mini_batch_size
+        if test_data:
+            num_test_batches = size(test_data)/mini_batch_size
 
         # define the (regularized) cost function, symbolic gradients, and updates
         l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers])
-        cost = self.log_likelihood()+0.5*lmbda*l2_norm_squared/num_training_batches
+        cost = self.layers[-1].cost(self)+\
+               0.5*lmbda*l2_norm_squared/num_training_batches
         grads = T.grad(cost, self.params)
         updates = [(param, param-eta*grad) 
                    for param, grad in zip(self.params, grads)]
@@ -123,32 +134,39 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                 self.y: 
                 training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
             })
-        validate_mb_accuracy = theano.function(
-            [i], self.layers[-1].accuracy(self.y),
-            givens={
-                self.x: 
-                validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
-                self.y: 
-                validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
-            })
-        test_mb_accuracy = theano.function(
-            [i], self.layers[-1].accuracy(self.y),
-            givens={
-                self.x: 
-                test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
-                self.y: 
-                test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
-            })
-
+        if validation_data:
+            validate_mb_accuracy = theano.function(
+                [i], self.layers[-1].accuracy(self.y),
+                givens={
+                    self.x: 
+                    validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
+                    self.y: 
+                    validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+                })
+        if test_data:
+            test_mb_accuracy = theano.function(
+                [i], self.layers[-1].accuracy(self.y),
+                givens={
+                    self.x: 
+                    test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
+                    self.y: 
+                    test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+                })
+            self.test_mb_predictions = theano.function(
+                [i], self.layers[-1].y_out,
+                givens={
+                    self.x: 
+                    test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
+                })
         # Do the actual training
-        best_validation_accuracy = 0.0
+        if validation_data: best_validation_accuracy = 0.0
         for epoch in xrange(epochs):
             for minibatch_index in xrange(num_training_batches):
                 iteration = num_training_batches*epoch+minibatch_index
                 if iteration % 1000 == 0: 
                     print("Training mini-batch number {0}".format(iteration))
                 cost_ij = train_mb(minibatch_index)
-                if (iteration+1) % num_training_batches == 0:
+                if validation_data and (iteration+1) % num_training_batches == 0:
                     validation_accuracy = np.mean(
                         [validate_mb_accuracy(j) for j in xrange(num_validation_batches)])
                     print("Epoch {0}: validation accuracy {1:.2%}".format(
@@ -157,20 +175,20 @@ def SGD(self, training_data, epochs, mini_batch_size, eta,
                         print("This is the best validation accuracy to date.")
                         best_validation_accuracy = validation_accuracy
                         best_iteration = iteration
-                        test_accuracy = np.mean(
-                            [test_mb_accuracy(j) for j in xrange(num_test_batches)])
-                        print('The corresponding test accuracy is {0:.2%}'.format(
-                            test_accuracy))
+                        if test_data:
+                            test_accuracy = np.mean(
+                                [test_mb_accuracy(j) for j in xrange(num_test_batches)])
+                            print('The corresponding test accuracy is {0:.2%}'.format(test_accuracy))
         print("Finished training network.")
-        print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format(
-              best_validation_accuracy, best_iteration))
-        print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))
+        if validation_data:
+            print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format(best_validation_accuracy, best_iteration))
+            if test_data:
+                print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))
 
     def log_likelihood(self):
         "Return the log-likelihood cost."
         return -T.mean(T.log(self.output)[T.arange(self.y.shape[0]), self.y])
 
-
 #### Define layer types
 
 class ConvPoolLayer():
@@ -215,7 +233,7 @@ def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
             borrow=True)
         self.params = [self.w, self.b]
 
-    def set_inpt(self, inpt, mini_batch_size):
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
         self.inpt = inpt.reshape(self.image_shape)
         conv_out = conv.conv2d(
             input=self.inpt, filters=self.w, filter_shape=self.filter_shape,
@@ -224,20 +242,22 @@ def set_inpt(self, inpt, mini_batch_size):
             input=conv_out, ds=self.poolsize, ignore_border=True)
         self.output = self.activation_fn(
             pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
-
+        self.output_dropout = self.output # no dropout in the convolutional layers
 
 class FullyConnectedLayer():
 
-    def __init__(self, n_in, n_out, activation_fn=sigmoid):
+    def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0):
         self.n_in = n_in
         self.n_out = n_out
         self.activation_fn = activation_fn
+        self.p_dropout = p_dropout
         self.inpt = None
         self.output = None
         # Initialize weights and biases
         self.w = theano.shared(
             np.asarray(
-                np.random.normal(loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
+                np.random.normal(
+                    loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
                 dtype=theano.config.floatX),
             name='w', borrow=True)
         self.b = theano.shared(
@@ -246,17 +266,31 @@ def __init__(self, n_in, n_out, activation_fn=sigmoid):
             name='b', borrow=True)
         self.params = [self.w, self.b]
 
-    def set_inpt(self, inpt, mini_batch_size):
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
         self.inpt = inpt.reshape((mini_batch_size, self.n_in))
-        self.output = self.activation_fn(T.dot(self.inpt, self.w) + self.b)
+        self.output = self.activation_fn(
+            (1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
+        self.y_out = T.argmax(self.output, axis=1)
+        self.inpt_dropout = dropout_layer(
+            inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
+        self.output_dropout = self.activation_fn(T.dot(self.inpt_dropout, self.w) + self.b)
+
+    def cost(self, net):
+        "Return the cross-entropy cost."
+        return T.nnet.binary_crossentropy(self.output, net.y).mean()
+
+    def accuracy(self, y):
+        "Return the accuracy for the mini-batch."
+        return T.mean(T.eq(y, self.y_out))
 
 class SoftmaxLayer():
 
-    def __init__(self, n_in, n_out):
+    def __init__(self, n_in, n_out, p_dropout=0.0):
         self.inpt = None
         self.output = None
         self.n_in = n_in
         self.n_out = n_out
+        self.p_dropout = p_dropout
         # Initialize weights and biases
         self.w = theano.shared(
             np.zeros((n_in, n_out), dtype=theano.config.floatX),
@@ -266,10 +300,17 @@ def __init__(self, n_in, n_out):
             name='b', borrow=True)
         self.params = [self.w, self.b]
 
-    def set_inpt(self, inpt, mini_batch_size):
+    def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
         self.inpt = inpt.reshape((mini_batch_size, self.n_in))
-        self.output = softmax(T.dot(self.inpt, self.w) + self.b)
+        self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
         self.y_out = T.argmax(self.output, axis=1)
+        self.inpt_dropout = dropout_layer(
+            inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
+        self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b)
+
+    def cost(self, net):
+        "Return the log-likelihood cost."
+        return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])
 
     def accuracy(self, y):
         "Return the accuracy for the mini-batch."
@@ -281,3 +322,8 @@ def size(data):
     "Return the size of the dataset `data`."
     return data[0].get_value(borrow=True).shape[0]
 
+def dropout_layer(layer, p_dropout):
+    srng = shared_randomstreams.RandomStreams(
+        np.random.RandomState(0).randint(999999))
+    mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape)
+    return layer*T.cast(mask, theano.config.floatX)