diff --git a/src/conv.py b/src/conv.py index 759bf86cf..52a606663 100644 --- a/src/conv.py +++ b/src/conv.py @@ -9,22 +9,37 @@ """ +from collections import Counter + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import numpy as np +import theano +import theano.tensor as T + import network3 from network3 import sigmoid, tanh, ReLU, Network from network3 import ConvPoolLayer, FullyConnectedLayer, SoftmaxLayer + training_data, validation_data, test_data = network3.load_data_shared() mini_batch_size = 10 -def shallow(): - for j in range(3): +def shallow(n=3, epochs=60): + nets = [] + for j in range(n): print "A shallow net with 100 hidden neurons" net = Network([ FullyConnectedLayer(n_in=784, n_out=100), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size) - net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data) + net.SGD( + training_data, epochs, mini_batch_size, 0.1, + validation_data, test_data) + nets.append(net) + return nets -def basic_conv(): - for j in range(3): +def basic_conv(n=3, epochs=60): + for j in range(n): print "Conv + FC architecture" net = Network([ ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), @@ -32,7 +47,9 @@ def basic_conv(): poolsize=(2, 2)), FullyConnectedLayer(n_in=20*12*12, n_out=100), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size) - net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data) + net.SGD( + training_data, epochs, mini_batch_size, 0.1, validation_data, test_data) + return net def omit_FC(): for j in range(3): @@ -43,6 +60,7 @@ def omit_FC(): poolsize=(2, 2)), SoftmaxLayer(n_in=20*12*12, n_out=10)], mini_batch_size) net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data) + return net def dbl_conv(activation_fn=sigmoid): for j in range(3): @@ -59,8 +77,14 @@ def dbl_conv(activation_fn=sigmoid): FullyConnectedLayer( n_in=40*4*4, n_out=100, activation_fn=activation_fn), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size) - net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data) + net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data) + return net +# The following experiment was eventually omitted from the chapter, +# but I've left it in here, since it's an important negative result: +# basic l2 regularization didn't help much. The reason (I believe) is +# that using convolutional-pooling layers is already a pretty strong +# regularizer. def regularized_dbl_conv(): for lmbda in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]: for j in range(3): @@ -96,11 +120,15 @@ def dbl_conv_relu(): #### Some subsequent functions may make use of the expanded MNIST #### data. That can be generated by running expand_mnist.py. -def expanded_data(): +def expanded_data(n=100): + """n is the number of neurons in the fully-connected layer. We'll try + n=100, 300, and 1000. + + """ expanded_training_data, _, _ = network3.load_data_shared( "../data/mnist_expanded.pkl.gz") for j in range(3): - print "Training with expanded data, run num %s" % j + print "Training with expanded data, %s neurons in the FC layer, run num %s" % (n, j) net = Network([ ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), filter_shape=(20, 1, 5, 5), @@ -110,8 +138,160 @@ def expanded_data(): filter_shape=(40, 20, 5, 5), poolsize=(2, 2), activation_fn=ReLU), - FullyConnectedLayer(n_in=40*4*4, n_out=100, activation_fn=ReLU), - SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size) - net.SGD(expanded_training_data, 20, mini_batch_size, 0.03, + FullyConnectedLayer(n_in=40*4*4, n_out=n, activation_fn=ReLU), + SoftmaxLayer(n_in=n, n_out=10)], mini_batch_size) + net.SGD(expanded_training_data, 60, mini_batch_size, 0.03, + validation_data, test_data, lmbda=0.1) + return net + +def expanded_data_double_fc(n=100): + """n is the number of neurons in both fully-connected layers. We'll + try n=100, 300, and 1000. + + """ + expanded_training_data, _, _ = network3.load_data_shared( + "../data/mnist_expanded.pkl.gz") + for j in range(3): + print "Training with expanded data, %s neurons in two FC layers, run num %s" % (n, j) + net = Network([ + ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), + filter_shape=(20, 1, 5, 5), + poolsize=(2, 2), + activation_fn=ReLU), + ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12), + filter_shape=(40, 20, 5, 5), + poolsize=(2, 2), + activation_fn=ReLU), + FullyConnectedLayer(n_in=40*4*4, n_out=n, activation_fn=ReLU), + FullyConnectedLayer(n_in=n, n_out=n, activation_fn=ReLU), + SoftmaxLayer(n_in=n, n_out=10)], mini_batch_size) + net.SGD(expanded_training_data, 60, mini_batch_size, 0.03, validation_data, test_data, lmbda=0.1) + +def double_fc_dropout(p0, p1, p2, repetitions): + expanded_training_data, _, _ = network3.load_data_shared( + "../data/mnist_expanded.pkl.gz") + nets = [] + for j in range(repetitions): + print "\n\nTraining using a dropout network with parameters ",p0,p1,p2 + print "Training with expanded data, run num %s" % j + net = Network([ + ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28), + filter_shape=(20, 1, 5, 5), + poolsize=(2, 2), + activation_fn=ReLU), + ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12), + filter_shape=(40, 20, 5, 5), + poolsize=(2, 2), + activation_fn=ReLU), + FullyConnectedLayer( + n_in=40*4*4, n_out=1000, activation_fn=ReLU, p_dropout=p0), + FullyConnectedLayer( + n_in=1000, n_out=1000, activation_fn=ReLU, p_dropout=p1), + SoftmaxLayer(n_in=1000, n_out=10, p_dropout=p2)], mini_batch_size) + net.SGD(expanded_training_data, 40, mini_batch_size, 0.03, + validation_data, test_data) + nets.append(net) + return nets + +def ensemble(nets): + """Takes as input a list of nets, and then computes the accuracy on + the test data when classifications are computed by taking a vote + amongst the nets. Returns a tuple containing a list of indices + for test data which is erroneously classified, and a list of the + corresponding erroneous predictions. + + Note that this is a quick-and-dirty kluge: it'd be more reusable + (and faster) to define a Theano function taking the vote. But + this works. + + """ + test_x, test_y = test_data + for net in nets: + i = T.lscalar() # mini-batch index + net.test_mb_predictions = theano.function( + [i], net.layers[-1].y_out, + givens={ + net.x: + test_x[i*net.mini_batch_size: (i+1)*net.mini_batch_size] + }) + net.test_predictions = list(np.concatenate( + [net.test_mb_predictions(i) for i in xrange(1000)])) + all_test_predictions = zip(*[net.test_predictions for net in nets]) + def plurality(p): return Counter(p).most_common(1)[0][0] + plurality_test_predictions = [plurality(p) + for p in all_test_predictions] + test_y_eval = test_y.eval() + error_locations = [j for j in xrange(10000) + if plurality_test_predictions[j] != test_y_eval[j]] + erroneous_predictions = [plurality(all_test_predictions[j]) + for j in error_locations] + print "Accuracy is {:.2%}".format((1-len(error_locations)/10000.0)) + return error_locations, erroneous_predictions + +def plot_errors(error_locations, erroneous_predictions=None): + test_x, test_y = test_data[0].eval(), test_data[1].eval() + fig = plt.figure() + error_images = [np.array(test_x[i]).reshape(28, -1) for i in error_locations] + n = min(40, len(error_locations)) + for j in range(n): + ax = plt.subplot2grid((5, 8), (j/8, j % 8)) + ax.matshow(error_images[j], cmap = matplotlib.cm.binary) + ax.text(24, 5, test_y[error_locations[j]]) + if erroneous_predictions: + ax.text(24, 24, erroneous_predictions[j]) + plt.xticks(np.array([])) + plt.yticks(np.array([])) + plt.tight_layout() + return plt + +def plot_filters(net, layer, x, y): + + """Plot the filters for net after the (convolutional) layer number + layer. They are plotted in x by y format. So, for example, if we + have 20 filters after layer 0, then we can call show_filters(net, 0, 5, 4) to + get a 5 by 4 plot of all filters.""" + filters = net.layers[layer].w.eval() + fig = plt.figure() + for j in range(len(filters)): + ax = fig.add_subplot(y, x, j) + ax.matshow(filters[j][0], cmap = matplotlib.cm.binary) + plt.xticks(np.array([])) + plt.yticks(np.array([])) + plt.tight_layout() + return plt + + +#### Helper method to run all experiments in the book + +def run_experiments(): + + """Run the experiments described in the book. Note that the later + experiments require access to the expanded training data, which + can be generated by running expand_mnist.py. + + """ + shallow() + basic_conv() + omit_FC() + dbl_conv(activation_fn=sigmoid) + # omitted, but still interesting: regularized_dbl_conv() + dbl_conv_relu() + expanded_data(n=100) + expanded_data(n=300) + expanded_data(n=1000) + expanded_data_double_fc(n=100) + expanded_data_double_fc(n=300) + expanded_data_double_fc(n=1000) + nets = double_fc_dropout(0.5, 0.5, 0.5, 5) + # plot the erroneous digits in the ensemble of nets just trained + error_locations, erroneous_predictions = ensemble(nets) + plt = plot_errors(error_locations, erroneous_predictions) + plt.savefig("ensemble_errors.png") + # plot the filters learned by the first of the nets just trained + plt = plot_filters(nets[0], 0, 5, 4) + plt.savefig("net_full_layer_0.png") + plt = plot_filters(nets[0], 1, 8, 5) + plt.savefig("net_full_layer_1.png") + diff --git a/src/network3.py b/src/network3.py index 89a70ce76..b43136e08 100644 --- a/src/network3.py +++ b/src/network3.py @@ -37,23 +37,27 @@ import theano.tensor as T from theano.tensor.nnet import conv from theano.tensor.nnet import softmax +from theano.tensor import shared_randomstreams from theano.tensor.signal import downsample # Activation functions for neurons def linear(z): return z -def ReLU(z): return T.maximum(0, z) +def ReLU(z): return T.maximum(0.0, z) from theano.tensor.nnet import sigmoid from theano.tensor import tanh #### Constants -GPU = False +GPU = True if GPU: print "Trying to run under a GPU. If this is not desired, then modify "+\ "network3.py\nto set the GPU flag to False." try: theano.config.device = 'gpu' except: pass # it's already set theano.config.floatX = 'float32' +else: + print "Running with a CPU. If this is not desired, then the modify "+\ + "network3.py to set\nthe GPU flag to True." #### Load the MNIST data def load_data_shared(filename="../data/mnist.pkl.gz"): @@ -87,27 +91,34 @@ def __init__(self, layers, mini_batch_size): self.x = T.matrix("x") self.y = T.ivector("y") init_layer = self.layers[0] - init_layer.set_inpt(self.x, self.mini_batch_size) + init_layer.set_inpt(self.x, self.x, self.mini_batch_size) for j in xrange(1, len(self.layers)): prev_layer, layer = self.layers[j-1], self.layers[j] - layer.set_inpt(prev_layer.output, self.mini_batch_size) + layer.set_inpt( + prev_layer.output, prev_layer.output_dropout, self.mini_batch_size) self.output = self.layers[-1].output + self.output_dropout = self.layers[-1].output_dropout def SGD(self, training_data, epochs, mini_batch_size, eta, - validation_data, test_data, lmbda=0.0): + validation_data=None, test_data=None, lmbda=0.0): """Train the network using mini-batch stochastic gradient descent.""" training_x, training_y = training_data - validation_x, validation_y = validation_data - test_x, test_y = test_data + if validation_data: + validation_x, validation_y = validation_data + if test_data: + test_x, test_y = test_data # compute number of minibatches for training, validation and testing num_training_batches = size(training_data)/mini_batch_size - num_validation_batches = size(validation_data)/mini_batch_size - num_test_batches = size(test_data)/mini_batch_size + if validation_data: + num_validation_batches = size(validation_data)/mini_batch_size + if test_data: + num_test_batches = size(test_data)/mini_batch_size # define the (regularized) cost function, symbolic gradients, and updates l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers]) - cost = self.log_likelihood()+0.5*lmbda*l2_norm_squared/num_training_batches + cost = self.layers[-1].cost(self)+\ + 0.5*lmbda*l2_norm_squared/num_training_batches grads = T.grad(cost, self.params) updates = [(param, param-eta*grad) for param, grad in zip(self.params, grads)] @@ -123,32 +134,39 @@ def SGD(self, training_data, epochs, mini_batch_size, eta, self.y: training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] }) - validate_mb_accuracy = theano.function( - [i], self.layers[-1].accuracy(self.y), - givens={ - self.x: - validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], - self.y: - validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] - }) - test_mb_accuracy = theano.function( - [i], self.layers[-1].accuracy(self.y), - givens={ - self.x: - test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], - self.y: - test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] - }) - + if validation_data: + validate_mb_accuracy = theano.function( + [i], self.layers[-1].accuracy(self.y), + givens={ + self.x: + validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], + self.y: + validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] + }) + if test_data: + test_mb_accuracy = theano.function( + [i], self.layers[-1].accuracy(self.y), + givens={ + self.x: + test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size], + self.y: + test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size] + }) + self.test_mb_predictions = theano.function( + [i], self.layers[-1].y_out, + givens={ + self.x: + test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size] + }) # Do the actual training - best_validation_accuracy = 0.0 + if validation_data: best_validation_accuracy = 0.0 for epoch in xrange(epochs): for minibatch_index in xrange(num_training_batches): iteration = num_training_batches*epoch+minibatch_index if iteration % 1000 == 0: print("Training mini-batch number {0}".format(iteration)) cost_ij = train_mb(minibatch_index) - if (iteration+1) % num_training_batches == 0: + if validation_data and (iteration+1) % num_training_batches == 0: validation_accuracy = np.mean( [validate_mb_accuracy(j) for j in xrange(num_validation_batches)]) print("Epoch {0}: validation accuracy {1:.2%}".format( @@ -157,20 +175,20 @@ def SGD(self, training_data, epochs, mini_batch_size, eta, print("This is the best validation accuracy to date.") best_validation_accuracy = validation_accuracy best_iteration = iteration - test_accuracy = np.mean( - [test_mb_accuracy(j) for j in xrange(num_test_batches)]) - print('The corresponding test accuracy is {0:.2%}'.format( - test_accuracy)) + if test_data: + test_accuracy = np.mean( + [test_mb_accuracy(j) for j in xrange(num_test_batches)]) + print('The corresponding test accuracy is {0:.2%}'.format(test_accuracy)) print("Finished training network.") - print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format( - best_validation_accuracy, best_iteration)) - print("Corresponding test accuracy of {0:.2%}".format(test_accuracy)) + if validation_data: + print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format(best_validation_accuracy, best_iteration)) + if test_data: + print("Corresponding test accuracy of {0:.2%}".format(test_accuracy)) def log_likelihood(self): "Return the log-likelihood cost." return -T.mean(T.log(self.output)[T.arange(self.y.shape[0]), self.y]) - #### Define layer types class ConvPoolLayer(): @@ -215,7 +233,7 @@ def __init__(self, filter_shape, image_shape, poolsize=(2, 2), borrow=True) self.params = [self.w, self.b] - def set_inpt(self, inpt, mini_batch_size): + def set_inpt(self, inpt, inpt_dropout, mini_batch_size): self.inpt = inpt.reshape(self.image_shape) conv_out = conv.conv2d( input=self.inpt, filters=self.w, filter_shape=self.filter_shape, @@ -224,20 +242,22 @@ def set_inpt(self, inpt, mini_batch_size): input=conv_out, ds=self.poolsize, ignore_border=True) self.output = self.activation_fn( pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) - + self.output_dropout = self.output # no dropout in the convolutional layers class FullyConnectedLayer(): - def __init__(self, n_in, n_out, activation_fn=sigmoid): + def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0): self.n_in = n_in self.n_out = n_out self.activation_fn = activation_fn + self.p_dropout = p_dropout self.inpt = None self.output = None # Initialize weights and biases self.w = theano.shared( np.asarray( - np.random.normal(loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)), + np.random.normal( + loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)), dtype=theano.config.floatX), name='w', borrow=True) self.b = theano.shared( @@ -246,17 +266,31 @@ def __init__(self, n_in, n_out, activation_fn=sigmoid): name='b', borrow=True) self.params = [self.w, self.b] - def set_inpt(self, inpt, mini_batch_size): + def set_inpt(self, inpt, inpt_dropout, mini_batch_size): self.inpt = inpt.reshape((mini_batch_size, self.n_in)) - self.output = self.activation_fn(T.dot(self.inpt, self.w) + self.b) + self.output = self.activation_fn( + (1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b) + self.y_out = T.argmax(self.output, axis=1) + self.inpt_dropout = dropout_layer( + inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout) + self.output_dropout = self.activation_fn(T.dot(self.inpt_dropout, self.w) + self.b) + + def cost(self, net): + "Return the cross-entropy cost." + return T.nnet.binary_crossentropy(self.output, net.y).mean() + + def accuracy(self, y): + "Return the accuracy for the mini-batch." + return T.mean(T.eq(y, self.y_out)) class SoftmaxLayer(): - def __init__(self, n_in, n_out): + def __init__(self, n_in, n_out, p_dropout=0.0): self.inpt = None self.output = None self.n_in = n_in self.n_out = n_out + self.p_dropout = p_dropout # Initialize weights and biases self.w = theano.shared( np.zeros((n_in, n_out), dtype=theano.config.floatX), @@ -266,10 +300,17 @@ def __init__(self, n_in, n_out): name='b', borrow=True) self.params = [self.w, self.b] - def set_inpt(self, inpt, mini_batch_size): + def set_inpt(self, inpt, inpt_dropout, mini_batch_size): self.inpt = inpt.reshape((mini_batch_size, self.n_in)) - self.output = softmax(T.dot(self.inpt, self.w) + self.b) + self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b) self.y_out = T.argmax(self.output, axis=1) + self.inpt_dropout = dropout_layer( + inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout) + self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b) + + def cost(self, net): + "Return the log-likelihood cost." + return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y]) def accuracy(self, y): "Return the accuracy for the mini-batch." @@ -281,3 +322,8 @@ def size(data): "Return the size of the dataset `data`." return data[0].get_value(borrow=True).shape[0] +def dropout_layer(layer, p_dropout): + srng = shared_randomstreams.RandomStreams( + np.random.RandomState(0).randint(999999)) + mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape) + return layer*T.cast(mask, theano.config.floatX)