From 4a08fa44bfed1b0270babcb537940f2c891e158d Mon Sep 17 00:00:00 2001 From: udlbook <110402648+udlbook@users.noreply.github.com> Date: Fri, 29 Sep 2023 12:45:08 +0100 Subject: [PATCH] Delete Notebooks/Chap_11/11_1_Shattered_Gradients.ipynb --- .../Chap_11/11_1_Shattered_Gradients.ipynb | 392 ------------------ 1 file changed, 392 deletions(-) delete mode 100644 Notebooks/Chap_11/11_1_Shattered_Gradients.ipynb diff --git a/Notebooks/Chap_11/11_1_Shattered_Gradients.ipynb b/Notebooks/Chap_11/11_1_Shattered_Gradients.ipynb deleted file mode 100644 index 6dc40f22..00000000 --- a/Notebooks/Chap_11/11_1_Shattered_Gradients.ipynb +++ /dev/null @@ -1,392 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "authorship_tag": "ABX9TyMrF4rB2hTKq7XzLuYsURdL", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# **Notebook 11.1: Shattered gradients**\n", - "\n", - "This notebook investigates the phenomenon of shattered gradients as discussed in section 11.1.1. It replicates some of the experiments in [Balduzzi et al. (2017)](https://arxiv.org/abs/1702.08591).\n", - "\n", - "Work through the cells below, running each cell in turn. In various places you will see the words \"TO DO\". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.\n", - "\n", - "Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions." - ], - "metadata": { - "id": "pOZ6Djz0dhoy" - } - }, - { - "cell_type": "code", - "source": [ - "import numpy as np\n", - "import matplotlib.pyplot as plt" - ], - "metadata": { - "id": "iaFyNGhU21VJ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "First let's define a neural network. We'll initialize both the weights and biases randomly with Glorot initialization (He initialization without the factor of two)" - ], - "metadata": { - "id": "YcNlAxnE3XXn" - } - }, - { - "cell_type": "code", - "source": [ - "# K is width, D is number of hidden units in each layer\n", - "def init_params(K, D):\n", - " # Set seed so we always get the same random numbers\n", - " np.random.seed(1)\n", - "\n", - " # Input layer\n", - " D_i = 1\n", - " # Output layer\n", - " D_o = 1\n", - "\n", - " # Glorot initialization\n", - " sigma_sq_omega = 1.0/D\n", - "\n", - " # Make empty lists\n", - " all_weights = [None] * (K+1)\n", - " all_biases = [None] * (K+1)\n", - "\n", - " # Create parameters for input and output layers\n", - " all_weights[0] = np.random.normal(size=(D, D_i))*np.sqrt(sigma_sq_omega)\n", - " all_weights[-1] = np.random.normal(size=(D_o, D)) * np.sqrt(sigma_sq_omega)\n", - " all_biases[0] = np.random.normal(size=(D,1))* np.sqrt(sigma_sq_omega)\n", - " all_biases[-1]= np.random.normal(size=(D_o,1))* np.sqrt(sigma_sq_omega)\n", - "\n", - " # Create intermediate layers\n", - " for layer in range(1,K):\n", - " all_weights[layer] = np.random.normal(size=(D,D))*np.sqrt(sigma_sq_omega)\n", - " all_biases[layer] = np.random.normal(size=(D,1))* np.sqrt(sigma_sq_omega)\n", - "\n", - " return all_weights, all_biases" - ], - "metadata": { - "id": "kr-q7hc23Bn9" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "The next two functions define the forward pass of the algorithm" - ], - "metadata": { - "id": "kwcn5z7-dq_1" - } - }, - { - "cell_type": "code", - "source": [ - "# Define the Rectified Linear Unit (ReLU) function\n", - "def ReLU(preactivation):\n", - " activation = preactivation.clip(0.0)\n", - " return activation\n", - "\n", - "def forward_pass(net_input, all_weights, all_biases):\n", - "\n", - " # Retrieve number of layers\n", - " K = len(all_weights) -1\n", - "\n", - " # We'll store the pre-activations at each layer in a list \"all_f\"\n", - " # and the activations in a second list[all_h].\n", - " all_f = [None] * (K+1)\n", - " all_h = [None] * (K+1)\n", - "\n", - " #For convenience, we'll set\n", - " # all_h[0] to be the input, and all_f[K] will be the output\n", - " all_h[0] = net_input\n", - "\n", - " # Run through the layers, calculating all_f[0...K-1] and all_h[1...K]\n", - " for layer in range(K):\n", - " # Update preactivations and activations at this layer according to eqn 7.5\n", - " all_f[layer] = all_biases[layer] + np.matmul(all_weights[layer], all_h[layer])\n", - " all_h[layer+1] = ReLU(all_f[layer])\n", - "\n", - " # Compute the output from the last hidden layer\n", - " all_f[K] = all_biases[K] + np.matmul(all_weights[K], all_h[K])\n", - "\n", - " # Retrieve the output\n", - " net_output = all_f[K]\n", - "\n", - " return net_output, all_f, all_h" - ], - "metadata": { - "id": "_2w-Tr7G3sYq" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "The next two functions compute the gradient of the output with respect to the input using the back propagation algorithm." - ], - "metadata": { - "id": "aM2l7QafeC8T" - } - }, - { - "cell_type": "code", - "source": [ - "# We'll need the indicator function\n", - "def indicator_function(x):\n", - " x_in = np.array(x)\n", - " x_in[x_in>=0] = 1\n", - " x_in[x_in<0] = 0\n", - " return x_in\n", - "\n", - "# Main backward pass routine\n", - "def calc_input_output_gradient(x_in, all_weights, all_biases):\n", - "\n", - " # Run the forward pass\n", - " y, all_f, all_h = forward_pass(x_in, all_weights, all_biases)\n", - "\n", - " # We'll store the derivatives dl_dweights and dl_dbiases in lists as well\n", - " all_dl_dweights = [None] * (K+1)\n", - " all_dl_dbiases = [None] * (K+1)\n", - " # And we'll store the derivatives of the loss with respect to the activation and preactivations in lists\n", - " all_dl_df = [None] * (K+1)\n", - " all_dl_dh = [None] * (K+1)\n", - " # Again for convenience we'll stick with the convention that all_h[0] is the net input and all_f[k] in the net output\n", - "\n", - " # Compute derivatives of net output with respect to loss\n", - " all_dl_df[K] = np.ones_like(all_f[K])\n", - "\n", - " # Now work backwards through the network\n", - " for layer in range(K,-1,-1):\n", - " all_dl_dbiases[layer] = np.array(all_dl_df[layer])\n", - " all_dl_dweights[layer] = np.matmul(all_dl_df[layer], all_h[layer].transpose())\n", - "\n", - " all_dl_dh[layer] = np.matmul(all_weights[layer].transpose(), all_dl_df[layer])\n", - "\n", - " if layer > 0:\n", - " all_dl_df[layer-1] = indicator_function(all_f[layer-1]) * all_dl_dh[layer]\n", - "\n", - "\n", - " return all_dl_dh[0],y" - ], - "metadata": { - "id": "DwR3eGMgV8bl" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Double check we have the gradient correct using finite differences" - ], - "metadata": { - "id": "Ar_VmraReSWe" - } - }, - { - "cell_type": "code", - "source": [ - "D = 200; K = 3\n", - "# Initialize parameters\n", - "all_weights, all_biases = init_params(K,D)\n", - "\n", - "x = np.ones((1,1))\n", - "dydx,y = calc_input_output_gradient(x, all_weights, all_biases)\n", - "\n", - "# Offset for finite gradients\n", - "delta = 0.00000001\n", - "x1 = x\n", - "y1,*_ = forward_pass(x1, all_weights, all_biases)\n", - "x2 = x+delta\n", - "y2,*_ = forward_pass(x2, all_weights, all_biases)\n", - "# Finite difference calculation\n", - "dydx_fd = (y2-y1)/delta\n", - "\n", - "print(\"Gradient calculation=%f, Finite difference gradient=%f\"%(dydx,dydx_fd))\n" - ], - "metadata": { - "id": "KJpQPVd36Haq" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Helper function that computes the derivatives for a 1D array of input values and plots them." - ], - "metadata": { - "id": "YC-LAYRKtbxp" - } - }, - { - "cell_type": "code", - "source": [ - "def plot_derivatives(K, D):\n", - "\n", - " # Initialize parameters\n", - " all_weights, all_biases = init_params(K,D)\n", - "\n", - " x_in = np.arange(-2,2, 4.0/256.0)\n", - " x_in = np.resize(x_in, (1,len(x_in)))\n", - " dydx,y = calc_input_output_gradient(x_in, all_weights, all_biases)\n", - "\n", - " fig,ax = plt.subplots()\n", - " ax.plot(np.squeeze(x_in), np.squeeze(dydx), 'b-')\n", - " ax.set_xlim(-2,2)\n", - " ax.set_xlabel('Input, $x$')\n", - " ax.set_ylabel('Gradient, $dy/dx$')\n", - " ax.set_title('No layers = %d'%(K))\n", - " plt.show()" - ], - "metadata": { - "id": "uJr5eDe648jF" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Build a model with one hidden layer and 200 neurons and plot derivatives\n", - "D = 200; K = 1\n", - "plot_derivatives(K,D)\n", - "\n", - "# TODO -- Interpret this result\n", - "# Why does the plot have some flat regions?\n", - "\n", - "# TODO -- Add code to plot the derivatives for models with 24 and 50 hidden layers\n", - "# with 200 neurons per layer\n", - "\n", - "# TODO -- Why does this graph not have visible flat regions?\n", - "\n", - "# TODO -- Why does the magnitude of the gradients decrease as we increase the number\n", - "# of hidden layers\n", - "\n", - "# TODO -- Do you find this a convincing replication of the experiment in the original paper? (I don't)\n", - "# Can you help me find why I have failed to replicate this result? udlbookmail@gmail.com" - ], - "metadata": { - "id": "56gTMTCb49KO" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Let's look at the autocorrelation function now" - ], - "metadata": { - "id": "f_0zjQbxuROQ" - } - }, - { - "cell_type": "code", - "source": [ - "def autocorr(dydx):\n", - " # TODO -- compute the autocorrelation function\n", - " # Use the numpy function \"correlate\" with the mode set to \"same\"\n", - " # Replace this line:\n", - " ac = np.ones((256,1))\n", - "\n", - " return ac" - ], - "metadata": { - "id": "ggnO8hfoRN1e" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Helper function to plot the autocorrelation function and normalize so correlation is one with offset of zero" - ], - "metadata": { - "id": "EctWSV1RuddK" - } - }, - { - "cell_type": "code", - "source": [ - "def plot_autocorr(K, D):\n", - "\n", - " # Initialize parameters\n", - " all_weights, all_biases = init_params(K,D)\n", - "\n", - " x_in = np.arange(-2.0,2.0, 4.0/256)\n", - " x_in = np.resize(x_in, (1,len(x_in)))\n", - " dydx,y = calc_input_output_gradient(x_in, all_weights, all_biases)\n", - " ac = autocorr(np.squeeze(dydx))\n", - " ac = ac / ac[128]\n", - "\n", - " y = ac[128:]\n", - " x = np.squeeze(x_in)[128:]\n", - " fig,ax = plt.subplots()\n", - " ax.plot(x,y, 'b-')\n", - " ax.set_xlim([0,2])\n", - " ax.set_xlabel('Distance')\n", - " ax.set_ylabel('Autocorrelation')\n", - " ax.set_title('No layers = %d'%(K))\n", - " plt.show()\n" - ], - "metadata": { - "id": "2LKlZ9u_WQXN" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Plot the autocorrelation functions\n", - "D = 200; K =1\n", - "plot_autocorr(K,D)\n", - "D = 200; K =50\n", - "plot_autocorr(K,D)\n", - "\n", - "# TODO -- Do you find this a convincing replication of the experiment in the original paper? (I don't)\n", - "# Can you help me find why I have failed to replicate this result?" - ], - "metadata": { - "id": "RD9JTdjNWw6p" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file