Skip to content

Commit

Permalink
Created using Colaboratory
Browse files Browse the repository at this point in the history
  • Loading branch information
udlbook committed Nov 14, 2022
1 parent bd3b226 commit 22fd25f
Showing 1 changed file with 351 additions and 0 deletions.
351 changes: 351 additions & 0 deletions CM20315_Gradients_III.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,351 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyPr1jNETAJLP27xFPVEC09J",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Gradients_III.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# Initialization\n",
"\n",
"In this practical, we'll investigate the what happens to the activations and the forward pass if we don't initialize the parameters sensibly."
],
"metadata": {
"id": "L6chybAVFJW2"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LdIDglk1FFcG"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"source": [
"First let's define a neural network. We'll just choose the weights and biaes randomly for now"
],
"metadata": {
"id": "nnUoI0m6GyjC"
}
},
{
"cell_type": "code",
"source": [
"def init_params(K, D, sigma_sq_omega):\n",
" # Set seed so we always get the same random numbers\n",
" np.random.seed(0)\n",
"\n",
" # Input layer\n",
" D_i = 1\n",
" # Output layer \n",
" D_o = 1\n",
"\n",
" # Make empty lists \n",
" all_weights = [None] * (K+1)\n",
" all_biases = [None] * (K+1)\n",
"\n",
" # Create input and output layers\n",
" all_weights[0] = np.random.normal(size=(D, D_i))*np.sqrt(sigma_sq_omega)\n",
" all_weights[-1] = np.random.normal(size=(D_o, D)) * np.sqrt(sigma_sq_omega)\n",
" all_biases[0] = np.zeros((D,1))\n",
" all_biases[-1]= np.zeros((D_o,1))\n",
"\n",
" # Create intermediate layers\n",
" for layer in range(1,K):\n",
" all_weights[layer] = np.random.normal(size=(D,D))*np.sqrt(sigma_sq_omega)\n",
" all_biases[layer] = np.zeros((D,1)) \n",
"\n",
" return all_weights, all_biases"
],
"metadata": {
"id": "WVM4Tc_jGI0Q"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Define the Rectified Linear Unit (ReLU) function\n",
"def ReLU(preactivation):\n",
" activation = preactivation.clip(0.0)\n",
" return activation"
],
"metadata": {
"id": "jZh-7bPXIDq4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def compute_network_output(net_input, all_weights, all_biases):\n",
"\n",
" # Retrieve number of layers\n",
" K = len(all_weights) -1\n",
"\n",
" # We'll store the pre-activations at each layer in a list \"all_f\"\n",
" # and the activations in a second list[all_h]. \n",
" all_f = [None] * (K+1)\n",
" all_h = [None] * (K+1)\n",
"\n",
" #For convenience, we'll set \n",
" # all_h[0] to be the input, and all_f[K] will be the output\n",
" all_h[0] = net_input\n",
"\n",
" # Run through the layers, calculating all_f[0...K-1] and all_h[1...K]\n",
" for layer in range(K):\n",
" # Update preactivations and activations at this layer according to eqn 7.5\n",
" all_f[layer] = all_biases[layer] + np.matmul(all_weights[layer], all_h[layer])\n",
" all_h[layer+1] = ReLU(all_f[layer])\n",
"\n",
" # Compute the output from the last hidden layer\n",
" all_f[K] = all_biases[K] + np.matmul(all_weights[K], all_h[K])\n",
"\n",
" # Retrieve the output\n",
" net_output = all_f[K]\n",
"\n",
" return net_output, all_f, all_h"
],
"metadata": {
"id": "LgquJUJvJPaN"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Now let's investigate how this the size of the outputs vary as we change the initialization variance:\n"
],
"metadata": {
"id": "bIUrcXnOqChl"
}
},
{
"cell_type": "code",
"source": [
"# Number of layers\n",
"K = 5\n",
"# Number of neurons per layer\n",
"D = 8\n",
" # Input layer\n",
"D_i = 1\n",
"# Output layer \n",
"D_o = 1\n",
"# Set variance of initial weights to 1\n",
"sigma_sq_omega = 1.0\n",
"# Initialize parameters\n",
"all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n",
"\n",
"n_data = 1000\n",
"data_in = np.random.normal(size=(1,n_data))\n",
"net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n",
"\n",
"for layer in range(K):\n",
" print(\"Layer %d, std of hidden units = %3.3f\"%(layer, np.std(all_h[layer])))"
],
"metadata": {
"id": "A55z3rKBqO7M"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n",
"# and the 1000 training examples\n",
"\n",
"# TO DO \n",
"# Change this to 50 layers with 80 hidden units per layer\n",
"\n",
"# TO DO \n",
"# Now experiment with sigma_sq_omega to try to stop the variance of the forward computation explode"
],
"metadata": {
"id": "VL_SO4tar3DC"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Now let's define a loss function. We'll just use the least squaures loss function. We'll also write a function to compute dloss_doutput\n"
],
"metadata": {
"id": "SxVTKp3IcoBF"
}
},
{
"cell_type": "code",
"source": [
"def least_squares_loss(net_output, y):\n",
" return np.sum((net_output-y) * (net_output-y))\n",
"\n",
"def d_loss_d_output(net_output, y):\n",
" return 2*(net_output -y); "
],
"metadata": {
"id": "6XqWSYWJdhQR"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Here's the code for the backward pass"
],
"metadata": {
"id": "98WmyqFYWA-0"
}
},
{
"cell_type": "code",
"source": [
"# We'll need the indicator function\n",
"def indicator_function(x):\n",
" x_in = np.array(x)\n",
" x_in[x_in>=0] = 1\n",
" x_in[x_in<0] = 0\n",
" return x_in\n",
"\n",
"# Main backward pass routine\n",
"def backward_pass(all_weights, all_biases, all_f, all_h, y):\n",
" # We'll store the derivatives dl_dweights and dl_dbiases in lists as well\n",
" all_dl_dweights = [None] * (K+1)\n",
" all_dl_dbiases = [None] * (K+1)\n",
" # And we'll store the derivatives of the loss with respect to the activation and preactivations in lists\n",
" all_dl_df = [None] * (K+1)\n",
" all_dl_dh = [None] * (K+1)\n",
" # Again for convenience we'll stick with the convention that all_h[0] is the net input and all_f[k] in the net output\n",
"\n",
" # Compute derivatives of net output with respect to loss\n",
" all_dl_df[K] = np.array(d_loss_d_output(all_f[K],y))\n",
"\n",
" # Now work backwards through the network\n",
" for layer in range(K,-1,-1):\n",
" # Calculate the derivatives of biases at layer from all_dl_df[K]. (eq 7.13, line 1)\n",
" all_dl_dbiases[layer] = np.array(all_dl_df[layer])\n",
" # Calculate the derivatives of weight at layer from all_dl_df[K] and all_h[K] (eq 7.13, line 2)\n",
" all_dl_dweights[layer] = np.matmul(all_dl_df[layer], all_h[layer].transpose())\n",
"\n",
" # Calculate the derivatives of activations from weight and derivatives of next preactivations (eq 7.13, line 3 second part)\n",
" all_dl_dh[layer] = np.matmul(all_weights[layer].transpose(), all_dl_df[layer])\n",
" # Calculate the derivatives of the pre-activation f with respect to activation h (eq 7.13, line 3, first part)\n",
" if layer > 0:\n",
" all_dl_df[layer-1] = indicator_function(all_f[layer-1]) * all_dl_dh[layer]\n",
"\n",
" return all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df"
],
"metadata": {
"id": "LJng7WpRPLMz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Now let's look at what happens to the magnitude of the gradients on the way back."
],
"metadata": {
"id": "phFnbthqwhFi"
}
},
{
"cell_type": "code",
"source": [
"# Number of layers\n",
"K = 5\n",
"# Number of neurons per layer\n",
"D = 8\n",
" # Input layer\n",
"D_i = 1\n",
"# Output layer \n",
"D_o = 1\n",
"# Set variance of initial weights to 1\n",
"sigma_sq_omega = 1.0\n",
"# Initialize parameters\n",
"all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n",
"\n",
"# For simplicity we'll just consider the gradients of the weights and biases between the first and last hidden layer\n",
"n_data = 100\n",
"aggregate_dl_df = [None] * (K+1)\n",
"for layer in range(1,K):\n",
" # These 3D arrays will store the gradients for every data point\n",
" aggregate_dl_df[layer] = np.zeros((D,n_data))\n",
"\n",
"\n",
"# We'll have to compute the derivatives of the parameters for each data point separately\n",
"for c_data in range(n_data):\n",
" data_in = np.random.normal(size=(1,1))\n",
" y = np.zeros((1,1))\n",
" net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n",
" all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df = backward_pass(all_weights, all_biases, all_f, all_h, y)\n",
" for layer in range(1,K):\n",
" aggregate_dl_df[layer][:,c_data] = np.squeeze(all_dl_df[layer])\n",
"\n",
"for layer in range(1,K):\n",
" print(\"Layer %d, std of dl_dh = %3.3f\"%(layer, np.std(aggregate_dl_df[layer].ravel())))\n"
],
"metadata": {
"id": "9A9MHc4sQvbp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n",
"# and the 1000 training examples\n",
"\n",
"# TO DO \n",
"# Change this to 50 layers with 80 hidden units per layer\n",
"\n",
"# TO DO \n",
"# Now experiment with sigma_sq_omega to try to stop the variance of the gradients exploding"
],
"metadata": {
"id": "gtokc0VX0839"
},
"execution_count": null,
"outputs": []
}
]
}

0 comments on commit 22fd25f

Please sign in to comment.