forked from udlbook/udlbook
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
351 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,351 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"provenance": [], | ||
"collapsed_sections": [], | ||
"authorship_tag": "ABX9TyPr1jNETAJLP27xFPVEC09J", | ||
"include_colab_link": true | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "view-in-github", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Gradients_III.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"# Initialization\n", | ||
"\n", | ||
"In this practical, we'll investigate the what happens to the activations and the forward pass if we don't initialize the parameters sensibly." | ||
], | ||
"metadata": { | ||
"id": "L6chybAVFJW2" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "LdIDglk1FFcG" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import numpy as np\n", | ||
"import matplotlib.pyplot as plt" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"First let's define a neural network. We'll just choose the weights and biaes randomly for now" | ||
], | ||
"metadata": { | ||
"id": "nnUoI0m6GyjC" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"def init_params(K, D, sigma_sq_omega):\n", | ||
" # Set seed so we always get the same random numbers\n", | ||
" np.random.seed(0)\n", | ||
"\n", | ||
" # Input layer\n", | ||
" D_i = 1\n", | ||
" # Output layer \n", | ||
" D_o = 1\n", | ||
"\n", | ||
" # Make empty lists \n", | ||
" all_weights = [None] * (K+1)\n", | ||
" all_biases = [None] * (K+1)\n", | ||
"\n", | ||
" # Create input and output layers\n", | ||
" all_weights[0] = np.random.normal(size=(D, D_i))*np.sqrt(sigma_sq_omega)\n", | ||
" all_weights[-1] = np.random.normal(size=(D_o, D)) * np.sqrt(sigma_sq_omega)\n", | ||
" all_biases[0] = np.zeros((D,1))\n", | ||
" all_biases[-1]= np.zeros((D_o,1))\n", | ||
"\n", | ||
" # Create intermediate layers\n", | ||
" for layer in range(1,K):\n", | ||
" all_weights[layer] = np.random.normal(size=(D,D))*np.sqrt(sigma_sq_omega)\n", | ||
" all_biases[layer] = np.zeros((D,1)) \n", | ||
"\n", | ||
" return all_weights, all_biases" | ||
], | ||
"metadata": { | ||
"id": "WVM4Tc_jGI0Q" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# Define the Rectified Linear Unit (ReLU) function\n", | ||
"def ReLU(preactivation):\n", | ||
" activation = preactivation.clip(0.0)\n", | ||
" return activation" | ||
], | ||
"metadata": { | ||
"id": "jZh-7bPXIDq4" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"def compute_network_output(net_input, all_weights, all_biases):\n", | ||
"\n", | ||
" # Retrieve number of layers\n", | ||
" K = len(all_weights) -1\n", | ||
"\n", | ||
" # We'll store the pre-activations at each layer in a list \"all_f\"\n", | ||
" # and the activations in a second list[all_h]. \n", | ||
" all_f = [None] * (K+1)\n", | ||
" all_h = [None] * (K+1)\n", | ||
"\n", | ||
" #For convenience, we'll set \n", | ||
" # all_h[0] to be the input, and all_f[K] will be the output\n", | ||
" all_h[0] = net_input\n", | ||
"\n", | ||
" # Run through the layers, calculating all_f[0...K-1] and all_h[1...K]\n", | ||
" for layer in range(K):\n", | ||
" # Update preactivations and activations at this layer according to eqn 7.5\n", | ||
" all_f[layer] = all_biases[layer] + np.matmul(all_weights[layer], all_h[layer])\n", | ||
" all_h[layer+1] = ReLU(all_f[layer])\n", | ||
"\n", | ||
" # Compute the output from the last hidden layer\n", | ||
" all_f[K] = all_biases[K] + np.matmul(all_weights[K], all_h[K])\n", | ||
"\n", | ||
" # Retrieve the output\n", | ||
" net_output = all_f[K]\n", | ||
"\n", | ||
" return net_output, all_f, all_h" | ||
], | ||
"metadata": { | ||
"id": "LgquJUJvJPaN" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Now let's investigate how this the size of the outputs vary as we change the initialization variance:\n" | ||
], | ||
"metadata": { | ||
"id": "bIUrcXnOqChl" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# Number of layers\n", | ||
"K = 5\n", | ||
"# Number of neurons per layer\n", | ||
"D = 8\n", | ||
" # Input layer\n", | ||
"D_i = 1\n", | ||
"# Output layer \n", | ||
"D_o = 1\n", | ||
"# Set variance of initial weights to 1\n", | ||
"sigma_sq_omega = 1.0\n", | ||
"# Initialize parameters\n", | ||
"all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n", | ||
"\n", | ||
"n_data = 1000\n", | ||
"data_in = np.random.normal(size=(1,n_data))\n", | ||
"net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n", | ||
"\n", | ||
"for layer in range(K):\n", | ||
" print(\"Layer %d, std of hidden units = %3.3f\"%(layer, np.std(all_h[layer])))" | ||
], | ||
"metadata": { | ||
"id": "A55z3rKBqO7M" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n", | ||
"# and the 1000 training examples\n", | ||
"\n", | ||
"# TO DO \n", | ||
"# Change this to 50 layers with 80 hidden units per layer\n", | ||
"\n", | ||
"# TO DO \n", | ||
"# Now experiment with sigma_sq_omega to try to stop the variance of the forward computation explode" | ||
], | ||
"metadata": { | ||
"id": "VL_SO4tar3DC" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Now let's define a loss function. We'll just use the least squaures loss function. We'll also write a function to compute dloss_doutput\n" | ||
], | ||
"metadata": { | ||
"id": "SxVTKp3IcoBF" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"def least_squares_loss(net_output, y):\n", | ||
" return np.sum((net_output-y) * (net_output-y))\n", | ||
"\n", | ||
"def d_loss_d_output(net_output, y):\n", | ||
" return 2*(net_output -y); " | ||
], | ||
"metadata": { | ||
"id": "6XqWSYWJdhQR" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Here's the code for the backward pass" | ||
], | ||
"metadata": { | ||
"id": "98WmyqFYWA-0" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# We'll need the indicator function\n", | ||
"def indicator_function(x):\n", | ||
" x_in = np.array(x)\n", | ||
" x_in[x_in>=0] = 1\n", | ||
" x_in[x_in<0] = 0\n", | ||
" return x_in\n", | ||
"\n", | ||
"# Main backward pass routine\n", | ||
"def backward_pass(all_weights, all_biases, all_f, all_h, y):\n", | ||
" # We'll store the derivatives dl_dweights and dl_dbiases in lists as well\n", | ||
" all_dl_dweights = [None] * (K+1)\n", | ||
" all_dl_dbiases = [None] * (K+1)\n", | ||
" # And we'll store the derivatives of the loss with respect to the activation and preactivations in lists\n", | ||
" all_dl_df = [None] * (K+1)\n", | ||
" all_dl_dh = [None] * (K+1)\n", | ||
" # Again for convenience we'll stick with the convention that all_h[0] is the net input and all_f[k] in the net output\n", | ||
"\n", | ||
" # Compute derivatives of net output with respect to loss\n", | ||
" all_dl_df[K] = np.array(d_loss_d_output(all_f[K],y))\n", | ||
"\n", | ||
" # Now work backwards through the network\n", | ||
" for layer in range(K,-1,-1):\n", | ||
" # Calculate the derivatives of biases at layer from all_dl_df[K]. (eq 7.13, line 1)\n", | ||
" all_dl_dbiases[layer] = np.array(all_dl_df[layer])\n", | ||
" # Calculate the derivatives of weight at layer from all_dl_df[K] and all_h[K] (eq 7.13, line 2)\n", | ||
" all_dl_dweights[layer] = np.matmul(all_dl_df[layer], all_h[layer].transpose())\n", | ||
"\n", | ||
" # Calculate the derivatives of activations from weight and derivatives of next preactivations (eq 7.13, line 3 second part)\n", | ||
" all_dl_dh[layer] = np.matmul(all_weights[layer].transpose(), all_dl_df[layer])\n", | ||
" # Calculate the derivatives of the pre-activation f with respect to activation h (eq 7.13, line 3, first part)\n", | ||
" if layer > 0:\n", | ||
" all_dl_df[layer-1] = indicator_function(all_f[layer-1]) * all_dl_dh[layer]\n", | ||
"\n", | ||
" return all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df" | ||
], | ||
"metadata": { | ||
"id": "LJng7WpRPLMz" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Now let's look at what happens to the magnitude of the gradients on the way back." | ||
], | ||
"metadata": { | ||
"id": "phFnbthqwhFi" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# Number of layers\n", | ||
"K = 5\n", | ||
"# Number of neurons per layer\n", | ||
"D = 8\n", | ||
" # Input layer\n", | ||
"D_i = 1\n", | ||
"# Output layer \n", | ||
"D_o = 1\n", | ||
"# Set variance of initial weights to 1\n", | ||
"sigma_sq_omega = 1.0\n", | ||
"# Initialize parameters\n", | ||
"all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n", | ||
"\n", | ||
"# For simplicity we'll just consider the gradients of the weights and biases between the first and last hidden layer\n", | ||
"n_data = 100\n", | ||
"aggregate_dl_df = [None] * (K+1)\n", | ||
"for layer in range(1,K):\n", | ||
" # These 3D arrays will store the gradients for every data point\n", | ||
" aggregate_dl_df[layer] = np.zeros((D,n_data))\n", | ||
"\n", | ||
"\n", | ||
"# We'll have to compute the derivatives of the parameters for each data point separately\n", | ||
"for c_data in range(n_data):\n", | ||
" data_in = np.random.normal(size=(1,1))\n", | ||
" y = np.zeros((1,1))\n", | ||
" net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n", | ||
" all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df = backward_pass(all_weights, all_biases, all_f, all_h, y)\n", | ||
" for layer in range(1,K):\n", | ||
" aggregate_dl_df[layer][:,c_data] = np.squeeze(all_dl_df[layer])\n", | ||
"\n", | ||
"for layer in range(1,K):\n", | ||
" print(\"Layer %d, std of dl_dh = %3.3f\"%(layer, np.std(aggregate_dl_df[layer].ravel())))\n" | ||
], | ||
"metadata": { | ||
"id": "9A9MHc4sQvbp" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n", | ||
"# and the 1000 training examples\n", | ||
"\n", | ||
"# TO DO \n", | ||
"# Change this to 50 layers with 80 hidden units per layer\n", | ||
"\n", | ||
"# TO DO \n", | ||
"# Now experiment with sigma_sq_omega to try to stop the variance of the gradients exploding" | ||
], | ||
"metadata": { | ||
"id": "gtokc0VX0839" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
} | ||
] | ||
} |