forked from DorinK/Deep-Learning-Gradient-based-Learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmlpn.py
134 lines (95 loc) · 3.74 KB
/
mlpn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import numpy as np
STUDENT = {'name': 'Dorin Keshales'}
def classifier_output(x, params):
# YOUR CODE HERE.
global input_lst
input_lst = [x]
# linear operation on the first two params
result = np.dot(x, params[0]) + params[1]
it = iter(params[2::1])
rest_params = list(zip(it, it))
for W, b in rest_params:
# Saving copy of the hidden layer input - before the tanh
input_lst.append(result.copy())
# Using tanh as activation function
result = np.tanh(result)
# Saving copy of the hidden layer output - after the tanh
input_lst.append(result.copy())
# linear operation on the current two params
result = np.dot(result, W) + b
# normalizing before the softmax and then activating the softmax
result -= result.max()
probs = np.exp(result) / np.sum(np.exp(result))
return probs
def predict(x, params):
return np.argmax(classifier_output(x, params))
def loss_and_gradients(x, y, params):
"""
params: a list as created by create_classifier(...)
returns:
loss,[gW1, gb1, gW2, gb2, ...]
loss: scalar
gW1: matrix, gradients of W1
gb1: vector, gradients of b1
gW2: matrix, gradients of W2
gb2: vector, gradients of b2
...
(of course, if we request a linear classifier (ie, params is of length 2),
you should not have gW2 and gb2.)
"""
# YOU CODE HERE
gradients = []
# Calculating the loss
y_hat = classifier_output(x, params)
loss = -np.log(y_hat[y])
# Turning the correct label(=y) into a vector
label = y
y = np.zeros(len(y_hat))
y[label] = 1
i = iter(params)
params_pairs = list(zip(i, i))
# Using the inputs to each layer which i saved earlier in the classifier_output function.
inputs = input_lst.copy()
error = y_hat - y
# We are looping over the params pairs from the last ones to the first ones
for W, b in reversed(params_pairs):
# Popping the output of the last tanh
input_layer = inputs.pop()
gb = error
gW = np.outer(input_layer, gb)
gradients = [gW, gb] + gradients
# If there are hidden layers
if len(inputs) != 0:
# Popping the input to the tanh
input_layer = inputs.pop()
# Calculating the error to the previous layer
error = (np.dot(W, error)) * (1 - np.square(np.tanh(input_layer)))
return loss, gradients
# Initialization function to the weights matrices and the bias vectors.
def my_random(size1, size2=None):
t = 1 if size2 is None else size2
eps = np.sqrt(6.0 / (size1 + t))
return np.random.uniform(-eps, eps, (size1, size2)) if size2 is not None else np.random.uniform(-eps, eps, size1)
def create_classifier(dims):
"""
returns the parameters for a multi-layer perceptron with an arbitrary number
of hidden layers.
dims is a list of length at least 2, where the first item is the input
dimension, the last item is the output dimension, and the ones in between
are the hidden layers.
For example, for:
dims = [300, 20, 30, 40, 5]
We will have input of 300 dimension, a hidden layer of 20 dimension, passed
to a layer of 30 dimensions, passed to learn of 40 dimensions, and finally
an output of 5 dimensions.
Assume a tanh activation function between all the layers.
return:
a flat list of parameters where the first two elements are the W and b from input
to first layer, then the second two are the matrix and vector from first to
second layer, and so on.
"""
params = []
for dim_in, dim_out in zip(dims[0::1], dims[1::1]):
params.append(my_random(dim_in, dim_out))
params.append(my_random(dim_out))
return params