-
Notifications
You must be signed in to change notification settings - Fork 29
/
optimizers.py
63 lines (52 loc) · 1.95 KB
/
optimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
This module implements gradient-based optimizers.
"""
import autograd.numpy as np
def l2(x):
"""
Hacky l2-norm computation to be used for tracking update magnitude.
"""
return np.sqrt(np.sum(np.multiply(x, x)))
class RMSProp(object):
"""
This class stores RMSProp state and generates RMSProp updates.
"""
def __init__(self, W, learning_rate=10e-5, decay=0.95, blend=0.95):
"""
This is the Alex Graves RMSProp variant from
Generating Sequences with Recurrent Neural Networks.
It scales parameter updates by a running estimate of the variance
of the parameter rather than just a running estimate of the magnitude.
decay governs how fast the momentum falls off.
blend governs the extent to which we take the current parameter value
into account when updating our estimate of variance.
"""
self.lr = learning_rate
self.d = decay
self.b = blend
self.ns = {} # store the mean of the square
self.gs = {} # store the mean, which will later be squared
self.ms = {} # momentum
self.qs = {} # update norm over param norm - ideally this stays around 10e-3
for k, v in W.iteritems():
self.ns[k] = np.zeros_like(v)
self.gs[k] = np.zeros_like(v)
self.ms[k] = np.zeros_like(v)
self.qs[k] = self.lr
def update_weights(self, params, dparams):
"""
params: the parameters of the model as they currently exist.
dparams: the grad of the cost w.r.t. the parameters of the model.
We update params based on dparams.
"""
for k in params.keys():
p = params[k]
d = dparams[k]
d = np.clip(d,-10,10)
self.ns[k] = self.b * self.ns[k] + (1 - self.b) * (d*d)
self.gs[k] = self.b * self.gs[k] + (1 - self.b) * d
n = self.ns[k]
g = self.gs[k]
self.ms[k] = self.d * self.ms[k] - self.lr * (d / (np.sqrt(n - g*g + 1e-8)))
self.qs[k] = self.b * self.qs[k] + (1 - self.b) * (l2(self.ms[k]) / l2(p))
p += self.ms[k]