Skip to content

Commit

Permalink
Merge pull request keras-team#457 from kashif/adam
Browse files Browse the repository at this point in the history
Updated adam solver to v8 of paper
  • Loading branch information
fchollet committed Jul 31, 2015
2 parents 3bf5340 + 9c7c52d commit 54dc647
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 24 deletions.
9 changes: 4 additions & 5 deletions docs/sources/optimizers.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,15 @@ __Arguments__:
## Adam

```python
keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, kappa=1-1e-8)
keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
```

Adam optimizer, proposed by Kingma and Lei Ba in [Adam: A Method For Stochastic Optimization](http://arxiv.org/pdf/1412.6980v4.pdf). Default parameters are those suggested in the paper. The parameter "lambda" from the paper has been renamed kappa, for syntactic reasons.
Adam optimizer, proposed by Kingma and Lei Ba in [Adam: A Method For Stochastic Optimization](http://arxiv.org/pdf/1412.6980v8.pdf). Default parameters are those suggested in the paper.

__Arguments__:

- __lr__: float >= 0. Learning rate.
- __lr__: float >= 0. Learning rate.
- __beta_1__, __beta_2__: floats, 0 < beta < 1. Generally close to 1.
- __epsilon__: float >= 0. Fuzz factor.
- __kappa__: float 0 < kappa < 1. Lambda parameter in the original paper.

---
---
28 changes: 9 additions & 19 deletions keras/optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,13 +166,11 @@ def get_config(self):

class Adam(Optimizer):
'''
Reference: http://arxiv.org/abs/1412.6980
Reference: http://arxiv.org/abs/1412.6980v8
Default parameters follow those provided in the original paper
lambda is renamed kappa.
Default parameters follow those provided in the original paper.
'''
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, kappa=1-1e-8, *args, **kwargs):
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, *args, **kwargs):
super(Adam, self).__init__(**kwargs)
self.__dict__.update(locals())
self.iterations = shared_scalar(0)
Expand All @@ -181,23 +179,16 @@ def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
self.updates = [(self.iterations, self.iterations+1.)]

i = self.iterations
beta_1_t = self.beta_1 * (self.kappa**i)

# the update below seems missing from the paper, but is obviously required
beta_2_t = self.beta_2 * (self.kappa**i)
t = self.iterations + 1
lr_t = self.lr * T.sqrt(1-self.beta_2**t)/(1-self.beta_1**t)

for p, g, c in zip(params, grads, constraints):
m = theano.shared(p.get_value() * 0.) # zero init of moment
v = theano.shared(p.get_value() * 0.) # zero init of velocity

m_t = (beta_1_t * m) + (1 - beta_1_t) * g
v_t = (beta_2_t * v) + (1 - beta_2_t) * (g**2)

m_b_t = m_t / (1 - beta_1_t)
v_b_t = v_t / (1 - beta_2_t)

p_t = p - self.lr * m_b_t / (T.sqrt(v_b_t) + self.epsilon)
m_t = (self.beta_1 * m) + (1 - self.beta_1) * g
v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g**2)
p_t = p - lr_t * m_t / (T.sqrt(v_t) + self.epsilon)

self.updates.append((m, m_t))
self.updates.append((v, v_t))
Expand All @@ -209,8 +200,7 @@ def get_config(self):
"lr": self.lr,
"beta_1": self.beta_1,
"beta_2": self.beta_2,
"epsilon": self.epsilon,
"kappa": self.kappa}
"epsilon": self.epsilon}

# aliases
sgd = SGD
Expand Down

0 comments on commit 54dc647

Please sign in to comment.