diff --git a/docs/sources/optimizers.md b/docs/sources/optimizers.md index c6868a69991..35e2269048a 100644 --- a/docs/sources/optimizers.md +++ b/docs/sources/optimizers.md @@ -103,16 +103,15 @@ __Arguments__: ## Adam ```python -keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, kappa=1-1e-8) +keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8) ``` -Adam optimizer, proposed by Kingma and Lei Ba in [Adam: A Method For Stochastic Optimization](http://arxiv.org/pdf/1412.6980v4.pdf). Default parameters are those suggested in the paper. The parameter "lambda" from the paper has been renamed kappa, for syntactic reasons. +Adam optimizer, proposed by Kingma and Lei Ba in [Adam: A Method For Stochastic Optimization](http://arxiv.org/pdf/1412.6980v8.pdf). Default parameters are those suggested in the paper. __Arguments__: -- __lr__: float >= 0. Learning rate. +- __lr__: float >= 0. Learning rate. - __beta_1__, __beta_2__: floats, 0 < beta < 1. Generally close to 1. - __epsilon__: float >= 0. Fuzz factor. -- __kappa__: float 0 < kappa < 1. Lambda parameter in the original paper. ---- \ No newline at end of file +--- diff --git a/keras/optimizers.py b/keras/optimizers.py index ee245db2667..c52ad719418 100644 --- a/keras/optimizers.py +++ b/keras/optimizers.py @@ -166,13 +166,11 @@ def get_config(self): class Adam(Optimizer): ''' - Reference: http://arxiv.org/abs/1412.6980 + Reference: http://arxiv.org/abs/1412.6980v8 - Default parameters follow those provided in the original paper - - lambda is renamed kappa. + Default parameters follow those provided in the original paper. ''' - def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, kappa=1-1e-8, *args, **kwargs): + def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, *args, **kwargs): super(Adam, self).__init__(**kwargs) self.__dict__.update(locals()) self.iterations = shared_scalar(0) @@ -181,23 +179,16 @@ def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [(self.iterations, self.iterations+1.)] - i = self.iterations - beta_1_t = self.beta_1 * (self.kappa**i) - - # the update below seems missing from the paper, but is obviously required - beta_2_t = self.beta_2 * (self.kappa**i) + t = self.iterations + 1 + lr_t = self.lr * T.sqrt(1-self.beta_2**t)/(1-self.beta_1**t) for p, g, c in zip(params, grads, constraints): m = theano.shared(p.get_value() * 0.) # zero init of moment v = theano.shared(p.get_value() * 0.) # zero init of velocity - m_t = (beta_1_t * m) + (1 - beta_1_t) * g - v_t = (beta_2_t * v) + (1 - beta_2_t) * (g**2) - - m_b_t = m_t / (1 - beta_1_t) - v_b_t = v_t / (1 - beta_2_t) - - p_t = p - self.lr * m_b_t / (T.sqrt(v_b_t) + self.epsilon) + m_t = (self.beta_1 * m) + (1 - self.beta_1) * g + v_t = (self.beta_2 * v) + (1 - self.beta_2) * (g**2) + p_t = p - lr_t * m_t / (T.sqrt(v_t) + self.epsilon) self.updates.append((m, m_t)) self.updates.append((v, v_t)) @@ -209,8 +200,7 @@ def get_config(self): "lr": self.lr, "beta_1": self.beta_1, "beta_2": self.beta_2, - "epsilon": self.epsilon, - "kappa": self.kappa} + "epsilon": self.epsilon} # aliases sgd = SGD