You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ha...@apache.org on 2019/02/15 01:08:07 UTC
[incubator-mxnet] branch master updated: In-place updates for Nadam,
Adadelta, Adamax and SGLD (#13960)
This is an automated email from the ASF dual-hosted git repository.
haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new a4e249b In-place updates for Nadam, Adadelta, Adamax and SGLD (#13960)
a4e249b is described below
commit a4e249bee2db7b931ecdd6fed05a94e742e7c3c5
Author: Anirudh <an...@gmail.com>
AuthorDate: Thu Feb 14 17:07:43 2019 -0800
In-place updates for Nadam, Adadelta, Adamax and SGLD (#13960)
---
python/mxnet/optimizer/optimizer.py | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index a986f27..def2c95 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -1091,8 +1091,9 @@ class SGLD(Optimizer):
grad = grad * self.rescale_grad
if self.clip_gradient is not None:
grad = clip(grad, -self.clip_gradient, self.clip_gradient)
- weight[:] += - lr/2 * (grad + wd * weight) + normal(0, math.sqrt(lr), shape=weight.shape,
- dtype=weight.dtype, ctx=weight.context)
+ weight[:] += - lr/2 * (grad + wd * weight)
+ weight[:] += normal(0, math.sqrt(lr), shape=weight.shape,
+ dtype=weight.dtype, ctx=weight.context)
@@ -1372,9 +1373,11 @@ class AdaDelta(Optimizer):
acc_g, acc_delta = state
# update g, delta
- acc_g[:] = self.rho * acc_g + (1. - self.rho) * grad * grad
+ acc_g[:] *= self.rho
+ acc_g[:] += (1. - self.rho) * grad * grad
current_delta = sqrt(acc_delta + self.epsilon) / sqrt(acc_g + self.epsilon) * grad
- acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta * current_delta
+ acc_delta[:] *= self.rho
+ acc_delta[:] += (1. - self.rho) * current_delta * current_delta
# update weight
weight[:] -= current_delta + wd * weight
@@ -1507,7 +1510,8 @@ class Adamax(Optimizer):
# update m_t and u_t
m_t, u_t = state
- m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
+ m_t[:] *= self.beta1
+ m_t[:] += (1. - self.beta1) * grad
u_t[:] = maximum(self.beta2 * u_t, NDabs(grad))
# update weight
@@ -1570,8 +1574,10 @@ class Nadam(Optimizer):
# update m_t and v_t
m_t, v_t = state
- m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
- v_t[:] = self.beta2 * v_t + (1. - self.beta2) * grad * grad
+ m_t[:] *= self.beta1
+ m_t[:] += (1. - self.beta1) * grad
+ v_t[:] *= self.beta2
+ v_t[:] += (1. - self.beta2) * grad * grad
grad_prime = grad / (1. - self.m_schedule)
m_t_prime = m_t / (1. - m_schedule_next)