You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by lx...@apache.org on 2017/07/07 15:58:25 UTC
[12/50] [abbrv] incubator-mxnet-test git commit: add Adamax & Nadam
(#6784)
add Adamax & Nadam (#6784)
* add Adamax & Nadam
* change abs to NDabs
* update nadam
* Update optimizer.py
Project: http://git-wip-us.apache.org/repos/asf/incubator-mxnet-test/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-mxnet-test/commit/8d3aa29a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-mxnet-test/tree/8d3aa29a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-mxnet-test/diff/8d3aa29a
Branch: refs/heads/master
Commit: 8d3aa29a18d109772d5f81b877d88a1d727c2a2d
Parents: 4b1dd22
Author: CNevd <CN...@users.noreply.github.com>
Authored: Thu Jun 29 06:56:20 2017 +0800
Committer: Eric Junyuan Xie <pi...@users.noreply.github.com>
Committed: Wed Jun 28 15:56:20 2017 -0700
----------------------------------------------------------------------
python/mxnet/optimizer.py | 121 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 119 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-mxnet-test/blob/8d3aa29a/python/mxnet/optimizer.py
----------------------------------------------------------------------
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 3d31800..57fadf4 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -4,7 +4,7 @@ import pickle
import logging
import warnings
import numpy
-from .ndarray import NDArray, zeros, clip, sqrt, sign, array
+from .ndarray import (NDArray, zeros, clip, sqrt, sign, array, maximum, abs as NDabs)
from .ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
mp_sgd_update, mp_sgd_mom_update)
from .random import normal
@@ -777,7 +777,124 @@ class Ftrl(Optimizer):
# update weight
weight[:] = (sign(dn) * self.lamda1 - dn) / \
- ((self.beta + sqrt(n)) / lr + wd) * (NDArray.abs(dn) > self.lamda1)
+ ((self.beta + sqrt(n)) / lr + wd) * (NDabs(dn) > self.lamda1)
+
+@register
+class Adamax(Optimizer):
+ """The AdaMax optimizer.
+
+ It is a variant of Adam based on the infinity norm
+ available at http://arxiv.org/abs/1412.6980 Section 7.
+
+ This optimizer accepts the following parameters in addition to those accepted
+ by :class:`.Optimizer`.
+
+ Parameters
+ ----------
+ beta1 : float, optional
+ Exponential decay rate for the first moment estimates.
+ beta2 : float, optional
+ Exponential decay rate for the second moment estimates.
+ """
+ def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, **kwargs):
+ super(Adamax, self).__init__(learning_rate=learning_rate, **kwargs)
+ self.beta1 = beta1
+ self.beta2 = beta2
+
+ def create_state(self, index, weight):
+ return (zeros(weight.shape, weight.context, dtype=weight.dtype), # mean
+ zeros(weight.shape, weight.context, dtype=weight.dtype)) # variance
+
+ def update(self, index, weight, grad, state):
+ assert(isinstance(weight, NDArray))
+ assert(isinstance(grad, NDArray))
+ lr = self._get_lr(index)
+ wd = self._get_wd(index)
+ self._update_count(index)
+
+ t = self._index_update_count[index]
+ lr /= (1. - self.beta1**t)
+
+ # preprocess grad
+ grad = grad * self.rescale_grad + wd * weight
+ if self.clip_gradient is not None:
+ grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+
+ # update m_t and u_t
+ m_t, u_t = state
+ m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
+ u_t[:] = maximum(self.beta2 * u_t, NDabs(grad))
+
+ # update weight
+ weight[:] -= lr * m_t / u_t
+
+@register
+class Nadam(Optimizer):
+ """The Nesterov Adam optimizer.
+
+ Much like Adam is essentially RMSprop with momentum,
+ Nadam is Adam RMSprop with Nesterov momentum available
+ at http://cs229.stanford.edu/proj2015/054_report.pdf.
+
+ This optimizer accepts the following parameters in addition to those accepted
+ by :class:`.Optimizer`.
+
+ Parameters
+ ----------
+ beta1 : float, optional
+ Exponential decay rate for the first moment estimates.
+ beta2 : float, optional
+ Exponential decay rate for the second moment estimates.
+ epsilon : float, optional
+ Small value to avoid division by 0.
+ schedule_decay : float, optional
+ Exponential decay rate for the momentum schedule
+ """
+ def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+ schedule_decay=0.004, **kwargs):
+ super(Nadam, self).__init__(learning_rate=learning_rate, **kwargs)
+ self.beta1 = beta1
+ self.beta2 = beta2
+ self.epsilon = epsilon
+ self.schedule_decay = schedule_decay
+ self.m_schedule = 1.
+
+ def create_state(self, index, weight):
+ return (zeros(weight.shape, weight.context, dtype=weight.dtype), # mean
+ zeros(weight.shape, weight.context, dtype=weight.dtype)) # variance
+
+ def update(self, index, weight, grad, state):
+ assert(isinstance(weight, NDArray))
+ assert(isinstance(grad, NDArray))
+ lr = self._get_lr(index)
+ wd = self._get_wd(index)
+ self._update_count(index)
+
+ t = self._index_update_count[index]
+
+ # preprocess grad
+ grad *= self.rescale_grad + wd * weight
+ if self.clip_gradient is not None:
+ grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+
+ # warming momentum schedule
+ momentum_t = self.beta1 * (1. - 0.5 * (pow(0.96, t * self.schedule_decay)))
+ momentum_t_1 = self.beta1 * (1. - 0.5 * (pow(0.96, (t + 1) * self.schedule_decay)))
+ self.m_schedule = self.m_schedule * momentum_t
+ m_schedule_next = self.m_schedule * momentum_t_1
+
+ # update m_t and v_t
+ m_t, v_t = state
+ m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
+ v_t[:] = self.beta2 * v_t + (1. - self.beta2) * grad * grad
+
+ grad_prime = grad / (1. - self.m_schedule)
+ m_t_prime = m_t / (1. - m_schedule_next)
+ v_t_prime = v_t / (1. - pow(self.beta2, t))
+ m_t_bar = (1. - momentum_t) * grad_prime + momentum_t_1 * m_t_prime
+
+ # update weight
+ weight[:] -= lr * m_t_bar / (sqrt(v_t_prime) + self.epsilon)
@register
class Test(Optimizer):