You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ha...@apache.org on 2018/08/06 16:57:00 UTC
[incubator-mxnet] branch master updated: Doc fix for a few optimizers (#12034)

This is an automated email from the ASF dual-hosted git repository.

haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 70efd32  Doc fix for a few optimizers (#12034)
70efd32 is described below

commit 70efd32a49e05fd0e7224496259ddacdaf61aa2a
Author: Haibin Lin <li...@gmail.com>
AuthorDate: Mon Aug 6 09:56:47 2018 -0700

    Doc fix for a few optimizers (#12034)
    
    * Update optimizer.py
    
    * Update optimizer.py
---
 python/mxnet/optimizer.py | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 7e69cf6..ab7dadb 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -449,7 +449,7 @@ class SGD(Optimizer):
     **lazy updates** are applied by::
 
         for row in grad.indices:
-            rescaled_grad[row] = lr * rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row]
+            rescaled_grad[row] = lr * (rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row])
             state[row] = momentum[row] * state[row] + rescaled_grad[row]
             weight[row] = weight[row] - state[row]
 
@@ -462,7 +462,7 @@ class SGD(Optimizer):
 
     Otherwise, **standard updates** are applied by::
 
-        rescaled_grad = lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
+        rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight)
         state = momentum * state + rescaled_grad
         weight = weight - state
 
@@ -616,6 +616,14 @@ class FTML(Optimizer):
     *FTML - Follow the Moving Leader in Deep Learning*,
     available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
 
+    Denote time step by t. The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
+        d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, t))) + epsilon)
+        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
+        weight = - z / d_t
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
@@ -1080,6 +1088,13 @@ class AdaGrad(Optimizer):
     Methods for Online Learning and Stochastic Optimization*, and available at
     http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
 
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient)
+        history += square(grad)
+        div = grad / sqrt(history + float_stable_eps)
+        weight += (div + weight * wd) * -lr
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
@@ -1207,6 +1222,14 @@ class AdaDelta(Optimizer):
     This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
     learning rate method*, available at https://arxiv.org/abs/1212.5701.
 
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
+        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
+        weight -= (delta + wd * weight)
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
@@ -1333,6 +1356,13 @@ class Adamax(Optimizer):
     It is a variant of Adam based on the infinity norm
     available at http://arxiv.org/abs/1412.6980 Section 7.
 
+    The optimizer updates the weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        m = beta1 * m_t + (1 - beta1) * grad
+        u = maximum(beta2 * u, abs(grad))
+        weight -= lr / (1 - beta1**t) * m / u
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.