You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ji...@apache.org on 2016/02/24 10:25:46 UTC
[2/2] incubator-singa git commit: SINGA-145 New SGD based optimization Updaters: AdaDelta, Adam, AdamMax

SINGA-145 New SGD based optimization Updaters: AdaDelta, Adam, AdamMax

New Updaters: AdaDelta, Adam, AdamMax.
To implement AdamMax, add two new operators for Tensor in cxxnet_op.h i.e. op::abs and op::max.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/e32e70cc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/e32e70cc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/e32e70cc

Branch: refs/heads/master
Commit: e32e70ccdd16582c633f12e3e6702636139a6078
Parents: 8d4953a
Author: ijingo <ij...@gmail.com>
Authored: Fri Feb 19 16:10:21 2016 +0800
Committer: jinyangturbo <pk...@gmail.com>
Committed: Wed Feb 24 01:21:23 2016 -0800

----------------------------------------------------------------------
 examples/cifar10/job.conf     | 57 +++++++++++++++++-------
 include/mshadow/cxxnet_op.h   | 11 +++++
 include/singa/utils/param.h   |  3 +-
 include/singa/utils/updater.h | 29 +++++++++++--
 src/driver.cc                 |  4 ++
 src/proto/job.proto           | 23 ++++++++++
 src/utils/param.cc            |  1 +
 src/utils/updater.cc          | 88 ++++++++++++++++++++++++++++++--------
 8 files changed, 178 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index d20b452..22b2bb2 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -1,7 +1,7 @@
 name: "cifar10-convnet"
-train_steps: 1000
+train_steps: 5000
 test_steps: 100
-test_freq: 200
+test_freq: 500
 #validate_steps: 100
 #validate_freq: 300
 disp_freq: 50
@@ -9,20 +9,47 @@ disp_freq: 50
 train_one_batch {
   alg: kBP
 }
+#updater{
+#  type: kSGD
+#  weight_decay:0.004
+#  momentum:0.9
+#  learning_rate {
+#    type: kFixedStep
+#    fixedstep_conf:{
+#      step:0
+#      step:60000
+#      step:65000
+#      step_lr:0.001
+#      step_lr:0.0001
+#      step_lr:0.00001
+#    }
+#  }
+#}
+#updater{
+#  type: kAdaDelta
+#  weight_decay:0.004
+#  delta: 0.000001
+#  learning_rate {
+#    type: kFixed
+#    base_lr:1
+#  }
+#}
+#updater{
+#  type: kAdamMax
+#  weight_decay:0.004
+#  delta: 0.00000001
+#  learning_rate {
+#    type: kFixed
+#    base_lr:0.0001
+#  }
+#}
 updater{
-  type: kSGD
+  type: kAdamMax
   weight_decay:0.004
-  momentum:0.9
+  delta: 0.00000001
   learning_rate {
-    type: kFixedStep
-    fixedstep_conf:{
-      step:0
-      step:60000
-      step:65000
-      step_lr:0.001
-      step_lr:0.0001
-      step_lr:0.00001
-    }
+    type: kFixed
+    base_lr:0.002
   }
 }
 neuralnet {
@@ -273,7 +300,7 @@ neuralnet {
 cluster {
   nworker_groups: 1
   nserver_groups: 1
-  nworkers_per_group: 1
-  nworkers_per_procs: 1
+  nworkers_per_group: 4
+  nworkers_per_procs: 4
   workspace: "examples/cifar10"
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/include/mshadow/cxxnet_op.h
----------------------------------------------------------------------
diff --git a/include/mshadow/cxxnet_op.h b/include/mshadow/cxxnet_op.h
index 930caf2..1422070 100644
--- a/include/mshadow/cxxnet_op.h
+++ b/include/mshadow/cxxnet_op.h
@@ -86,6 +86,12 @@ namespace mshadow {
             }
         };
 
+        struct abs{
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a < 0 ? -a : a;
+            }
+        };
+
     }; //namespace op
 
 }; //namespace mshadow
@@ -110,6 +116,11 @@ namespace mshadow {
                 return sqrt(a+b);
             }
         };
+        struct max {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a > b ? a : b;
+            }
+        };
     }; // namespace op
 }; // namespace mshadow
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/include/singa/utils/param.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/param.h b/include/singa/utils/param.h
index 9930710..fcaaeb7 100644
--- a/include/singa/utils/param.h
+++ b/include/singa/utils/param.h
@@ -219,6 +219,7 @@ class Param {
   inline float* mutable_cpu_data() { return data_.mutable_cpu_data(); }
   inline float* mutable_cpu_grad() { return grad_.mutable_cpu_data(); }
   inline float* mutable_cpu_history() { return history_.mutable_cpu_data(); }
+  inline float* mutable_cpu_update() { return update_.mutable_cpu_data(); }
   /**
    * @return slice start ID
    */
@@ -355,7 +356,7 @@ class Param {
   std::vector<bool> pending_update_;
   int num_pending_requests_ = 0;
   // data, gradient, history gradient of this parameter
-  Blob<float> data_, grad_, history_;
+  Blob<float> data_, grad_, history_, update_;
   ParamProto proto_;
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/include/singa/utils/updater.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/updater.h b/include/singa/utils/updater.h
index 7fec78c..b14f72b 100644
--- a/include/singa/utils/updater.h
+++ b/include/singa/utils/updater.h
@@ -125,19 +125,40 @@ class RMSPropUpdater : public Updater {
 
  protected:
   float rho_;
+  float delta_;
 };
 
-/*
 class AdaDeltaUpdater : public Updater {
  public:
-  virtual void Update(int step, Param* param, float grad_scale);
+  void Init(const UpdaterProto &proto) override;
+  void Update(int step, Param* param, float grad_scale) override;
 
  protected:
   float rho_;
   float delta_;
-  float weight_decay_;
 };
-*/
+
+class AdamUpdater : public Updater {
+  public:
+   void Init(const UpdaterProto &proto) override;
+   void Update(int step, Param* param, float grad_scale) override;
+
+  protected:
+   float beta1_;
+   float beta2_;
+   float delta_;
+};
+
+class AdamMaxUpdater : public Updater {
+  public:
+   void Init(const UpdaterProto &proto) override;
+   void Update(int step, Param* param, float grad_scale) override;
+
+  protected:
+   float beta1_;
+   float beta2_;
+   float delta_;
+};
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index 1e4929f..6163865 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -128,6 +128,10 @@ void Driver::Init(int argc, char **argv) {
   RegisterUpdater<AdaGradUpdater>(kAdaGrad);
   RegisterUpdater<NesterovUpdater>(kNesterov);
   RegisterUpdater<RMSPropUpdater>(kRMSProp);
+  RegisterUpdater<AdaDeltaUpdater>(kAdaDelta);
+  RegisterUpdater<AdamUpdater>(kAdam);
+  RegisterUpdater<AdamMaxUpdater>(kAdamMax);
+
   RegisterUpdater<SGDUpdater>(kSGD);
 
   // register learning rate change methods

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index da52ea9..7bc0ea3 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -138,6 +138,12 @@ message UpdaterProto {
 
   // configuration for RMSProp algorithm
   optional RMSPropProto rmsprop_conf = 3;
+  // congiguration for AdaDelta algorithm
+  optional AdaDeltaProto adadelta_conf = 4;
+  // congiguration for Adam algorithm
+  optional AdamProto adam_conf = 5;
+  // congiguration for AdamMax algorithm
+  optional AdamMaxProto adammax_conf = 6;
 
   // learning rate generator
   optional LRGenProto learning_rate = 11;
@@ -561,6 +567,17 @@ message RMSPropProto {
  // history=history*rho_+(1-rho_)*(grad*grad_scale);
   required float rho = 1;
 }
+message AdaDeltaProto {
+  required float rho = 1 [default = 0.9];
+}
+message AdamProto {
+  required float beta1 = 1 [default = 0.9];
+  required float beta2 = 2 [default = 0.999];
+}
+message AdamMaxProto {
+  required float beta1 = 1 [default = 0.9];
+  required float beta2 = 2 [default = 0.999];
+}
 
 message FixedStepProto {
   repeated int32 step = 28;
@@ -713,6 +730,12 @@ enum UpdaterType {
   kRMSProp = 3;
   // Nesterov first optimal gradient method
   kNesterov = 4;
+  // AdaDelta
+  kAdaDelta = 5;
+  // Adam
+  kAdam = 6;
+  // AdamMax
+  kAdamMax = 7;
   // For user defined updater
   kUserUpdater = 105;
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 95396bc..158c777 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -154,6 +154,7 @@ void Param::Setup(const vector<int>& shape) {
   data_.Reshape(shape);
   grad_.Reshape(shape);
   history_.Reshape(shape);
+  update_.Reshape(shape);
 }
 
 void Param::InitValues() {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
index 1b3e26c..3f45d9e 100644
--- a/src/utils/updater.cc
+++ b/src/utils/updater.cc
@@ -181,6 +181,7 @@ void AdaGradUpdater::Update(int step, Param* param, float grad_scale) {
 void RMSPropUpdater::Init(const UpdaterProto& proto) {
   Updater::Init(proto);
   rho_ = proto.rmsprop_conf().rho();
+  delta_ = proto.delta();
 }
 
 void RMSPropUpdater::Update(int step, Param* param, float grad_scale) {
@@ -198,14 +199,13 @@ void RMSPropUpdater::Update(int step, Param* param, float grad_scale) {
   if (wd > 0)  //  L2 regularization, should be done after timing grad_scale
     grad += data * wd;
   history = history * rho_ + (1 - rho_) * F<square>(grad);
-  data -= lr * grad / (F<sqrtop>(history, proto_.delta()));
+  data -= lr * grad / F<sqrtop>(history, delta_);
 }
-/***********************AdaDelta******************************
+/***********************AdaDelta******************************/
 void AdaDeltaUpdater::Init(const UpdaterProto& proto){
   Updater::Init(proto);
-  delta_=proto.delta();
-  rho_=proto.rho();
-  weight_decay_=proto.weight_decay();
+  delta_ = proto.delta();
+  rho_=proto.adadelta_conf().rho();
 }
 
 void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){
@@ -215,19 +215,71 @@ void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){
   Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
   Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
   TensorContainer<cpu, 1> tmp(s);
-  float wd=weight_decay_*param->wd_scale();
-  if(wd>0){ // L2 regularization
-    grad+=data*wd;
-  }
-  if(step==0){
-    history=0;
-    update=0;
-  }
-  history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale);
-  tmp=grad*F<op::sqrtop>(update, delta_)/F<op::sqrtop>(history, delta_);
-  update=rho_*update+(1-rho_)*F<op::square>(tmp);
-  data-=tmp;
+  float wd = weight_decay_*param->wd_scale();
+  float lr = lr_gen_->Get(step) * param->lr_scale();
+  if (grad_scale != 1.f)
+    grad *= grad_scale;
+  if (wd > 0)  //  L2 regularization, should be done after timing grad_scale
+    grad += data * wd;
+  history = history * rho_ + (1 - rho_) * F<op::square>(grad);
+  tmp = grad * F<op::sqrtop>(update, delta_) / F<op::sqrtop>(history, delta_);
+  update = rho_ * update + (1 - rho_) * F<op::square>(tmp);
+  if (lr != 1.f)
+    data -= lr * tmp;
+  else 
+    data -= tmp;
+}
+
+/***********************Adam******************************/
+void AdamUpdater::Init(const UpdaterProto &proto) {
+  Updater::Init(proto);
+  beta1_=proto.adam_conf().beta1();
+  beta2_=proto.adam_conf().beta2();
+  delta_ = proto.delta();
+}
+
+void AdamUpdater::Update(int step, Param* param, float grad_scale) {
+  Shape<1> s=Shape1(param->size());
+  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
+  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
+  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
+  Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
+  float wd = weight_decay_*param->wd_scale();
+  float lr = lr_gen_->Get(step) * param->lr_scale();
+  if (grad_scale != 1.f)
+    grad *= grad_scale;
+  if (wd > 0)  //  L2 regularization, should be done after timing grad_scale
+    grad += data * wd;
+  history = history * beta1_ + (1 - beta1_) * grad;
+  update = update * beta2_ + (1 - beta2_) * F<op::square>(grad);
+  data -= lr * history / F<op::sqrtop>(update, delta_);
+}
+
+/***********************AdamMax******************************/
+void AdamMaxUpdater::Init(const UpdaterProto &proto) {
+  Updater::Init(proto);
+  beta1_=proto.adammax_conf().beta1();
+  beta2_=proto.adammax_conf().beta2();
+  delta_=proto.delta();
+}
+
+void AdamMaxUpdater::Update(int step, Param* param, float grad_scale) {
+  Shape<1> s=Shape1(param->size());
+  Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
+  Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
+  Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
+  Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
+  float wd = weight_decay_*param->wd_scale();
+  float lr = lr_gen_->Get(step) * param->lr_scale();
+  if (grad_scale != 1.f)
+    grad *= grad_scale;
+  if (wd > 0)  //  L2 regularization, should be done after timing grad_scale
+    grad += data * wd;
+  history = history * beta1_ + (1 - beta1_) * grad;
+  update = update * beta2_;
+  grad = F<op::abs>(grad);
+  update = F<op::max>(update, grad) + delta_;
+  data -= lr * history / update;
 }
-*/
 
 }  // namespace singa