You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ji...@apache.org on 2016/02/24 10:25:46 UTC
[2/2] incubator-singa git commit: SINGA-145 New SGD based
optimization Updaters: AdaDelta, Adam, AdamMax
SINGA-145 New SGD based optimization Updaters: AdaDelta, Adam, AdamMax
New Updaters: AdaDelta, Adam, AdamMax.
To implement AdamMax, add two new operators for Tensor in cxxnet_op.h i.e. op::abs and op::max.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/e32e70cc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/e32e70cc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/e32e70cc
Branch: refs/heads/master
Commit: e32e70ccdd16582c633f12e3e6702636139a6078
Parents: 8d4953a
Author: ijingo <ij...@gmail.com>
Authored: Fri Feb 19 16:10:21 2016 +0800
Committer: jinyangturbo <pk...@gmail.com>
Committed: Wed Feb 24 01:21:23 2016 -0800
----------------------------------------------------------------------
examples/cifar10/job.conf | 57 +++++++++++++++++-------
include/mshadow/cxxnet_op.h | 11 +++++
include/singa/utils/param.h | 3 +-
include/singa/utils/updater.h | 29 +++++++++++--
src/driver.cc | 4 ++
src/proto/job.proto | 23 ++++++++++
src/utils/param.cc | 1 +
src/utils/updater.cc | 88 ++++++++++++++++++++++++++++++--------
8 files changed, 178 insertions(+), 38 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index d20b452..22b2bb2 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -1,7 +1,7 @@
name: "cifar10-convnet"
-train_steps: 1000
+train_steps: 5000
test_steps: 100
-test_freq: 200
+test_freq: 500
#validate_steps: 100
#validate_freq: 300
disp_freq: 50
@@ -9,20 +9,47 @@ disp_freq: 50
train_one_batch {
alg: kBP
}
+#updater{
+# type: kSGD
+# weight_decay:0.004
+# momentum:0.9
+# learning_rate {
+# type: kFixedStep
+# fixedstep_conf:{
+# step:0
+# step:60000
+# step:65000
+# step_lr:0.001
+# step_lr:0.0001
+# step_lr:0.00001
+# }
+# }
+#}
+#updater{
+# type: kAdaDelta
+# weight_decay:0.004
+# delta: 0.000001
+# learning_rate {
+# type: kFixed
+# base_lr:1
+# }
+#}
+#updater{
+# type: kAdamMax
+# weight_decay:0.004
+# delta: 0.00000001
+# learning_rate {
+# type: kFixed
+# base_lr:0.0001
+# }
+#}
updater{
- type: kSGD
+ type: kAdamMax
weight_decay:0.004
- momentum:0.9
+ delta: 0.00000001
learning_rate {
- type: kFixedStep
- fixedstep_conf:{
- step:0
- step:60000
- step:65000
- step_lr:0.001
- step_lr:0.0001
- step_lr:0.00001
- }
+ type: kFixed
+ base_lr:0.002
}
}
neuralnet {
@@ -273,7 +300,7 @@ neuralnet {
cluster {
nworker_groups: 1
nserver_groups: 1
- nworkers_per_group: 1
- nworkers_per_procs: 1
+ nworkers_per_group: 4
+ nworkers_per_procs: 4
workspace: "examples/cifar10"
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/include/mshadow/cxxnet_op.h
----------------------------------------------------------------------
diff --git a/include/mshadow/cxxnet_op.h b/include/mshadow/cxxnet_op.h
index 930caf2..1422070 100644
--- a/include/mshadow/cxxnet_op.h
+++ b/include/mshadow/cxxnet_op.h
@@ -86,6 +86,12 @@ namespace mshadow {
}
};
+ struct abs{
+ MSHADOW_XINLINE static real_t Map(real_t a) {
+ return a < 0 ? -a : a;
+ }
+ };
+
}; //namespace op
}; //namespace mshadow
@@ -110,6 +116,11 @@ namespace mshadow {
return sqrt(a+b);
}
};
+ struct max {
+ MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+ return a > b ? a : b;
+ }
+ };
}; // namespace op
}; // namespace mshadow
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/include/singa/utils/param.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/param.h b/include/singa/utils/param.h
index 9930710..fcaaeb7 100644
--- a/include/singa/utils/param.h
+++ b/include/singa/utils/param.h
@@ -219,6 +219,7 @@ class Param {
inline float* mutable_cpu_data() { return data_.mutable_cpu_data(); }
inline float* mutable_cpu_grad() { return grad_.mutable_cpu_data(); }
inline float* mutable_cpu_history() { return history_.mutable_cpu_data(); }
+ inline float* mutable_cpu_update() { return update_.mutable_cpu_data(); }
/**
* @return slice start ID
*/
@@ -355,7 +356,7 @@ class Param {
std::vector<bool> pending_update_;
int num_pending_requests_ = 0;
// data, gradient, history gradient of this parameter
- Blob<float> data_, grad_, history_;
+ Blob<float> data_, grad_, history_, update_;
ParamProto proto_;
};
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/include/singa/utils/updater.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/updater.h b/include/singa/utils/updater.h
index 7fec78c..b14f72b 100644
--- a/include/singa/utils/updater.h
+++ b/include/singa/utils/updater.h
@@ -125,19 +125,40 @@ class RMSPropUpdater : public Updater {
protected:
float rho_;
+ float delta_;
};
-/*
class AdaDeltaUpdater : public Updater {
public:
- virtual void Update(int step, Param* param, float grad_scale);
+ void Init(const UpdaterProto &proto) override;
+ void Update(int step, Param* param, float grad_scale) override;
protected:
float rho_;
float delta_;
- float weight_decay_;
};
-*/
+
+class AdamUpdater : public Updater {
+ public:
+ void Init(const UpdaterProto &proto) override;
+ void Update(int step, Param* param, float grad_scale) override;
+
+ protected:
+ float beta1_;
+ float beta2_;
+ float delta_;
+};
+
+class AdamMaxUpdater : public Updater {
+ public:
+ void Init(const UpdaterProto &proto) override;
+ void Update(int step, Param* param, float grad_scale) override;
+
+ protected:
+ float beta1_;
+ float beta2_;
+ float delta_;
+};
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index 1e4929f..6163865 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -128,6 +128,10 @@ void Driver::Init(int argc, char **argv) {
RegisterUpdater<AdaGradUpdater>(kAdaGrad);
RegisterUpdater<NesterovUpdater>(kNesterov);
RegisterUpdater<RMSPropUpdater>(kRMSProp);
+ RegisterUpdater<AdaDeltaUpdater>(kAdaDelta);
+ RegisterUpdater<AdamUpdater>(kAdam);
+ RegisterUpdater<AdamMaxUpdater>(kAdamMax);
+
RegisterUpdater<SGDUpdater>(kSGD);
// register learning rate change methods
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index da52ea9..7bc0ea3 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -138,6 +138,12 @@ message UpdaterProto {
// configuration for RMSProp algorithm
optional RMSPropProto rmsprop_conf = 3;
+ // congiguration for AdaDelta algorithm
+ optional AdaDeltaProto adadelta_conf = 4;
+ // congiguration for Adam algorithm
+ optional AdamProto adam_conf = 5;
+ // congiguration for AdamMax algorithm
+ optional AdamMaxProto adammax_conf = 6;
// learning rate generator
optional LRGenProto learning_rate = 11;
@@ -561,6 +567,17 @@ message RMSPropProto {
// history=history*rho_+(1-rho_)*(grad*grad_scale);
required float rho = 1;
}
+message AdaDeltaProto {
+ required float rho = 1 [default = 0.9];
+}
+message AdamProto {
+ required float beta1 = 1 [default = 0.9];
+ required float beta2 = 2 [default = 0.999];
+}
+message AdamMaxProto {
+ required float beta1 = 1 [default = 0.9];
+ required float beta2 = 2 [default = 0.999];
+}
message FixedStepProto {
repeated int32 step = 28;
@@ -713,6 +730,12 @@ enum UpdaterType {
kRMSProp = 3;
// Nesterov first optimal gradient method
kNesterov = 4;
+ // AdaDelta
+ kAdaDelta = 5;
+ // Adam
+ kAdam = 6;
+ // AdamMax
+ kAdamMax = 7;
// For user defined updater
kUserUpdater = 105;
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 95396bc..158c777 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -154,6 +154,7 @@ void Param::Setup(const vector<int>& shape) {
data_.Reshape(shape);
grad_.Reshape(shape);
history_.Reshape(shape);
+ update_.Reshape(shape);
}
void Param::InitValues() {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e32e70cc/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
index 1b3e26c..3f45d9e 100644
--- a/src/utils/updater.cc
+++ b/src/utils/updater.cc
@@ -181,6 +181,7 @@ void AdaGradUpdater::Update(int step, Param* param, float grad_scale) {
void RMSPropUpdater::Init(const UpdaterProto& proto) {
Updater::Init(proto);
rho_ = proto.rmsprop_conf().rho();
+ delta_ = proto.delta();
}
void RMSPropUpdater::Update(int step, Param* param, float grad_scale) {
@@ -198,14 +199,13 @@ void RMSPropUpdater::Update(int step, Param* param, float grad_scale) {
if (wd > 0) // L2 regularization, should be done after timing grad_scale
grad += data * wd;
history = history * rho_ + (1 - rho_) * F<square>(grad);
- data -= lr * grad / (F<sqrtop>(history, proto_.delta()));
+ data -= lr * grad / F<sqrtop>(history, delta_);
}
-/***********************AdaDelta******************************
+/***********************AdaDelta******************************/
void AdaDeltaUpdater::Init(const UpdaterProto& proto){
Updater::Init(proto);
- delta_=proto.delta();
- rho_=proto.rho();
- weight_decay_=proto.weight_decay();
+ delta_ = proto.delta();
+ rho_=proto.adadelta_conf().rho();
}
void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){
@@ -215,19 +215,71 @@ void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
TensorContainer<cpu, 1> tmp(s);
- float wd=weight_decay_*param->wd_scale();
- if(wd>0){ // L2 regularization
- grad+=data*wd;
- }
- if(step==0){
- history=0;
- update=0;
- }
- history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale);
- tmp=grad*F<op::sqrtop>(update, delta_)/F<op::sqrtop>(history, delta_);
- update=rho_*update+(1-rho_)*F<op::square>(tmp);
- data-=tmp;
+ float wd = weight_decay_*param->wd_scale();
+ float lr = lr_gen_->Get(step) * param->lr_scale();
+ if (grad_scale != 1.f)
+ grad *= grad_scale;
+ if (wd > 0) // L2 regularization, should be done after timing grad_scale
+ grad += data * wd;
+ history = history * rho_ + (1 - rho_) * F<op::square>(grad);
+ tmp = grad * F<op::sqrtop>(update, delta_) / F<op::sqrtop>(history, delta_);
+ update = rho_ * update + (1 - rho_) * F<op::square>(tmp);
+ if (lr != 1.f)
+ data -= lr * tmp;
+ else
+ data -= tmp;
+}
+
+/***********************Adam******************************/
+void AdamUpdater::Init(const UpdaterProto &proto) {
+ Updater::Init(proto);
+ beta1_=proto.adam_conf().beta1();
+ beta2_=proto.adam_conf().beta2();
+ delta_ = proto.delta();
+}
+
+void AdamUpdater::Update(int step, Param* param, float grad_scale) {
+ Shape<1> s=Shape1(param->size());
+ Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
+ Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
+ Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
+ Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
+ float wd = weight_decay_*param->wd_scale();
+ float lr = lr_gen_->Get(step) * param->lr_scale();
+ if (grad_scale != 1.f)
+ grad *= grad_scale;
+ if (wd > 0) // L2 regularization, should be done after timing grad_scale
+ grad += data * wd;
+ history = history * beta1_ + (1 - beta1_) * grad;
+ update = update * beta2_ + (1 - beta2_) * F<op::square>(grad);
+ data -= lr * history / F<op::sqrtop>(update, delta_);
+}
+
+/***********************AdamMax******************************/
+void AdamMaxUpdater::Init(const UpdaterProto &proto) {
+ Updater::Init(proto);
+ beta1_=proto.adammax_conf().beta1();
+ beta2_=proto.adammax_conf().beta2();
+ delta_=proto.delta();
+}
+
+void AdamMaxUpdater::Update(int step, Param* param, float grad_scale) {
+ Shape<1> s=Shape1(param->size());
+ Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
+ Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
+ Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
+ Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
+ float wd = weight_decay_*param->wd_scale();
+ float lr = lr_gen_->Get(step) * param->lr_scale();
+ if (grad_scale != 1.f)
+ grad *= grad_scale;
+ if (wd > 0) // L2 regularization, should be done after timing grad_scale
+ grad += data * wd;
+ history = history * beta1_ + (1 - beta1_) * grad;
+ update = update * beta2_;
+ grad = F<op::abs>(grad);
+ update = F<op::max>(update, grad) + delta_;
+ data -= lr * history / update;
}
-*/
} // namespace singa