You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/08/15 10:11:16 UTC
[03/12] incubator-singa git commit: SINGA-58 Fix fan-in dimension of
weight matrix
SINGA-58 Fix fan-in dimension of weight matrix
Use the num of cols of a weight matrix as its fan-in.
Layer that have weight matrix should follow this assumption.
Otherwise, if there would be errors when the weight matrix is initialized based on fan-in.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/fcd377ae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/fcd377ae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/fcd377ae
Branch: refs/heads/master
Commit: fcd377aed543f5a44deeb3145551b107e6cc2324
Parents: a584da6
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sat Aug 15 11:37:59 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Aug 15 14:59:11 2015 +0800
----------------------------------------------------------------------
examples/cifar10/job.conf | 16 ++++++++--------
examples/mnist/conv.conf | 16 ++++++++--------
include/utils/param.h | 8 ++++----
src/neuralnet/layer.cc | 8 ++++----
src/proto/job.proto | 8 ++++----
src/utils/param.cc | 10 ++++------
src/utils/updater.cc | 18 +++++++++---------
7 files changed, 41 insertions(+), 43 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index 89afca9..fdf6167 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -65,12 +65,12 @@ neuralnet {
name: "w1"
init_method:kGaussian
std:0.0001
- learning_rate_multiplier:1.0
+ lr_scale:1.0
}
param {
name: "b1"
init_method: kConstant
- learning_rate_multiplier:2.0
+ lr_scale:2.0
value:0
}
}
@@ -115,12 +115,12 @@ neuralnet {
name: "w2"
init_method:kGaussian
std:0.01
- learning_rate_multiplier:1.0
+ lr_scale:1.0
}
param {
name: "b2"
init_method: kConstant
- learning_rate_multiplier:2.0
+ lr_scale:2.0
value:0
}
}
@@ -197,14 +197,14 @@ neuralnet {
name: "w4"
init_method:kGaussian
std:0.01
- learning_rate_multiplier:1.0
- weight_decay_multiplier:250
+ lr_scale:1.0
+ wd_scale:250
}
param {
name: "b4"
init_method: kConstant
- learning_rate_multiplier:2.0
- weight_decay_multiplier:0
+ lr_scale:2.0
+ wd_scale:0
value:0
}
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/examples/mnist/conv.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf
index ba6f6a7..3509a36 100644
--- a/examples/mnist/conv.conf
+++ b/examples/mnist/conv.conf
@@ -63,12 +63,12 @@ neuralnet {
param{
name: "w1"
init_method:kUniformSqrtFanIn
- learning_rate_multiplier:1.0
+ lr_scale:1.0
}
param{
name: "b1"
init_method: kConstant
- learning_rate_multiplier:2.0
+ lr_scale:2.0
value:0
}
}
@@ -94,12 +94,12 @@ neuralnet {
param{
name: "w2"
init_method:kUniformSqrtFanIn
- learning_rate_multiplier:1.0
+ lr_scale:1.0
}
param{
name: "b2"
init_method: kConstant
- learning_rate_multiplier:2.0
+ lr_scale:2.0
value:0
}
}
@@ -123,12 +123,12 @@ neuralnet {
param{
name: "w3"
init_method:kUniformSqrtFanIn
- learning_rate_multiplier:1.0
+ lr_scale:1.0
}
param{
name: "b3"
init_method: kConstant
- learning_rate_multiplier:2.0
+ lr_scale:2.0
value:0
}
@@ -150,12 +150,12 @@ neuralnet {
param {
name: "w4"
init_method:kUniformSqrtFanIn
- learning_rate_multiplier:1
+ lr_scale:1
}
param {
name: "b4"
init_method: kConstant
- learning_rate_multiplier:2
+ lr_scale:2
value:0
}
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/include/utils/param.h
----------------------------------------------------------------------
diff --git a/include/utils/param.h b/include/utils/param.h
index 8fabe71..2eb66db 100644
--- a/include/utils/param.h
+++ b/include/utils/param.h
@@ -52,14 +52,14 @@ class Param {
/**
* Scale the learning rate when updating parameters in the Param object
*/
- float learning_rate_multiplier() {
- return proto_.learning_rate_multiplier();
+ float lr_scale() {
+ return proto_.lr_scale();
}
/**
* Scale the weight decay when updating parameters in the Param object
*/
- float weight_decay_multiplier() {
- return proto_.weight_decay_multiplier();
+ float wd_scale() {
+ return proto_.wd_scale();
}
/**
* Parameter name used for Param re-use in other model or sharing between
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index c1fce00..810d0b4 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -341,7 +341,7 @@ void InnerProductLayer::Setup(const LayerProto& proto, int npartitions) {
Factory<Param>* factory=Singleton<Factory<Param>>::Instance();
weight_ = factory->Create("Param");
bias_ = factory->Create("Param");
- weight_->Setup(proto.param(0), vector<int>{vdim_, hdim_});
+ weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_});
bias_->Setup(proto.param(1), vector<int>{hdim_});
}
@@ -350,7 +350,7 @@ void InnerProductLayer::ComputeFeature(Phase phase, Metric* perf) {
auto src = Tensor2(srclayers_[0]->mutable_data(this));
auto weight = Tensor2(weight_->mutable_data());
auto bias = Tensor1(bias_->mutable_data());
- data=dot(src, weight);
+ data=dot(src, weight.T());
// repmat: repeat bias vector into batchsize rows
data+=repmat(bias, batchsize_);
}
@@ -363,10 +363,10 @@ void InnerProductLayer::ComputeGradient(Phase phas) {
auto gbias = Tensor1(bias_->mutable_grad());
gbias=sum_rows(grad);
- gweight=dot(src.T(), grad);
+ gweight=dot(grad.T(), src);
if(srclayers_[0]->mutable_grad(this)!=nullptr){
auto gsrc = Tensor2(srclayers_[0]->mutable_grad(this));
- gsrc=dot(grad, weight.T());
+ gsrc=dot(grad, weight);
}
}
/*****************************************************************************
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index dbbfc61..fe8dc21 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -213,10 +213,10 @@ message ParamProto {
// for gaussian sampling
optional float mean = 8 [default = 0];
optional float std = 9 [default = 1];
- // multiplied on the global learning rate.
- optional float learning_rate_multiplier = 15 [default = 1];
- // multiplied on the global weight decay.
- optional float weight_decay_multiplier = 16 [default = 1];
+ // scale factor, multiplied on the global learning rate.
+ optional float lr_scale = 15 [default = 1];
+ // scale factor, multiplied on the global weight decay.
+ optional float wd_scale = 16 [default = 1];
// name of the owner param from which this param shares the values
optional string share_from = 60;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 2655877..7adea7c 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -53,11 +53,9 @@ void Param::InitValues(int version) {
break;
case InitMethod::kUniformSqrtFanIn:
random->SampleUniform(data, proto_.low(), proto_.high());
- // only valid for param matrix with dim 1 for fan in
- LOG(ERROR) << "init fan in";
+ // only valid for param matrix with num of cols as fan in
CHECK_EQ(data_->shape().size(), 2);
data *= proto_.value() / sqrt(data_->shape().at(1) / 3.0f);
- LOG(ERROR) << "end fan in";
break;
case InitMethod::kUniformSqrtFanInOut:
random->SampleUniform(data, proto_.low(), proto_.high());
@@ -96,7 +94,7 @@ Msg* Param::GenPutMsg(bool copy, int idx) {
void *p = ptr;
if (copy) p = nullptr;
msg->AddFormatFrame("iffp", slice_size_[idx],
- learning_rate_multiplier(), weight_decay_multiplier(), p);
+ lr_scale(), wd_scale(), p);
if (copy) {
msg->AddFrame(ptr, slice_size_[idx] * sizeof(float));
}
@@ -146,8 +144,8 @@ Msg* Param::HandlePutMsg(Msg** msg, bool reserve) {
float* ptr;
(*msg)->ParseFormatFrame("iffp", &size, &lr, &wc, &ptr);
ParamProto proto;
- proto.set_learning_rate_multiplier(lr);
- proto.set_weight_decay_multiplier(wc);
+ proto.set_lr_scale(lr);
+ proto.set_wd_scale(wc);
vector<int> shape{size};
Setup(proto, shape);
if (ptr == nullptr) {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
index c038ca7..7bca6dc 100644
--- a/src/utils/updater.cc
+++ b/src/utils/updater.cc
@@ -68,8 +68,8 @@ void SGDUpdater::Update(int step, Param* param, float grad_scale) {
Shape<1> s = Shape1(param->size());
Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
- float lr = GetLearningRate(step)*param->learning_rate_multiplier();
- float wd = weight_decay_*param->weight_decay_multiplier();
+ float lr = GetLearningRate(step)*param->lr_scale();
+ float wd = weight_decay_*param->wd_scale();
if (grad_scale != 1.f)
grad *= grad_scale;
if (wd > 0) { // L2 regularization, should be done after timing grad_scale
@@ -99,8 +99,8 @@ void NesterovUpdater::Update(int step, Param* param, float grad_scale) {
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
TensorContainer<cpu, 1> tmp(s);
- float lr = GetLearningRate(step)*param->learning_rate_multiplier();
- float wd = weight_decay_*param->weight_decay_multiplier();
+ float lr = GetLearningRate(step)*param->lr_scale();
+ float wd = weight_decay_*param->wd_scale();
if (grad_scale != 1.f)
grad *= grad_scale;
if (wd > 0) { // L2 regularization, should be done after timing grad_scale
@@ -125,8 +125,8 @@ void AdaGradUpdater::Update(int step, Param* param, float grad_scale) {
Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
- float lr = GetLearningRate(step)*param->learning_rate_multiplier();
- float wd = weight_decay_*param->weight_decay_multiplier();
+ float lr = GetLearningRate(step)*param->lr_scale();
+ float wd = weight_decay_*param->wd_scale();
if (grad_scale != 1.f)
grad *= grad_scale;
if (wd > 0) { // L2 regularization, should be done after timing grad_scale
@@ -152,8 +152,8 @@ void RMSPropUpdater::Update(int step, Param* param, float grad_scale){
Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale);
- float lr=GetLearningRate(step)*param->learning_rate_multiplier();
- float wd=weight_decay_*param->weight_decay_multiplier();
+ float lr=GetLearningRate(step)*param->lr_scale();
+ float wd=weight_decay_*param->wd_scale();
if(wd>0){ // L2 regularization
grad+=data*wd;
}
@@ -175,7 +175,7 @@ void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){
Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
TensorContainer<cpu, 1> tmp(s);
- float wd=weight_decay_*param->weight_decay_multiplier();
+ float wd=weight_decay_*param->wd_scale();
if(wd>0){ // L2 regularization
grad+=data*wd;
}