You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/08/15 10:11:16 UTC
[03/12] incubator-singa git commit: SINGA-58 Fix fan-in dimension of weight matrix

SINGA-58 Fix fan-in dimension of weight matrix

Use the num of cols of a weight matrix as its fan-in.
Layer that have weight matrix should follow this assumption.
Otherwise, if there would be errors when the weight matrix is initialized based on fan-in.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/fcd377ae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/fcd377ae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/fcd377ae

Branch: refs/heads/master
Commit: fcd377aed543f5a44deeb3145551b107e6cc2324
Parents: a584da6
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sat Aug 15 11:37:59 2015 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sat Aug 15 14:59:11 2015 +0800

----------------------------------------------------------------------
 examples/cifar10/job.conf | 16 ++++++++--------
 examples/mnist/conv.conf  | 16 ++++++++--------
 include/utils/param.h     |  8 ++++----
 src/neuralnet/layer.cc    |  8 ++++----
 src/proto/job.proto       |  8 ++++----
 src/utils/param.cc        | 10 ++++------
 src/utils/updater.cc      | 18 +++++++++---------
 7 files changed, 41 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index 89afca9..fdf6167 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -65,12 +65,12 @@ neuralnet {
       name: "w1"
       init_method:kGaussian
       std:0.0001
-      learning_rate_multiplier:1.0
+      lr_scale:1.0
     }
     param {
       name: "b1"
       init_method: kConstant
-      learning_rate_multiplier:2.0
+      lr_scale:2.0
       value:0
     }
   }
@@ -115,12 +115,12 @@ neuralnet {
       name: "w2"
       init_method:kGaussian
       std:0.01
-      learning_rate_multiplier:1.0
+      lr_scale:1.0
     }
     param {
       name: "b2"
       init_method: kConstant
-      learning_rate_multiplier:2.0
+      lr_scale:2.0
       value:0
     }
   }
@@ -197,14 +197,14 @@ neuralnet {
       name: "w4"
       init_method:kGaussian
       std:0.01
-      learning_rate_multiplier:1.0
-      weight_decay_multiplier:250
+      lr_scale:1.0
+      wd_scale:250
     }
     param {
       name: "b4"
       init_method: kConstant
-      learning_rate_multiplier:2.0
-      weight_decay_multiplier:0
+      lr_scale:2.0
+      wd_scale:0
       value:0
     }
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/examples/mnist/conv.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf
index ba6f6a7..3509a36 100644
--- a/examples/mnist/conv.conf
+++ b/examples/mnist/conv.conf
@@ -63,12 +63,12 @@ neuralnet {
     param{
         name: "w1"
         init_method:kUniformSqrtFanIn
-        learning_rate_multiplier:1.0
+        lr_scale:1.0
       }
     param{
         name: "b1"
         init_method: kConstant
-        learning_rate_multiplier:2.0
+        lr_scale:2.0
         value:0
       }
   }
@@ -94,12 +94,12 @@ neuralnet {
     param{
         name: "w2"
         init_method:kUniformSqrtFanIn
-        learning_rate_multiplier:1.0
+        lr_scale:1.0
       }
     param{
         name: "b2"
         init_method: kConstant
-        learning_rate_multiplier:2.0
+        lr_scale:2.0
         value:0
       }
   }
@@ -123,12 +123,12 @@ neuralnet {
     param{
         name: "w3"
         init_method:kUniformSqrtFanIn
-        learning_rate_multiplier:1.0
+        lr_scale:1.0
       }
     param{
         name: "b3"
         init_method: kConstant
-        learning_rate_multiplier:2.0
+        lr_scale:2.0
         value:0
     }
 
@@ -150,12 +150,12 @@ neuralnet {
     param {
       name: "w4"
       init_method:kUniformSqrtFanIn
-      learning_rate_multiplier:1
+      lr_scale:1
     }
     param {
       name: "b4"
       init_method: kConstant
-      learning_rate_multiplier:2
+      lr_scale:2
       value:0
     }
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/include/utils/param.h
----------------------------------------------------------------------
diff --git a/include/utils/param.h b/include/utils/param.h
index 8fabe71..2eb66db 100644
--- a/include/utils/param.h
+++ b/include/utils/param.h
@@ -52,14 +52,14 @@ class Param {
   /**
    * Scale the learning rate when updating parameters in the Param object
    */
-  float learning_rate_multiplier() {
-    return proto_.learning_rate_multiplier();
+  float lr_scale() {
+    return proto_.lr_scale();
   }
   /**
    * Scale the weight decay when updating parameters in the Param object
    */
-  float weight_decay_multiplier() {
-    return proto_.weight_decay_multiplier();
+  float wd_scale() {
+    return proto_.wd_scale();
   }
   /**
    * Parameter name used for Param re-use in other model or sharing between

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index c1fce00..810d0b4 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -341,7 +341,7 @@ void InnerProductLayer::Setup(const LayerProto& proto, int npartitions) {
   Factory<Param>* factory=Singleton<Factory<Param>>::Instance();
   weight_ = factory->Create("Param");
   bias_ = factory->Create("Param");
-  weight_->Setup(proto.param(0), vector<int>{vdim_, hdim_});
+  weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_});
   bias_->Setup(proto.param(1), vector<int>{hdim_});
 }
 
@@ -350,7 +350,7 @@ void InnerProductLayer::ComputeFeature(Phase phase, Metric* perf) {
   auto src = Tensor2(srclayers_[0]->mutable_data(this));
   auto weight = Tensor2(weight_->mutable_data());
   auto bias = Tensor1(bias_->mutable_data());
-  data=dot(src, weight);
+  data=dot(src, weight.T());
   // repmat: repeat bias vector into batchsize rows
   data+=repmat(bias, batchsize_);
 }
@@ -363,10 +363,10 @@ void InnerProductLayer::ComputeGradient(Phase phas) {
   auto gbias = Tensor1(bias_->mutable_grad());
 
   gbias=sum_rows(grad);
-  gweight=dot(src.T(), grad);
+  gweight=dot(grad.T(), src);
   if(srclayers_[0]->mutable_grad(this)!=nullptr){
     auto gsrc = Tensor2(srclayers_[0]->mutable_grad(this));
-    gsrc=dot(grad, weight.T());
+    gsrc=dot(grad, weight);
   }
 }
 /*****************************************************************************

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index dbbfc61..fe8dc21 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -213,10 +213,10 @@ message ParamProto {
   // for gaussian sampling
   optional float mean = 8 [default = 0];
   optional float std = 9 [default = 1];
-  // multiplied on the global learning rate.
-  optional float learning_rate_multiplier = 15 [default = 1];
-  // multiplied on the global weight decay.
-  optional float weight_decay_multiplier = 16 [default = 1];
+  // scale factor, multiplied on the global learning rate.
+  optional float lr_scale = 15 [default = 1];
+  // scale factor, multiplied on the global weight decay.
+  optional float wd_scale = 16 [default = 1];
 
   // name of the owner param from which this param shares the values
   optional string share_from = 60;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 2655877..7adea7c 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -53,11 +53,9 @@ void Param::InitValues(int version) {
     break;
   case InitMethod::kUniformSqrtFanIn:
     random->SampleUniform(data, proto_.low(), proto_.high());
-    // only valid for param matrix with dim 1 for fan in
-    LOG(ERROR) << "init fan in";
+    // only valid for param matrix with num of cols as fan in
     CHECK_EQ(data_->shape().size(), 2);
     data *= proto_.value() / sqrt(data_->shape().at(1) / 3.0f);
-    LOG(ERROR) << "end fan in";
     break;
   case InitMethod::kUniformSqrtFanInOut:
     random->SampleUniform(data, proto_.low(), proto_.high());
@@ -96,7 +94,7 @@ Msg* Param::GenPutMsg(bool copy, int idx) {
   void *p = ptr;
   if (copy) p = nullptr;
   msg->AddFormatFrame("iffp", slice_size_[idx],
-      learning_rate_multiplier(), weight_decay_multiplier(), p);
+      lr_scale(), wd_scale(), p);
   if (copy) {
     msg->AddFrame(ptr, slice_size_[idx] * sizeof(float));
   }
@@ -146,8 +144,8 @@ Msg* Param::HandlePutMsg(Msg** msg, bool reserve) {
   float* ptr;
   (*msg)->ParseFormatFrame("iffp", &size, &lr, &wc, &ptr);
   ParamProto proto;
-  proto.set_learning_rate_multiplier(lr);
-  proto.set_weight_decay_multiplier(wc);
+  proto.set_lr_scale(lr);
+  proto.set_wd_scale(wc);
   vector<int> shape{size};
   Setup(proto, shape);
   if (ptr == nullptr) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fcd377ae/src/utils/updater.cc
----------------------------------------------------------------------
diff --git a/src/utils/updater.cc b/src/utils/updater.cc
index c038ca7..7bca6dc 100644
--- a/src/utils/updater.cc
+++ b/src/utils/updater.cc
@@ -68,8 +68,8 @@ void SGDUpdater::Update(int step, Param* param, float grad_scale) {
   Shape<1> s = Shape1(param->size());
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
-  float lr = GetLearningRate(step)*param->learning_rate_multiplier();
-  float wd = weight_decay_*param->weight_decay_multiplier();
+  float lr = GetLearningRate(step)*param->lr_scale();
+  float wd = weight_decay_*param->wd_scale();
   if (grad_scale != 1.f)
     grad *= grad_scale;
   if (wd > 0) {  // L2 regularization, should be done after timing grad_scale
@@ -99,8 +99,8 @@ void NesterovUpdater::Update(int step, Param* param, float grad_scale) {
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
   Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
   TensorContainer<cpu, 1> tmp(s);
-  float lr = GetLearningRate(step)*param->learning_rate_multiplier();
-  float wd = weight_decay_*param->weight_decay_multiplier();
+  float lr = GetLearningRate(step)*param->lr_scale();
+  float wd = weight_decay_*param->wd_scale();
   if (grad_scale != 1.f)
     grad *= grad_scale;
   if (wd > 0) {  // L2 regularization, should be done after timing grad_scale
@@ -125,8 +125,8 @@ void AdaGradUpdater::Update(int step, Param* param, float grad_scale) {
   Tensor<cpu, 1> data(param->mutable_cpu_data(), s);
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
   Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
-  float lr = GetLearningRate(step)*param->learning_rate_multiplier();
-  float wd = weight_decay_*param->weight_decay_multiplier();
+  float lr = GetLearningRate(step)*param->lr_scale();
+  float wd = weight_decay_*param->wd_scale();
   if (grad_scale != 1.f)
     grad *= grad_scale;
   if (wd > 0) {  //  L2 regularization, should be done after timing grad_scale
@@ -152,8 +152,8 @@ void RMSPropUpdater::Update(int step, Param* param, float grad_scale){
   Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s);
   Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
   history=history*rho_+(1-rho_)*F<op::square>(grad*grad_scale);
-  float lr=GetLearningRate(step)*param->learning_rate_multiplier();
-  float wd=weight_decay_*param->weight_decay_multiplier();
+  float lr=GetLearningRate(step)*param->lr_scale();
+  float wd=weight_decay_*param->wd_scale();
   if(wd>0){ // L2 regularization
     grad+=data*wd;
   }
@@ -175,7 +175,7 @@ void AdaDeltaUpdater::Update(int step, Param* param, float grad_scale){
   Tensor<cpu, 1> history(param->mutable_cpu_history(), s);
   Tensor<cpu, 1> update(param->mutable_cpu_update(), s);
   TensorContainer<cpu, 1> tmp(s);
-  float wd=weight_decay_*param->weight_decay_multiplier();
+  float wd=weight_decay_*param->wd_scale();
   if(wd>0){ // L2 regularization
     grad+=data*wd;
   }