You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ka...@apache.org on 2016/06/27 14:11:50 UTC
[3/6] incubator-singa git commit: SINGA-204 Support the training of feed-forward neural nets

SINGA-204 Support the training of feed-forward neural nets

Implement Alexnet model for Cifar10 https://code.google.com/p/cuda-convnet/
But the test accuracy is low 0.72 (which should be 0.82).


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/71eb059c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/71eb059c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/71eb059c

Branch: refs/heads/dev
Commit: 71eb059cd13ea41e74195c7c115f927aaf143490
Parents: cf1d841
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Mon Jun 27 01:21:59 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 27 15:29:05 2016 +0800

----------------------------------------------------------------------
 examples/cifar10/alexnet.cc             | 123 ++++++++++++++++++---------
 examples/cifar10/cifar10.h              |   3 +-
 examples/cifar10/make.sh                |   2 +-
 include/singa/core/tensor.h             |   1 +
 include/singa/model/feed_forward_net.h  |   2 +-
 include/singa/model/initializer.h       |  26 +++++-
 include/singa/model/loss.h              |  12 ++-
 include/singa/model/metric.h            |   2 +-
 include/singa/model/optimizer.h         |  18 ++--
 include/singa/utils/string.h            |  11 +++
 src/core/tensor/math_kernel.cu          |  22 +++++
 src/core/tensor/math_kernel.h           |   2 +
 src/core/tensor/tensor.cc               |  69 ++++++++++-----
 src/core/tensor/tensor_math.h           |   5 ++
 src/core/tensor/tensor_math_cpp.h       |  14 +++
 src/core/tensor/tensor_math_cuda.h      |   9 ++
 src/model/feed_forward_net.cc           | 104 ++++++++++++++--------
 src/model/layer/cudnn_convolution.cc    |   5 +-
 src/model/layer/cudnn_dropout.cc        |   2 +-
 src/model/layer/dense.cc                |   4 +-
 src/model/loss/mse.cc                   |   5 +-
 src/model/loss/softmax_cross_entropy.cc |  11 ++-
 src/model/metric/accuracy.cc            |   1 +
 src/model/optimizer/optimizer.cc        |  30 ++++++-
 src/model/optimizer/sgd.cc              |   1 +
 src/proto/model.proto                   |   5 ++
 test/singa/test_cross_entropy.cc        |   8 +-
 test/singa/test_dense.cc                |   2 +-
 test/singa/test_mse.cc                  |   8 +-
 test/singa/test_tensor_math.cc          |   4 +-
 30 files changed, 370 insertions(+), 141 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/examples/cifar10/alexnet.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/alexnet.cc b/examples/cifar10/alexnet.cc
index 45d8571..d6541a3 100644
--- a/examples/cifar10/alexnet.cc
+++ b/examples/cifar10/alexnet.cc
@@ -28,12 +28,13 @@
 #include "../../src/model/layer/cudnn_convolution.h"
 #include "../../src/model/layer/cudnn_activation.h"
 #include "../../src/model/layer/cudnn_pooling.h"
+#include "../../src/model/layer/cudnn_lrn.h"
 #include "../../src/model/layer/dense.h"
 #include "../../src/model/layer/flatten.h"
 namespace singa {
 
 LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
-                      int pad) {
+                      int pad, float std) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type("CudnnConvolution");
@@ -42,13 +43,23 @@ LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
   conv->add_kernel_size(kernel);
   conv->add_stride(stride);
   conv->add_pad(pad);
+  conv->set_bias_term(true);
 
-  FillerConf *weight = conv->mutable_weight_filler();
-  weight->set_type("Xavier");
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+//  bspec->set_decay_mult(0);
   return conf;
 }
 
-LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride, int pad) {
+LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
+                         int pad) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type("CudnnPooling");
@@ -56,8 +67,7 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride, int
   pool->set_kernel_size(kernel);
   pool->set_stride(stride);
   pool->set_pad(pad);
-  if (!max_pool)
-    pool->set_pool(PoolingConf_PoolMethod_AVE);
+  if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
   return conf;
 }
 
@@ -68,21 +78,38 @@ LayerConf GenReLUConf(string name) {
   return conf;
 }
 
-LayerConf GenDenseConf(string name, int num_output) {
+LayerConf GenDenseConf(string name, int num_output, float std, float wd) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type("Dense");
   DenseConf *dense = conf.mutable_dense_conf();
   dense->set_num_output(num_output);
-  FillerConf *weight = dense->mutable_weight_filler();
-  weight->set_type("Xavier");
+  FillerConf *bias = dense->mutable_bias_filler();
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  wspec->set_decay_mult(wd);
+
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+
   return conf;
 }
 
-LayerConf GenSoftmaxConf(string name) {
+LayerConf GenLRNConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnSoftmax");
+  conf.set_type("CudnnLRN");
+  LRNConf *lrn = conf.mutable_lrn_conf();
+  lrn->set_local_size(3);
+  lrn->set_alpha(5e-05);
+  lrn->set_beta(0.75);
   return conf;
 }
 
@@ -92,25 +119,25 @@ LayerConf GenFlattenConf(string name) {
   conf.set_type("Flatten");
   return conf;
 }
-FeedForwardNet CreateNet(Optimizer* opt, Loss<Tensor>* loss, Metric<Tensor>* metric) {
+
+FeedForwardNet CreateNet() {
   FeedForwardNet net;
   Shape s{3, 32, 32};
 
-  net.Add(new CudnnConvolution(), GenConvConf("conv1", 32, 5, 1, 2), &s);
+  net.Add(new CudnnConvolution(), GenConvConf("conv1", 32, 5, 1, 2, 0.0001),
+          &s);
   net.Add(new CudnnActivation(), GenReLUConf("relu1"));
-  net.Add(new CudnnPooling, GenPoolingConf("pool1", true, 3, 2, 0));
-  net.Add(new CudnnConvolution(), GenConvConf("conv2", 32, 5, 1, 2));
+  net.Add(new CudnnPooling(), GenPoolingConf("pool1", true, 3, 2, 1));
+  net.Add(new CudnnLRN(), GenLRNConf("lrn1"));
+  net.Add(new CudnnConvolution(), GenConvConf("conv2", 32, 5, 1, 2, 0.01));
   net.Add(new CudnnActivation(), GenReLUConf("relu2"));
-  net.Add(new CudnnPooling(), GenPoolingConf("pool2", true, 3, 2, 0));
-  net.Add(new CudnnConvolution, GenConvConf("conv3", 64, 5, 1, 2));
+  net.Add(new CudnnPooling(), GenPoolingConf("pool2", false, 3, 2, 1));
+  net.Add(new CudnnLRN(), GenLRNConf("lrn2"));
+  net.Add(new CudnnConvolution, GenConvConf("conv3", 64, 5, 1, 2, 0.01));
   net.Add(new CudnnActivation(), GenReLUConf("relu3"));
-  net.Add(new CudnnConvolution(), GenConvConf("pool3", true, 3, 2, 0));
+  net.Add(new CudnnPooling(), GenPoolingConf("pool3", false, 3, 2, 1));
   net.Add(new Flatten(), GenFlattenConf("flat"));
-  net.Add(new Dense(), GenDenseConf("ip1", 10));
-  OptimizerConf opt_conf;
-  opt_conf.set_momentum(0.9);
-  opt->Setup(opt_conf);
-  net.Compile(true, opt, loss, metric);
+  net.Add(new Dense(), GenDenseConf("ip", 10, 0.01, 250));
   return net;
 }
 
@@ -120,50 +147,62 @@ void Train(float lr, int num_epoch, string data_dir) {
   {
     auto train = data.ReadTrainData();
     size_t nsamples = train.first.shape(0);
-    auto matx = Reshape(train.first, Shape{nsamples, train.first.Size() / nsamples});
+    auto matx =
+        Reshape(train.first, Shape{nsamples, train.first.Size() / nsamples});
     const auto mean = Average(matx, 0);
     SubRow(mean, &matx);
     train_x = Reshape(matx, train.first.shape());
     train_y = train.second;
     auto test = data.ReadTestData();
     nsamples = test.first.shape(0);
-    auto maty = Reshape(test.first, Shape{nsamples, test.first.Size() / nsamples});
+    auto maty =
+        Reshape(test.first, Shape{nsamples, test.first.Size() / nsamples});
     SubRow(mean, &maty);
     test_x = Reshape(maty, test.first.shape());
     test_y = test.second;
   }
-  LOG(ERROR) << "creating net";
+  LOG(INFO) << "Training samples = " << train_y.shape(0)
+            << " Test samples =" << test_y.shape(0);
+  auto net = CreateNet();
+  SGD sgd;
+  OptimizerConf opt_conf;
+  opt_conf.set_momentum(0.9);
+  auto reg = opt_conf.mutable_regularizer();
+  reg->set_coefficient(0.004);
+  sgd.Setup(opt_conf);
+  sgd.SetLearningRateGenerator([lr](int step) {
+    if (step <= 120)
+      return 0.001;
+    else if (step <= 130)
+      return 0.0001;
+    else if (step <= 140)
+      return 0.00001;
+  });
   SoftmaxCrossEntropy loss;
   Accuracy acc;
-  SGD sgd;
-  sgd.SetLearningRateGenerator([lr](int step) {return lr;});
-  auto net = CreateNet(&sgd, &loss, &acc);
-
+  net.Compile(true, &sgd, &loss, &acc);
   auto cuda = std::make_shared<CudaGPU>();
   net.ToDevice(cuda);
 
   train_x.ToDevice(cuda);
   train_y.ToDevice(cuda);
-  net.Train(50, num_epoch, train_x, train_y); // test_x, test_y);
+  test_x.ToDevice(cuda);
+  test_y.ToDevice(cuda);
+  net.Train(100, num_epoch, train_x, train_y, test_x, test_y);
 }
-
-
 }
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
   singa::InitChannel(nullptr);
   int pos = singa::ArgPos(argc, argv, "-epoch");
-  int nEpoch = 5;
-  if (pos != -1)
-    nEpoch = atoi(argv[pos + 1]);
+  int nEpoch = 140;
+  if (pos != -1) nEpoch = atoi(argv[pos + 1]);
   pos = singa::ArgPos(argc, argv, "-lr");
-  float lr = 0.01;
-  if (pos != -1)
-    lr = atof(argv[pos + 1]);
+  float lr = 0.001;
+  if (pos != -1) lr = atof(argv[pos + 1]);
   pos = singa::ArgPos(argc, argv, "-data");
   string data = "cifar-10-batches-bin";
-  if (pos != -1)
-    data = argv[pos + 1];
+  if (pos != -1) data = argv[pos + 1];
 
   LOG(INFO) << "Start training";
   singa::Train(lr, nEpoch, data);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/examples/cifar10/cifar10.h
----------------------------------------------------------------------
diff --git a/examples/cifar10/cifar10.h b/examples/cifar10/cifar10.h
index 261c048..7f10153 100644
--- a/examples/cifar10/cifar10.h
+++ b/examples/cifar10/cifar10.h
@@ -40,11 +40,12 @@ class Cifar10 {
   const std::pair<Tensor, Tensor> ReadFile(string file, bool shuffle = false);
 
   void ReadImage(std::ifstream* file, int* label, char* buffer);
+
  private:
   const size_t kImageSize = 32;
   const size_t kImageVol = 3072;
   const size_t kBatchSize = 10000;
-  const size_t kTrainFiles = 1;
+  const size_t kTrainFiles = 5;
 
   string dir_path_;
   bool normalize_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/examples/cifar10/make.sh
----------------------------------------------------------------------
diff --git a/examples/cifar10/make.sh b/examples/cifar10/make.sh
index 17e4b39..5a41612 100755
--- a/examples/cifar10/make.sh
+++ b/examples/cifar10/make.sh
@@ -1 +1 @@
-g++ -g --std=c++11 alexnet.cc -o alexnet -I../../include -I../../build/include -I/home/wangwei/local/cudnn4/include -I/home/wangwei/local/include -I/usr/local/cuda/include/ -I../../lib/cnmem/include -L../../build/lib/ -lsinga_core -lsinga_model  -lsinga_utils -lcudart -lcublas -lcurand -lcudnn -L/usr/local/cuda/lib64 -L/home/wangwei/local/cudnn4/lib64 ../../build/lib/libproto.a -lprotobuf
+g++ -g --std=c++11 alexnet.cc -o alexnet -I../../include -I../../build/include -I/home/wangwei/local/cudnn5/include -I/home/wangwei/local/include -I/usr/local/cuda/include/ -I../../lib/cnmem/include -L../../build/lib/ -lsinga_core -lsinga_model  -lsinga_utils -lcudart -lcublas -lcurand -lcudnn -L/home/wangwei/local/cudnn5/lib64 -L/usr/local/cuda/lib64 ../../build/lib/libproto.a -lprotobuf

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 3b496d9..18aa7ef 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -180,6 +180,7 @@ class Tensor {
   template <typename SType>
   Tensor &operator/=(const SType x);
 
+  float L1() const;
   float L2() const;
 
  protected:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/include/singa/model/feed_forward_net.h
----------------------------------------------------------------------
diff --git a/include/singa/model/feed_forward_net.h b/include/singa/model/feed_forward_net.h
index 9beeb7a..1ca417c 100644
--- a/include/singa/model/feed_forward_net.h
+++ b/include/singa/model/feed_forward_net.h
@@ -72,7 +72,7 @@ class FeedForwardNet {
   void Train(size_t batchsize, int nb_epoch, const Tensor& x, const Tensor& y,
              const Tensor& val_x, const Tensor& val_y);
   /// Train the neural net over one batch of training data.
-  const std::pair<float, float> TrainOnBatch(const Tensor& x, const Tensor& y);
+  const std::pair<float, float> TrainOnBatch(int epoch, const Tensor& x, const Tensor& y);
 
   /// Evaluate the neural net with given data.
   /// Returns one tensor for loss values and one tensor for metric values;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/include/singa/model/initializer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/initializer.h b/include/singa/model/initializer.h
index 302fc97..7024f70 100644
--- a/include/singa/model/initializer.h
+++ b/include/singa/model/initializer.h
@@ -21,8 +21,8 @@
 #include <string>
 #include "singa/core/tensor.h"
 #include "singa/proto/model.pb.h"
+#include "singa/utils/string.h"
 namespace singa {
-namespace init {
 /// Base class for initializing parameter values.
 using InitializerConf = FillerConf;
 class Initializer {
@@ -40,6 +40,7 @@ class Initializer {
   virtual void Fill(Tensor* t) = 0;
 };
 
+namespace init {
 class Constant : public Initializer {
 public:
   Constant() = default;
@@ -76,7 +77,7 @@ public:
   void Fill(Tensor* t) override { singa::Gaussian(mean_, std_, t); }
 
  private:
-  float mean_ = 0, std_ = 0.01;
+  float mean_ = 0, std_ = 1;
 };
 
 /// Ref: [Bengio and Glorot 2010] Understanding the difficulty of training deep
@@ -86,6 +87,7 @@ public:
   void Fill(Tensor* t) override {
     CHECK_EQ(t->nDim(), 2u);
     float scale = sqrt(6.0f / (t->shape(0) + t->shape(1)));
+    LOG(INFO) << "xavier scale " << scale;
     singa::Uniform(-scale, scale, t);
   }
 };
@@ -100,6 +102,26 @@ class MSRA : public Initializer {
     singa::Gaussian(0.0f, std, t);
   }
 };
+
 }  // namespace init
+
+std::shared_ptr<Initializer> CreateInitializer(const InitializerConf& conf) {
+  std::shared_ptr<Initializer> init;
+  if (ToLowerCase(conf.type()) == "constant") {
+    init = std::make_shared<init::Constant>();
+  } else if (ToLowerCase(conf.type()) == "uniform") {
+    init = std::make_shared<init::Uniform>();
+  } else if (ToLowerCase(conf.type()) == "gaussian") {
+    init = std::make_shared<init::Gaussian>();
+  } else if (ToLowerCase(conf.type()) == "xavier") {
+    init = std::make_shared<init::Xavier>();
+  } else if (ToLowerCase(conf.type()) == "msra") {
+    init = std::make_shared<init::MSRA>();
+  } else {
+    LOG(FATAL) << "Unknown initialization type: " << conf.type();
+  }
+  init->Setup(conf);
+  return init;
+}
 }  // namespace singa
 #endif  // SINGA_MODEL_INITIALIZER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
index 41ec701..f400768 100644
--- a/include/singa/model/loss.h
+++ b/include/singa/model/loss.h
@@ -43,13 +43,13 @@ class Loss {
 
   /// Compute the loss values for each sample/instance given the prediction
   /// and the target.
-  virtual Tensor Forward(const Tensor& prediction, const T& target) = 0;
+  virtual Tensor Forward(int flag, const Tensor& prediction, const T& target) = 0;
 
   /// Average loss values for all samples in the mini-batch
   /// It calls Forward() internally. The calling pattern should be
   /// [Evaluate|Forward] Backward.
-  float Evaluate(const Tensor& prediction, const T& target) {
-    Tensor loss = Forward(prediction, target);
+  float Evaluate(int flag, const Tensor& prediction, const T& target) {
+    Tensor loss = Forward(flag, prediction, target);
     loss.ToHost();
     return Sum<float>(loss) / (1.0f * loss.Size());
   }
@@ -68,7 +68,7 @@ class MSE : public Loss<Tensor> {
   /// and the target, which is 0.5/||prediction-target||^2
   /// Users can call Average(const Tensor&) to get the average
   /// loss value over all samples in the batch.
-  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+  Tensor Forward(int flag, const Tensor& prediction, const Tensor& target) override;
 
   /// Compute the gradients of the loss values w.r.t. the prediction,
   /// which is (prediction-target)/batchsize
@@ -90,7 +90,7 @@ class SoftmaxCrossEntropy : public Loss<Tensor> {
   /// from Softmax(prediction).
   /// Users can call Average(const Tensor&) to get the average
   /// loss value over all samples in the batch.
-  Tensor Forward(const Tensor& prediction, const Tensor& target) override;
+  Tensor Forward(int flag, const Tensor& prediction, const Tensor& target) override;
 
   /// Compute the gradients of the loss values w.r.t. the prediction,
   /// which is: p[idx] - 1 if idx is the truth category's index; else,
@@ -106,5 +106,3 @@ class SoftmaxCrossEntropy : public Loss<Tensor> {
 }  // namespace singa
 
 #endif  // SINGA_MODEL_LOSS_H_
-
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/include/singa/model/metric.h
----------------------------------------------------------------------
diff --git a/include/singa/model/metric.h b/include/singa/model/metric.h
index d013fa4..b100435 100644
--- a/include/singa/model/metric.h
+++ b/include/singa/model/metric.h
@@ -48,7 +48,7 @@ class Metric {
 
   /// Comptue the metric value averaged over all samples (in a batch)
   float Evaluate(const Tensor& prediction, const T& target) {
-    const Tensor& metric = Forward(prediction, target);
+    const Tensor metric = Forward(prediction, target);
     return Sum<float>(metric) / (1.0f * metric.Size());
   }
 };

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/include/singa/model/optimizer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/optimizer.h b/include/singa/model/optimizer.h
index a268126..2ec68fe 100644
--- a/include/singa/model/optimizer.h
+++ b/include/singa/model/optimizer.h
@@ -41,7 +41,7 @@ class Regularizer;
 class Optimizer {
  public:
   Optimizer() = default;
-
+  virtual ~Optimizer();
   /// Setup the optimzier using configurations from serialized string (for
   /// binding languages).
   void Setup(const string& str) {
@@ -51,7 +51,7 @@ class Optimizer {
   }
 
   /// Setup the meta fields of the optimizer
-  virtual void Setup(const OptimizerConf& conf) {}
+  virtual void Setup(const OptimizerConf& conf);
   /// Register the parameter, e.g., create Constraint and Regularizers.
   /// If there is no constraint or regularizer, then no need to register the
   /// parameter.
@@ -76,15 +76,21 @@ class Optimizer {
   void SetLearningRateGenerator(function<float(int)> func) {
     learning_rate_generator_ = func;
   }
-  /// Since Optimizer base layer has pure virtual function, a virtual
-  /// deconstructor is needed.
-  virtual ~Optimizer() = default;
+  float GetLearningRate(int step) {
+    if (learning_rate_generator_)
+      return learning_rate_generator_(step);
+    else
+      return 0;
+  }
 
  protected:
   function<float(int)> learning_rate_generator_;
   std::unordered_map<std::string, float> learning_rate_multplier_;
+  std::unordered_map<std::string, float> weight_decay_multplier_;
   std::unordered_map<std::string, Constraint*> constraints_;
   std::unordered_map<std::string, Regularizer*> regularizers_;
+  Constraint* constraint_ = nullptr;
+  Regularizer* regularizer_ = nullptr;
 };
 
 /// Apply constraints for parameters (gradient).
@@ -141,7 +147,7 @@ class Regularizer {
   /// e.g., clip each gradient if it is too large w.r.t the threshold,
   /// \ref
   /// https://www.reddit.com/r/MachineLearning/comments/31b6x8/gradient_clipping_rnns/
-  void Apply(int step, Tensor* grad, Tensor* value);
+  void Apply(int step, Tensor* grad, Tensor* value, float scale = 1.0f);
   /// Apply the regularizer for multiple parameter objects together.
   /// \ref https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
   void Apply(int step, const vector<Tensor*>& grads,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/include/singa/utils/string.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/string.h b/include/singa/utils/string.h
index cbfb28b..b4c7c24 100644
--- a/include/singa/utils/string.h
+++ b/include/singa/utils/string.h
@@ -51,6 +51,17 @@ inline int ArgPos(int argc, char** arglist, const char* arg) {
   return -1;
 }
 
+template<typename T>
+inline std::string VecToStr(const std::vector<T> & in) {
+  std::string out = "(";
+
+  for (auto x : in) {
+    out += std::to_string(x) + ", ";
+  }
+  out += ")";
+  return out;
+}
+
 /**
  * Tokenize a string.
  *

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index 4135ab8..13005af 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -265,6 +265,19 @@ __global__ void KernelLT(const size_t num, const float *in, const float x,
     out[idx] = in[idx] < x ? 1.0f : 0.0f;
   }
 }
+
+__global__ void KernelRowMax(const size_t nrow, const size_t ncol, const float *inPtr,
+    float *outPtr) {
+  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < nrow;
+       idx += blockDim.x * gridDim.x) {
+    int offset = idx * ncol;
+    float maxval = inPtr[offset];
+    for (size_t k = 1; k < ncol; k++) {
+      maxval = max(maxval, inPtr[offset + k]);
+    }
+    outPtr[idx] = maxval;
+  }
+}
 __global__ void KernelComputeCrossEntropy(const size_t batchsize,
                                           const size_t dim, const float *p,
                                           const int *t, float *loss) {
@@ -286,6 +299,9 @@ __global__ void KernelSoftmaxCrossEntropyBwd(const size_t batchsize,
     grad[pos] = p[pos] - 1.0f;  // TODO(wangwei) Consider p and grad are diff
   }
 }
+
+
+
 // ********************************
 // Functions call kernels
 // ********************************
@@ -421,6 +437,12 @@ void SoftmaxCrossEntropyBwd(size_t batchsize, const size_t dim, const float *p,
   KernelSoftmaxCrossEntropyBwd <<<ceil(batchsize / CU1DBLOCKF), CU1DBLOCKF>>>
       (batchsize, dim, p, t, grad);
 }
+
+void RowMax(const size_t nrow, const size_t ncol, const float *inPtr,
+    float *outPtr, cudaStream_t stream) {
+  KernelRowMax <<<ceil(nrow / CU1DBLOCKF), CU1DBLOCKF>>>(nrow, ncol, inPtr, outPtr);
+}
+
 /*
 void square_grad(int n, const float *in, float *out, cudaStream_t s) {
   kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index d4087e5..63b0d82 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -98,6 +98,8 @@ void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
                             const float *p, const int *t, float *grad,
                             cudaStream_t stream);
 
+void RowMax(const size_t nrow, const size_t ncol, const float *inPtr,
+    float *outPtr, cudaStream_t stream);
 }  // cuda
 
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 898cdc6..b07a23c 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -42,7 +42,8 @@ Tensor::Tensor(Shape &&shape, DataType dtype)
   device_ = defaultDevice;
   block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
 }
-Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, DataType dtype)
+Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
+               DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_));
 }
@@ -68,11 +69,10 @@ Tensor::Tensor(Tensor &&in)
   in.block_ = nullptr;
 }
 
-void Tensor::SetBlock(Block* block) {
+void Tensor::SetBlock(Block *block) {
   LOG(WARNING) << "Pls avoid using this function, which may have side-effect.";
   if (block_ != nullptr)
-    if (block_->DecRefCount())
-      device_->FreeBlock(block_);
+    if (block_->DecRefCount()) device_->FreeBlock(block_);
   block_ = block;
 }
 
@@ -118,8 +118,7 @@ void Tensor::ToDevice(std::shared_ptr<Device> dst) {
   // TODO(wangwei) the comparison is very strict. May compare against device ID?
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
-    if (block_ != nullptr && Size())
-      tmp.CopyData(*this);
+    if (block_ != nullptr && Size()) tmp.CopyData(*this);
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = tmp.block_;
@@ -132,13 +131,13 @@ void Tensor::ToHost() { ToDevice(device_->host()); }
 
 template <typename DType>
 void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num,
-    const size_t offset) {
+                                 const size_t offset) {
   CHECK_EQ(sizeof(DType), SizeOf(data_type_))
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
   if (src != nullptr) {
     device_->CopyDataFromHostPtr(block(), src, sizeof(DType) * num,
-        sizeof(DType) * offset);
+                                 sizeof(DType) * offset);
   } else {
     LOG(WARNING) << "Copy data from null host ptr";
   }
@@ -161,8 +160,7 @@ void Tensor::CopyData(const Tensor &src) {
 }
 
 Tensor Tensor::Clone(std::shared_ptr<Device> device) const {
-  if (device == nullptr)
-    device = device_;
+  if (device == nullptr) device = device_;
   Tensor t(shape_, device_, data_type_);
   t.transpose_ = transpose_;
   t.CopyData(*this);
@@ -244,8 +242,6 @@ GenUnaryScalarArgMemberFn(operator+=, Add);
 GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
 GenUnaryScalarArgMemberFn(operator/=, Div);
 
-
-
 // ====================Tensor Operations=======================================
 void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
                     const size_t dst_offset, const size_t src_offset) {
@@ -336,6 +332,18 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
   } while (0)
 
 // =============Element-wise operations====================================
+float Tensor::L1() const {
+  float nrm = 0.0f;
+  TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+    device_->Exec([&nrm, this](Context *ctx) {
+      DType ret;
+      Asum<DType, Lang>(this->Size(), this->block(), &ret, ctx);
+      nrm = TypeCast<DType, float>(ret);
+    }, {this->block()}, {});
+  });
+  return nrm / Size();
+}
+
 /// L2 norm, Do not use Nrm2 (name conflict).
 float Tensor::L2() const {
   float nrm = 0.0f;
@@ -346,8 +354,10 @@ float Tensor::L2() const {
       nrm = TypeCast<DType, float>(ret);
     }, {this->block()}, {});
   });
-  return nrm;
+  return nrm / Size();
 }
+
+
 template <typename SType>
 void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));
@@ -525,18 +535,35 @@ Tensor SoftMax(const Tensor &in) {
   return out;
 }
 
+Tensor RowMax(const Tensor &in) {
+  Tensor ret({in.shape(0)}, in.device(), in.data_type());
+  TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+    in.device()->Exec([in, ret](Context *ctx) {
+      size_t nrow = 1;
+      if (in.nDim() > 1) nrow = in.shape(0);
+      size_t ncol = in.Size() / nrow;
+      RowMax<DType, Lang>(nrow, ncol, in.block(), ret.block(), ctx);
+    }, {in.block()}, {ret.block()});
+  });
+  return ret;
+}
+
 void SoftMax(const Tensor &in, Tensor *out) {
   CHECK_LE(in.nDim(), 2u);
-  Exp(in, out);
+  out->CopyData(in);
   size_t nrow = 1, ncol = in.Size(), size = ncol;
   if (in.nDim() == 2u) {
     nrow = in.shape(0);
     ncol = size / nrow;
     out->Reshape(Shape{nrow, ncol});
   }
-  Tensor sum(Shape{nrow}, in.device(), in.data_type());
-  SumColumns(*out, &sum);
-  DivColumn(sum, out);
+  Tensor tmp = RowMax(*out);
+  SubColumn(tmp, out);
+  Exp(*out, out);
+
+  SumColumns(*out, &tmp);
+  DivColumn(tmp, out);
+  out->Reshape(in.shape());
 }
 
 void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
@@ -582,8 +609,8 @@ void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
     Mult(alpha, one, vmat, beta, M);
   }
 }
-template
-void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
+template void AddRow(const float alpha, const float beta, const Tensor &v,
+                     Tensor *M);
 
 /// Divide column 'v' by each column of matrix M; write results into 'out'
 void DivColumn(const Tensor &v, Tensor *M) {
@@ -699,7 +726,7 @@ void MultRow(const Tensor &v, Tensor *M) {
   });
 }
 
-Tensor SliceRows(const Tensor& in, const size_t start, const size_t end) {
+Tensor SliceRows(const Tensor &in, const size_t start, const size_t end) {
   LOG(FATAL) << "Tensor::SliceRows is not implemented";
   Tensor ret;
   /*
@@ -788,6 +815,7 @@ void Gaussian(const SType mean, const SType std, Tensor *out) {
 template void Gaussian<float>(const float mean, const float std, Tensor *out);
 
 // ================Blas operations============================================
+
 template <typename SType>
 void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
@@ -869,5 +897,4 @@ void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
   });
 }
 
-
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 57ccb88..7732dd2 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -339,6 +339,11 @@ void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim,
   LOG(FATAL) << "Not Implemented";
 }
 
+template <typename DType, typename Lang>
+void RowMax(const size_t nrow, const size_t ncol, const Block *in,
+    const Block *ret, Context* ctx) {
+  LOG(FATAL) << "Not Implemented";
+}
 // **************************************
 // Matrix functions
 // **************************************

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 4717b5f..3e0c8ad 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -549,6 +549,20 @@ void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize,
   }
 }
 
+template <>
+void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+                              const Block *in, const Block *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->data());
+  float *outPtr = static_cast<float *>(out->mutable_data());
+  for (size_t r = 0; r < nrow; r++) {
+    int offset = r * ncol;
+    float maxval = inPtr[offset];
+    for (size_t c = 1; c < ncol; c++)
+      maxval = std::max(maxval, inPtr[offset + c]);
+    outPtr[r] = maxval;
+  }
+}
+
 // =========Matrix operations ================================================
 /*
 template <>

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 67ee861..43bfa1b 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -421,6 +421,15 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize,
   cuda::SoftmaxCrossEntropyBwd(batchsize, dim, pPtr, tPtr, gradPtr,
                                ctx->stream);
 }
+
+template <>
+void RowMax<float, lang::Cuda>(const size_t nrow, const size_t ncol,
+                               const Block* in, const Block* out,
+                               Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->data());
+  float* outPtr = static_cast<float*>(out->mutable_data());
+  cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
+}
 }  // namespace singa
 
 #endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/model/feed_forward_net.cc
----------------------------------------------------------------------
diff --git a/src/model/feed_forward_net.cc b/src/model/feed_forward_net.cc
index a24d36a..e682918 100644
--- a/src/model/feed_forward_net.cc
+++ b/src/model/feed_forward_net.cc
@@ -1,22 +1,26 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
 
 #include "singa/model/feed_forward_net.h"
+#include "singa/model/initializer.h"
 #include "singa/utils/logging.h"
 #include "singa/utils/channel.h"
 namespace singa {
@@ -37,12 +41,15 @@ Layer* FeedForwardNet::Add(const LayerConf& conf, const Shape* sample_shape) {
   return layer;
 }
 
-Layer* FeedForwardNet::Add(Layer* layer, const LayerConf& conf, const Shape* sample_shape) {
+Layer* FeedForwardNet::Add(Layer* layer, const LayerConf& conf,
+                           const Shape* sample_shape) {
+  CHECK(conf.has_name()) << "Must set layer name";
   if (sample_shape == nullptr)
     layer->Setup(layers_.back()->GetOutputSampleShape(), conf);
   else
     layer->Setup(*sample_shape, conf);
   Add(layer);
+  LOG(INFO) << layer->name() << VecToStr(layer->GetOutputSampleShape());
   return layer;
 }
 
@@ -75,12 +82,19 @@ void FeedForwardNet::Compile(bool shuffle, Optimizer* opt, Loss<Tensor>* loss,
   opt_ = opt;
   loss_ = loss;
   metric_ = metric;
-  // init params and register them to sgd
+  const auto specs = GetParamSpecs();
+  const auto params = GetParamValues();
+  CHECK_EQ(specs.size(), params.size());
+  for (size_t k = 0; k < specs.size(); k++) {
+    opt_->Register(specs[k].name(), specs[k]);
+    auto init = CreateInitializer(specs[k].filler());
+    init->Fill(params[k]);
+    LOG(INFO) << specs[k].name() << " : " << params[k]->L1();
+  }
 }
 
 void FeedForwardNet::ToDevice(std::shared_ptr<Device> device) {
-  for (auto layer: layers_)
-    layer->ToDevice(device);
+  for (auto layer : layers_) layer->ToDevice(device);
   /*
   opt_->ToDevice(device);
   loss_->ToDevice(device);
@@ -129,7 +143,6 @@ void FeedForwardNet::Train(size_t batchsize, int nb_epoch, const Tensor& x,
 void FeedForwardNet::Train(size_t batchsize, int nb_epoch, const Tensor& x,
                            const Tensor& y, const Tensor& val_x,
                            const Tensor& val_y) {
-  InitNetParams();
   CHECK_EQ(x.shape(0), y.shape(0)) << "Diff num of sampels in x and y";
   int num_extra_samples = x.shape(0) % batchsize;
   if (num_extra_samples != 0)
@@ -137,13 +150,18 @@ void FeedForwardNet::Train(size_t batchsize, int nb_epoch, const Tensor& x,
   Channel* train_ch = GetChannel("train_perf");
   train_ch->EnableDestStderr(true);
   Channel* val_ch = GetChannel("val_perf");
+  val_ch->EnableDestStderr(true);
+  std::vector<size_t> index;
+  for (size_t i = 0; i < x.shape(0) / batchsize; i++) index.push_back(i);
   for (int epoch = 0; epoch < nb_epoch; epoch++) {
+    if (shuffle_) std::random_shuffle(index.begin(), index.end());
     float loss = 0.0f, metric = 0.0f;
     size_t b = 0;
     for (; b < x.shape(0) / batchsize; b++) {
-      const Tensor bx = CopyRows(x, b * batchsize, b * batchsize + batchsize);
-      const Tensor by = CopyRows(y, b * batchsize, b * batchsize + batchsize);
-      const auto ret = TrainOnBatch(bx, by);
+      size_t idx = index[b];
+      const Tensor bx = CopyRows(x, idx * batchsize, (idx + 1) * batchsize);
+      const Tensor by = CopyRows(y, idx * batchsize, (idx + 1) * batchsize);
+      const auto ret = TrainOnBatch(epoch, bx, by);
       loss += ret.first;
       metric += ret.second;
     }
@@ -151,7 +169,8 @@ void FeedForwardNet::Train(size_t batchsize, int nb_epoch, const Tensor& x,
     metric /= b;
     train_ch->Send("Epoch " + std::to_string(epoch) + ", training loss = " +
                    std::to_string(loss) + ", accuracy = " +
-                   std::to_string(metric));
+                   std::to_string(metric) + ", lr = " +
+                   std::to_string(opt_->GetLearningRate(epoch)));
     if (val_x.Size() && val_y.Size()) {
       const auto val_perf = Evaluate(val_x, val_y, batchsize);
       val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
@@ -162,22 +181,28 @@ void FeedForwardNet::Train(size_t batchsize, int nb_epoch, const Tensor& x,
   }
 }
 
-const std::pair<float, float> FeedForwardNet::TrainOnBatch(const Tensor& x,
+const std::pair<float, float> FeedForwardNet::TrainOnBatch(int epoch,
+                                                           const Tensor& x,
                                                            const Tensor& y) {
   int flag = kTrain;
   const Tensor fea = Forward(flag, x);
-  float loss = loss_->Evaluate(fea, y);
+  float loss = loss_->Evaluate(flag, fea, y);
   float metric = metric_->Evaluate(fea, y);
   const Tensor grad = loss_->Backward();
-  const auto grads = Backward(kTrain, grad);
+  auto grads = Backward(kTrain, grad / static_cast<float>(x.shape(0)));
+  auto names = GetParamNames();
+  auto values = GetParamValues();
+  for (size_t k = 0; k < grads.size(); k++) {
+    opt_->Apply(epoch, names[k], &grads[k], values.at(k));
+  }
   return std::make_pair(loss, metric);
 }
 
 const Tensor FeedForwardNet::Forward(int flag, const Tensor& data) {
   Tensor input = data, output;
   for (auto layer : layers_) {
-//    LOG(INFO) << layer->name();
     output = layer->Forward(flag, input);
+    // LOG(INFO) << layer->name() << ": " << output.L2();
     input = output;
   }
   return output;
@@ -185,13 +210,22 @@ const Tensor FeedForwardNet::Forward(int flag, const Tensor& data) {
 
 const vector<Tensor> FeedForwardNet::Backward(int flag, const Tensor& grad) {
   vector<Tensor> param_grads;
+  std::stack<Tensor> buf;
   Tensor tmp = grad;
   for (int i = layers_.size() - 1; i >= 0; i--) {
- //   LOG(INFO) << layers_.at(i)->name();
+    // LOG(INFO) << layers_.at(i)->name() << " : " << tmp.L2();
     auto ret = layers_.at(i)->Backward(flag, tmp);
     tmp = ret.first;
-    if (ret.second.size())
-      for (const auto x : ret.second) param_grads.push_back(x);
+    if (ret.second.size()) {
+      for (int k = ret.second.size() - 1; k >= 0; k--) {
+        buf.push(ret.second[k]);
+        // LOG(INFO) <<  "      " << buf.top().L2();
+      }
+    }
+  }
+  while (!buf.empty()) {
+    param_grads.push_back(buf.top());
+    buf.pop();
   }
   return param_grads;
 }
@@ -230,8 +264,8 @@ std::pair<Tensor, Tensor> FeedForwardNet::EvaluateOnBatch(const Tensor& x,
   int flag = kEval;
   const Tensor fea = Forward(flag, x);
   const Tensor m = metric_->Forward(fea, y);
-  const Tensor l = loss_->Forward(fea, y);
-  return std::make_pair(m, l);
+  const Tensor l = loss_->Forward(flag, fea, y);
+  return std::make_pair(l, m);
 }
 
 const Tensor FeedForwardNet::Predict(const Tensor& x, size_t batchsize) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
index eb507b2..3dca28a 100644
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@ -72,8 +72,8 @@ void CudnnConvolution::InitCudnn(const Tensor &input) {
       num_filters_, conv_height_, conv_width_));
   if (bias_term_)
     CUDNN_CHECK(cudnnSetTensor4dDescriptor(bias_desc_, CUDNN_TENSOR_NCHW,
-                                           GetCudnnDataType(dtype), 1, 1,
-                                           num_filters_, 1));
+                                           GetCudnnDataType(dtype), 1, 1, 1,
+                                           num_filters_));
   CUDNN_CHECK(cudnnSetConvolution2dDescriptor(conv_desc_, pad_h_, pad_w_,
                                               stride_h_, stride_w_, 1, 1,
                                               CUDNN_CROSS_CORRELATION));
@@ -244,6 +244,7 @@ const std::pair<Tensor, vector<Tensor>> CudnnConvolution::Backward(
   }, {grad.block(), weight_.block()}, {dx.block(), workspace_.block()});
   param_grad.push_back(dw);
   param_grad.push_back(db);
+  LOG(INFO) << "bias nrm " << db.L1();
   return std::make_pair(dx, param_grad);
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
index f9b9dbf..ab83226 100644
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@ -108,7 +108,7 @@ const std::pair<Tensor, vector<Tensor>> CudnnDropout::Backward(
 }
 void CudnnDropout::ToDevice(std::shared_ptr<Device> device) {
   Dropout::ToDevice(device);
-  state.ToDevice(device);
+  state_.ToDevice(device);
 }
 }  // namespace singa
 #endif  // CUDNN_VERSION_MAJOR>=5

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/model/layer/dense.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
index c6a9f8a..338409c 100644
--- a/src/model/layer/dense.cc
+++ b/src/model/layer/dense.cc
@@ -41,13 +41,15 @@ void Dense::Setup(const Shape& in_sample, const LayerConf &conf) {
   bias_.Reshape(Shape{hdim_});
   param_values_.push_back(&weight_);
   param_values_.push_back(&bias_);
+  for (auto specs: conf.param())
+    param_specs_.push_back(specs);
 }
 
 /// \copydoc Layer::Forward(int flag, const Tensor&)
 const Tensor Dense::Forward(int flag, const Tensor &input) {
   CHECK(buf_.empty());
   Tensor output;
-  CHECK_EQ(input.nDim(), 2);
+  CHECK_EQ(input.nDim(), 2u);
   if (transpose_)  // use the transposed version of weight_ for computing
     output = Mult(input, weight_);
   else

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/model/loss/mse.cc
----------------------------------------------------------------------
diff --git a/src/model/loss/mse.cc b/src/model/loss/mse.cc
index a4bbb72..6e19059 100644
--- a/src/model/loss/mse.cc
+++ b/src/model/loss/mse.cc
@@ -20,7 +20,7 @@
 
 namespace singa {
 
-Tensor MSE::Forward(const Tensor& prediction, const Tensor& target) {
+Tensor MSE::Forward(int flag, const Tensor& prediction, const Tensor& target) {
   CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
                       << " The calling pattern is [Forward|Evaluate] Backward";
   Tensor t = prediction - target;
@@ -28,7 +28,8 @@ Tensor MSE::Forward(const Tensor& prediction, const Tensor& target) {
   if (t.nDim() > 1) batchsize = t.shape().at(0);
   size_t dim = t.Size() / batchsize;
   t.Reshape(Shape{batchsize, dim});
-  buf_.push(t);
+  if (kTrain & flag)
+    buf_.push(t);
   // TODO(wangwei) use CastType for operator/
   return Sum(Square(t), 1) * 0.5f;
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/model/loss/softmax_cross_entropy.cc
----------------------------------------------------------------------
diff --git a/src/model/loss/softmax_cross_entropy.cc b/src/model/loss/softmax_cross_entropy.cc
index bed3348..3411fbe 100644
--- a/src/model/loss/softmax_cross_entropy.cc
+++ b/src/model/loss/softmax_cross_entropy.cc
@@ -21,7 +21,7 @@
 
 namespace singa {
 
-Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction,
+Tensor SoftmaxCrossEntropy::Forward(int flag, const Tensor& prediction,
                                     const Tensor& target) {
   CHECK(buf_.empty()) << "Do not call Forward successively for more than twice."
                       << " The calling pattern is [Forward|Evaluate] Backward";
@@ -30,13 +30,17 @@ Tensor SoftmaxCrossEntropy::Forward(const Tensor& prediction,
   size_t dim = prediction.Size() / batchsize;
   const Tensor& input = Reshape(prediction, Shape{batchsize, dim});
   Tensor prob = SoftMax(input);
+  // LOG(INFO) << "prob: " << prob.L2();
 
   // buffer intermediate data
-  buf_.push(prob);
-  buf_.push(target);
+  if (flag & kTrain) {
+    buf_.push(prob);
+    buf_.push(target);
+  }
   Tensor loss(Shape{batchsize}, prob.device(), prob.data_type());
 
   ComputeCrossEntropy(prob, target, &loss);
+
   return loss;
 }
 
@@ -50,4 +54,3 @@ Tensor SoftmaxCrossEntropy::Backward() {
 }
 }  // namespace singa
 
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/model/metric/accuracy.cc
----------------------------------------------------------------------
diff --git a/src/model/metric/accuracy.cc b/src/model/metric/accuracy.cc
index 1b667b1..ffda938 100644
--- a/src/model/metric/accuracy.cc
+++ b/src/model/metric/accuracy.cc
@@ -30,6 +30,7 @@ Tensor Accuracy::Match(const Tensor& predict, const vector<int>& target) {
   // TODO(wangwei) CloneToDevice(host);
   const float* prob = prediction.data<float>();
   float* score = new float[batchsize];
+  memset(score, 0, batchsize * sizeof(float));
   for (size_t b = 0; b < batchsize; b++) {
     vector<std::pair<float, int>> prob_class;
     for (size_t c = 0; c < nb_classes; c++) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/model/optimizer/optimizer.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/optimizer.cc b/src/model/optimizer/optimizer.cc
index c9e7a72..9be47c8 100644
--- a/src/model/optimizer/optimizer.cc
+++ b/src/model/optimizer/optimizer.cc
@@ -21,6 +21,17 @@
 
 namespace singa {
 
+Optimizer::~Optimizer() {
+  for (auto entry : regularizers_) delete entry.second;
+  for (auto entry : constraints_) delete entry.second;
+  if (constraint_ != nullptr) delete constraint_;
+  if (regularizer_ != nullptr) delete regularizer_;
+}
+void Optimizer::Setup(const OptimizerConf& conf) {
+  if (conf.has_regularizer())
+    regularizer_ = new Regularizer(conf.regularizer());
+  if (conf.has_constraint()) constraint_ = new Constraint(conf.constraint());
+}
 void Optimizer::Register(const string& name, const ParamSpec& specs) {
   if (specs.has_constraint()) {
     CHECK(constraints_.find(name) == constraints_.end())
@@ -32,6 +43,11 @@ void Optimizer::Register(const string& name, const ParamSpec& specs) {
         << "Parameter with name = " << name << " has already registered";
     regularizers_[name] = new Regularizer(specs.regularizer());
   }
+  if (specs.has_decay_mult()) {
+    CHECK(weight_decay_multplier_.find(name) == weight_decay_multplier_.end())
+        << "Parameter with name = " << name << " has already registered";
+    weight_decay_multplier_[name] = specs.decay_mult();
+  }
   if (specs.has_lr_mult()) {
     CHECK(learning_rate_multplier_.find(name) == learning_rate_multplier_.end())
         << "Parameter with name = " << name << " has already registered";
@@ -47,10 +63,18 @@ void Optimizer::Register(const string& name, const ParamSpec& specs) {
 void Optimizer::Apply(int step, const string& name, Tensor* grad,
                       Tensor* param) {
   // TODO(wangwei) need to consider the order of constraint and regularizer
-  if (regularizers_.find(name) != regularizers_.end())
+  if (regularizers_.find(name) != regularizers_.end()) {
     regularizers_.at(name)->Apply(step, param, grad);
+  } else if (regularizer_ != nullptr) {
+    float scale = 1.0f;
+    if (weight_decay_multplier_.find(name) != weight_decay_multplier_.end())
+      scale = weight_decay_multplier_.at(name);
+    regularizer_->Apply(step, param, grad, scale);
+  }
   if (constraints_.find(name) != constraints_.end())
     constraints_.at(name)->Apply(step, param, grad);
+  else if (constraint_ != nullptr)
+    constraint_->Apply(step, param, grad);
   float lr = learning_rate_generator_(step);
   if (learning_rate_multplier_.find(name) != learning_rate_multplier_.end())
     lr *= learning_rate_multplier_.at(name);
@@ -62,9 +86,9 @@ void Regularizer::Setup(const RegularizerConf& conf) {
   coefficient_ = conf.coefficient();
 }
 
-void Regularizer::Apply(int step, Tensor* value, Tensor* grad) {
+void Regularizer::Apply(int step, Tensor* value, Tensor* grad, float scale) {
   if (type_ == "L2" || type_ == "l2") {
-    (*grad) -= (*value) * coefficient_;
+    Axpy(coefficient_ * scale, *value, grad);
   } else {
     CHECK(type_ == "NotSet") << "Unknown regularizer type = " << type_;
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/model/optimizer/sgd.cc
----------------------------------------------------------------------
diff --git a/src/model/optimizer/sgd.cc b/src/model/optimizer/sgd.cc
index a5c66a1..71071ff 100644
--- a/src/model/optimizer/sgd.cc
+++ b/src/model/optimizer/sgd.cc
@@ -22,6 +22,7 @@
 namespace singa {
 
 void SGD::Setup(const OptimizerConf& conf) {
+  Optimizer::Setup(conf);
   if (conf.has_momentum()) {
     float m = conf.momentum();
     SetMomentumGenerator([m](int step) { return m; });

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/src/proto/model.proto
----------------------------------------------------------------------
diff --git a/src/proto/model.proto b/src/proto/model.proto
index c06deec..b1318d9 100644
--- a/src/proto/model.proto
+++ b/src/proto/model.proto
@@ -89,6 +89,11 @@ message OptimizerConf {
 
   // delta is used to avoid dividing zero
   optional float delta = 6 [default = 1e-8];
+
+  // global regularizer lower priority than ParamSpec regularizer
+  optional RegularizerConf regularizer = 10;
+  // global constraint lower priority than ParamSpec constraint
+  optional ConstraintConf constraint = 11;
 }
 
 message ConstraintConf {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/test/singa/test_cross_entropy.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_cross_entropy.cc b/test/singa/test_cross_entropy.cc
index d73591f..c7fa2fb 100644
--- a/test/singa/test_cross_entropy.cc
+++ b/test/singa/test_cross_entropy.cc
@@ -44,7 +44,7 @@ TEST_F(TestSoftmaxCrossEntropy, CppForward) {
   t.CopyDataFromHostPtr(tdat, 2);
 
   singa::SoftmaxCrossEntropy cross_entropy;
-  const Tensor& loss = cross_entropy.Forward(p, t);
+  const Tensor& loss = cross_entropy.Forward(singa::kEval, p, t);
   auto ldat = loss.data<float>();
 
   const float result_test = -log(0.25);
@@ -58,7 +58,7 @@ TEST_F(TestSoftmaxCrossEntropy, CppBackward) {
   t.CopyDataFromHostPtr(tdat, 2);
 
   singa::SoftmaxCrossEntropy cross_entropy;
-  cross_entropy.Forward(p, t);
+  cross_entropy.Forward(singa::kTrain, p, t);
   const Tensor& grad = cross_entropy.Backward();
 
   auto gdat = grad.data<float>();
@@ -82,7 +82,7 @@ TEST_F(TestSoftmaxCrossEntropy, CudaForward) {
   p.CopyDataFromHostPtr(pdat, 8);
   t.CopyDataFromHostPtr(tdat, 2);
 
-  Tensor loss = cross_entropy.Forward(p, t);
+  Tensor loss = cross_entropy.Forward(singa::kEval, p, t);
   loss.ToHost();
   auto ldat = loss.data<float>();
 
@@ -99,7 +99,7 @@ TEST_F(TestSoftmaxCrossEntropy, CudaBackward) {
   p.CopyDataFromHostPtr(pdat, 8);
   t.CopyDataFromHostPtr(tdat, 2);
 
-  cross_entropy.Forward(p, t);
+  cross_entropy.Forward(singa::kTrain, p, t);
   Tensor grad = cross_entropy.Backward();
 
   grad.ToHost();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_dense.cc b/test/singa/test_dense.cc
index 363fb6e..e80384f 100644
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@ -207,7 +207,7 @@ TEST(Dense, BackwardCuda) {
   singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
   grad.CopyDataFromHostPtr(dy, batchsize * hdim);
 
-  const auto ret = dense.Backward(singa::kTrain, grad);
+  auto ret = dense.Backward(singa::kTrain, grad);
   singa::Tensor in_grad = ret.first;
   singa::Tensor dweight = ret.second.at(0);
   singa::Tensor dbias = ret.second.at(1);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_mse.cc b/test/singa/test_mse.cc
index 788652f..640caf4 100644
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@ -42,7 +42,7 @@ class TestMSE : public ::testing::Test {
 #ifdef USE_CBLAS
 TEST_F(TestMSE, CppForward) {
   singa::MSE mse;
-  const Tensor& loss = mse.Forward(p, t);
+  const Tensor& loss = mse.Forward(singa::kEval, p, t);
   auto ldat = loss.data<float>();
 
   for (size_t i = 0, k = 0; i < loss.Size(); i++) {
@@ -57,7 +57,7 @@ TEST_F(TestMSE, CppForward) {
 
 TEST_F(TestMSE, CppBackward) {
   singa::MSE mse;
-  mse.Forward(p, t);
+  mse.Forward(singa::kTrain, p, t);
   const Tensor& grad = mse.Backward();
 
   auto gdat = grad.data<float>();
@@ -72,7 +72,7 @@ TEST_F(TestMSE, CudaForward) {
   auto dev = std::make_shared<singa::CudaGPU>();
   p.ToDevice(dev);
   t.ToDevice(dev);
-  Tensor loss = mse->Forward(p, t);
+  Tensor loss = mse->Forward(singa::kEval, p, t);
 
   loss.ToHost();
   auto ldat = loss.data<float>();
@@ -94,7 +94,7 @@ TEST_F(TestMSE, CudaBackward) {
   auto dev = std::make_shared<singa::CudaGPU>();
   p.ToDevice(dev);
   t.ToDevice(dev);
-  mse.Forward(p, t);
+  mse.Forward(singa::kTrain, p, t);
   Tensor grad = mse.Backward();
   grad.ToHost();
   auto gdat = grad.data<float>();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/71eb059c/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index f8d0351..2a0df0d 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -346,7 +346,7 @@ TEST_F(TestTensorMath, L2Cpp) {
   float l2 = a.L2();
   float target = 0.0f;
   for (size_t i = 0; i < a.Size(); i++) target += dat1[i] * dat1[i];
-  EXPECT_FLOAT_EQ(l2, sqrt(target));
+  EXPECT_FLOAT_EQ(l2, sqrt(target) / a.Size());
 }
 TEST_F(TestTensorMath, MultCpp) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
@@ -514,7 +514,7 @@ TEST_F(TestTensorMath, L2Cuda) {
   float l2 = t.L2();
   float target = 0.0f;
   for (size_t i = 0; i < t.Size(); i++) target += dat1[i] * dat1[i];
-  EXPECT_FLOAT_EQ(l2, sqrt(target));
+  EXPECT_FLOAT_EQ(l2, sqrt(target) / t.Size());
 }
 TEST_F(TestTensorMath, MultCuda) {
   const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};