You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ka...@apache.org on 2016/06/27 14:11:49 UTC
[2/6] incubator-singa git commit: SINGA-204 Support the training of feed-forward neural nets

SINGA-204 Support the training of feed-forward neural nets

Fix the bug from pre/post increament in Block Inc/Dec reference which
resulted in out-of-memory.  Cifar10 data loading works.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/cf1d8418
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/cf1d8418
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/cf1d8418

Branch: refs/heads/dev
Commit: cf1d841890842c6cf1573491f4fc9d7e1eca30f4
Parents: d826b2e
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sat Jun 25 19:34:13 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Mon Jun 27 15:27:19 2016 +0800

----------------------------------------------------------------------
 examples/cifar10/alexnet.cc            | 105 +++++++++++------
 examples/cifar10/cifar10.cc            |  98 ----------------
 examples/cifar10/cifar10.h             |  99 ++++++++++++++++
 examples/cifar10/make.sh               |   1 +
 include/singa/core/common.h            |  16 +--
 include/singa/core/tensor.h            |   4 +-
 include/singa/model/feed_forward_net.h |  63 +++++-----
 include/singa/model/layer.h            |   2 +-
 include/singa/model/loss.h             |   4 +-
 include/singa/model/metric.h           |  23 ++++
 include/singa/model/optimizer.h        |   2 +-
 src/CMakeLists.txt                     |   3 +-
 src/core/device/cuda_gpu.cc            |   4 +-
 src/core/tensor/tensor.cc              |  14 ++-
 src/model/feed_forward_net.cc          | 176 ++++++++++++++--------------
 src/model/layer/convolution.cc         |   6 +-
 src/model/layer/convolution.h          |   1 +
 src/model/layer/cudnn_activation.cc    |   1 +
 src/model/layer/cudnn_convolution.cc   |   1 +
 src/model/layer/cudnn_dropout.cc       |   4 +
 src/model/layer/cudnn_dropout.h        |   1 +
 src/model/layer/cudnn_pooling.cc       |   1 +
 src/model/layer/cudnn_softmax.cc       |   1 +
 src/model/layer/dense.cc               |   3 +
 src/model/metric/accuracy.cc           |  62 ++++++++++
 src/model/metric/accuracy.h            |  84 -------------
 test/singa/test_accuracy.cc            |   2 +-
 27 files changed, 419 insertions(+), 362 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/examples/cifar10/alexnet.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/alexnet.cc b/examples/cifar10/alexnet.cc
index 2917dd2..45d8571 100644
--- a/examples/cifar10/alexnet.cc
+++ b/examples/cifar10/alexnet.cc
@@ -22,7 +22,14 @@
 #include "singa/model/feed_forward_net.h"
 #include "singa/model/optimizer.h"
 #include "singa/model/initializer.h"
-
+#include "singa/model/metric.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+#include "../../src/model/layer/cudnn_convolution.h"
+#include "../../src/model/layer/cudnn_activation.h"
+#include "../../src/model/layer/cudnn_pooling.h"
+#include "../../src/model/layer/dense.h"
+#include "../../src/model/layer/flatten.h"
 namespace singa {
 
 LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
@@ -32,9 +39,9 @@ LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
   conf.set_type("CudnnConvolution");
   ConvolutionConf *conv = conf.mutable_convolution_conf();
   conv->set_num_output(nb_filter);
-  conv->set_kernel_size(kernel);
-  conv->set_stride(stride);
-  conv->set_pad(pad);
+  conv->add_kernel_size(kernel);
+  conv->add_stride(stride);
+  conv->add_pad(pad);
 
   FillerConf *weight = conv->mutable_weight_filler();
   weight->set_type("Xavier");
@@ -50,7 +57,7 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride, int
   pool->set_stride(stride);
   pool->set_pad(pad);
   if (!max_pool)
-    pool->set_pool(PoolingConf_AVE);
+    pool->set_pool(PoolingConf_PoolMethod_AVE);
   return conf;
 }
 
@@ -65,9 +72,9 @@ LayerConf GenDenseConf(string name, int num_output) {
   LayerConf conf;
   conf.set_name(name);
   conf.set_type("Dense");
-  DenseConf *dense = conf->mutable_dense_conf();
+  DenseConf *dense = conf.mutable_dense_conf();
   dense->set_num_output(num_output);
-  FillerConf *weight = conv->mutable_weight_filler();
+  FillerConf *weight = dense->mutable_weight_filler();
   weight->set_type("Xavier");
   return conf;
 }
@@ -79,22 +86,27 @@ LayerConf GenSoftmaxConf(string name) {
   return conf;
 }
 
-
-FeedForwordNet CreateNet(Optimizer* opt, Loss* loss, Metric* metric) {
-  FeedForwordNet net;
+LayerConf GenFlattenConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("Flatten");
+  return conf;
+}
+FeedForwardNet CreateNet(Optimizer* opt, Loss<Tensor>* loss, Metric<Tensor>* metric) {
+  FeedForwardNet net;
   Shape s{3, 32, 32};
-  net.AddLayer(GenConvConf("conv1", 32, 5, 1, 2), &s);
-  net.AddLayer(GenReLUConf("relu1"));
-  net.AddLayer(GenConvConf("pool1", 3, 2, 0));
-  net.AddLayer(GenConvConf("conv2", 32, 5, 1, 2));
-  net.AddLayer(GenReLUConf("relu2"));
-  net.AddLayer(GenConvConf("pool2", 3, 2, 0));
-  net.AddLayer(GenConvConf("conv3", 64, 5, 1, 2));
-  net.AddLayer(GenReLUConf("relu3"));
-  net.AddLayer(GenConvConf("pool3", 3, 2, 0));
-  net.AddLayer(GenDenseConf("ip1", 10));
-  net.AddLayer(GenSoftmaxConf("softmax"));
 
+  net.Add(new CudnnConvolution(), GenConvConf("conv1", 32, 5, 1, 2), &s);
+  net.Add(new CudnnActivation(), GenReLUConf("relu1"));
+  net.Add(new CudnnPooling, GenPoolingConf("pool1", true, 3, 2, 0));
+  net.Add(new CudnnConvolution(), GenConvConf("conv2", 32, 5, 1, 2));
+  net.Add(new CudnnActivation(), GenReLUConf("relu2"));
+  net.Add(new CudnnPooling(), GenPoolingConf("pool2", true, 3, 2, 0));
+  net.Add(new CudnnConvolution, GenConvConf("conv3", 64, 5, 1, 2));
+  net.Add(new CudnnActivation(), GenReLUConf("relu3"));
+  net.Add(new CudnnConvolution(), GenConvConf("pool3", true, 3, 2, 0));
+  net.Add(new Flatten(), GenFlattenConf("flat"));
+  net.Add(new Dense(), GenDenseConf("ip1", 10));
   OptimizerConf opt_conf;
   opt_conf.set_momentum(0.9);
   opt->Setup(opt_conf);
@@ -103,42 +115,57 @@ FeedForwordNet CreateNet(Optimizer* opt, Loss* loss, Metric* metric) {
 }
 
 void Train(float lr, int num_epoch, string data_dir) {
-  SoftmaxCrossEntropy loss;
-  Accuracy acc;
-  SGD sgd;
-  sgd.SetLearningRate([lr](int step) {return lr;});
-  auto net = CreateNet(&opt, &loss, &metric);
   Cifar10 data(data_dir);
-  Tensor train_x, tain_y, test_x, test_y;
+  Tensor train_x, train_y, test_x, test_y;
   {
     auto train = data.ReadTrainData();
-    const auto mean = Average(train.first, 0);
-    train_x = SubRow(train.first, mean);
-    auto test = data.ReadTestData();
-    test_x = SubRow(test.first, mean);
+    size_t nsamples = train.first.shape(0);
+    auto matx = Reshape(train.first, Shape{nsamples, train.first.Size() / nsamples});
+    const auto mean = Average(matx, 0);
+    SubRow(mean, &matx);
+    train_x = Reshape(matx, train.first.shape());
     train_y = train.second;
+    auto test = data.ReadTestData();
+    nsamples = test.first.shape(0);
+    auto maty = Reshape(test.first, Shape{nsamples, test.first.Size() / nsamples});
+    SubRow(mean, &maty);
+    test_x = Reshape(maty, test.first.shape());
     test_y = test.second;
   }
-  net.Train(100, num_epoch, train_x, train_y, test_x, test_y);
+  LOG(ERROR) << "creating net";
+  SoftmaxCrossEntropy loss;
+  Accuracy acc;
+  SGD sgd;
+  sgd.SetLearningRateGenerator([lr](int step) {return lr;});
+  auto net = CreateNet(&sgd, &loss, &acc);
+
+  auto cuda = std::make_shared<CudaGPU>();
+  net.ToDevice(cuda);
+
+  train_x.ToDevice(cuda);
+  train_y.ToDevice(cuda);
+  net.Train(50, num_epoch, train_x, train_y); // test_x, test_y);
+}
+
+
 }
 
 int main(int argc, char** argv) {
-  InitChannel();
-  int pos = ArgPos(argc, argv, "-epoch");
+  singa::InitChannel(nullptr);
+  int pos = singa::ArgPos(argc, argv, "-epoch");
   int nEpoch = 5;
   if (pos != -1)
     nEpoch = atoi(argv[pos + 1]);
-  pos = ArgPos(argc, argv, "-lr");
+  pos = singa::ArgPos(argc, argv, "-lr");
   float lr = 0.01;
   if (pos != -1)
     lr = atof(argv[pos + 1]);
-  pos = ArgPos(argc, argv, "-data");
-  string data = "cifar-10-batch-bin";
+  pos = singa::ArgPos(argc, argv, "-data");
+  string data = "cifar-10-batches-bin";
   if (pos != -1)
     data = argv[pos + 1];
 
   LOG(INFO) << "Start training";
-  Train(lr, nEpoch, data);
+  singa::Train(lr, nEpoch, data);
   LOG(INFO) << "End training";
 }
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/examples/cifar10/cifar10.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/cifar10.cc b/examples/cifar10/cifar10.cc
deleted file mode 100644
index 7efc18f..0000000
--- a/examples/cifar10/cifar10.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include <fstream>
-#include <string>
-#include <cstdint>
-#include <iostream>
-
-using std::string;
-namespace singa {
-/// For reading cifar10 binary data as tensors.
-class Cifar10 {
- public:
-  /// 'dir_path': path to the folder including the *.bin files
-  Cifar10(string dir_path, bool normalize = true)
-      : dir_path_(dir_path), normalize_(normalize) {}
-
-  /// read all training data into an image Tensor and a label Tensor
-  const std::pair<Tensor, Tensor> ReadTrainData(bool shuffle = false);
-  /// read all test data into an image Tensor and a label Tensor
-  const std::pair<Tensor, Tensor> ReadTestData();
-  /// read data from one file into an image Tensor and a label Tensor
-  const std::pair<Tensor, Tensor> ReadFile(string file, bool shuffle = false);
-
- private:
-  const int kImageSize = 32;
-  const int kImageVol = 3072;
-  const int kBatchSize = 10000;
-  const int kTrainFiles = 5;
-
-  string dir_path_;
-  bool normalize_;
-};
-
-void read_image(std::ifstream* file, int* label, char* buffer) {
-  char label_char;
-  file->read(&label_char, 1);
-  *label = label_char;
-  file->read(buffer, kImageVol);
-  return;
-}
-const std::pair<Tensor, Tensor> Cifar10::ReadFile(string file,
-                                                  bool shuffle = false) {
-  Tensor images(Shape{kTrainFiles, 3, kImageSize, kImageSize});
-  Tensor labels(Shape{kTrainFiles}, kInt);
-  if (dir_path_.back() != '/') dir_path_.push_back('/');
-  LOG(INFO) << "Reading file " << dir_path_ + file;
-  std::ifstream data_file((dir_path_ + file).c_str(),
-                          std::ios::in | std::ios::binary);
-  CHECK(data_file.is_open()) << "Unable to open file " << file;
-  int label;
-  char image[kImageVol];
-  float float_image[kImageVol];
-  int tmplabels[kBatchSize];
-  for (int itemid = 0; itemid < kBatchSize; ++itemid) {
-    read_image(&data_file, &label, image);
-    for (int i = 0; i < kImageVol; i++)
-      float_image[i] = static_cast<float>(static_cast<int>(image[i]));
-    images.CopyDataFromHostPtr(float_image, kImageVol, itemid * kImageVol);
-    tmplabels[itemid] = label;
-  }
-  labels.CopyDataFromHostPtr(tmplabels, kBatchSize);
-  return std::make_pair(images, labels);
-}
-
-const std::pair<Tensor, Tensor> Cifar10::ReadTrainData(bool shuffle = false) {
-  Tensor images(Shape{kBatchSize * kTrainFiles, 3, kImageSize, kImageSize});
-  Tensor labels(Shape{kBatchSize * kTrainFiles, 3, kImageSize, kImageSize});
-  for (int fileid = 0; fileid < kTrainFiles; ++fileid) {
-    string file = "data_batch_" + std::to_string(fileid + 1) + ".bin";
-    const auto ret = ReadFile(file);
-    CopyDataToFrom(&images, ret.first, ret.first.Size(),
-                   fileid * ret.first.Size());
-    CopyDataToFrom(&labels, ret.second, kBatchSize, fileid * kBatchSize);
-  }
-  return std::make_pair(images, labels);
-}
-const std::pair<Tensor, Tensor> Cifar10::ReadTrainData() {
-  return ReadFile("test_batch.bin");
-}
-}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/examples/cifar10/cifar10.h
----------------------------------------------------------------------
diff --git a/examples/cifar10/cifar10.h b/examples/cifar10/cifar10.h
new file mode 100644
index 0000000..261c048
--- /dev/null
+++ b/examples/cifar10/cifar10.h
@@ -0,0 +1,99 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include <fstream>
+#include <string>
+#include <cstdint>
+#include <iostream>
+#include "singa/core/tensor.h"
+using std::string;
+namespace singa {
+/// For reading cifar10 binary data as tensors.
+class Cifar10 {
+ public:
+  /// 'dir_path': path to the folder including the *.bin files
+  Cifar10(string dir_path, bool normalize = true)
+      : dir_path_(dir_path), normalize_(normalize) {}
+
+  /// read all training data into an image Tensor and a label Tensor
+  const std::pair<Tensor, Tensor> ReadTrainData(bool shuffle = false);
+  /// read all test data into an image Tensor and a label Tensor
+  const std::pair<Tensor, Tensor> ReadTestData();
+  /// read data from one file into an image Tensor and a label Tensor
+  const std::pair<Tensor, Tensor> ReadFile(string file, bool shuffle = false);
+
+  void ReadImage(std::ifstream* file, int* label, char* buffer);
+ private:
+  const size_t kImageSize = 32;
+  const size_t kImageVol = 3072;
+  const size_t kBatchSize = 10000;
+  const size_t kTrainFiles = 1;
+
+  string dir_path_;
+  bool normalize_;
+};
+
+void Cifar10::ReadImage(std::ifstream* file, int* label, char* buffer) {
+  char label_char;
+  file->read(&label_char, 1);
+  *label = static_cast<int>(label_char);
+  file->read(buffer, kImageVol);
+  return;
+}
+const std::pair<Tensor, Tensor> Cifar10::ReadFile(string file, bool shuffle) {
+  Tensor images(Shape{kBatchSize, 3, kImageSize, kImageSize});
+  Tensor labels(Shape{kBatchSize}, kInt);
+  if (dir_path_.back() != '/') dir_path_.push_back('/');
+  LOG(INFO) << "Reading file " << dir_path_ + file;
+  std::ifstream data_file((dir_path_ + file).c_str(),
+                          std::ios::in | std::ios::binary);
+  CHECK(data_file.is_open()) << "Unable to open file " << dir_path_ + file;
+  int label;
+  char image[kImageVol];
+  float float_image[kImageVol];
+  int tmplabels[kBatchSize];
+  for (int itemid = 0; itemid < kBatchSize; ++itemid) {
+    // LOG(INFO) << "reading " << itemid << "-th image";
+    ReadImage(&data_file, &label, image);
+    for (int i = 0; i < kImageVol; i++)
+      float_image[i] = static_cast<float>(static_cast<int>(image[i]));
+    images.CopyDataFromHostPtr(float_image, kImageVol, itemid * kImageVol);
+    tmplabels[itemid] = label;
+  }
+  labels.CopyDataFromHostPtr(tmplabels, kBatchSize);
+  return std::make_pair(images, labels);
+}
+
+const std::pair<Tensor, Tensor> Cifar10::ReadTrainData(bool shuffle) {
+  Tensor images(Shape{kBatchSize * kTrainFiles, 3, kImageSize, kImageSize});
+  Tensor labels(Shape{kBatchSize * kTrainFiles}, kInt);
+  for (int fileid = 0; fileid < kTrainFiles; ++fileid) {
+    string file = "data_batch_" + std::to_string(fileid + 1) + ".bin";
+    const auto ret = ReadFile(file);
+    CopyDataToFrom(&images, ret.first, ret.first.Size(),
+                   fileid * ret.first.Size());
+    CopyDataToFrom(&labels, ret.second, kBatchSize, fileid * kBatchSize);
+  }
+  return std::make_pair(images, labels);
+}
+const std::pair<Tensor, Tensor> Cifar10::ReadTestData() {
+  return ReadFile("test_batch.bin");
+}
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/examples/cifar10/make.sh
----------------------------------------------------------------------
diff --git a/examples/cifar10/make.sh b/examples/cifar10/make.sh
new file mode 100755
index 0000000..17e4b39
--- /dev/null
+++ b/examples/cifar10/make.sh
@@ -0,0 +1 @@
+g++ -g --std=c++11 alexnet.cc -o alexnet -I../../include -I../../build/include -I/home/wangwei/local/cudnn4/include -I/home/wangwei/local/include -I/usr/local/cuda/include/ -I../../lib/cnmem/include -L../../build/lib/ -lsinga_core -lsinga_model  -lsinga_utils -lcudart -lcublas -lcurand -lcudnn -L/usr/local/cuda/lib64 -L/home/wangwei/local/cudnn4/lib64 ../../build/lib/libproto.a -lprotobuf

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/include/singa/core/common.h
----------------------------------------------------------------------
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index cb1bdca..691d7d4 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -49,27 +49,29 @@ class Block {
  public:
   Block(void* ptr, size_t size, size_t offset = 0)
       : data_(ptr), size_(size), offset_(offset) {
-    ref_count_ = std::make_shared<std::atomic<int>>(1);
+    ref_count_ = 1; //std::make_shared<std::atomic<int>>(1);
   }
-  Block(void* ptr, size_t size, size_t offset, std::shared_ptr<atomic<int>> ref)
-      : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {}
+//  Block(void* ptr, size_t size, size_t offset, std::shared_ptr<atomic<int>> ref)
+//      : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {}
   void* mutable_data() const { return static_cast<char*>(data_) + offset_; }
   const void* data() const { return static_cast<char*>(data_) + offset_; }
   size_t size() const { return size_; }
   size_t offset() const { return offset_; }
   int IncRefCount() {
-    return (*ref_count_)++;
+    return ++ref_count_;  //(*ref_count_)++;
   }
   int DecRefCount() {
-    return  (*ref_count_)--;
+    return --ref_count_; // (*ref_count_)--;
   }
-  int ref_count() const { return ref_count_->load(); }
+  int ref_count() const { return ref_count_.load(); }
 
  private:
+  Block() {}
   void* data_ = nullptr;
   size_t size_ = 0;
   size_t offset_ = 0;
-  std::shared_ptr<std::atomic<int>> ref_count_ = nullptr;
+  // std::shared_ptr<std::atomic<int>> ref_count_ = nullptr;
+  std::atomic<int> ref_count_;
 };
 
 typedef struct _Context {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 6de5c0c..3b496d9 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -96,6 +96,8 @@ class Tensor {
 
   /// return number of total elements
   size_t Size() const {
+    if (block_ == nullptr)
+      return 0u;
     CHECK_EQ(block_->size() % SizeOf(data_type_), 0u);
     return block_->size() / SizeOf(data_type_);
   }
@@ -315,7 +317,7 @@ Tensor Div(const SType x, const Tensor &in);
 template <typename SType>
 void Div(const SType x, const Tensor &in, Tensor *out);
 
-template <typename SType>
+template <typename SType = float>
 SType Sum(const Tensor &in);
 // ============Matrix (row/column) operations==================================
 /// Average elements in the Tensor, currently only support vector and matrix.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/include/singa/model/feed_forward_net.h
----------------------------------------------------------------------
diff --git a/include/singa/model/feed_forward_net.h b/include/singa/model/feed_forward_net.h
index 173600b..9beeb7a 100644
--- a/include/singa/model/feed_forward_net.h
+++ b/include/singa/model/feed_forward_net.h
@@ -18,7 +18,9 @@
 #ifndef SINGA_MODEL_FEED_FORWARD_NET_H_
 #define SINGA_MODEL_FEED_FORWARD_NET_H_
 #include "singa/model/layer.h"
-
+#include "singa/model/loss.h"
+#include "singa/model/metric.h"
+#include "singa/model/optimizer.h"
 namespace singa {
 
 /// The feed-forward neural net.
@@ -26,14 +28,14 @@ namespace singa {
 /// and conducting training, evaluation and prediction.
 class FeedForwardNet {
  public:
-  FeedForwardNet() = explicit;
+  FeedForwardNet() = default;
   ~FeedForwardNet();
 
   /// Add a layer with the assumption that
   /// 1. this function is called in correct order, i.e., the layers are added
   ///    following the topological order.
   /// 2. this layer has already been setup (Setup function is called outside).
-  void Add(Layer *layer);
+  Layer* Add(Layer* layer);
 
   // TODO(wangwei) add ConcatenateLayer and SliceLayer
   // AddConcatenateLayer(vector<Layer*> src, Layer *dst);
@@ -43,36 +45,34 @@ class FeedForwardNet {
   /// Assume the layer is added in corret order.
   /// For the first layer, 'sample_shape' (the input sample shape) is necessary
   /// for calling Setup().
-  void Add(const LayerConf &conf, const Shape *sample_shape = nullptr);
+  Layer* Add(const LayerConf& conf, const Shape* sample_shape = nullptr);
 
+  Layer* Add(Layer* layer, const LayerConf& conf, const Shape* sample_shape = nullptr);
   /// Set some fields used for training and evaluating the neural net.
   /// If the neural net is constructed for evaluation only, then 'opt' is not
   /// necessary; But for training, both 'opt' and 'loss' are necessary.
   /// 'shuffle' indicates shuffling training samples within one epoch it is
   /// valid using Train();
-  void Compile(bool shuffle, Optimizer *opt, Loss *loss, Metric *metric);
+  void Compile(bool shuffle, Optimizer* opt, Loss<Tensor>* loss,
+               Metric<Tensor>* metric);
 
   /// Conduct the training giving the training data 'x' and label 'y'.
-  /// Due to memory limit, 'x' and 'y' could not be very large. Hence, it is
-  /// typically used for small training datasets, e.g., cifar10 and MNIST which
-  /// can be stored in main memory.
-  void Train(int batchsize, int nb_epoch, Tensor x, Tensor y);
-  /// Conduct the training giving the training data 'x' and label 'y'.
-  /// 'val_split' is a ratio for splitting (1-'val_split') of training data for
+  /// 'val_split' of training data is used for
   /// validation. Validation is performance before every epoch.
   /// Due to memory limit, 'x' and 'y' could not be very large. Hence, it is
   /// typically used for small training datasets, e.g., cifar10 and MNIST which
   /// can be stored in main memory.
-  void Train(int batchsize, int nb_epoch, float val_split, Tensor x, Tensor y);
+  void Train(size_t batchsize, int nb_epoch, const Tensor& x, const Tensor& y,
+             float val_split = 0.0f);
   /// Conduct the training given the training and validation data.
   /// Validation is performance before every epoch.
   /// Due to memory limit, 'x' and 'y' could not be very large. Hence, it is
   /// typically used for small training datasets, e.g., cifar10 and MNIST which
   /// can be stored in main memory.
-  void Train(int batchsize, int nb_epoch, Tensor x, Tensor y, Tensor val_x,
-             Tensor val_y);
+  void Train(size_t batchsize, int nb_epoch, const Tensor& x, const Tensor& y,
+             const Tensor& val_x, const Tensor& val_y);
   /// Train the neural net over one batch of training data.
-  Tensor TrainOnBatch(Tensor x, Tensor y);
+  const std::pair<float, float> TrainOnBatch(const Tensor& x, const Tensor& y);
 
   /// Evaluate the neural net with given data.
   /// Returns one tensor for loss values and one tensor for metric values;
@@ -82,9 +82,10 @@ class FeedForwardNet {
   /// Due to memory limit, 'x' and 'y' could not be very large. Hence, it is
   /// typically used for small training datasets, e.g., cifar10 and MNIST which
   /// can be stored in main memory.
-  std::pair<Tensor, Tensor> Evaluate(Tensor x, Tensor y, int batchsize = 128);
+  std::pair<Tensor, Tensor> Evaluate(const Tensor& x, const Tensor& y,
+                                     size_t batchsize = 128);
   /// Evaluate the neural net for one batch of data
-  std::pair<Tensor, Tensor> EvaluateOnBatch(Tensor x, Tensor y);
+  std::pair<Tensor, Tensor> EvaluateOnBatch(const Tensor& x, const Tensor& y);
 
   /// Predict the probability distributation over candicate classes for each
   /// data sample. 'batchsize' is used for controlling the memory footprint.
@@ -92,35 +93,37 @@ class FeedForwardNet {
   /// Due to memory limit, 'x' and 'y' could not be very large. Hence, it is
   /// typically used for small training datasets, e.g., cifar10 and MNIST which
   /// can be stored in main memory.
-  Tensor Predict(const Tensor &x, int batchsize = 128);
+  const Tensor Predict(const Tensor& x, size_t batchsize = 128);
   /// Predict for one batch data.
-  Tensor PredictOnBatch(const Tensor &x);
+  const Tensor PredictOnBatch(const Tensor& x);
 
   /// Forward layers one by one using the data batch 'x'.
   /// Returns the prediction results (from the last layer).
-  Tensor Forward(const Tensor& x);
+  const Tensor Forward(int flag, const Tensor& x);
   /// Backward layers one by one using the gradient batch 'grad'.
   /// Returns the parameter gradients.
-  const vector<Tensor> Backward(const Tensor& grad);
+  const vector<Tensor> Backward(int flag, const Tensor& grad);
 
   /// Clone the neuaral net by cloning every layer to the given device.
   /// If 'device' is nullptr, then clone it one the current device.
-  FeedForwardNet Clone(std::shared_ptr<Device> device = nullptr);
+  FeedForwardNet Clone(std::shared_ptr<Device> device);
   /// Move the layer data to the given device.
-  void ToDevice(Device *device);
+  void ToDevice(std::shared_ptr<Device> device);
+  void ToHost() { ToDevice(defaultDevice); }
   /// Set the data type of each layer.
   void AsType(DataType dtype);
 
-  const vector<Layer *> layers() const { return layers_; }
+  const vector<Layer*> layers() const { return layers_; }
   const vector<string> GetParamNames() const;
-  const vector<Tensor *> GetParamValues() const;
-  const vector<Tensor *> GetParamGrads() const;
+  const vector<ParamSpec> GetParamSpecs() const;
+  const vector<Tensor*> GetParamValues() const;
+  const vector<Tensor*> GetParamGrads() const;
 
  protected:
-  vector<Layer *> layers_;
-  Optimizer *opt_;
-  Loss *loss_;
-  Metric *metric_;
+  vector<Layer*> layers_;
+  Optimizer* opt_;
+  Loss<Tensor>* loss_;
+  Metric<Tensor>* metric_;
 
   bool shuffle_ = true;
   Device* device_ = nullptr;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index 79eb069..ce8007c 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -151,7 +151,7 @@ class Layer {
 
   /// Clone the layer to the given device. Layer data (e.g., parameters) are
   /// deep copied. If 'device' is nullptr, then clone it one the current device.
-  virtual Layer* Clone(std::shared_ptr<Device> device);
+  // virtual Layer* Clone(std::shared_ptr<Device> device);
   /// Move the layer (including its parameters and other internal Tensor) onto
   /// the given device
   virtual void ToDevice(std::shared_ptr<Device> device) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/include/singa/model/loss.h
----------------------------------------------------------------------
diff --git a/include/singa/model/loss.h b/include/singa/model/loss.h
index 79abace..41ec701 100644
--- a/include/singa/model/loss.h
+++ b/include/singa/model/loss.h
@@ -37,6 +37,7 @@ class Loss {
     Setup(loss);
   }
 	virtual ~Loss(){};
+  virtual void ToDevice(std::shared_ptr<Device> device) {}
   /// Set meta fields from user configurations.
   virtual void Setup(const LossConf& conf) {}
 
@@ -48,7 +49,8 @@ class Loss {
   /// It calls Forward() internally. The calling pattern should be
   /// [Evaluate|Forward] Backward.
   float Evaluate(const Tensor& prediction, const T& target) {
-    const Tensor& loss = Forward(prediction, target);
+    Tensor loss = Forward(prediction, target);
+    loss.ToHost();
     return Sum<float>(loss) / (1.0f * loss.Size());
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/include/singa/model/metric.h
----------------------------------------------------------------------
diff --git a/include/singa/model/metric.h b/include/singa/model/metric.h
index b99ff0d..d013fa4 100644
--- a/include/singa/model/metric.h
+++ b/include/singa/model/metric.h
@@ -33,6 +33,7 @@ class Metric {
  public:
   // TODO(wangwei) call Setup using a default MetricConf.
   Metric() = default;
+  virtual void ToDevice(std::shared_ptr<Device> device) {}
   void Setup(const string& conf) {
     MetricConf metric;
     metric.ParseFromString(conf);
@@ -51,6 +52,28 @@ class Metric {
     return Sum<float>(metric) / (1.0f * metric.Size());
   }
 };
+/// Compute the accuray of the prediction, which is matched against the
+/// ground truth labels.
+/// TODO(wangwei) consider multi-label cases.
+class Accuracy : public Metric<Tensor> {
+ public:
+  /// Set meta fields from user configurations.
+  void Setup(const MetricConf& conf) override { top_k_ = conf.top_k(); }
+
+  /// Check the prediction against the target (ground truth) for each data
+  /// sample. The returned Tensor has a float value for each sample, 0 for wrong
+  /// and 1 for correct. Users can call Sum(const Tensor&) / Tensor::Size() to
+  /// get the accuracy.
+  Tensor Forward(const Tensor& prediction, const Tensor& target);
+
+ private:
+  /// \copydoc Match(const Tensor&, const Tensor&);
+  Tensor Match(const Tensor& prediction, const vector<int>& target);
+  /// If the ground truth label is in the top k predicted labels, then the
+  /// prediction is correct.
+  size_t top_k_ = 1;
+};
+
 
 }  // namespace singa
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/include/singa/model/optimizer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/optimizer.h b/include/singa/model/optimizer.h
index f912668..a268126 100644
--- a/include/singa/model/optimizer.h
+++ b/include/singa/model/optimizer.h
@@ -155,7 +155,7 @@ class Regularizer {
 };
 
 // =============Vallina SGD with Momentum=====================================
-class SGD : Optimizer {
+class SGD : public Optimizer {
  public:
   void Setup(const OptimizerConf& conf);
   /// Apply the updating algorithm.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c51d454..af09799 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -36,7 +36,7 @@ IF (USE_CUDA)
     SET(FLAGS_BACKUP ${CMAKE_CXX_FLAGS})
     SET(CMAKE_CXX_FLAGS "")
     IF (CMAKE_BUILD_TYPE MATCHES DEBUG)
-        CUDA_COMPILE(cuda_objs SHARED ${cuda_source} 
+        CUDA_COMPILE(cuda_objs SHARED ${cuda_source}
             OPTIONS "-Xcompiler -fPIC -G -g")
     ELSE (CMAKE_BUILD_TYPE MATCHES  DEBUG)
         CUDA_COMPILE(cuda_objs SHARED ${cuda_source} OPTIONS "-Xcompiler -fPIC")
@@ -57,6 +57,7 @@ AUX_SOURCE_DIRECTORY(model model_source)
 AUX_SOURCE_DIRECTORY(model/layer model_source)
 AUX_SOURCE_DIRECTORY(model/optimizer model_source)
 AUX_SOURCE_DIRECTORY(model/loss model_source)
+AUX_SOURCE_DIRECTORY(model/metric model_source)
 #MESSAGE(STATUS "MODEL ${model_source}")
 ADD_LIBRARY(singa_model SHARED ${model_source})
 TARGET_LINK_LIBRARIES(singa_model ${SINGA_LINKER_LIBS})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/core/device/cuda_gpu.cc
----------------------------------------------------------------------
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 5879c58..5f6ac17 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -121,7 +121,7 @@ void CudaGPU::CopyToFrom(void* dst, const void* src, size_t nBytes,
   // cudaMemcpyAsync(dst, src, nBytes,cudaMemcpyDefault, ctx_.stream);
 }
 
-/// Allocate cpu memory.
+/// Allocate gpu memory.
 void* CudaGPU::Malloc(int size) {
   void* ptr = nullptr;
   if (size > 0) {
@@ -132,7 +132,7 @@ void* CudaGPU::Malloc(int size) {
   return ptr;
 }
 
-/// Free cpu memory.
+/// Free gpu memory.
 void CudaGPU::Free(void* ptr) {
   if (ptr != nullptr) {
     // CUDA_CHECK(cudaFree(ptr));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index ec59aaa..898cdc6 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -118,7 +118,8 @@ void Tensor::ToDevice(std::shared_ptr<Device> dst) {
   // TODO(wangwei) the comparison is very strict. May compare against device ID?
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
-    tmp.CopyData(*this);
+    if (block_ != nullptr && Size())
+      tmp.CopyData(*this);
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = tmp.block_;
@@ -136,7 +137,8 @@ void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num,
       << "data_type is " << DataType_Name(data_type_)
       << " user given type is of size " << sizeof(DType);
   if (src != nullptr) {
-    device_->CopyDataFromHostPtr(block(), src, sizeof(DType) * num, offset);
+    device_->CopyDataFromHostPtr(block(), src, sizeof(DType) * num,
+        sizeof(DType) * offset);
   } else {
     LOG(WARNING) << "Copy data from null host ptr";
   }
@@ -637,13 +639,13 @@ Tensor ConcatenateColumns(const vector<Tensor> &in) {
   return out;
 }
 Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) {
-  CHECK_EQ(in.nDim(), 2u);
   CHECK_LT(start, end);
   CHECK_GE(in.shape(0), end);
-  Shape s;
-  s = Shape{end - start, in.shape(1)};
+  Shape s = in.shape();
+  s[0] = end - start;
+  size_t sample_size = in.Size() / in.shape(0);
   Tensor out(s, in.device(), in.data_type());
-  CopyDataToFrom(&out, in, out.Size(), 0, start * out.shape(1));
+  CopyDataToFrom(&out, in, out.Size(), 0, start * sample_size);
   return out;
 }
 Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/feed_forward_net.cc
----------------------------------------------------------------------
diff --git a/src/model/feed_forward_net.cc b/src/model/feed_forward_net.cc
index f9e6480..a24d36a 100644
--- a/src/model/feed_forward_net.cc
+++ b/src/model/feed_forward_net.cc
@@ -18,11 +18,11 @@
 
 #include "singa/model/feed_forward_net.h"
 #include "singa/utils/logging.h"
+#include "singa/utils/channel.h"
 namespace singa {
 
-~FeedForwardNet::FeedForwardNet() {
-  for (auto layer : layers_)
-    delete layer;
+FeedForwardNet::~FeedForwardNet() {
+  for (auto layer : layers_) delete layer;
 }
 Layer* FeedForwardNet::Add(Layer* layer) {
   layers_.push_back(layer);
@@ -32,7 +32,12 @@ Layer* FeedForwardNet::Add(Layer* layer) {
 Layer* FeedForwardNet::Add(const LayerConf& conf, const Shape* sample_shape) {
   CHECK(sample_shape != nullptr || layers_.size())
       << "Must provide the input sample shape for the first layer";
-  Layer* layer = CreateLayer(conf.type());
+  Layer* layer = nullptr;  // TODO(wangwei) use CreateLayer(conf.type());
+  Add(layer, conf, sample_shape);
+  return layer;
+}
+
+Layer* FeedForwardNet::Add(Layer* layer, const LayerConf& conf, const Shape* sample_shape) {
   if (sample_shape == nullptr)
     layer->Setup(layers_.back()->GetOutputSampleShape(), conf);
   else
@@ -44,28 +49,25 @@ Layer* FeedForwardNet::Add(const LayerConf& conf, const Shape* sample_shape) {
 const vector<string> FeedForwardNet::GetParamNames() const {
   vector<string> names;
   for (auto layer : layers_)
-    for (const auto name : layer->param_names())
-      names.push_back(name);
+    for (const auto name : layer->param_names()) names.push_back(name);
   return names;
 }
-const vector<Tensor *> FeedForwardNet::GetParamValues() const {
-  vector<Tensor *> values;
+const vector<Tensor*> FeedForwardNet::GetParamValues() const {
+  vector<Tensor*> values;
   for (auto layer : layers_)
-    for (const auto value : layer->param_values())
-      values.push_back(value);
+    for (const auto value : layer->param_values()) values.push_back(value);
   return values;
 }
 
-const vector<Tensor *> FeedForwardNet::GetParamSpecs() const {
-  vector<ParamSpec *> specs;
+const vector<ParamSpec> FeedForwardNet::GetParamSpecs() const {
+  vector<ParamSpec> specs;
   for (auto layer : layers_)
-    for (const auto spec : layer->param_specs())
-      specs.push_back(spec);
+    for (const auto spec : layer->param_specs()) specs.push_back(spec);
   return specs;
 }
 
-void FeedForwardNet::Compile(bool shuffle, Optimizer* opt, Loss* loss,
-                             Metric* metric) {
+void FeedForwardNet::Compile(bool shuffle, Optimizer* opt, Loss<Tensor>* loss,
+                             Metric<Tensor>* metric) {
   shuffle_ = shuffle;
   bool train = (opt != nullptr) && (loss != nullptr);
   bool test = metric != nullptr;
@@ -73,14 +75,17 @@ void FeedForwardNet::Compile(bool shuffle, Optimizer* opt, Loss* loss,
   opt_ = opt;
   loss_ = loss;
   metric_ = metric;
+  // init params and register them to sgd
 }
 
 void FeedForwardNet::ToDevice(std::shared_ptr<Device> device) {
   for (auto layer: layers_)
     layer->ToDevice(device);
+  /*
   opt_->ToDevice(device);
   loss_->ToDevice(device);
   metric_->ToDevice(device);
+  */
 }
 
 FeedForwardNet FeedForwardNet::Clone(std::shared_ptr<Device> device) {
@@ -98,118 +103,110 @@ FeedForwardNet FeedForwardNet::Clone(std::shared_ptr<Device> device) {
   net.device_ = device;
   net.dtype_ = dtype;
   */
+  return net;
 }
 
 void FeedForwardNet::AsType(DataType dtype) {
   LOG(FATAL) << "FeedForwardNet::AsType not implemented";
 }
 
-void FeedForwardNet::Train(int batchsize, int nb_epoch, Tensor x, Tensor y) {
-  CHECK_EQ(x.shape(0), y.shape(0)) << "Diff num of sampels in x and y";
-  int num_extra_samples = x.shape(0) % batchsize;
-  if (num_extra_samples != 0)
-    LOG(WARNING) << "The last " << num_extra_samples << " would not be used";
-  Channel *ch = GetChannel("perf");
-  for (int epoch = 0; epoch < nb_epoch; epoch++) {
-    float loss = 0.0f, metric = 0.0f;
-    int batch = 0;
-    for (; batch < x.shape(0) / batchsize; batch++) {
-      Tesnor bx = x.Slice(batch * batchsize, batch * batchsize + batchsize);
-      Tesnor by = y.Slice(batch * batchsize, batch * batchsize + batchsize);
-      const auto ret = TrainOnBatch(bx, by);
-      loss += ret.first;
-      metric += ret.second;
-    }
-    loss /= batch;
-    metric /= batch;
-    ch->Send("Epoch " + std::to_string(epoch) + ", training loss = " +
-             std::to_string(loss) + ", accuracy = " + std::to_string(metric));
-  }
-}
-
-void FeedForwardNet::Train(int batchsize, int nb_epoch, Tensor x, Tensor y,
-   float val_split) {
+void FeedForwardNet::Train(size_t batchsize, int nb_epoch, const Tensor& x,
+                           const Tensor& y, float val_split) {
   CHECK_EQ(x.shape(0), y.shape(0)) << "Diff num of sampels in x and y";
   size_t num_train = x.shape(0) * val_split;
-  const Tensor train_x = CopyRows(x, 0, num_train);
-  const Tensor train_y = CopyRows(y, 0, num_train);
-  const Tensor val_x = CopyRows(x, num_train, x.shape(0));
-  const Tensor val_y = CopyRows(y, num_train, x.shape(0));
-  Train(batchsize, nb_epoch, train_x, train_y, val_x, val_y);
+  if (val_split == 0.0f) {
+    Tensor dummy;
+    Train(batchsize, nb_epoch, x, y, dummy, dummy);
+  } else {
+    const Tensor train_x = CopyRows(x, 0, num_train);
+    const Tensor train_y = CopyRows(y, 0, num_train);
+    const Tensor test_x = CopyRows(x, num_train, x.shape(0));
+    const Tensor test_y = CopyRows(y, num_train, y.shape(0));
+    Train(batchsize, nb_epoch, train_x, train_y, test_x, test_y);
+  }
 }
 
-
-void FeedForwardNet::Train(int batchsize, int nb_epoch, Tensor x, Tensor y,
-    const Tensor & val_x, const Tensor &val_y) {
+void FeedForwardNet::Train(size_t batchsize, int nb_epoch, const Tensor& x,
+                           const Tensor& y, const Tensor& val_x,
+                           const Tensor& val_y) {
+  InitNetParams();
   CHECK_EQ(x.shape(0), y.shape(0)) << "Diff num of sampels in x and y";
   int num_extra_samples = x.shape(0) % batchsize;
   if (num_extra_samples != 0)
     LOG(WARNING) << "The last " << num_extra_samples << " would not be used";
-  Channel *train_ch = GetChannel("train_perf");
-  Channel *test_ch = GetChannel("test_perf");
+  Channel* train_ch = GetChannel("train_perf");
+  train_ch->EnableDestStderr(true);
+  Channel* val_ch = GetChannel("val_perf");
   for (int epoch = 0; epoch < nb_epoch; epoch++) {
     float loss = 0.0f, metric = 0.0f;
-    int b = 0;
-    for (;b < x.shape(0) / batchsize; b++) {
-      Tesnor bx = CopyRows(x, b * batchsize, b * batchsize + batchsize);
-      Tesnor by = CopyRows(y, b * batchsize, b * batchsize + batchsize);
+    size_t b = 0;
+    for (; b < x.shape(0) / batchsize; b++) {
+      const Tensor bx = CopyRows(x, b * batchsize, b * batchsize + batchsize);
+      const Tensor by = CopyRows(y, b * batchsize, b * batchsize + batchsize);
       const auto ret = TrainOnBatch(bx, by);
       loss += ret.first;
       metric += ret.second;
     }
-    loss /= batch;
-    metric /= batch;
+    loss /= b;
+    metric /= b;
     train_ch->Send("Epoch " + std::to_string(epoch) + ", training loss = " +
-             std::to_string(loss) + ", accuracy = " + std::to_string(metric));
-    const auto val_perf = Evaluate(val_x, val_y, batchsize);
-    test_ch->Send("Epoch " + std::to_string(epoch)
-        + ", test loss = " + std::to_string(Average(val_perf.first))
-        + ", metric = " + std::to_string(Average(val_perf.second)));
+                   std::to_string(loss) + ", accuracy = " +
+                   std::to_string(metric));
+    if (val_x.Size() && val_y.Size()) {
+      const auto val_perf = Evaluate(val_x, val_y, batchsize);
+      val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
+                   std::to_string(Sum(val_perf.first) / val_y.Size()) +
+                   ", metric = " +
+                   std::to_string(Sum(val_perf.second) / val_y.Size()));
+    }
   }
 }
 
-const std::pair<float, float> FeedForwardNet::TrainOnBatch(const Tensor x,
-                                                           const Tensor y) {
-  const Tensor fea = Forward(kTrain, bx);
-  float loss = loss->Evaluate(fea, fy);
-  float metric = metric->Evaluate(fea, by);
-  const Tensor grad = loss->Backward();
-  Backward(kTrain, grad);
+const std::pair<float, float> FeedForwardNet::TrainOnBatch(const Tensor& x,
+                                                           const Tensor& y) {
+  int flag = kTrain;
+  const Tensor fea = Forward(flag, x);
+  float loss = loss_->Evaluate(fea, y);
+  float metric = metric_->Evaluate(fea, y);
+  const Tensor grad = loss_->Backward();
+  const auto grads = Backward(kTrain, grad);
   return std::make_pair(loss, metric);
 }
 
-const Tensor FeedForwardNet::Forward(int flag, const Tensor data) {
-  Tensor tmp = data;
+const Tensor FeedForwardNet::Forward(int flag, const Tensor& data) {
+  Tensor input = data, output;
   for (auto layer : layers_) {
-    tmp = layer->Forward(flag, tmp);
+//    LOG(INFO) << layer->name();
+    output = layer->Forward(flag, input);
+    input = output;
   }
-  return tmp;
+  return output;
 }
 
-cons vector<Tensor> FeedForwardNet::Backward(int flag, const Tensor grad) {
+const vector<Tensor> FeedForwardNet::Backward(int flag, const Tensor& grad) {
   vector<Tensor> param_grads;
   Tensor tmp = grad;
-  for (size_t i = layers_.size() - 1; i >= 0; i--) {
+  for (int i = layers_.size() - 1; i >= 0; i--) {
+ //   LOG(INFO) << layers_.at(i)->name();
     auto ret = layers_.at(i)->Backward(flag, tmp);
-    tmp =ret.first;
+    tmp = ret.first;
     if (ret.second.size())
-      for (const auto x: ret.second)
-        param_grads.push_back(x);
+      for (const auto x : ret.second) param_grads.push_back(x);
   }
   return param_grads;
 }
 
-std::pair<Tensor, Tensor> Evaluate(Tensor x, Tensor y, int batchsize) {
+std::pair<Tensor, Tensor> FeedForwardNet::Evaluate(const Tensor& x,
+                                                   const Tensor& y,
+                                                   size_t batchsize) {
   CHECK_EQ(x.shape(0), y.shape(0)) << "Diff num of sampels in x and y";
   CHECK_GE(x.shape(0), batchsize);
   int num_extra_samples = x.shape(0) % batchsize;
-  int b = 0;
   Tensor loss(Shape{x.shape(0)}), metric(Shape{x.shape(0)});
-  for (; b < x.shape(0) / batchsize; b++) {
+  for (size_t b = 0; b < x.shape(0) / batchsize; b++) {
     int start = b * batchsize, end = start + batchsize;
     const Tensor bx = CopyRows(x, start, end);
     const Tensor by = CopyRows(y, start, end);
-    const Tensor fea = Forward(kEval, bx);
     const auto ret = EvaluateOnBatch(bx, by);
     CopyDataToFrom(&loss, ret.first, batchsize, start, 0);
     CopyDataToFrom(&metric, ret.second, batchsize, start, 0);
@@ -230,18 +227,19 @@ std::pair<Tensor, Tensor> Evaluate(Tensor x, Tensor y, int batchsize) {
 
 std::pair<Tensor, Tensor> FeedForwardNet::EvaluateOnBatch(const Tensor& x,
                                                           const Tensor& y) {
-  const Tensor fea = Forward(kEval, bx);
-  const Tensor m = metric_->Forward(fea, by);
-  const Tensor l = loss_->Forward(fea, by);
+  int flag = kEval;
+  const Tensor fea = Forward(flag, x);
+  const Tensor m = metric_->Forward(fea, y);
+  const Tensor l = loss_->Forward(fea, y);
   return std::make_pair(m, l);
 }
 
-const Tensor FeedForwardNet::Predict(const Tensor& x, int batchsize) {
+const Tensor FeedForwardNet::Predict(const Tensor& x, size_t batchsize) {
   CHECK_GE(x.shape(0), batchsize);
   int num_extra_samples = x.shape(0) % batchsize;
-  const auto outshape = layers_.back().GetOutputSampleShape();
+  const auto outshape = layers_.back()->GetOutputSampleShape();
   Tensor y(Shape{x.shape(0), Product(outshape)}, x.device());
-  for (int b = 0; b < x.shape(0) / batchsize; b++) {
+  for (size_t b = 0; b < x.shape(0) / batchsize; b++) {
     int start = b * batchsize, end = start + batchsize;
     const Tensor bx = CopyRows(x, start, end);
     CopyDataToFrom(&y, PredictOnBatch(bx), batchsize * y.shape(1),
@@ -258,6 +256,6 @@ const Tensor FeedForwardNet::Predict(const Tensor& x, int batchsize) {
 }
 
 const Tensor FeedForwardNet::PredictOnBatch(const Tensor& x) {
-  return Foward(kEval, x);
+  return Forward(kEval, x);
 }
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
index c27960d..e4991a4 100644
--- a/src/model/layer/convolution.cc
+++ b/src/model/layer/convolution.cc
@@ -112,5 +112,9 @@ const std::pair<Tensor, vector<Tensor>> Convolution::Backward(
 
   return std::make_pair(input_grad, param_grad);
 }
-
+void Convolution::ToDevice(std::shared_ptr<Device> device) {
+  Layer::ToDevice(device);
+  weight_.ToDevice(device);
+  bias_.ToDevice(device);
+}
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/layer/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h
index 7ea5712..0e0b160 100644
--- a/src/model/layer/convolution.h
+++ b/src/model/layer/convolution.h
@@ -44,6 +44,7 @@ class Convolution : public Layer {
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,
                                                    const Tensor &grad) override;
 
+  void ToDevice(std::shared_ptr<Device> device) override;
   size_t kernel_w() const { return kernel_w_; }
   size_t kernel_h() const { return kernel_h_; }
   size_t pad_w() const { return pad_w_; }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.cc b/src/model/layer/cudnn_activation.cc
index 4216fea..df728ce 100644
--- a/src/model/layer/cudnn_activation.cc
+++ b/src/model/layer/cudnn_activation.cc
@@ -54,6 +54,7 @@ void CudnnActivation::InitCudnn(size_t size, DataType dtype) {
 }
 
 const Tensor CudnnActivation::Forward(int flag, const Tensor& input) {
+  CHECK(buf_.empty());
   auto size = input.Size();
   DataType dtype = input.data_type();
   if (!has_init_cudnn_) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
index d5ac2a3..eb507b2 100644
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@ -156,6 +156,7 @@ void CudnnConvolution::InitCudnn(const Tensor &input) {
 }
 
 const Tensor CudnnConvolution::Forward(int flag, const Tensor &input) {
+  CHECK(buf_.empty());
   CHECK_EQ(input.device()->lang(), kCuda);
   CHECK_EQ(input.nDim(), 4u);
   if (flag & kTrain) buf_.push(input);  // buffer the input for backward

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
index 2e2e12b..f9b9dbf 100644
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@ -106,6 +106,10 @@ const std::pair<Tensor, vector<Tensor>> CudnnDropout::Backward(
   }
   return std::make_pair(dx, param_grad);
 }
+void CudnnDropout::ToDevice(std::shared_ptr<Device> device) {
+  Dropout::ToDevice(device);
+  state.ToDevice(device);
+}
 }  // namespace singa
 #endif  // CUDNN_VERSION_MAJOR>=5
 #endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index 6809653..9e0cb9e 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -42,6 +42,7 @@ class CudnnDropout : public Dropout {
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,
                                                    const Tensor& grad) override;
 
+  void ToDevice(std::shared_ptr<Device> device) override;
  private:
   /// Init cudnn related data structures.
   void InitCudnn(int size, DataType dtype, std::shared_ptr<Device> dev,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.cc b/src/model/layer/cudnn_pooling.cc
index 6d7a5b1..e49a1ec 100644
--- a/src/model/layer/cudnn_pooling.cc
+++ b/src/model/layer/cudnn_pooling.cc
@@ -78,6 +78,7 @@ void CudnnPooling::InitCudnn(const Tensor &input) {
 }
 
 const Tensor CudnnPooling::Forward(int flag, const Tensor &input) {
+  CHECK(buf_.empty());
   CHECK_EQ(input.device()->lang(), kCuda);
   CHECK_EQ(input.nDim(), 4u);
   size_t batchsize = input.shape(0);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_softmax.cc b/src/model/layer/cudnn_softmax.cc
index 77eab98..1d9e0b8 100644
--- a/src/model/layer/cudnn_softmax.cc
+++ b/src/model/layer/cudnn_softmax.cc
@@ -57,6 +57,7 @@ void CudnnSoftmax::InitCudnn(Shape shape, DataType dtype) {
 }
 
 const Tensor CudnnSoftmax::Forward(int flag, const Tensor& input) {
+  CHECK(buf_.empty());
   auto shape = input.shape();
   DataType dtype = input.data_type();
   if (!has_init_cudnn_) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/layer/dense.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
index bad26a8..c6a9f8a 100644
--- a/src/model/layer/dense.cc
+++ b/src/model/layer/dense.cc
@@ -45,7 +45,9 @@ void Dense::Setup(const Shape& in_sample, const LayerConf &conf) {
 
 /// \copydoc Layer::Forward(int flag, const Tensor&)
 const Tensor Dense::Forward(int flag, const Tensor &input) {
+  CHECK(buf_.empty());
   Tensor output;
+  CHECK_EQ(input.nDim(), 2);
   if (transpose_)  // use the transposed version of weight_ for computing
     output = Mult(input, weight_);
   else
@@ -81,6 +83,7 @@ const std::pair<Tensor, vector<Tensor>> Dense::Backward(int flag,
 }
 
 void Dense::ToDevice(std::shared_ptr<Device> device) {
+  Layer::ToDevice(device);
   weight_.ToDevice(device);
   bias_.ToDevice(device);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/metric/accuracy.cc
----------------------------------------------------------------------
diff --git a/src/model/metric/accuracy.cc b/src/model/metric/accuracy.cc
new file mode 100644
index 0000000..1b667b1
--- /dev/null
+++ b/src/model/metric/accuracy.cc
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "singa/model/metric.h"
+#include <algorithm>
+namespace singa {
+
+Tensor Accuracy::Match(const Tensor& predict, const vector<int>& target) {
+  Tensor prediction(predict.shape());
+  prediction.CopyData(predict);
+  size_t batchsize = target.size();
+  size_t nb_classes = prediction.Size() / batchsize;
+  // each row of prediction is the prob distribution for one sample
+  CHECK_EQ(prediction.shape().at(0), batchsize);
+  // TODO(wangwei) CloneToDevice(host);
+  const float* prob = prediction.data<float>();
+  float* score = new float[batchsize];
+  for (size_t b = 0; b < batchsize; b++) {
+    vector<std::pair<float, int>> prob_class;
+    for (size_t c = 0; c < nb_classes; c++) {
+      prob_class.push_back(std::make_pair(prob[b * nb_classes + c], c));
+    }
+    std::partial_sort(prob_class.begin(), prob_class.begin() + top_k_,
+                      prob_class.end(), std::greater<std::pair<float, int>>());
+
+    for (size_t k = 0; k < top_k_; k++)
+      if (prob_class.at(k).second == target.at(b)) score[b] = 1;
+  }
+  Tensor ret(Shape{batchsize});
+  ret.CopyDataFromHostPtr(score, batchsize);
+  return ret;
+}
+
+// TODO(wangwei) consider multi-label cases, where target is of shape
+// nb_samples * nb_classes
+Tensor Accuracy::Forward(const Tensor& prediction, const Tensor& t) {
+  Tensor target(t.shape(), t.data_type());
+  target.CopyData(t);
+  vector<int> target_vec;
+  // TODO(wangwei) copy target to host.
+  const int* target_value = target.data<int>();
+  for (size_t i = 0; i < target.Size(); i++)
+    target_vec.push_back(target_value[i]);
+  return Match(prediction, target_vec);
+}
+
+}  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/src/model/metric/accuracy.h
----------------------------------------------------------------------
diff --git a/src/model/metric/accuracy.h b/src/model/metric/accuracy.h
deleted file mode 100644
index 69bd96b..0000000
--- a/src/model/metric/accuracy.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SINGA_MODEL_METRIC_ACCURACY_H_
-#define SINGA_MODEL_METRIC_ACCURACY_H_
-#include "singa/model/metric.h"
-#include <algorithm>
-namespace singa {
-
-/// Compute the accuray of the prediction, which is matched against the
-/// ground truth labels.
-/// TODO(wangwei) consider multi-label cases.
-class Accuracy : public Metric<Tensor> {
- public:
-  /// Set meta fields from user configurations.
-  void Setup(const MetricConf& conf) override { top_k_ = conf.top_k(); }
-
-  /// Check the prediction against the target (ground truth) for each data
-  /// sample. The returned Tensor has a float value for each sample, 0 for wrong
-  /// and 1 for correct. Users can call Sum(const Tensor&) / Tensor::Size() to
-  /// get the accuracy.
-  Tensor Forward(const Tensor& prediction, const Tensor& target);
-
- private:
-  /// \copydoc Match(const Tensor&, const Tensor&);
-  Tensor Match(const Tensor& prediction, const vector<int>& target);
-  /// If the ground truth label is in the top k predicted labels, then the
-  /// prediction is correct.
-  size_t top_k_ = 1;
-};
-
-Tensor Accuracy::Match(const Tensor& prediction, const vector<int>& target) {
-  size_t batchsize = target.size();
-  size_t nb_classes = prediction.Size() / batchsize;
-  // each row of prediction is the prob distribution for one sample
-  CHECK_EQ(prediction.shape().at(0), batchsize);
-  // TODO(wangwei) CloneToDevice(host);
-  const float* prob = prediction.data<float>();
-  float* score = new float[batchsize];
-  for (size_t b = 0; b < batchsize; b++) {
-    vector<std::pair<float, int>> prob_class;
-    for (size_t c = 0; c < nb_classes; c++) {
-      prob_class.push_back(std::make_pair(prob[b * nb_classes + c], c));
-    }
-    std::partial_sort(prob_class.begin(), prob_class.begin() + top_k_,
-                      prob_class.end(), std::greater<std::pair<float, int>>());
-
-    for (size_t k = 0; k < top_k_; k++)
-      if (prob_class.at(k).second == target.at(b)) score[b] = 1;
-  }
-  Tensor ret(Shape{batchsize});
-  ret.CopyDataFromHostPtr(score, batchsize);
-  return ret;
-}
-
-// TODO(wangwei) consider multi-label cases, where target is of shape
-// nb_samples * nb_classes
-Tensor Accuracy::Forward(const Tensor& prediction, const Tensor& target) {
-  vector<int> target_vec;
-  // TODO(wangwei) copy target to host.
-  const int* target_value = target.data<int>();
-  for (size_t i = 0; i < target.Size(); i++)
-    target_vec.push_back(target_value[i]);
-  return Match(prediction, target_vec);
-}
-
-}  // namespace singa
-
-#endif  // SINGA_MODEL_METRIC_ACCURACY_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/cf1d8418/test/singa/test_accuracy.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_accuracy.cc b/test/singa/test_accuracy.cc
index dc7719b..4ff14c0 100644
--- a/test/singa/test_accuracy.cc
+++ b/test/singa/test_accuracy.cc
@@ -20,7 +20,7 @@
 *************************************************************/
 
 #include "gtest/gtest.h"
-#include "../src/model/metric/accuracy.h"
+#include "singa/model/metric.h"
 
 TEST(Accuracy, Compute) {
   singa::Accuracy acc;