You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2016/08/17 18:02:23 UTC

[02/51] [abbrv] incubator-singa git commit: SINGA-235 - Unify the engines for cudnn and singa layers

SINGA-235 - Unify the engines for cudnn and singa layers

For most layers, we would have multiple implementations, e.g., using
cudnn for nvidia gpu, using cpp for cpu and using opencl for other gpus.

These layers have different classes. They are registered with different
identifiers. This ticket would unify the layer identifiers for each
engine:
1. cudnn layers are registered with identifier = cudnn_xxx, e.g.,
cudnn_convolution for the CudnnConvolution layer.
2. singa layers are registered with identifier = singa_xxx, e.g.,
singa_convolution for the Convolution layer.

cudnn engine must run on cuda devices. and singa engine could run on
cuda-gpu device or cpp-cpu device depending on the layer type. For
instance, the Convolution layer must run on cpp-cpu device, and Dense
layer can run on both devices and would select the correct device
automatically.
Users need to make sure the engine and the device of the tensors.

Both CPP and Python code is updated. Users have to compose the layer
identifier manually for CPP version. For Python version, users can set
layer.engine='cudnn' or 'singa'.

All identifiers are case insensitive.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/05720c21
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/05720c21
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/05720c21

Branch: refs/heads/master
Commit: 05720c21636c0fd55770206176a60a9ab20ae16c
Parents: 53639b7
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Wed Aug 10 21:05:22 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Thu Aug 11 10:52:01 2016 +0800

----------------------------------------------------------------------
 examples/char-rnn/README.md            |  17 +--
 examples/char-rnn/train.py             |   2 +-
 examples/cifar10/alexnet-parallel.cc   |  84 ++++-------
 examples/cifar10/alexnet.cc            |  58 ++++---
 examples/cifar10/alexnet.py            |   5 +-
 examples/cifar10/train.py              |  33 ++--
 examples/cifar10/vgg-parallel.cc       |  91 ++++++-----
 examples/cifar10/vgg.py                |  24 +--
 examples/imagenet/alexnet.cc           |  22 +--
 include/singa/core/device.h            |  30 ++--
 include/singa/model/feed_forward_net.h |  12 +-
 include/singa/model/layer.h            |   4 +-
 include/singa/utils/integer.h          |  73 +++++++++
 src/core/device/platform.cc            |   4 +-
 src/core/tensor/tensor.cc              |   4 +-
 src/core/tensor/tensor_math_cpp.h      |   2 +-
 src/model/feed_forward_net.cc          |  17 +--
 src/model/layer/activation.cc          |  27 ++--
 src/model/layer/activation.h           |   2 +-
 src/model/layer/batchnorm.cc           |   6 +-
 src/model/layer/batchnorm.h            |   2 +-
 src/model/layer/convolution.cc         |   2 +-
 src/model/layer/convolution.h          |   2 +-
 src/model/layer/cudnn_activation.cc    |  10 +-
 src/model/layer/cudnn_activation.h     |   2 +-
 src/model/layer/cudnn_batchnorm.cc     |   2 +-
 src/model/layer/cudnn_batchnorm.h      |   2 +-
 src/model/layer/cudnn_convolution.cc   |   2 +-
 src/model/layer/cudnn_convolution.h    |   2 +-
 src/model/layer/cudnn_dropout.cc       |   2 +-
 src/model/layer/cudnn_dropout.h        |   2 +-
 src/model/layer/cudnn_lrn.cc           |   2 +-
 src/model/layer/cudnn_lrn.h            |   2 +-
 src/model/layer/cudnn_pooling.cc       |   2 +-
 src/model/layer/cudnn_pooling.h        |   2 +-
 src/model/layer/cudnn_rnn.cc           |  17 +--
 src/model/layer/cudnn_rnn.h            |   2 +-
 src/model/layer/cudnn_softmax.cc       |   2 +-
 src/model/layer/cudnn_softmax.h        |   2 +-
 src/model/layer/cudnn_utils.h          |   2 +-
 src/model/layer/dense.cc               |   2 +-
 src/model/layer/dense.h                |   2 +-
 src/model/layer/dropout.cc             |   2 +-
 src/model/layer/dropout.h              |   2 +-
 src/model/layer/flatten.cc             |   2 +-
 src/model/layer/flatten.h              |   2 +-
 src/model/layer/lrn.cc                 |   2 +-
 src/model/layer/lrn.h                  |   4 +-
 src/model/layer/pooling.cc             |   2 +-
 src/model/layer/pooling.h              |   2 +-
 src/model/layer/prelu.cc               |   4 +-
 src/model/layer/prelu.h                |   2 +-
 src/model/layer/rnn.cc                 |   2 +-
 src/model/layer/rnn.h                  |   2 +-
 src/model/layer/softmax.cc             |   2 +-
 src/model/layer/softmax.h              |   2 +-
 src/python/singa/device.py             |  13 ++
 src/python/singa/layer.py              | 226 ++++++++++++++++++----------
 src/python/singa/net.py                |   6 +-
 src/python/singa/tensor.py             |   7 +-
 src/python/swig/core_device.i          |   4 +
 src/python/swig/model_layer.i          |   3 -
 test/singa/test_activation.cc          |  26 ++--
 test/singa/test_batchnorm.cc           |  26 ++--
 test/singa/test_convolution.cc         |   2 +-
 test/singa/test_cudnn_activation.cc    |  28 ++--
 test/singa/test_cudnn_batchnorm.cc     |   2 +-
 test/singa/test_cudnn_convolution.cc   |   4 +-
 test/singa/test_cudnn_dropout.cc       |   2 +-
 test/singa/test_cudnn_lrn.cc           |   2 +-
 test/singa/test_cudnn_pooling.cc       |   2 +-
 test/singa/test_cudnn_rnn.cc           |   2 +-
 test/singa/test_cudnn_softmax.cc       |   2 +-
 test/singa/test_dense.cc               |   2 +-
 test/singa/test_dropout.cc             |   2 +-
 test/singa/test_flatten.cc             |   2 +-
 test/singa/test_layer.cc               |  19 ++-
 test/singa/test_lrn.cc                 |   2 +-
 test/singa/test_pooling.cc             |   2 +-
 test/singa/test_prelu.cc               |   2 +-
 test/singa/test_softmax.cc             |   2 +-
 81 files changed, 573 insertions(+), 433 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/examples/char-rnn/README.md
----------------------------------------------------------------------
diff --git a/examples/char-rnn/README.md b/examples/char-rnn/README.md
index c5cdbb8..d4cfa30 100644
--- a/examples/char-rnn/README.md
+++ b/examples/char-rnn/README.md
@@ -1,10 +1,10 @@
 # Train Char-RNN using SINGA
 
 Recurrent neural networks (RNN) are widely used for modelling sequential data,
-e.g., natural language sentences. This example describe how to implement a RNN
+e.g., natural language sentences. This example describes how to implement a RNN
 application (or model) using SINGA's RNN layers.
-We will use the [char-rnn](https://github.com/karpathy/char-rnn) modle as an
-example, which trains over setences or
+We will use the [char-rnn](https://github.com/karpathy/char-rnn) model as an
+example, which trains over sentences or
 source code, with each character as an input unit. Particularly, we will train
 a RNN using GRU over Linux kernel source code. After training, we expect to
 generate meaningful code from the model.
@@ -12,20 +12,19 @@ generate meaningful code from the model.
 
 ## Instructions
 
-* Compile and install SINGA. Currently the RNN implmentation depends on Cudnn V5.
+* Compile and install SINGA. Currently the RNN implementation depends on Cudnn with version >= 5.05.
 
 * Prepare the dataset. Download the [kernel source code](http://cs.stanford.edu/people/karpathy/char-rnn/).
 Other plain text files can also be used.
 
 * Start the training,
 
-    python train.py input_linux.txt
+        python train.py input_linux.txt
 
   Some hyper-parameters could be set through command line,
 
-    python train.py -h
+        python train.py -h
 
+* Sample characters from the model by providing the number of characters to sample and the seed string.
 
-* Sample characters from the model by providing num of characters and the seed string.
-
-    python sample.py 100 --seed '#include <std'
+        python sample.py 100 --seed '#include <std'

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/examples/char-rnn/train.py
----------------------------------------------------------------------
diff --git a/examples/char-rnn/train.py b/examples/char-rnn/train.py
index 22fdc82..3dfa0d9 100644
--- a/examples/char-rnn/train.py
+++ b/examples/char-rnn/train.py
@@ -195,7 +195,7 @@ def train(data, max_epoch, hidden_size =100, seq_length=100, batch_size=16,
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Train multi-stack LSTM for '\
             'modeling  character sequence from plain text files')
-    parser.add_argument('data', type=string, help='training file')
+    parser.add_argument('data', type=str, help='training file')
     parser.add_argument('-b', type=int, default=32, help='batch_size')
     parser.add_argument('-l', type=int, default=64, help='sequence length')
     parser.add_argument('-d', type=int, default=128, help='hidden size')

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/examples/cifar10/alexnet-parallel.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/alexnet-parallel.cc b/examples/cifar10/alexnet-parallel.cc
index 15ef58e..8cc3352 100644
--- a/examples/cifar10/alexnet-parallel.cc
+++ b/examples/cifar10/alexnet-parallel.cc
@@ -28,21 +28,17 @@
 #include "singa/utils/channel.h"
 #include "singa/utils/string.h"
 #include "singa/core/memory.h"
-#include "../../src/model/layer/cudnn_convolution.h"
-#include "../../src/model/layer/cudnn_activation.h"
-#include "../../src/model/layer/cudnn_pooling.h"
-#include "../../src/model/layer/cudnn_lrn.h"
-#include "../../src/model/layer/dense.h"
-#include "../../src/model/layer/flatten.h"
 #include <thread>
 #include <memory>
+
 namespace singa {
+const std::string engine = "cudnn";
 
 LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
                       int pad, float std) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnConvolution");
+  conf.set_type(engine + "_convolution");
   ConvolutionConf *conv = conf.mutable_convolution_conf();
   conv->set_num_output(nb_filter);
   conv->add_kernel_size(kernel);
@@ -67,7 +63,7 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
                          int pad) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnPooling");
+  conf.set_type(engine + "_pooling");
   PoolingConf *pool = conf.mutable_pooling_conf();
   pool->set_kernel_size(kernel);
   pool->set_stride(stride);
@@ -79,14 +75,14 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
 LayerConf GenReLUConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("RELU");
+  conf.set_type(engine + "_relu");
   return conf;
 }
 
 LayerConf GenDenseConf(string name, int num_output, float std, float wd) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Dense");
+  conf.set_type("singa_dense");
   DenseConf *dense = conf.mutable_dense_conf();
   dense->set_num_output(num_output);
 
@@ -108,7 +104,7 @@ LayerConf GenDenseConf(string name, int num_output, float std, float wd) {
 LayerConf GenLRNConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnLRN");
+  conf.set_type(engine + "_lrn");
   LRNConf *lrn = conf.mutable_lrn_conf();
   lrn->set_local_size(3);
   lrn->set_alpha(5e-05);
@@ -119,7 +115,7 @@ LayerConf GenLRNConf(string name) {
 LayerConf GenFlattenConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Flatten");
+  conf.set_type("singa_flatten");
   return conf;
 }
 
@@ -127,20 +123,19 @@ FeedForwardNet CreateNet() {
   FeedForwardNet net;
   Shape s{3, 32, 32};
 
-  net.Add(new CudnnConvolution(), GenConvConf("conv1", 32, 5, 1, 2, 0.0001),
-          &s);
-  net.Add(new CudnnPooling(), GenPoolingConf("pool1", true, 3, 2, 1));
-  net.Add(new CudnnActivation(), GenReLUConf("relu1"));
-  net.Add(new CudnnLRN(), GenLRNConf("lrn1"));
-  net.Add(new CudnnConvolution(), GenConvConf("conv2", 32, 5, 1, 2, 0.01));
-  net.Add(new CudnnActivation(), GenReLUConf("relu2"));
-  net.Add(new CudnnPooling(), GenPoolingConf("pool2", false, 3, 2, 1));
-  net.Add(new CudnnLRN(), GenLRNConf("lrn2"));
-  net.Add(new CudnnConvolution, GenConvConf("conv3", 64, 5, 1, 2, 0.01));
-  net.Add(new CudnnActivation(), GenReLUConf("relu3"));
-  net.Add(new CudnnPooling(), GenPoolingConf("pool3", false, 3, 2, 1));
-  net.Add(new Flatten(), GenFlattenConf("flat"));
-  net.Add(new Dense(), GenDenseConf("ip", 10, 0.01, 250));
+  net.Add(GenConvConf("conv1", 32, 5, 1, 2, 0.0001), &s);
+  net.Add(GenPoolingConf("pool1", true, 3, 2, 1));
+  net.Add(GenReLUConf("relu1"));
+  net.Add(GenLRNConf("lrn1"));
+  net.Add(GenConvConf("conv2", 32, 5, 1, 2, 0.01));
+  net.Add(GenReLUConf("relu2"));
+  net.Add(GenPoolingConf("pool2", false, 3, 2, 1));
+  net.Add(GenLRNConf("lrn2"));
+  net.Add(GenConvConf("conv3", 64, 5, 1, 2, 0.01));
+  net.Add(GenReLUConf("relu3"));
+  net.Add(GenPoolingConf("pool3", false, 3, 2, 1));
+  net.Add(GenFlattenConf("flat"));
+  net.Add(GenDenseConf("ip", 10, 0.01, 250));
   return net;
 }
 
@@ -228,35 +223,18 @@ void Train(float lr, int num_epoch, string data_dir) {
   mem_conf.add_device(0);
   mem_conf.add_device(1);
   std::shared_ptr<DeviceMemPool> mem_pool(new CnMemPool(mem_conf));
-  std::shared_ptr<CudaGPU> cuda_1(new CudaGPU(0, mem_pool));
-  std::shared_ptr<CudaGPU> cuda_2(new CudaGPU(1, mem_pool));
-  net_1.ToDevice(cuda_1);
-  net_2.ToDevice(cuda_2);
-
-  /*
-  // this does not work for net_2
-  train_x_2.ResetLike(train_x);
-  train_y_2.ResetLike(train_y);
-  test_x_2.ResetLike(test_x);
-  test_y_2.ResetLike(test_y);
-
-  train_x.ToDevice(cuda_1);
-  train_y.ToDevice(cuda_1);
-  test_x.ToDevice(cuda_1);
-  test_y.ToDevice(cuda_1);
+  std::shared_ptr<CudaGPU> dev_1(new CudaGPU(0, mem_pool));
+  std::shared_ptr<CudaGPU> dev_2(new CudaGPU(1, mem_pool));
 
-  train_x_2.ToDevice(cuda_2);
-  train_y_2.ToDevice(cuda_2);
-  test_x_2.ToDevice(cuda_2);
-  test_y_2.ToDevice(cuda_2);
-  */
+  net_1.ToDevice(dev_1);
+  net_2.ToDevice(dev_2);
 
-  train_x_1.ToDevice(cuda_1);
-  train_y_1.ToDevice(cuda_1);
-  test_x.ToDevice(cuda_1);
-  test_y.ToDevice(cuda_1);
-  train_x_2.ToDevice(cuda_2);
-  train_y_2.ToDevice(cuda_2);
+  train_x_1.ToDevice(dev_1);
+  train_y_1.ToDevice(dev_1);
+  test_x.ToDevice(dev_1);
+  test_y.ToDevice(dev_1);
+  train_x_2.ToDevice(dev_2);
+  train_y_2.ToDevice(dev_2);
 
   // net.Train(100, num_epoch, train_x, train_y, test_x, test_y);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/examples/cifar10/alexnet.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/alexnet.cc b/examples/cifar10/alexnet.cc
index 6480557..e1363e4 100644
--- a/examples/cifar10/alexnet.cc
+++ b/examples/cifar10/alexnet.cc
@@ -26,19 +26,14 @@
 #include "singa/model/metric.h"
 #include "singa/utils/channel.h"
 #include "singa/utils/string.h"
-#include "../../src/model/layer/cudnn_convolution.h"
-#include "../../src/model/layer/cudnn_activation.h"
-#include "../../src/model/layer/cudnn_pooling.h"
-#include "../../src/model/layer/cudnn_lrn.h"
-#include "../../src/model/layer/dense.h"
-#include "../../src/model/layer/flatten.h"
 namespace singa {
 
+const std::string engine = "cudnn";
 LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
                       int pad, float std) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnConvolution");
+  conf.set_type(engine + "_convolution");
   ConvolutionConf *conv = conf.mutable_convolution_conf();
   conv->set_num_output(nb_filter);
   conv->add_kernel_size(kernel);
@@ -63,7 +58,7 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
                          int pad) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnPooling");
+  conf.set_type(engine + "_pooling");
   PoolingConf *pool = conf.mutable_pooling_conf();
   pool->set_kernel_size(kernel);
   pool->set_stride(stride);
@@ -75,14 +70,14 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
 LayerConf GenReLUConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("RELU");
+  conf.set_type(engine + "_relu");
   return conf;
 }
 
 LayerConf GenDenseConf(string name, int num_output, float std, float wd) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Dense");
+  conf.set_type("singa_dense");
   DenseConf *dense = conf.mutable_dense_conf();
   dense->set_num_output(num_output);
 
@@ -104,7 +99,7 @@ LayerConf GenDenseConf(string name, int num_output, float std, float wd) {
 LayerConf GenLRNConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnLRN");
+  conf.set_type(engine + "_lrn");
   LRNConf *lrn = conf.mutable_lrn_conf();
   lrn->set_local_size(3);
   lrn->set_alpha(5e-05);
@@ -115,7 +110,7 @@ LayerConf GenLRNConf(string name) {
 LayerConf GenFlattenConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Flatten");
+  conf.set_type("singa_flatten");
   return conf;
 }
 
@@ -123,20 +118,19 @@ FeedForwardNet CreateNet() {
   FeedForwardNet net;
   Shape s{3, 32, 32};
 
-  net.Add(new CudnnConvolution(), GenConvConf("conv1", 32, 5, 1, 2, 0.0001),
-          &s);
-  net.Add(new CudnnPooling(), GenPoolingConf("pool1", true, 3, 2, 1));
-  net.Add(new CudnnActivation(), GenReLUConf("relu1"));
-  net.Add(new CudnnLRN(), GenLRNConf("lrn1"));
-  net.Add(new CudnnConvolution(), GenConvConf("conv2", 32, 5, 1, 2, 0.01));
-  net.Add(new CudnnActivation(), GenReLUConf("relu2"));
-  net.Add(new CudnnPooling(), GenPoolingConf("pool2", false, 3, 2, 1));
-  net.Add(new CudnnLRN(), GenLRNConf("lrn2"));
-  net.Add(new CudnnConvolution, GenConvConf("conv3", 64, 5, 1, 2, 0.01));
-  net.Add(new CudnnActivation(), GenReLUConf("relu3"));
-  net.Add(new CudnnPooling(), GenPoolingConf("pool3", false, 3, 2, 1));
-  net.Add(new Flatten(), GenFlattenConf("flat"));
-  net.Add(new Dense(), GenDenseConf("ip", 10, 0.01, 250));
+  net.Add(GenConvConf("conv1", 32, 5, 1, 2, 0.0001), &s);
+  net.Add(GenPoolingConf("pool1", true, 3, 2, 1));
+  net.Add(GenReLUConf("relu1"));
+  net.Add(GenLRNConf("lrn1"));
+  net.Add(GenConvConf("conv2", 32, 5, 1, 2, 0.01));
+  net.Add(GenReLUConf("relu2"));
+  net.Add(GenPoolingConf("pool2", false, 3, 2, 1));
+  net.Add(GenLRNConf("lrn2"));
+  net.Add(GenConvConf("conv3", 64, 5, 1, 2, 0.01));
+  net.Add(GenReLUConf("relu3"));
+  net.Add(GenPoolingConf("pool3", false, 3, 2, 1));
+  net.Add(GenFlattenConf("flat"));
+  net.Add(GenDenseConf("ip", 10, 0.01, 250));
   return net;
 }
 
@@ -184,12 +178,12 @@ void Train(float lr, int num_epoch, string data_dir) {
   Accuracy acc;
   net.Compile(true, &sgd, &loss, &acc);
 
-  auto cuda = std::make_shared<CudaGPU>();
-  net.ToDevice(cuda);
-  train_x.ToDevice(cuda);
-  train_y.ToDevice(cuda);
-  test_x.ToDevice(cuda);
-  test_y.ToDevice(cuda);
+  auto dev = std::make_shared<CudaGPU>();
+  net.ToDevice(dev);
+  train_x.ToDevice(dev);
+  train_y.ToDevice(dev);
+  test_x.ToDevice(dev);
+  test_y.ToDevice(dev);
   net.Train(100, num_epoch, train_x, train_y, test_x, test_y);
 }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/examples/cifar10/alexnet.py
----------------------------------------------------------------------
diff --git a/examples/cifar10/alexnet.py b/examples/cifar10/alexnet.py
index 96c339a..9ed5599 100644
--- a/examples/cifar10/alexnet.py
+++ b/examples/cifar10/alexnet.py
@@ -31,7 +31,10 @@ from singa import loss
 from singa import net as ffnet
 
 
-def create_net():
+def create_net(use_cpu=False):
+    if use_cpu:
+        layer.engine = 'singa'
+
     net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy())
     W0_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.0001}
     W1_specs = {'init': 'gaussian', 'mean': 0, 'std': 0.01}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/examples/cifar10/train.py
----------------------------------------------------------------------
diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index cb4110d..3285651 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -96,16 +96,23 @@ def alexnet_lr(epoch):
         return 0.00001
 
 
-def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100):
+def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100,
+          use_cpu=False):
     print 'Start intialization............'
-    cuda = device.create_cuda_gpu()
-    net.to_device(cuda)
+    if use_cpu:
+        print 'Using CPU'
+        dev = device.get_default_device()
+    else:
+        print 'Using GPU'
+        dev = device.create_cuda_gpu()
+
+    net.to_device(dev)
     opt = optimizer.SGD(momentum=0.9, weight_decay=0.004)
     for (p, specs) in zip(net.param_values(), net.param_specs()):
         opt.register(p, specs)
 
-    tx = tensor.Tensor((batch_size, 3, 32, 32), cuda)
-    ty = tensor.Tensor((batch_size,), cuda, core_pb2.kInt)
+    tx = tensor.Tensor((batch_size, 3, 32, 32), dev)
+    ty = tensor.Tensor((batch_size,), dev, core_pb2.kInt)
     train_x, train_y, test_x, test_y = data
     num_train_batch = train_x.shape[0] / batch_size
     num_test_batch = test_x.shape[0] / batch_size
@@ -127,7 +134,7 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100):
             # update progress bar
             utils.update_progress(b * 1.0 / num_train_batch,
                                   'training loss = %f, accuracy = %f' % (l, a))
-        info = 'training loss = %f, training accuracy = %f' \
+        info = '\ntraining loss = %f, training accuracy = %f' \
             % (loss / num_train_batch, acc / num_train_batch)
         print info
 
@@ -146,9 +153,11 @@ def train(data, net, max_epoch, get_lr, weight_decay, batch_size=100):
     net.save('model.bin')  # save model params into checkpoint file
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Train vgg/alexnet for cifar10')
+    parser = argparse.ArgumentParser(description='Train vgg/alexnet for '
+                                     'cifar10 dataset')
     parser.add_argument('model', choices=['vgg', 'alexnet'], default='alexnet')
     parser.add_argument('data', default='cifar-10-batches-py')
+    parser.add_argument('--use_cpu', action='store_true')
     args = parser.parse_args()
     assert os.path.exists(args.data), \
         'Pls download the cifar10 dataset via "download_data.py py"'
@@ -157,9 +166,11 @@ if __name__ == '__main__':
     test_x, test_y = load_test_data(args.data)
     if args.model == 'alexnet':
         train_x, test_x = normalize_for_alexnet(train_x, test_x)
-        net = alexnet.create_net()
-        train((train_x, train_y, test_x, test_y), net, 140, alexnet_lr, 0.004)
+        net = alexnet.create_net(args.use_cpu)
+        train((train_x, train_y, test_x, test_y), net, 140, alexnet_lr, 0.004,
+              use_cpu=args.use_cpu)
     else:
         train_x, test_x = normalize_for_vgg(train_x, test_x)
-        net = vgg.create_net()
-        train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005)
+        net = vgg.create_net(args.use_cpu)
+        train((train_x, train_y, test_x, test_y), net, 250, vgg_lr, 0.0005,
+              use_cpu=args.use_cpu)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/examples/cifar10/vgg-parallel.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/vgg-parallel.cc b/examples/cifar10/vgg-parallel.cc
index c6b7fa1..149cb21 100644
--- a/examples/cifar10/vgg-parallel.cc
+++ b/examples/cifar10/vgg-parallel.cc
@@ -28,27 +28,20 @@
 #include "singa/utils/channel.h"
 #include "singa/utils/string.h"
 #include "singa/core/memory.h"
-#include "../../src/model/layer/cudnn_convolution.h"
-#include "../../src/model/layer/cudnn_activation.h"
-#include "../../src/model/layer/cudnn_pooling.h"
-#include "../../src/model/layer/cudnn_lrn.h"
-#include "../../src/model/layer/dropout.h"
-#include "../../src/model/layer/cudnn_batchnorm.h"
-#include "../../src/model/layer/dense.h"
-#include "../../src/model/layer/flatten.h"
 #include <thread>
 #include <memory>
 #include <cmath>
 
 namespace singa {
 
+const std::string engine = "cudnn";
 const float default_wd  = 0.0005f;
 
 LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
                       int pad, float std = .02f, float bias = .0f) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnConvolution");
+  conf.set_type(engine + "_convolution");
   ConvolutionConf *conv = conf.mutable_convolution_conf();
   conv->set_num_output(nb_filter);
   conv->add_kernel_size(kernel);
@@ -75,7 +68,7 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
                          int pad) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnPooling");
+  conf.set_type(engine + "_pooling");
   PoolingConf *pool = conf.mutable_pooling_conf();
   pool->set_kernel_size(kernel);
   pool->set_stride(stride);
@@ -87,14 +80,14 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
 LayerConf GenReLUConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("RELU");
+  conf.set_type(engine + "_relu");
   return conf;
 }
 
 LayerConf GenDenseConf(string name, int num_output, float std, float wd = default_wd) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Dense");
+  conf.set_type("singa_dense");
   DenseConf *dense = conf.mutable_dense_conf();
   dense->set_num_output(num_output);
 
@@ -116,14 +109,14 @@ LayerConf GenDenseConf(string name, int num_output, float std, float wd = defaul
 LayerConf GenFlattenConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Flatten");
+  conf.set_type("singa_flatten");
   return conf;
 }
 
 LayerConf GenBatchNormConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnBatchNorm");
+  conf.set_type(engine + "_batchnorm");
   ParamSpec *gammaspec = conf.add_param();
   gammaspec->set_name(name + "_gamma");
   auto gammafill = gammaspec->mutable_filler();
@@ -155,7 +148,7 @@ LayerConf GenBatchNormConf(string name) {
 LayerConf GenDropoutConf(string name, float dropout_ratio) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Dropout");
+  conf.set_type(engine + "_dropout");
   DropoutConf *dropout = conf.mutable_dropout_conf();
   dropout->set_dropout_ratio(dropout_ratio);
 
@@ -163,47 +156,47 @@ LayerConf GenDropoutConf(string name, float dropout_ratio) {
 }
 
 void ConvBNReLU(FeedForwardNet& net, string name, int nb_filter, Shape* shape = nullptr) {
-  net.Add(new CudnnConvolution(), GenConvConf(name+"_conv", nb_filter, 3, 1, 1), shape);
-  net.Add(new CudnnBatchNorm(), GenBatchNormConf(name+"_bn"));
-  net.Add(new CudnnActivation(), GenReLUConf(name+"_relu"));
+  net.Add(GenConvConf(name+"_conv", nb_filter, 3, 1, 1), shape);
+  net.Add(GenBatchNormConf(name+"_bn"));
+  net.Add(GenReLUConf(name+"_relu"));
 }
 
 FeedForwardNet CreateNet() {
   FeedForwardNet net;
   Shape s{3, 32, 32};
   ConvBNReLU(net, "conv1_1", 64, &s);
-  net.Add(new Dropout(), GenDropoutConf("drop1", 0.3));
+  net.Add(GenDropoutConf("drop1", 0.3));
   ConvBNReLU(net, "conv1_2", 64);
-  net.Add(new CudnnPooling(), GenPoolingConf("pool1", true, 2, 2, 0));
+  net.Add(GenPoolingConf("pool1", true, 2, 2, 0));
   ConvBNReLU(net, "conv2_1", 128);
-  net.Add(new Dropout(), GenDropoutConf("drop2", 0.4));
+  net.Add(GenDropoutConf("drop2", 0.4));
   ConvBNReLU(net, "conv2_2", 128);
-  net.Add(new CudnnPooling(), GenPoolingConf("pool2", true, 2, 2, 0));
+  net.Add(GenPoolingConf("pool2", true, 2, 2, 0));
   ConvBNReLU(net, "conv3_1", 256);
-  net.Add(new Dropout(), GenDropoutConf("drop3_1", 0.4));
+  net.Add(GenDropoutConf("drop3_1", 0.4));
   ConvBNReLU(net, "conv3_2", 256);
-  net.Add(new Dropout(), GenDropoutConf("drop3_2", 0.4));
+  net.Add(GenDropoutConf("drop3_2", 0.4));
   ConvBNReLU(net, "conv3_3", 256);
-  net.Add(new CudnnPooling(), GenPoolingConf("pool3", true, 2, 2, 0));
+  net.Add(GenPoolingConf("pool3", true, 2, 2, 0));
   ConvBNReLU(net, "conv4_1", 512);
-  net.Add(new Dropout(), GenDropoutConf("drop4_1", 0.4));
+  net.Add(GenDropoutConf("drop4_1", 0.4));
   ConvBNReLU(net, "conv4_2", 512);
-  net.Add(new Dropout(), GenDropoutConf("drop4_2", 0.4));
+  net.Add(GenDropoutConf("drop4_2", 0.4));
   ConvBNReLU(net, "conv4_3", 512);
-  net.Add(new CudnnPooling(), GenPoolingConf("pool4", true, 2, 2, 0));
+  net.Add(GenPoolingConf("pool4", true, 2, 2, 0));
   ConvBNReLU(net, "conv5_1", 512);
-  net.Add(new Dropout(), GenDropoutConf("drop5_1", 0.4));
+  net.Add(GenDropoutConf("drop5_1", 0.4));
   ConvBNReLU(net, "conv5_2", 512);
-  net.Add(new Dropout(), GenDropoutConf("drop5_2", 0.4));
+  net.Add(GenDropoutConf("drop5_2", 0.4));
   ConvBNReLU(net, "conv5_3", 512);
-  net.Add(new CudnnPooling(), GenPoolingConf("pool5", true, 2, 2, 0));
-  net.Add(new Flatten(), GenFlattenConf("flat"));
-  net.Add(new Dropout(), GenDropoutConf("flat_drop", 0.5));
-  net.Add(new Dense(), GenDenseConf("ip1", 512, 0.02));
-  net.Add(new CudnnBatchNorm(), GenBatchNormConf("ip1_bn"));
-  net.Add(new CudnnActivation(), GenReLUConf("ip1_relu"));
-  net.Add(new Dropout(), GenDropoutConf("ip1_drop", 0.5));
-  net.Add(new Dense(), GenDenseConf("ip2", 10, 0.02));
+  net.Add(GenPoolingConf("pool5", true, 2, 2, 0));
+  net.Add(GenFlattenConf("flat"));
+  net.Add(GenDropoutConf("flat_drop", 0.5));
+  net.Add(GenDenseConf("ip1", 512, 0.02));
+  net.Add(GenBatchNormConf("ip1_bn"));
+  net.Add(GenReLUConf("ip1_relu"));
+  net.Add(GenDropoutConf("ip1_drop", 0.5));
+  net.Add(GenDenseConf("ip2", 10, 0.02));
 
   return net;
 }
@@ -294,17 +287,17 @@ void Train(float lr, int num_epoch, string data_dir) {
   mem_conf.add_device(0);
   mem_conf.add_device(1);
   std::shared_ptr<DeviceMemPool> mem_pool(new CnMemPool(mem_conf));
-  std::shared_ptr<CudaGPU> cuda_1(new CudaGPU(0, mem_pool));
-  std::shared_ptr<CudaGPU> cuda_2(new CudaGPU(1, mem_pool));
-  net_1.ToDevice(cuda_1);
-  net_2.ToDevice(cuda_2);
-
-  train_x_1.ToDevice(cuda_1);
-  train_y_1.ToDevice(cuda_1);
-  test_x.ToDevice(cuda_1);
-  test_y.ToDevice(cuda_1);
-  train_x_2.ToDevice(cuda_2);
-  train_y_2.ToDevice(cuda_2);
+  std::shared_ptr<CudaGPU> dev_1(new CudaGPU(0, mem_pool));
+  std::shared_ptr<CudaGPU> dev_2(new CudaGPU(1, mem_pool));
+  net_1.ToDevice(dev_1);
+  net_2.ToDevice(dev_2);
+
+  train_x_1.ToDevice(dev_1);
+  train_y_1.ToDevice(dev_1);
+  test_x.ToDevice(dev_1);
+  test_y.ToDevice(dev_1);
+  train_x_2.ToDevice(dev_2);
+  train_y_2.ToDevice(dev_2);
 
   LOG(INFO) << "Launching thread...";
   std::thread t1 =

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/examples/cifar10/vgg.py
----------------------------------------------------------------------
diff --git a/examples/cifar10/vgg.py b/examples/cifar10/vgg.py
index 0b9bb56..97e690c 100644
--- a/examples/cifar10/vgg.py
+++ b/examples/cifar10/vgg.py
@@ -40,40 +40,42 @@ def ConvBnReLU(net, name, nb_filers, sample_shape=None):
     net.add(layer.Activation(name + '_3'))
 
 
-def create_net():
+def create_net(use_cpu=False):
+    if use_cpu:
+        layer.engine = 'singa'
     net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy())
     ConvBnReLU(net, 'conv1_1', 64, (3, 32, 32))
-    net.add(layer.Dropout('drop1', 0.3, engine='cuda'))
+    net.add(layer.Dropout('drop1', 0.3))
     ConvBnReLU(net, 'conv1_2', 64)
     net.add(layer.MaxPooling2D('pool1', 2, 2, border_mode='valid'))
     ConvBnReLU(net, 'conv2_1', 128)
-    net.add(layer.Dropout('drop2_1', 0.4, engine='cuda'))
+    net.add(layer.Dropout('drop2_1', 0.4))
     ConvBnReLU(net, 'conv2_2', 128)
     net.add(layer.MaxPooling2D('pool2', 2, 2, border_mode='valid'))
     ConvBnReLU(net, 'conv3_1', 256)
-    net.add(layer.Dropout('drop3_1', 0.4, engine='cuda'))
+    net.add(layer.Dropout('drop3_1', 0.4))
     ConvBnReLU(net, 'conv3_2', 256)
-    net.add(layer.Dropout('drop3_2', 0.4, engine='cuda'))
+    net.add(layer.Dropout('drop3_2', 0.4))
     ConvBnReLU(net, 'conv3_3', 256)
     net.add(layer.MaxPooling2D('pool3', 2, 2, border_mode='valid'))
     ConvBnReLU(net, 'conv4_1', 512)
-    net.add(layer.Dropout('drop4_1', 0.4, engine='cuda'))
+    net.add(layer.Dropout('drop4_1', 0.4))
     ConvBnReLU(net, 'conv4_2', 512)
-    net.add(layer.Dropout('drop4_2', 0.4, engine='cuda'))
+    net.add(layer.Dropout('drop4_2', 0.4))
     ConvBnReLU(net, 'conv4_3', 512)
     net.add(layer.MaxPooling2D('pool4', 2, 2, border_mode='valid'))
     ConvBnReLU(net, 'conv5_1', 512)
-    net.add(layer.Dropout('drop5_1', 0.4, engine='cuda'))
+    net.add(layer.Dropout('drop5_1', 0.4))
     ConvBnReLU(net, 'conv5_2', 512)
-    net.add(layer.Dropout('drop5_2', 0.4, engine='cuda'))
+    net.add(layer.Dropout('drop5_2', 0.4))
     ConvBnReLU(net, 'conv5_3', 512)
     net.add(layer.MaxPooling2D('pool5', 2, 2, border_mode='valid'))
     net.add(layer.Flatten('flat'))
-    net.add(layer.Dropout('drop_flat', 0.5, engine='cuda'))
+    net.add(layer.Dropout('drop_flat', 0.5))
     net.add(layer.Dense('ip1', 512))
     net.add(layer.BatchNormalization('batchnorm_ip1'))
     net.add(layer.Activation('relu_ip1'))
-    net.add(layer.Dropout('drop_ip2', 0.5, engine='cuda'))
+    net.add(layer.Dropout('drop_ip2', 0.5))
     net.add(layer.Dense('ip2', 10))
     print 'Start intialization............'
     for (p, name) in zip(net.param_values(), net.param_names()):

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/examples/imagenet/alexnet.cc
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet.cc b/examples/imagenet/alexnet.cc
index 270312c..3fb5d04 100644
--- a/examples/imagenet/alexnet.cc
+++ b/examples/imagenet/alexnet.cc
@@ -22,13 +22,6 @@
 #include "singa/singa_config.h"
 #ifdef USE_OPENCV
 #include <cmath>
-#include "../../src/model/layer/cudnn_activation.h"
-#include "../../src/model/layer/cudnn_convolution.h"
-#include "../../src/model/layer/dropout.h"
-#include "../../src/model/layer/cudnn_lrn.h"
-#include "../../src/model/layer/cudnn_pooling.h"
-#include "../../src/model/layer/dense.h"
-#include "../../src/model/layer/flatten.h"
 #include "./ilsvrc12.h"
 #include "singa/io/snapshot.h"
 #include "singa/model/feed_forward_net.h"
@@ -40,11 +33,12 @@
 #include "singa/utils/timer.h"
 namespace singa {
 
+const std::string engine = "cudnn";
 LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
                       int pad, float std, float bias = .0f) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnConvolution");
+  conf.set_type(engine + "_convolution");
   ConvolutionConf *conv = conf.mutable_convolution_conf();
   conv->set_num_output(nb_filter);
   conv->add_kernel_size(kernel);
@@ -71,7 +65,7 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
                          int pad) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnPooling");
+  conf.set_type(engine + "_pooling");
   PoolingConf *pool = conf.mutable_pooling_conf();
   pool->set_kernel_size(kernel);
   pool->set_stride(stride);
@@ -83,7 +77,7 @@ LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
 LayerConf GenReLUConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("RELU");
+  conf.set_type(engine + "_relu");
   return conf;
 }
 
@@ -91,7 +85,7 @@ LayerConf GenDenseConf(string name, int num_output, float std, float wd,
                        float bias = .0f) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Dense");
+  conf.set_type("singa_dense");
   DenseConf *dense = conf.mutable_dense_conf();
   dense->set_num_output(num_output);
 
@@ -115,7 +109,7 @@ LayerConf GenDenseConf(string name, int num_output, float std, float wd,
 LayerConf GenLRNConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("CudnnLRN");
+  conf.set_type(engine + "_lrn");
   LRNConf *lrn = conf.mutable_lrn_conf();
   lrn->set_local_size(5);
   lrn->set_alpha(1e-04);
@@ -126,14 +120,14 @@ LayerConf GenLRNConf(string name) {
 LayerConf GenFlattenConf(string name) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Flatten");
+  conf.set_type("singa_flatten");
   return conf;
 }
 
 LayerConf GenDropoutConf(string name, float dropout_ratio) {
   LayerConf conf;
   conf.set_name(name);
-  conf.set_type("Dropout");
+  conf.set_type(engine + "_dropout");
   DropoutConf *dropout = conf.mutable_dropout_conf();
   dropout->set_dropout_ratio(dropout_ratio);
   return conf;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 778a130..4c46114 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -321,23 +321,33 @@ public:
   /// Return a string containing all hardware info, e.g., version, memory size.
   static const std::string DeviceQuery(int id, bool verbose = false);
 
+  /// Return the defualt host device
+  static std::shared_ptr<Device> GetDefaultDevice() {
+    return defaultDevice;
+  }
+
   /// Create a set of CudaGPU Device using 'num_devices' free GPUs.
   static const std::vector<std::shared_ptr<Device>>
   CreateCudaGPUs(const size_t num_devices, size_t init_size = 0);
 
   /// Create a set of CudaGPU Device using given GPU IDs.
   static const std::vector<std::shared_ptr<Device>>
-  CreateCudaGPUs(const std::vector<int> &devices, size_t init_size = 0);
-
-  /// Create a \p num_devices set of valid OpenCL devices, regardless of platforms.
-  /// If there are fewer valid devices than requested, then this method will return as many as possible.
-  /// If OpenCL is not in use, this method will return an empty array.
-  const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const size_t num_devices);
-
-  /// Create a set of valid OpenCL devices, regardless of platforms, assigning \p id to each device in sequence.
-  /// If there are fewer valid devices than requested, then this method will return as many as possible.
+  CreateCudaGPUsOn(const std::vector<int> &devices, size_t init_size = 0);
+
+  /// Create a \p num_devices set of valid OpenCL devices, regardless of
+  /// platforms.  If there are fewer valid devices than requested, then this
+  /// method will return as many as possible.If OpenCL is not in use, this
+  /// method will return an empty array.
+  const std::vector<std::shared_ptr<Device> > CreateOpenclDevices(
+             const size_t num_devices);
+
+  /// Create a set of valid OpenCL devices, regardless of platforms, assigning
+  /// \p id to each device in sequence.
+  /// If there are fewer valid devices than requested, then this method will
+  /// return as many as possible.
   /// If OpenCL is not in use, this method will return an empty array.
-  const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const vector<int>& id);
+  const std::vector<std::shared_ptr<Device> >
+  CreateOpenclDevices(const vector<int> &id);
 
   /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/).
   /// This function checks the availability of GPU #device_id.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/include/singa/model/feed_forward_net.h
----------------------------------------------------------------------
diff --git a/include/singa/model/feed_forward_net.h b/include/singa/model/feed_forward_net.h
index 8adc259..1bf112c 100644
--- a/include/singa/model/feed_forward_net.h
+++ b/include/singa/model/feed_forward_net.h
@@ -39,7 +39,7 @@ class FeedForwardNet {
   ///    following the topological order.
   /// 2. this layer has already been setup (Setup function is called outside).
   /// The layer will be freed in the destructor of FeedForwardNet.
-  Layer* Add(Layer* layer);
+  std::shared_ptr<Layer> Add(std::shared_ptr<Layer> layer);
 
   // TODO(wangwei) add ConcatenateLayer and SliceLayer
   // AddConcatenateLayer(vector<Layer*> src, Layer *dst);
@@ -49,11 +49,9 @@ class FeedForwardNet {
   /// Assume the layer is added in corret order.
   /// For the first layer, 'sample_shape' (the input sample shape) is necessary
   /// for calling Setup().
-  Layer* Add(const LayerConf& conf, const Shape* sample_shape = nullptr);
+  std::shared_ptr<Layer> Add(const LayerConf& conf,
+      const Shape* sample_shape = nullptr);
 
-  /// Add a layer, and call its Setup function.
-  Layer* Add(Layer* layer, const LayerConf& conf,
-             const Shape* sample_shape = nullptr);
   /// Set some fields used for training and evaluating the neural net.
   /// This method will instantiate an Updater ,then wrap the Optimier into
   /// Updater and always register the parameters of the net instance.
@@ -147,13 +145,13 @@ class FeedForwardNet {
     return std::thread([=]() { Train(batchsize, nb_epoch, x, y); });
   }
 
-  const vector<Layer*> layers() const { return layers_; }
+  const vector<std::shared_ptr<Layer>> layers() const { return layers_; }
   const vector<string> GetParamNames() const;
   const vector<ParamSpec> GetParamSpecs() const;
   const vector<Tensor> GetParamValues() const;
 
  protected:
-  vector<Layer*> layers_;
+  vector<std::shared_ptr<Layer>> layers_;
   std::shared_ptr<Updater> updater_;
   Loss* loss_;
   Metric* metric_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/include/singa/model/layer.h
----------------------------------------------------------------------
diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h
index d31bd95..58f0f4b 100644
--- a/include/singa/model/layer.h
+++ b/include/singa/model/layer.h
@@ -222,8 +222,8 @@ class Layer {
   vector<ParamSpec> param_specs_;
 };
 
-#define RegisterLayerClass(SubLayer) \
-  static Registra<Layer, SubLayer> _##SubLayer##Layer(#SubLayer);
+#define RegisterLayerClass(Name, SubLayer) \
+  static Registra<Layer, SubLayer> Name##SubLayer(#Name);
 
 inline std::shared_ptr<Layer> CreateLayer(const std::string type) {
   std::shared_ptr<Layer> layer(Factory<Layer>::Create(type));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/include/singa/utils/integer.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/integer.h b/include/singa/utils/integer.h
new file mode 100644
index 0000000..9c2799d
--- /dev/null
+++ b/include/singa/utils/integer.h
@@ -0,0 +1,73 @@
+/************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+
+#ifndef INTEGER_H_
+#define INTEGER_H_
+
+#include <cstdint>
+
+namespace singa{
+static bool isNetworkOrder() {
+    int test = 1;
+    return (1 != *(uint8_t*)&test);
+}
+
+template <typename T>
+static inline T byteSwap(const T& v) {
+    int size = sizeof(v);
+    T ret;
+    uint8_t *dest = reinterpret_cast<uint8_t *>(&ret);
+    uint8_t *src = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(&v));
+    for (int i = 0; i < size; ++i) {
+        dest[i] = src[size - i - 1];
+    }
+    return ret;
+}
+
+template <typename T>
+static inline T hton(const T& v)
+{
+    return isNetworkOrder() ? v : byteSwap(v);
+}
+
+template <typename T>
+static inline T ntoh(const T& v) 
+{
+    return hton(v);
+}
+
+static inline int appendInteger(char* buf) {return 0;}
+static inline int readInteger(char* buf) {return 0;}
+
+template<typename Type, typename... Types>
+static int appendInteger(char* buf, Type value, Types... values) {
+    *(Type*)buf = hton(value);
+    return sizeof(Type) + appendInteger(buf + sizeof(Type), values...);
+}
+
+template<typename Type, typename... Types>
+static int readInteger(char* buf, Type& value, Types&... values) {
+    value = ntoh(*(Type*)buf);
+    return sizeof(Type) + readInteger(buf + sizeof(Type), values...);
+}
+
+}
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/core/device/platform.cc
----------------------------------------------------------------------
diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc
index a4561de..a3661f2 100644
--- a/src/core/device/platform.cc
+++ b/src/core/device/platform.cc
@@ -113,11 +113,11 @@ Platform::CreateCudaGPUs(const size_t num_devices, size_t init_size) {
   const vector<int> gpus = GetGPUIDs();
   CHECK_LE(num_devices, gpus.size());
   vector<int> use_gpus(gpus.begin(), gpus.begin() + num_devices);
-  return CreateCudaGPUs(use_gpus, init_size);
+  return CreateCudaGPUsOn(use_gpus, init_size);
 }
 
 const vector<shared_ptr<Device> >
-Platform::CreateCudaGPUs(const vector<int> &devices, size_t init_size) {
+Platform::CreateCudaGPUsOn(const vector<int> &devices, size_t init_size) {
   MemPoolConf conf;
   if (init_size > 0)
     conf.set_init_size(init_size);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index e260f9e..dfb1eb2 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -452,7 +452,7 @@ float Tensor::L1() const {
   float nrm = 0.0f;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec([&nrm, this](Context *ctx) {
-      DType ret;
+      DType ret = DType(0);
       Asum<DType, Lang>(this->Size(), this->block(), &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
     }, {this->block()}, {});
@@ -465,7 +465,7 @@ float Tensor::L2() const {
   float nrm = 0.0f;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec([&nrm, this](Context *ctx) {
-      DType ret;
+      DType ret = DType(0);
       Nrm2<DType, Lang>(this->Size(), this->block(), &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
     }, {this->block()}, {});

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 941931d..a2802d5 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -239,7 +239,7 @@ void Sqrt<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t i = 0; i < num; i++) {
-    CHECK_GT(inPtr[i], 0.f);
+    CHECK_GE(inPtr[i], 0.f);
     outPtr[i] = sqrt(inPtr[i]);
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/feed_forward_net.cc
----------------------------------------------------------------------
diff --git a/src/model/feed_forward_net.cc b/src/model/feed_forward_net.cc
index 9450c9e..514d6e2 100644
--- a/src/model/feed_forward_net.cc
+++ b/src/model/feed_forward_net.cc
@@ -26,23 +26,16 @@
 namespace singa {
 
 FeedForwardNet::~FeedForwardNet() {
-  for (auto layer : layers_) delete layer;
-}
-Layer* FeedForwardNet::Add(Layer* layer) {
-  layers_.push_back(layer);
-  return layer;
 }
 
-Layer* FeedForwardNet::Add(const LayerConf& conf, const Shape* sample_shape) {
-  CHECK(sample_shape != nullptr || layers_.size())
-      << "Must provide the input sample shape for the first layer";
-  Layer* layer = nullptr;  // TODO(wangwei) use CreateLayer(conf.type());
-  Add(layer, conf, sample_shape);
+std::shared_ptr<Layer> FeedForwardNet::Add(std::shared_ptr<Layer> layer) {
+  layers_.push_back(layer);
   return layer;
 }
 
-Layer* FeedForwardNet::Add(Layer* layer, const LayerConf& conf,
-                           const Shape* sample_shape) {
+std::shared_ptr<Layer> FeedForwardNet::Add(const LayerConf& conf,
+    const Shape* sample_shape) {
+  std::shared_ptr<Layer> layer(CreateLayer(conf.type()));
   CHECK(conf.has_name()) << "Must set layer name";
   if (sample_shape == nullptr)
     layer->Setup(layers_.back()->GetOutputSampleShape(), conf);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/activation.cc b/src/model/layer/activation.cc
index 2497c31..aa40edb 100644
--- a/src/model/layer/activation.cc
+++ b/src/model/layer/activation.cc
@@ -18,14 +18,23 @@
 
 #include "singa/model/layer.h"
 #include "./activation.h"
+#include "singa/utils/string.h"
 namespace singa {
 
-RegisterLayerClass(Activation);
+RegisterLayerClass(singa_relu, Activation);
+RegisterLayerClass(singa_sigmoid, Activation);
+RegisterLayerClass(singa_tanh, Activation);
 
 void Activation::Setup(const Shape& in_sample, const LayerConf& conf) {
   Layer::Setup(in_sample, conf);
-  mode_ = conf.type();
-  if (mode_ == "RELU") {
+  auto pos = conf.type().find_first_of('_');
+  CHECK_NE(pos, string::npos) << "There should be a '_' in the laye type "
+    << conf.type();
+  mode_ = ToLowerCase(conf.type().substr(pos + 1));
+  if (mode_ != "relu" && mode_ != "sigmoid" && mode_ != "tanh")
+    LOG(FATAL) << "Unkown activation type: " << conf.type() << " " << mode_
+      << ". Please use singa_relu, singa_sigmoid, or singa_tanh";
+  if (mode_ == "relu") {
     neg_slope_ = conf.relu_conf().negative_slope();
   }
   out_sample_shape_ = in_sample;
@@ -33,13 +42,13 @@ void Activation::Setup(const Shape& in_sample, const LayerConf& conf) {
 
 const Tensor Activation::Forward(int flag, const Tensor& input) {
   Tensor output;
-  if (mode_ == "SIGMOID") {
+  if (mode_ == "sigmoid") {
     output = Sigmoid(input);
     if (flag & kTrain) buf_.push(output);
-  } else if (mode_ == "TANH") {
+  } else if (mode_ == "tanh") {
     output = Tanh(input);
     if (flag & kTrain) buf_.push(output);
-  } else if (mode_ == "RELU") {
+  } else if (mode_ == "relu") {
     output = ReLU(input);
     if (flag & kTrain) buf_.push(input);
   } else
@@ -55,11 +64,11 @@ const std::pair<Tensor, vector<Tensor>> Activation::Backward(
   // activation.
   Tensor input_grad, inout = buf_.top();
   buf_.pop();
-  if (mode_ == "SIGMOID")
+  if (mode_ == "sigmoid")
     input_grad = grad * inout * (inout * (-1.f) + 1.f);
-  else if (mode_ == "TANH")
+  else if (mode_ == "tanh")
     input_grad = grad * (inout * inout * (-1.f) + 1.f);
-  else if (mode_ == "RELU")
+  else if (mode_ == "relu")
     input_grad = grad * (inout > 0.f) + (inout <= 0.f) * neg_slope_;
   else LOG(FATAL) << "Unkown activation: " << mode_;
   return std::make_pair(input_grad, param_grad);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/activation.h
----------------------------------------------------------------------
diff --git a/src/model/layer/activation.h b/src/model/layer/activation.h
index e3fb657..7d15979 100644
--- a/src/model/layer/activation.h
+++ b/src/model/layer/activation.h
@@ -26,7 +26,7 @@ namespace singa {
 class Activation : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "Activation"; }
+  // const std::string layer_type() const override { return "Activation"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const Shape& in_sample, const LayerConf& conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/batchnorm.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.cc b/src/model/layer/batchnorm.cc
index 6ea9f2a..f348661 100644
--- a/src/model/layer/batchnorm.cc
+++ b/src/model/layer/batchnorm.cc
@@ -21,7 +21,7 @@
 #include "batchnorm.h"
 
 namespace singa {
-RegisterLayerClass(BatchNorm);
+RegisterLayerClass(singa_batchnorm, BatchNorm);
 void BatchNorm::Setup(const Shape& in_sample, const LayerConf& conf) {
   Layer::Setup(in_sample, conf);
   out_sample_shape_ = in_sample;
@@ -78,8 +78,8 @@ const Tensor BatchNorm::Forward(int flag, const Tensor& input) {
     runningVariance_ *= 1.0f - factor_;
     Axpy(factor_, var, &runningVariance_);
     Tensor tmp = var.Clone();
-    tmp += 1e-6f;
     tmp = Sqrt(tmp);
+    tmp += 1e-6f;
     xnorm = x.Clone();
     SubRow(mean, &xnorm);
     DivRow(tmp, &xnorm);
@@ -94,8 +94,8 @@ const Tensor BatchNorm::Forward(int flag, const Tensor& input) {
     xnorm = x.Clone();
     SubRow(runningMean_, &xnorm);
     Tensor tmp = runningVariance_.Clone();
-    tmp += 1e-6f;
     tmp = Sqrt(tmp);
+    tmp += 1e-6f;
     DivRow(tmp, &xnorm);
     output = xnorm.Clone();
     MultRow(bnScale_, &output);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/batchnorm.h
----------------------------------------------------------------------
diff --git a/src/model/layer/batchnorm.h b/src/model/layer/batchnorm.h
index f3d83ab..c2cfde9 100644
--- a/src/model/layer/batchnorm.h
+++ b/src/model/layer/batchnorm.h
@@ -29,7 +29,7 @@ namespace singa {
 class BatchNorm : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "BatchNorm"; }
+  // const std::string layer_type() const override { return "BatchNorm"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const Shape& in_sample, const LayerConf& conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
index 1bf6b39..4fc209f 100644
--- a/src/model/layer/convolution.cc
+++ b/src/model/layer/convolution.cc
@@ -23,7 +23,7 @@
 namespace singa {
 using std::vector;
 
-RegisterLayerClass(Convolution);
+RegisterLayerClass(singa_convolution, Convolution);
 void Convolution::Setup(const Shape &in_sample, const LayerConf &conf) {
   Layer::Setup(in_sample, conf);
   ConvolutionConf conv_conf = conf.convolution_conf();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h
index 1383a66..d85a17b 100644
--- a/src/model/layer/convolution.h
+++ b/src/model/layer/convolution.h
@@ -27,7 +27,7 @@ namespace singa {
 class Convolution : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "Convolution"; }
+  // const std::string layer_type() const override { return "Convolution"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const vector<size_t>& in_shape, const LayerConf& conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_activation.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.cc b/src/model/layer/cudnn_activation.cc
index c86539d..4ecb375 100644
--- a/src/model/layer/cudnn_activation.cc
+++ b/src/model/layer/cudnn_activation.cc
@@ -25,7 +25,9 @@
 #include "singa/utils/logging.h"
 
 namespace singa {
-RegisterLayerClass(CudnnActivation);
+RegisterLayerClass(cudnn_relu, CudnnActivation);
+RegisterLayerClass(cudnn_sigmoid, CudnnActivation);
+RegisterLayerClass(cudnn_tanh, CudnnActivation);
 CudnnActivation::~CudnnActivation() {
   if (acti_desc_ != nullptr)
     CUDNN_CHECK(cudnnDestroyActivationDescriptor(acti_desc_));
@@ -40,11 +42,11 @@ void CudnnActivation::InitCudnn(size_t size, DataType dtype) {
   CUDNN_CHECK(cudnnSetTensor4dDescriptor(
       desc_, CUDNN_TENSOR_NCHW, GetCudnnDataType(dtype), 1, 1, 1, size));
 
-  if (mode_ == "SIGMOID")
+  if (mode_ == "sigmoid")
     cudnn_mode_ = CUDNN_ACTIVATION_SIGMOID;
-  else if (mode_ == "TANH")
+  else if (mode_ == "tanh")
     cudnn_mode_ = CUDNN_ACTIVATION_TANH;
-  else if (mode_ == "RELU")
+  else if (mode_ == "relu")
     cudnn_mode_ = CUDNN_ACTIVATION_RELU;
   else
     LOG(FATAL) << "Unkown activation: " << mode_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_activation.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_activation.h b/src/model/layer/cudnn_activation.h
index 526e03f..c69d157 100644
--- a/src/model/layer/cudnn_activation.h
+++ b/src/model/layer/cudnn_activation.h
@@ -35,7 +35,7 @@ class CudnnActivation : public Activation {
  public:
   ~CudnnActivation();
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "CudnnActivation"; }
+  // const std::string layer_type() const override { return "CudnnActivation"; }
 
   const Tensor Forward(int flag, const Tensor& input) override;
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_batchnorm.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_batchnorm.cc b/src/model/layer/cudnn_batchnorm.cc
index 461f1b6..01682b7 100644
--- a/src/model/layer/cudnn_batchnorm.cc
+++ b/src/model/layer/cudnn_batchnorm.cc
@@ -23,7 +23,7 @@
 
 namespace singa {
 
-RegisterLayerClass(CudnnBatchNorm);
+RegisterLayerClass(cudnn_batchnorm, CudnnBatchNorm);
 CudnnBatchNorm::~CudnnBatchNorm() {
   if (has_init_cudnn_) {
     CUDNN_CHECK(cudnnDestroyTensorDescriptor(shape_desc_));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_batchnorm.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_batchnorm.h b/src/model/layer/cudnn_batchnorm.h
index 4f46452..c4390a1 100644
--- a/src/model/layer/cudnn_batchnorm.h
+++ b/src/model/layer/cudnn_batchnorm.h
@@ -31,7 +31,7 @@ class CudnnBatchNorm : public BatchNorm {
  public:
   ~CudnnBatchNorm();
   /// \copy doc Layer::layer_type()
-  const std::string layer_type() const override { return "CudnnBatchNorm"; }
+  // const std::string layer_type() const override { return "CudnnBatchNorm"; }
 
   void Setup(const Shape& in_sample, const LayerConf& conf) override;
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
index e5efec0..ffd2ab7 100644
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@ -23,7 +23,7 @@
 #include "singa/utils/logging.h"
 
 namespace singa {
-RegisterLayerClass(CudnnConvolution);
+RegisterLayerClass(cudnn_convolution, CudnnConvolution);
 CudnnConvolution::~CudnnConvolution() {
   if (bias_desc_ != nullptr)
     CUDNN_CHECK(cudnnDestroyTensorDescriptor(bias_desc_));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.h b/src/model/layer/cudnn_convolution.h
index cd0471f..545fd5c 100644
--- a/src/model/layer/cudnn_convolution.h
+++ b/src/model/layer/cudnn_convolution.h
@@ -34,7 +34,7 @@ class CudnnConvolution : public Convolution {
  public:
   ~CudnnConvolution();
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "CudnnConvolution"; }
+  // const std::string layer_type() const override { return "CudnnConvolution";}
 
   const Tensor Forward(int flag, const Tensor &input) override;
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.cc b/src/model/layer/cudnn_dropout.cc
index e6950ca..c5b62cf 100644
--- a/src/model/layer/cudnn_dropout.cc
+++ b/src/model/layer/cudnn_dropout.cc
@@ -27,7 +27,7 @@
 #include "singa/utils/logging.h"
 
 namespace singa {
-RegisterLayerClass(CudnnDropout);
+RegisterLayerClass(cudnn_dropout, CudnnDropout);
 CudnnDropout::~CudnnDropout() {
   if (drop_desc_ != nullptr)
     CUDNN_CHECK(cudnnDestroyDropoutDescriptor(drop_desc_));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_dropout.h b/src/model/layer/cudnn_dropout.h
index 9e0cb9e..1241911 100644
--- a/src/model/layer/cudnn_dropout.h
+++ b/src/model/layer/cudnn_dropout.h
@@ -36,7 +36,7 @@ class CudnnDropout : public Dropout {
  public:
   ~CudnnDropout();
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "CudnnDropout"; }
+  // const std::string layer_type() const override { return "CudnnDropout"; }
 
   const Tensor Forward(int flag, const Tensor& input) override;
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_lrn.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_lrn.cc b/src/model/layer/cudnn_lrn.cc
index 540beb1..ac7645e 100644
--- a/src/model/layer/cudnn_lrn.cc
+++ b/src/model/layer/cudnn_lrn.cc
@@ -23,7 +23,7 @@
 #include "cudnn_utils.h"
 
 namespace singa {
-RegisterLayerClass(CudnnLRN);
+RegisterLayerClass(cudnn_lrn, CudnnLRN);
 CudnnLRN::~CudnnLRN() {
   if (has_init_cudnn_) {
     CUDNN_CHECK(cudnnDestroyLRNDescriptor(lrn_desc_));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_lrn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_lrn.h b/src/model/layer/cudnn_lrn.h
index e2a5e54..c48571d 100644
--- a/src/model/layer/cudnn_lrn.h
+++ b/src/model/layer/cudnn_lrn.h
@@ -31,7 +31,7 @@ class CudnnLRN : public LRN {
  public:
   ~CudnnLRN();
   /// \copy doc Layer::layer_type()
-  const std::string layer_type() const override { return "CudnnLRN"; }
+  // const std::string layer_type() const override { return "CudnnLRN"; }
 
   const Tensor Forward(int flag, const Tensor& input) override;
   const std::pair<Tensor, vector<Tensor>> Backward(int flag,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.cc b/src/model/layer/cudnn_pooling.cc
index 984427c..895ce3c 100644
--- a/src/model/layer/cudnn_pooling.cc
+++ b/src/model/layer/cudnn_pooling.cc
@@ -25,7 +25,7 @@
 #include "singa/utils/logging.h"
 
 namespace singa {
-RegisterLayerClass(CudnnPooling);
+RegisterLayerClass(cudnn_pooling, CudnnPooling);
 CudnnPooling::~CudnnPooling() {
   if (pool_desc_ != nullptr)
     CUDNN_CHECK(cudnnDestroyPoolingDescriptor(pool_desc_));

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_pooling.h b/src/model/layer/cudnn_pooling.h
index 90779f5..2080db3 100644
--- a/src/model/layer/cudnn_pooling.h
+++ b/src/model/layer/cudnn_pooling.h
@@ -35,7 +35,7 @@ class CudnnPooling : public Pooling {
  public:
   ~CudnnPooling();
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "CudnnPooling"; }
+  // const std::string layer_type() const override { return "CudnnPooling"; }
 
   void Setup(const Shape& in_sample, const LayerConf &conf) override;
   const Tensor Forward(int flag, const Tensor &input) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_rnn.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_rnn.cc b/src/model/layer/cudnn_rnn.cc
index bfbfa48..9961df2 100644
--- a/src/model/layer/cudnn_rnn.cc
+++ b/src/model/layer/cudnn_rnn.cc
@@ -24,6 +24,7 @@
 #include "singa/utils/logging.h"
 
 namespace singa {
+RegisterLayerClass(cudnn_rnn, CudnnRNN);
 CudnnRNN::~CudnnRNN() {
   if (weight_desc_ != nullptr)
     CUDNN_CHECK(cudnnDestroyFilterDescriptor(weight_desc_));
@@ -126,25 +127,19 @@ void CudnnRNN::SetRNNDescriptor(shared_ptr<Device> dev) {
       dropout_state_.block()->mutable_data(), state_size, seed_));
 
   CUDNN_CHECK(cudnnCreateRNNDescriptor(&rnn_desc_));
-  cudnnRNNInputMode_t input_mode;
-  if (input_mode_ == "linear")
-    input_mode = CUDNN_LINEAR_INPUT;
-  else if (input_mode_ == "skip")
+  cudnnRNNInputMode_t input_mode = CUDNN_LINEAR_INPUT;
+  if (input_mode_ == "skip")
     input_mode = CUDNN_SKIP_INPUT;
 
-  cudnnDirectionMode_t direction;
-  if (direction_ == "unidirectional")
-    direction = CUDNN_UNIDIRECTIONAL;
-  else if (direction_ == "bidirectional")
+  cudnnDirectionMode_t direction = CUDNN_UNIDIRECTIONAL;
+  if (direction_ == "bidirectional")
     direction = CUDNN_BIDIRECTIONAL;
 
-  cudnnRNNMode_t rnn_mode;
+  cudnnRNNMode_t rnn_mode = CUDNN_LSTM;
   if (rnn_mode_ == "relu")
     rnn_mode = CUDNN_RNN_RELU;
   else if (rnn_mode_ == "tanh")
     rnn_mode = CUDNN_RNN_TANH;
-  else if (rnn_mode_ == "lstm")
-    rnn_mode = CUDNN_LSTM;
   else if (rnn_mode_ == "gru")
     rnn_mode = CUDNN_GRU;
   CUDNN_CHECK(cudnnSetRNNDescriptor(rnn_desc_, hidden_size_, num_stacks_,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_rnn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_rnn.h b/src/model/layer/cudnn_rnn.h
index cfb8aac..82c68b0 100644
--- a/src/model/layer/cudnn_rnn.h
+++ b/src/model/layer/cudnn_rnn.h
@@ -39,7 +39,7 @@ class CudnnRNN : public RNN {
  public:
   ~CudnnRNN();
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "CudnnRNN"; }
+  // const std::string layer_type() const override { return "CudnnRNN"; }
 
   const vector<Tensor> Forward(int flag, const vector<Tensor>& inputs) override;
   const std::pair<vector<Tensor>, vector<Tensor>> Backward(

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_softmax.cc b/src/model/layer/cudnn_softmax.cc
index 6dce68f..f1a4a5b 100644
--- a/src/model/layer/cudnn_softmax.cc
+++ b/src/model/layer/cudnn_softmax.cc
@@ -23,7 +23,7 @@
 #include "singa/utils/logging.h"
 namespace singa {
 
-RegisterLayerClass(CudnnSoftmax);
+RegisterLayerClass(cudnn_softmax, CudnnSoftmax);
 CudnnSoftmax::~CudnnSoftmax() {
   if (desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_softmax.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_softmax.h b/src/model/layer/cudnn_softmax.h
index aca3729..532a643 100644
--- a/src/model/layer/cudnn_softmax.h
+++ b/src/model/layer/cudnn_softmax.h
@@ -34,7 +34,7 @@ class CudnnSoftmax : public Softmax {
  public:
   ~CudnnSoftmax();
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "CudnnSoftmax"; }
+  // const std::string layer_type() const override { return "CudnnSoftmax"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const Shape& in_sample_shape, const LayerConf &conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/cudnn_utils.h
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_utils.h b/src/model/layer/cudnn_utils.h
index 19c72ec..64ee758 100644
--- a/src/model/layer/cudnn_utils.h
+++ b/src/model/layer/cudnn_utils.h
@@ -26,7 +26,7 @@
 #include "singa/utils/logging.h"
 namespace singa {
 inline cudnnDataType_t GetCudnnDataType(DataType dtype) {
-  cudnnDataType_t ret;
+  cudnnDataType_t ret = CUDNN_DATA_FLOAT;
   switch (dtype) {
     case kFloat32:
       ret = CUDNN_DATA_FLOAT;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/dense.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.cc b/src/model/layer/dense.cc
index 557d8bd..1a2d16e 100644
--- a/src/model/layer/dense.cc
+++ b/src/model/layer/dense.cc
@@ -23,7 +23,7 @@
 namespace singa {
 using std::vector;
 
-RegisterLayerClass(Dense);
+RegisterLayerClass(singa_dense, Dense);
 Dense::~Dense() {
   // delete weight_;
   // delete bias_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/dense.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dense.h b/src/model/layer/dense.h
index bb5db66..8a149a5 100644
--- a/src/model/layer/dense.h
+++ b/src/model/layer/dense.h
@@ -28,7 +28,7 @@ class Dense : public Layer {
  public:
   ~Dense();
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "Dense"; }
+  // const std::string layer_type() const override { return "Dense"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const Shape& in_sample, const LayerConf& conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/dropout.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.cc b/src/model/layer/dropout.cc
index 0a4b1df..35801b4 100644
--- a/src/model/layer/dropout.cc
+++ b/src/model/layer/dropout.cc
@@ -20,7 +20,7 @@
 #include "./dropout.h"
 namespace singa {
 
-RegisterLayerClass(Dropout);
+RegisterLayerClass(singa_dropout, Dropout);
 void Dropout::Setup(const Shape& in_sample, const LayerConf& conf) {
   Layer::Setup(in_sample, conf);
   dropout_ratio_ = conf.dropout_conf().dropout_ratio();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/dropout.h
----------------------------------------------------------------------
diff --git a/src/model/layer/dropout.h b/src/model/layer/dropout.h
index 1a4bdbf..711c86b 100644
--- a/src/model/layer/dropout.h
+++ b/src/model/layer/dropout.h
@@ -26,7 +26,7 @@ namespace singa {
 class Dropout : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "Dropout"; }
+  // const std::string layer_type() const override { return "Dropout"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const Shape& in_sample, const LayerConf& conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/flatten.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/flatten.cc b/src/model/layer/flatten.cc
index e7d8fa0..d89361e 100644
--- a/src/model/layer/flatten.cc
+++ b/src/model/layer/flatten.cc
@@ -20,7 +20,7 @@
 #include "./flatten.h"
 namespace singa {
 
-RegisterLayerClass(Flatten);
+RegisterLayerClass(singa_flatten, Flatten);
 void Flatten::Setup(const Shape& in_sample, const LayerConf &conf) {
   Layer::Setup(in_sample, conf);
   axis_ = conf.flatten_conf().axis();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/flatten.h
----------------------------------------------------------------------
diff --git a/src/model/layer/flatten.h b/src/model/layer/flatten.h
index 6ac90c2..8bbf481 100644
--- a/src/model/layer/flatten.h
+++ b/src/model/layer/flatten.h
@@ -26,7 +26,7 @@ namespace singa {
 class Flatten : public Layer {
  public:
   /// \copydoc Layer::layer_type();
-  const std::string layer_type() const override { return "Flatten"; }
+  // const std::string layer_type() const override { return "Flatten"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const Shape& in_sample, const LayerConf& conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/lrn.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/lrn.cc b/src/model/layer/lrn.cc
index a624147..6b5a618 100644
--- a/src/model/layer/lrn.cc
+++ b/src/model/layer/lrn.cc
@@ -22,7 +22,7 @@
 #include <vector>
 
 namespace singa {
-RegisterLayerClass(LRN);
+RegisterLayerClass(singa_lrn, LRN);
 void LRN::Setup(const Shape& in_sample, const LayerConf& conf) {
   Layer::Setup(in_sample, conf);
   out_sample_shape_ = in_sample;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/lrn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/lrn.h b/src/model/layer/lrn.h
index 0632f8c..57e26ba 100644
--- a/src/model/layer/lrn.h
+++ b/src/model/layer/lrn.h
@@ -27,9 +27,7 @@ namespace singa {
 class LRN : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override {
-    return "LRN";
-  }
+  // const std::string layer_type() const override { return "LRN"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const Shape& in_sample, const LayerConf& conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.cc b/src/model/layer/pooling.cc
index 943f9b2..5e7ba1d 100644
--- a/src/model/layer/pooling.cc
+++ b/src/model/layer/pooling.cc
@@ -20,7 +20,7 @@
 #include "singa/model/layer.h"
 namespace singa {
 
-RegisterLayerClass(Pooling);
+RegisterLayerClass(singa_pooling, Pooling);
 void Pooling::Setup(const Shape& in_sample, const LayerConf& conf) {
   Layer::Setup(in_sample, conf);
   PoolingConf pool_conf = conf.pooling_conf();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.h b/src/model/layer/pooling.h
index 6df292a..f844799 100644
--- a/src/model/layer/pooling.h
+++ b/src/model/layer/pooling.h
@@ -28,7 +28,7 @@ namespace singa {
 class Pooling : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "Pooling"; }
+  // const std::string layer_type() const override { return "Pooling"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const Shape& in_sample, const LayerConf& conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/prelu.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/prelu.cc b/src/model/layer/prelu.cc
index 421bcaa..a20972c 100644
--- a/src/model/layer/prelu.cc
+++ b/src/model/layer/prelu.cc
@@ -20,7 +20,7 @@
 #include "./prelu.h"
 namespace singa {
 
-RegisterLayerClass(PReLU);
+RegisterLayerClass(singa_prelu, PReLU);
 void PReLU::Setup(const Shape& in_sample, const LayerConf &conf) {
   Layer::Setup(in_sample, conf);
   out_sample_shape_ = in_sample;
@@ -82,7 +82,7 @@ const std::pair<Tensor, vector<Tensor> > PReLU::Backward(int flag,
   Tensor da;
   da.ResetLike(a_);
   if (!channel_shared_) {
-    size_t n, c, h, w;
+    size_t n = 0, c = 0, h = 0, w = 0;
     Tensor temp1 = (input <= 0.f);
     if (temp1.nDim() == 4) {
       if (format_ == "NCHW") {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/prelu.h
----------------------------------------------------------------------
diff --git a/src/model/layer/prelu.h b/src/model/layer/prelu.h
index 70a9dcf..3041d1e 100644
--- a/src/model/layer/prelu.h
+++ b/src/model/layer/prelu.h
@@ -27,7 +27,7 @@ namespace singa {
 class PReLU : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "PReLU"; }
+  //  const std::string layer_type() const override { return "PReLU"; }
 
 
   /// \copydoc Layer::Setup(const LayerConf&);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/rnn.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/rnn.cc b/src/model/layer/rnn.cc
index 424c20b..524b462 100644
--- a/src/model/layer/rnn.cc
+++ b/src/model/layer/rnn.cc
@@ -22,7 +22,7 @@
 #include "singa/utils/string.h"
 
 namespace singa {
-
+RegisterLayerClass(singa_rnn, RNN);
 void RNN::Setup(const Shape& in_sample, const LayerConf &conf) {
   Layer::Setup(in_sample, conf);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/rnn.h
----------------------------------------------------------------------
diff --git a/src/model/layer/rnn.h b/src/model/layer/rnn.h
index 1b5dad7..3369a00 100644
--- a/src/model/layer/rnn.h
+++ b/src/model/layer/rnn.h
@@ -35,7 +35,7 @@ namespace singa {
 class RNN : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "RNN"; }
+  // const std::string layer_type() const override { return "RNN"; }
 
   /// Setup the RNN layer.
   /// in_shape is the shape of a single training instance from one timestep,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/softmax.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/softmax.cc b/src/model/layer/softmax.cc
index 6b1785c..6a49131 100644
--- a/src/model/layer/softmax.cc
+++ b/src/model/layer/softmax.cc
@@ -19,7 +19,7 @@
 #include "./softmax.h"
 namespace singa {
 
-RegisterLayerClass(Softmax);
+RegisterLayerClass(singa_softmax, Softmax);
 void Softmax::Setup(const Shape& in_sample, const LayerConf& conf) {
   Layer::Setup(in_sample, conf);
   CHECK_EQ(in_sample.size(), 1u);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/model/layer/softmax.h
----------------------------------------------------------------------
diff --git a/src/model/layer/softmax.h b/src/model/layer/softmax.h
index 837b23a..cf71587 100644
--- a/src/model/layer/softmax.h
+++ b/src/model/layer/softmax.h
@@ -24,7 +24,7 @@ namespace singa {
 class Softmax : public Layer {
  public:
   /// \copydoc Layer::layer_type()
-  const std::string layer_type() const override { return "Softmax"; }
+  // const std::string layer_type() const override { return "Softmax"; }
 
   /// \copydoc Layer::Setup(const LayerConf&);
   void Setup(const Shape& in_sample, const LayerConf& conf) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05720c21/src/python/singa/device.py
----------------------------------------------------------------------
diff --git a/src/python/singa/device.py b/src/python/singa/device.py
index 3db90bf..aff3587 100644
--- a/src/python/singa/device.py
+++ b/src/python/singa/device.py
@@ -73,3 +73,16 @@ def create_cuda_gpus(num):
 
 def create_cuda_gpu():
     return singa.Platform.CreateCudaGPUs(1)[0]
+
+
+def create_cuda_gpus_on(device_ids):
+    return singa.Platform.CreateCudaGPUsOn(device_ids)
+
+
+def create_cuda_gpu_on(device_id):
+    devices = create_cuda_gpus_on([device_id])
+    return devices[0]
+
+
+def get_default_device():
+    return singa.Platform.GetDefaultDevice()