You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@singa.apache.org by wa...@apache.org on 2015/08/28 11:28:47 UTC

[1/2] incubator-singa git commit: SINGA-9 Add Support for Restricted Boltzman Machine (RBM) model This is to implement RBM in SINGA. To training RBM models, the Contrastive Divergence (CD) algorithm is implemented. We have implemented a BPWorker to run t

Repository: incubator-singa
Updated Branches:
  refs/heads/master 6afa895b8 -> fbbcaafdb


SINGA-9 Add Support for Restricted Boltzman Machine (RBM) model
This is to implement RBM in SINGA.
To training RBM models, the Contrastive Divergence (CD) algorithm is implemented.
We have implemented a BPWorker to run the Back-Propagation algorithm. To implement the CD algorithm, we follow the same way
to create a CDWorker whose RunOneBatch function controls the logic of the CD algorithm, including positive phase,
negative phase and computing gradient phase. RBM's layers are different to the layers for feed-forward neural networks,
hence new layers for RBM models are added.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/ef4de796
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/ef4de796
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/ef4de796

Branch: refs/heads/master
Commit: ef4de796303550b1f3e31fd2fddd9eb831db2b06
Parents: 6afa895
Author: zhaojing <zh...@comp.nus.edu.sg>
Authored: Sun Aug 16 15:42:20 2015 +0800
Committer: zhaojing <zh...@comp.nus.edu.sg>
Committed: Thu Aug 20 16:56:03 2015 +0800

----------------------------------------------------------------------
 examples/rbm/autoencoder.conf | 299 +++++++++++++++++++++++++++++++++++++
 examples/rbm/rbm0.conf        | 103 +++++++++++++
 examples/rbm/rbm1.conf        | 135 +++++++++++++++++
 examples/rbm/rbm2.conf        | 167 +++++++++++++++++++++
 examples/rbm/rbm3.conf        | 198 ++++++++++++++++++++++++
 include/neuralnet/layer.h     |  57 ++++++-
 include/trainer/worker.h      |   6 +-
 src/driver.cc                 |   6 +-
 src/neuralnet/layer.cc        | 191 +++++++++++++++++------
 src/proto/job.proto           |  21 ++-
 src/trainer/worker.cc         |  22 ++-
 11 files changed, 1133 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/examples/rbm/autoencoder.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/autoencoder.conf b/examples/rbm/autoencoder.conf
new file mode 100644
index 0000000..9575323
--- /dev/null
+++ b/examples/rbm/autoencoder.conf
@@ -0,0 +1,299 @@
+name: "deep-big-simple-mlp"
+train_steps: 12200
+test_steps:100
+test_freq:100
+disp_freq:20
+checkpoint_after: 1000
+checkpoint_freq: 1000
+checkpoint_path: "examples/rbm/checkpoint/rbm0/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/checkpoint/rbm1/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/checkpoint/rbm2/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/checkpoint/rbm3/checkpoint/step6000-worker0.bin"
+alg: kBP
+updater{
+  type: kAdaGrad
+  learning_rate{
+  base_lr: 0.01
+  type: kFixed
+  }
+}
+
+neuralnet {
+  layer {
+    name: "data"
+    type: kShardData
+    sharddata_conf {
+      path: "examples/rbm/mnist_train_shard"
+      batchsize: 1000
+    }
+    exclude: kTest
+  }
+
+  layer {
+    name: "data"
+    type: kShardData
+    sharddata_conf {
+      path: "examples/rbm/mnist_test_shard"
+      batchsize: 1000
+    }
+    exclude: kTrain
+  }
+
+  layer{
+    name:"mnist"
+    type: kMnist
+    srclayers: "data"
+    mnist_conf {
+      norm_a: 255
+      norm_b: 0
+    }
+  }
+
+  layer{
+    name: "label"
+    type: kLabel
+    srclayers: "data"
+  }
+
+  layer{
+    name: "fc1"
+    type: kInnerProduct
+    srclayers:"mnist"
+    innerproduct_conf{
+      num_output: 1000
+    }
+    param{
+      name: "w1"
+      init{
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb12"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid1"
+    type: kSigmoid
+    srclayers:"fc1"
+  }
+  layer{
+    name: "fc2"
+    type: kInnerProduct
+    srclayers:"sigmoid1"
+    innerproduct_conf{
+      num_output: 500
+    }
+    param{
+      name: "w2"
+      init{
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb22"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid2"
+    type: kSigmoid
+    srclayers:"fc2"
+  }
+
+  layer{
+    name: "fc3"
+    type:  kInnerProduct
+    srclayers:"sigmoid2"
+    innerproduct_conf{
+      num_output: 250
+    }
+    param{
+      name: "w3"
+      init{
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb32"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid3"
+    type: kSigmoid
+    srclayers:"fc3"
+  }
+
+  layer{
+    name: "fc4"
+    type: kInnerProduct
+    srclayers:"sigmoid3"
+    innerproduct_conf{
+      num_output: 30
+    }
+    param{
+      name: "w4"
+      init{
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb42"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "fc5"
+    type: kInnerProduct
+    #srclayers:"sigmoid4"
+    srclayers:"fc4"
+    innerproduct_conf{
+      num_output: 250
+      transpose: true
+    }
+    param{
+      name: "w5"
+      share_from: "w4"
+    }
+    param{
+      name: "rb41"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid5"
+    type: kSigmoid
+    srclayers:"fc5"
+  }
+  layer{
+    name: "fc6"
+    type: kInnerProduct
+    srclayers:"sigmoid5"
+    innerproduct_conf{
+      num_output: 500
+      transpose: true
+    }
+    param{
+      name: "w6"
+      share_from: "w3"
+    }
+    param{
+      name: "rb31"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+
+  }
+
+  layer{
+    name: "sigmoid6"
+    type: kSigmoid
+    srclayers:"fc6"
+  }
+ layer{
+    name: "fc7"
+    type: kInnerProduct
+    srclayers:"sigmoid6"
+    innerproduct_conf{
+      num_output: 1000
+      transpose: true
+    }
+    param{
+      name: "w7"
+      share_from: "w2"
+    }
+    param{
+      name: "rb21"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+
+  }
+
+  layer{
+    name: "sigmoid7"
+    type: kSigmoid
+    srclayers:"fc7"
+  }
+ layer{
+    name: "fc8"
+    type: kInnerProduct
+    srclayers:"sigmoid7"
+    innerproduct_conf{
+      num_output: 784
+      transpose: true
+    }
+    param{
+      name: "w8"
+      share_from: "w1"
+    }
+    param{
+      name: "rb11"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+
+  }
+
+  layer{
+    name: "sigmoid8"
+    type: kSigmoid
+    srclayers:"fc8"
+  }
+
+  layer{
+    name: "loss"
+    type:kEuclideanLoss
+    srclayers:"sigmoid8"
+    srclayers:"mnist"
+  }
+}
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  workspace: "examples/rbm/checkpoint/autoencoder/"
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/examples/rbm/rbm0.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm0.conf b/examples/rbm/rbm0.conf
new file mode 100644
index 0000000..ef8653f
--- /dev/null
+++ b/examples/rbm/rbm0.conf
@@ -0,0 +1,103 @@
+name: "deep-big-simple-dbm"
+train_steps: 6000
+test_steps:100
+test_freq:100
+disp_freq: 100
+alg: kCD
+checkpoint_after: 500
+checkpoint_freq: 1000
+updater{
+  type: kSGD
+  momentum: 0.9
+  weight_decay: 0.0002
+  learning_rate{
+    base_lr: 0.1
+    type: kFixed
+  }
+}
+
+neuralnet {
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/rbm/mnist_train_shard"
+    batchsize: 100
+  }
+  exclude: kTest
+}
+
+
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/rbm/mnist_test_shard"
+    batchsize: 100
+  }
+  exclude: kTrain
+}
+
+
+layer{
+  name:"mnist"
+  type: kMnist
+  srclayers: "data"
+  mnist_conf {
+    norm_a: 255
+    norm_b: 0
+  }
+}
+
+layer{
+  name: "RBMVis"
+  type: kRBMVis
+  srclayers:"mnist"
+  srclayers:"RBMHid"
+  rbmvis_conf{
+    num_output: 1000
+  }
+  param{
+    name: "w1"
+    init{
+    type: kGaussian
+    mean: 0.0
+    std: 0.1
+    }
+  }
+  param{
+    name: "rb11"
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+
+layer{
+  name: "RBMHid"
+  type: kRBMHid
+  srclayers:"RBMVis"
+  rbmhid_conf{
+    hid_dim: 1000
+  }
+  param{
+    name: "w1_1"
+    share_from: "w1"
+  }
+  param{
+    name: "rb12"
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+}
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  nservers_per_group: 1
+  nworkers_per_group: 1
+  workspace: "examples/rbm/checkpoint/rbm0/"
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/examples/rbm/rbm1.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm1.conf b/examples/rbm/rbm1.conf
new file mode 100644
index 0000000..f9b4974
--- /dev/null
+++ b/examples/rbm/rbm1.conf
@@ -0,0 +1,135 @@
+name: "deep-big-simple-dbm"
+train_steps: 6000
+test_steps:100
+test_freq:500
+disp_freq: 100
+alg: kCD
+checkpoint_after: 500
+checkpoint_freq: 1000
+checkpoint_path: "examples/rbm/checkpoint/rbm0/checkpoint/step6000-worker0.bin"
+updater{
+  type: kSGD
+  momentum: 0.9
+  weight_decay: 0.0002
+  learning_rate{
+  base_lr: 0.1
+  type: kFixed
+  }
+}
+
+neuralnet {
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/rbm/mnist_train_shard"
+    batchsize: 100
+  }
+  exclude: kTest
+}
+
+
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/rbm/mnist_test_shard"
+    batchsize: 100
+  }
+  exclude: kTrain
+}
+
+
+layer{
+  name:"mnist"
+  type: kMnist
+  srclayers: "data"
+  mnist_conf {
+    norm_a: 255
+    norm_b: 0
+  }
+}
+
+layer{
+    name: "fc1"
+    type: kInnerProduct
+    srclayers:"mnist"
+    innerproduct_conf{
+      num_output: 1000
+    }
+    param{
+      name: "w1"
+      init{
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb12"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid1"
+    type: kSigmoid
+    srclayers:"fc1"
+  }
+
+layer{
+  name: "RBMVis"
+  type: kRBMVis
+  srclayers:"sigmoid1"
+  srclayers:"RBMHid"
+  rbmvis_conf{
+    num_output: 500
+  }
+  param{
+    name: "w2"
+    init{
+    type: kGaussian
+    mean: 0.0
+    std: 0.1
+    }
+  }
+  param{
+    name: "rb21"
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+
+layer{
+  name: "RBMHid"
+  type: kRBMHid
+  srclayers:"RBMVis"
+  rbmhid_conf{
+    hid_dim: 500
+  }
+  param{
+    name: "w2_1"
+    share_from: "w2"
+  }
+  param{
+    name: "rb22"
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+}
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  nservers_per_group: 1
+  nworkers_per_group: 1
+  workspace: "examples/rbm/checkpoint/rbm1/"
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/examples/rbm/rbm2.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm2.conf b/examples/rbm/rbm2.conf
new file mode 100644
index 0000000..6629481
--- /dev/null
+++ b/examples/rbm/rbm2.conf
@@ -0,0 +1,167 @@
+name: "deep-big-simple-dbm"
+train_steps: 6000
+test_steps:100
+test_freq:100
+disp_freq: 100
+alg: kCD
+checkpoint_after: 500
+checkpoint_freq: 1000
+checkpoint_path: "examples/rbm/checkpoint/rbm1/checkpoint/step6000-worker0.bin"
+
+updater{
+  type: kSGD
+  momentum: 0.9
+  weight_decay: 0.0002
+  learning_rate{
+    base_lr: 0.1
+    type: kFixed
+  }
+}
+
+
+neuralnet {
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/rbm/mnist_train_shard"
+    batchsize: 100
+  }
+  exclude: kTest
+}
+
+
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/rbm/mnist_test_shard"
+    batchsize: 100
+  }
+  exclude: kTrain
+}
+
+
+layer{
+  name:"mnist"
+  type: kMnist
+  srclayers: "data"
+  mnist_conf {
+    norm_a: 255
+    norm_b: 0
+  }
+}
+
+layer{
+    name: "fc1"
+    type: kInnerProduct
+    srclayers:"mnist"
+    innerproduct_conf{
+      num_output: 1000
+    }
+    param{
+      name: "w1"
+      init {
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb12"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid1"
+    type: kSigmoid
+    srclayers:"fc1"
+  }
+
+layer{
+    name: "fc2"
+    type: kInnerProduct
+    srclayers:"sigmoid1"
+    innerproduct_conf{
+      num_output: 500
+    }
+    param{
+      name: "w2"
+      init{
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb22"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid2"
+    type: kSigmoid
+    srclayers:"fc2"
+  }
+layer{
+  name: "RBMVis"
+  type: kRBMVis
+  srclayers:"sigmoid2"
+  srclayers:"RBMHid"
+  rbmvis_conf{
+    num_output: 250
+  }
+  param{
+    name: "w3"
+    init{
+    type: kGaussian
+    mean: 0.0
+    std: 0.1
+    }
+  }
+  param{
+    name: "rb31"
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+
+layer{
+  name: "RBMHid"
+  type: kRBMHid
+  srclayers:"RBMVis"
+  rbmhid_conf{
+    hid_dim: 250
+  }
+  param{
+    name: "w3_1"
+    share_from: "w3"
+  }
+  param{
+    name: "rb32"
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+}
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  nservers_per_group: 1
+  nworkers_per_group: 1
+  workspace: "examples/rbm/checkpoint/rbm2/"
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/examples/rbm/rbm3.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm3.conf b/examples/rbm/rbm3.conf
new file mode 100644
index 0000000..482c5e7
--- /dev/null
+++ b/examples/rbm/rbm3.conf
@@ -0,0 +1,198 @@
+name: "deep-big-simple-dbm"
+train_steps: 6000
+test_steps: 100
+test_freq: 100
+disp_freq: 100
+alg: kCD
+checkpoint_after: 500
+checkpoint_freq: 1000
+checkpoint_path: "examples/rbm/checkpoint/rbm2/checkpoint/step6000-worker0.bin"
+updater{
+    type: kSGD
+    momentum: 0.9
+    weight_decay: 0.0002
+    learning_rate{
+      base_lr: 0.001
+      type: kFixed
+    }
+}
+
+neuralnet {
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/rbm/mnist_train_shard"
+    batchsize: 100
+  }
+  exclude: kTest
+}
+
+
+layer {
+  name: "data"
+  type: kShardData
+  sharddata_conf {
+    path: "examples/rbm/mnist_test_shard"
+    batchsize: 100
+  }
+  exclude: kTrain
+}
+
+
+layer{
+  name:"mnist"
+  type: kMnist
+  srclayers: "data"
+  mnist_conf {
+    norm_a: 255
+    norm_b: 0
+  }
+}
+
+layer{
+    name: "fc1"
+    type: kInnerProduct
+    srclayers:"mnist"
+    innerproduct_conf{
+      num_output: 1000
+    }
+    param{
+      name: "w1"
+      init{
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb12"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid1"
+    type: kSigmoid
+    srclayers:"fc1"
+  }
+
+layer{
+    name: "fc2"
+    type: kInnerProduct
+    srclayers:"sigmoid1"
+    innerproduct_conf{
+      num_output: 500
+    }
+    param{
+      name: "w2"
+      init{
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb22"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid2"
+    type: kSigmoid
+    srclayers:"fc2"
+  }
+
+layer{
+    name: "fc3"
+    type: kInnerProduct
+    srclayers:"sigmoid2"
+    innerproduct_conf{
+      num_output: 250
+    }
+    param{
+      name: "w3"
+      init{
+      type: kUniform
+      low:-0.05
+      high:0.05
+      }
+    }
+    param{
+      name: "rb32"
+      init{
+      type: kUniform
+      low: -0.05
+      high:0.05
+      }
+    }
+  }
+
+  layer{
+    name: "sigmoid3"
+    type: kSigmoid
+    srclayers:"fc3"
+  }
+
+layer{
+  name: "RBMVis"
+  type: kRBMVis
+  srclayers:"sigmoid3"
+  srclayers:"RBMHid"
+  rbmvis_conf{
+    num_output: 30
+  }
+  param{
+    name: "w4"
+    init{
+    type: kGaussian
+    mean: 0.0
+    std: 0.1
+    }
+  }
+  param{
+    name: "rb41"
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+
+layer{
+  name: "RBMHid"
+  type: kRBMHid
+  srclayers:"RBMVis"
+  rbmhid_conf{
+    hid_dim: 30
+    gaussian: true
+  }
+  param{
+    name: "w4_1"
+    share_from: "w4"
+  }
+  param{
+    name: "rb42"
+    init{
+    type: kConstant
+    value: 0.0
+    }
+  }
+}
+}
+cluster {
+  nworker_groups: 1
+  nserver_groups: 1
+  nservers_per_group: 1
+  nworkers_per_group: 1
+  workspace: "examples/rbm/checkpoint/rbm3/"
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/include/neuralnet/layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h
index 118da56..b1fbbb0 100644
--- a/include/neuralnet/layer.h
+++ b/include/neuralnet/layer.h
@@ -69,8 +69,8 @@ class DropoutLayer: public Layer {
   Blob<float> mask_;
 };
 /**
-  * RBM visible layer
-  */
+ * RBM visible layer
+ */
 class RBMVisLayer: public Layer {
  public:
   using Layer::ComputeFeature;
@@ -108,6 +108,7 @@ class RBMVisLayer: public Layer {
   ~RBMVisLayer();
 
 
+
  private:
   //! dimension of the hidden layer
   int hdim_;
@@ -127,8 +128,8 @@ class RBMVisLayer: public Layer {
   // in order to implement Persistent Contrastive Divergence,
 };
 /**
-  * RBM hidden layer
-  */
+ * RBM hidden layer
+ */
 class RBMHidLayer: public Layer {
  public:
   using Layer::ComputeFeature;
@@ -154,7 +155,7 @@ class RBMHidLayer: public Layer {
       return data_;
     else
       return hid_sample_;
-  }
+    }
   const vector<Param*> GetParams() const override {
     vector<Param*> params{weight_, bias_};
     return params;
@@ -169,6 +170,8 @@ class RBMHidLayer: public Layer {
   // batchsize of negative phase
   int neg_batchsize_;
   float scale_;
+  // whether use gaussian sampling
+  bool gaussian_;
   Blob<float> hid_sample_;
   Param* weight_, *bias_;
 };
@@ -184,7 +187,6 @@ class InnerProductLayer: public Layer {
   void ComputeFeature(Phase phase, Metric *perf) override;
   void ComputeGradient(Phase phase) override;
 
-
   ConnectionType src_neuron_connection(int k) const override {
     // CHECK_LT(k, srclayers_.size());
     return kOneToAll;
@@ -201,6 +203,7 @@ class InnerProductLayer: public Layer {
   //! dimension of the visible layer
   int vdim_;
   int batchsize_;
+  bool transpose_;
   Param* weight_, *bias_;
 };
 
@@ -246,7 +249,9 @@ class MnistLayer: public ParserLayer {
   void Setup(const LayerProto& proto, int npartitions) override;
   void ParseRecords(Phase phase, const vector<Record>& records,
       Blob<float>* blob) override;
-
+  ConnectionType dst_layer_connection() const override {
+    return kOneToMany;
+  }
  protected:
   // height and width of the image after deformation
   // kernel size for elastic distortion
@@ -283,6 +288,29 @@ class ReLULayer: public Layer {
   void ComputeGradient(Phase phase) override;
 };
 
+class EuclideanLossLayer: public LossLayer {
+ public:
+  using Layer::ComputeFeature;
+  using Layer::ComputeGradient;
+
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
+
+
+  int partition_dim() const override {
+    CHECK_LE(layer_proto_.partition_dim(), 1);
+    return layer_proto_.partition_dim();
+  }
+  ConnectionType src_neuron_connection(int k) const override {
+    // CHECK_LT(k, srclayers_.size());
+    return kOneToAll;
+  }
+
+ private:
+  int batchsize_;
+  int dim_;
+};
 
 class SoftmaxLossLayer: public LossLayer {
   /*
@@ -344,6 +372,21 @@ class ShardDataLayer: public DataLayer{
 };
 
 /**
+ * This layer apply Sigmoid function to neuron activations.
+ * f(x)=1/(1+exp(-x))
+ * f'(x)=f(x)*(1-f(x))
+ */
+class SigmoidLayer: public Layer {
+ public:
+  using Layer::ComputeFeature;
+  using Layer::ComputeGradient;
+
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeGradient(Phase phase) override;
+};
+
+/**
  * This layer apply Tan function to neuron activations.
  * f(x)=A tanh(Bx)
  * f'(x)=B/A (A*A-f(x)*f(x))

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/include/trainer/worker.h
----------------------------------------------------------------------
diff --git a/include/trainer/worker.h b/include/trainer/worker.h
index cc5a745..86b1c90 100644
--- a/include/trainer/worker.h
+++ b/include/trainer/worker.h
@@ -193,10 +193,10 @@ class BPWorker: public Worker{
 class CDWorker: public Worker{
  public:
   ~CDWorker() {}
-  void Init(int thread_id, int group_id, int worker_id) override;
+  void Init(int thread_id, int grp_id, int id) override;
   void TrainOneBatch(int step, Metric* perf) override;
-  void TestOneBatch(int step, Phase phase,
-       shared_ptr<NeuralNet> net, Metric* perf) override;
+  void TestOneBatch(int step, Phase phase, shared_ptr<NeuralNet> net,
+      Metric* perf) override;
   void PositivePhase(int step, shared_ptr<NeuralNet> net, Metric* perf);
   void NegativePhase(int step, shared_ptr<NeuralNet> net, Metric* perf);
   void GradientPhase(int step, shared_ptr<NeuralNet> net);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index 1bc712d..e5045a3 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -31,21 +31,23 @@ void Driver::Init(int argc, char **argv) {
   RegisterLayer<ConvolutionLayer, int>(kConvolution);
   RegisterLayer<ConcateLayer, int>(kConcate);
   RegisterLayer<DropoutLayer, int>(kDropout);
+  RegisterLayer<EuclideanLossLayer, int>(kEuclideanLoss);
   RegisterLayer<InnerProductLayer, int>(kInnerProduct);
   RegisterLayer<LabelLayer, int>(kLabel);
   RegisterLayer<LRNLayer, int>(kLRN);
   RegisterLayer<MnistLayer, int>(kMnist);
   RegisterLayer<PrefetchLayer, int>(kPrefetch);
   RegisterLayer<PoolingLayer, int>(kPooling);
+  RegisterLayer<RBMHidLayer, int>(kRBMHid);
+  RegisterLayer<RBMVisLayer, int>(kRBMVis);
   RegisterLayer<RGBImageLayer, int>(kRGBImage);
   RegisterLayer<ReLULayer, int>(kReLU);
   RegisterLayer<ShardDataLayer, int>(kShardData);
+  RegisterLayer<SigmoidLayer, int>(kSigmoid);
   RegisterLayer<SliceLayer, int>(kSlice);
   RegisterLayer<SoftmaxLossLayer, int>(kSoftmaxLoss);
   RegisterLayer<SplitLayer, int>(kSplit);
   RegisterLayer<TanhLayer, int>(kTanh);
-  RegisterLayer<RBMVisLayer, int>(kRBMVis);
-  RegisterLayer<RBMHidLayer, int>(kRBMHid);
 #ifdef USE_LMDB
   RegisterLayer<LMDBDataLayer, int>(kLMDBData);
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index ae45ae8..b5c986e 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -163,6 +163,7 @@ RBMVisLayer::~RBMVisLayer() {
   delete weight_;
   delete bias_;
 }
+
 void RBMVisLayer::Setup(const LayerProto& proto,
       int npartitions) {
   Layer::Setup(proto, npartitions);
@@ -188,7 +189,7 @@ void RBMVisLayer::Setup(const LayerProto& proto,
   vis_sample_.Reshape(vector<int>{neg_batchsize_, vdim_});
   weight_ = Param::Create(proto.param(0));
   bias_ = Param::Create(proto.param(1));
-  weight_->Setup(proto.param(0), vector<int>{vdim_, hdim_});
+  weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_});
   bias_->Setup(proto.param(1), vector<int>{vdim_});
 }
 
@@ -199,24 +200,15 @@ void RBMVisLayer::ComputeFeature(Phase phase, Metric* perf) {
     auto src = Tensor2(srclayers_[data_idx_]->mutable_data(this));
     Copy(data, src);
   } else if (phase == kNegative) {   /*negative phase*/
-      if (is_first_iteration_vis_) {
-        CHECK_EQ(srclayers_[data_idx_]->data(this).count(), batchsize_*vdim_);
-        auto src = Tensor2(srclayers_[data_idx_]->mutable_data(this));
-        auto vis_sample = Tensor2(&vis_sample_);
-        Copy(vis_sample, src);
-        is_first_iteration_vis_ = false;
-      } else {
-          auto hid_sample =
-                Tensor2(srclayers_[hid_idx_]->mutable_data(this, kNegative));
-          // fetch sampling results from hidden layer
-          auto vis_sample = Tensor2(&vis_sample_);
-          auto weight = Tensor2(weight_->mutable_data());
-          auto bias = Tensor1(bias_->mutable_data());
-          vis_sample = dot(hid_sample, weight.T());
-          vis_sample+=repmat(bias, neg_batchsize_);
-          vis_sample = F<op::sigmoid>(vis_sample);
-          TSingleton<Random<cpu>>::Instance()->SampleBinary(vis_sample);
-        }
+      auto hid_sample =
+        Tensor2(srclayers_[hid_idx_]->mutable_data(this, kNegative));
+      // fetch sampling results from hidden layer
+      auto vis_sample = Tensor2(&vis_sample_);
+      auto weight = Tensor2(weight_->mutable_data());
+      auto bias = Tensor1(bias_->mutable_data());
+      vis_sample = dot(hid_sample, weight);
+      vis_sample+=repmat(bias, neg_batchsize_);
+      vis_sample = F<op::sigmoid>(vis_sample);
     }
 }
 
@@ -231,14 +223,14 @@ void RBMVisLayer::ComputeGradient(Phase phase) {
   auto gbias = Tensor1(bias_->mutable_grad());
   gbias = sum_rows(vis_sample);
   gbias -= sum_rows(data);
-  gweight = dot(vis_sample.T(), hid_sample);
-  gweight -= dot(data.T(), hid_data);
+  gweight = dot(hid_sample.T(), vis_sample);
+  gweight -= dot(hid_data.T(), data);
   gbias*=(1.0f)/(1.0f*batchsize_);
   gweight*=(1.0f)/(1.0f*batchsize_);
 }
 
 void RBMVisLayer::ComputeLoss(Metric* perf) {
-  float loss = (0.0f);
+  float loss_sqr = (0.0f);
   CHECK_EQ(srclayers_[data_idx_]->data(this).count(), batchsize_*vdim_);
   auto src = Tensor2(srclayers_[data_idx_]->mutable_data(this));
   auto hid_data = Tensor2(srclayers_[hid_idx_]->mutable_data(this, kPositive));
@@ -247,24 +239,26 @@ void RBMVisLayer::ComputeLoss(Metric* perf) {
   auto bias = Tensor1(bias_->mutable_data());
   Tensor<cpu, 2> reconstruct(Shape2(batchsize_, vdim_)); /*reconstruct error*/
   AllocSpace(reconstruct);
-  reconstruct = dot(hid_data, weight.T());
+  reconstruct = dot(hid_data, weight);
   reconstruct+=repmat(bias, batchsize_);
   reconstruct = F<op::sigmoid>(reconstruct);
   float *src_dptr = src.dptr;
-  float *reconstruct_dptr = reconstruct.dptr;
-  for (int i = 0; i < vdim_*batchsize_; i++)
-    loss += -(src_dptr[i]*log(reconstruct_dptr[i])
-            +(1-src_dptr[i])*log(1-reconstruct_dptr[i]));
-  loss/=batchsize_;
+  for (int i = 0; i < vdim_*batchsize_; i++) {
+      int recon_row = i / vdim_;
+      int recon_col = i - recon_row * vdim_;
+      loss_sqr += (src_dptr[i] - reconstruct[recon_row][recon_col]) *
+                  (src_dptr[i] - reconstruct[recon_row][recon_col]);
+  }
   FreeSpace(reconstruct);
   perf->Reset();
-  perf->Add("reconstruct_error", loss);
+  perf->Add("sqr_reconstruct_error", loss_sqr);
 }
 /**************** Implementation for RBMHidLayer********************/
 RBMHidLayer::~RBMHidLayer() {
   delete weight_;
   delete bias_;
 }
+
 void RBMHidLayer::Setup(const LayerProto& proto,
       int npartitions) {
   Layer::Setup(proto, npartitions);
@@ -276,24 +270,44 @@ void RBMHidLayer::Setup(const LayerProto& proto,
   neg_batchsize_ = src_sample.shape()[0];
   vdim_ = src_data.count()/batchsize_;
   hdim_ = proto.rbmhid_conf().hid_dim();
+  gaussian_ = proto.rbmhid_conf().gaussian();
   data_.Reshape(vector<int>{batchsize_, hdim_});
   hid_sample_.Reshape(vector<int>{neg_batchsize_, hdim_});
   weight_ = Param::Create(proto.param(0));
   bias_ = Param::Create(proto.param(1));
-  weight_->Setup(proto.param(0), vector<int>{vdim_, hdim_});
   bias_->Setup(proto.param(1), vector<int>{hdim_});
+  weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_});
 }
 
 void RBMHidLayer::ComputeFeature(Phase phase, Metric* perf) {
   if (phase == kPositive) {  /*postive phase*/
     auto data = Tensor2(&data_);
+
+    auto hid_sample = Tensor2(&hid_sample_);
+
     CHECK_EQ(srclayers_[0]->data(this, kPositive).count(), batchsize_*vdim_);
     auto src = Tensor2(srclayers_[0]->mutable_data(this, kPositive));
     auto weight = Tensor2(weight_->mutable_data());
     auto bias = Tensor1(bias_->mutable_data());
-    data = dot(src, weight);
+    data = dot(src, weight.T());
     data += repmat(bias, batchsize_);
-    data = F<op::sigmoid>(data);
+
+    if (!gaussian_)
+      data = F<op::sigmoid>(data);
+
+    Copy(hid_sample, data);
+
+    if (gaussian_) {  // first gibbs
+      Tensor<cpu, 2> gaussian_sample(Shape2(batchsize_, hdim_));
+      AllocSpace(gaussian_sample);
+      auto random = TSingleton<Random<cpu>>::Instance();
+      random->SampleGaussian(gaussian_sample, 0.0f, 1.0f);
+      hid_sample += gaussian_sample;
+      FreeSpace(gaussian_sample);
+    } else {
+        TSingleton<Random<cpu>>::Instance()->SampleBinary(hid_sample);
+    }
+
   } else if (phase == kNegative) {   /*negative phase*/
       CHECK_EQ(srclayers_[0]->data(this, kNegative).count(),
          neg_batchsize_*vdim_);
@@ -301,15 +315,25 @@ void RBMHidLayer::ComputeFeature(Phase phase, Metric* perf) {
       auto hid_sample = Tensor2(&hid_sample_);
       auto bias = Tensor1(bias_->mutable_data());
       auto weight = Tensor2(weight_->mutable_data());
-      hid_sample = dot(src_sample, weight);
+      hid_sample = dot(src_sample, weight.T());
       hid_sample += repmat(bias, neg_batchsize_);
-      hid_sample = F<op::sigmoid>(hid_sample);
-      TSingleton<Random<cpu>>::Instance()->SampleBinary(hid_sample);
+      if (!gaussian_)
+        hid_sample = F<op::sigmoid>(hid_sample);
     } else if (phase == kLoss) {   /*test phase*/
-       auto data = Tensor2(&data_);  // data: sigmoid(Wv+b)
-       TSingleton<Random<cpu>>::Instance()->SampleBinary(data);
+        auto data = Tensor2(&data_);  // data: sigmoid(Wv+b)
+        if (gaussian_) {
+          Tensor<cpu, 2> gaussian_sample(Shape2(batchsize_, hdim_));
+          AllocSpace(gaussian_sample);
+          auto random = TSingleton<Random<cpu>>::Instance();
+          random->SampleGaussian(gaussian_sample, 0.0f, 1.0f);
+          data += gaussian_sample;
+          FreeSpace(gaussian_sample);
+        }
+        else
+          TSingleton<Random<cpu>>::Instance()->SampleBinary(data);
       }
 }
+
 void RBMHidLayer::ComputeGradient(Phase phase) {
   auto data = Tensor2(&data_);
   auto hid_sample = Tensor2(&hid_sample_);
@@ -326,17 +350,21 @@ InnerProductLayer::~InnerProductLayer() {
 void InnerProductLayer::Setup(const LayerProto& proto, int npartitions) {
   Layer::Setup(proto, npartitions);
   CHECK_EQ(srclayers_.size(), 1);
-  const auto& src=srclayers_[0]->data(this);
-  batchsize_=src.shape()[0];
-  vdim_=src.count()/batchsize_;
-  hdim_=proto.innerproduct_conf().num_output();
-  if(partition_dim()>0)
+  const auto& src = srclayers_[0]->data(this);
+  batchsize_ = src.shape()[0];
+  vdim_ = src.count()/batchsize_;
+  hdim_ = proto.innerproduct_conf().num_output();
+  transpose_ = proto.innerproduct_conf().transpose();
+  if (partition_dim() > 0)
     hdim_ /= npartitions;
   data_.Reshape(vector<int>{batchsize_, hdim_});
   grad_.ReshapeLike(data_);
   weight_ = Param::Create(proto.param(0));
   bias_ = Param::Create(proto.param(1));
-  weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_});
+  if (transpose_)
+    weight_->Setup(proto.param(0), vector<int>{vdim_, hdim_});
+  else
+    weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_});
   bias_->Setup(proto.param(1), vector<int>{hdim_});
 }
 
@@ -345,7 +373,10 @@ void InnerProductLayer::ComputeFeature(Phase phase, Metric* perf) {
   auto src = Tensor2(srclayers_[0]->mutable_data(this));
   auto weight = Tensor2(weight_->mutable_data());
   auto bias = Tensor1(bias_->mutable_data());
-  data=dot(src, weight.T());
+  if (transpose_)
+    data = dot(src, weight);
+  else
+    data = dot(src, weight.T());
   // repmat: repeat bias vector into batchsize rows
   data+=repmat(bias, batchsize_);
 }
@@ -357,11 +388,17 @@ void InnerProductLayer::ComputeGradient(Phase phas) {
   auto gweight = Tensor2(weight_->mutable_grad());
   auto gbias = Tensor1(bias_->mutable_grad());
 
-  gbias=sum_rows(grad);
-  gweight=dot(grad.T(), src);
-  if(srclayers_[0]->mutable_grad(this)!=nullptr){
+  gbias = sum_rows(grad);
+  if (transpose_)
+    gweight = dot(src.T(), grad);
+  else
+    gweight = dot(grad.T(), src);
+  if (srclayers_[0]->mutable_grad(this) != nullptr) {
     auto gsrc = Tensor2(srclayers_[0]->mutable_grad(this));
-    gsrc=dot(grad, weight);
+    if (transpose_)
+      gsrc = dot(grad, weight.T());
+    else
+      gsrc = dot(grad, weight);
   }
 }
 /*****************************************************************************
@@ -703,6 +740,25 @@ ShardDataLayer::~ShardDataLayer() {
     delete shard_;
   shard_ = nullptr;
 }
+/*******************Implementation of SigmoidLayer***************************/
+void SigmoidLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  data_.ReshapeLike(srclayers_[0]->data(this));
+  grad_.ReshapeLike(srclayers_[0]->grad(this));
+}
+
+void SigmoidLayer::ComputeFeature(Phase phase, Metric* perf) {
+  auto data = Tensor1(&data_);
+  auto src = Tensor1(srclayers_[0]->mutable_data(this));
+  data = F<op::sigmoid>(src);
+}
+
+void SigmoidLayer::ComputeGradient(Phase phase) {
+  auto data = Tensor1(&data_);
+  auto grad = Tensor1(&grad_);
+  auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
+  gsrc = F<op::sigmoid_grad>(data)*grad;
+}
 /*******************Implementation of TanLayer***************************/
 void TanhLayer::Setup(const LayerProto& proto, int npartitions){
   Layer::Setup(proto, npartitions);
@@ -722,6 +778,45 @@ void TanhLayer::ComputeGradient(Phase phase) {
   auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
   gsrc=F<op::stanh_grad>(data)*grad;
 }
+/********** * Implementation for EuclideanLossLayer*************************/
+void EuclideanLossLayer::Setup(const LayerProto& proto, int npartitions) {
+  LossLayer::Setup(proto, npartitions);
+  CHECK_EQ(srclayers_.size(), 2);
+  data_.Reshape(srclayers_[0]->data(this).shape());
+  batchsize_ = data_.shape()[0];
+  dim_ = data_.count()/batchsize_;
+  metric_.Reshape(vector<int>{1});
+}
+void EuclideanLossLayer::ComputeFeature(Phase phase, Metric* perf) {
+  const float* reconstruct_dptr = srclayers_[0]->data(this).cpu_data();
+  const float* input_dptr = srclayers_[1]->data(this).cpu_data();
+  float loss = 0;
+  for (int n = 0; n < batchsize_; n++) {
+    for (int j = 0; j < dim_; ++j) {
+      loss += (input_dptr[j] - reconstruct_dptr[j]) *
+                 (input_dptr[j] - reconstruct_dptr[j]);
+    }
+    reconstruct_dptr+=dim_;
+    input_dptr+=dim_;
+  }
+  CHECK_EQ(reconstruct_dptr,
+            srclayers_[0]->data(this).cpu_data() + (batchsize_*dim_));
+  CHECK_EQ(input_dptr,
+      srclayers_[1]->data(this).cpu_data() + (batchsize_*dim_));
+  perf->Add("loss", loss/(1.0f*batchsize_));
+}
+void EuclideanLossLayer::ComputeGradient(Phase phase) {
+  const float* reconstruct_dptr = srclayers_[0]->data(this).cpu_data();
+  const float* input_dptr = srclayers_[1]->data(this).cpu_data();
+  Blob<float>* gsrcblob = srclayers_[0]->mutable_grad(this);
+  float* gsrcptr = gsrcblob->mutable_cpu_data();
+  for (int n = 0; n < batchsize_; n++) {
+    for (int j = 0; j < dim_; j++)
+    gsrcptr[n*dim_+j]= 2 * (reconstruct_dptr[n*dim_+j]-input_dptr[n*dim_+j]);
+  }
+  Tensor<cpu, 1> gsrc(gsrcptr, Shape1(gsrcblob->count()));
+  gsrc*=1.0f/(1.0f*batchsize_);
+}
 /********** * Implementation for SoftmaxLossLayer*************************/
 void SoftmaxLossLayer::Setup(const LayerProto& proto, int npartitions) {
   LossLayer::Setup(proto, npartitions);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index b4abe68..7f030ac 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -166,6 +166,8 @@ message LayerProto {
   optional ConcateProto concate_conf = 31;
   // configuration for dropout layer
   optional DropoutProto dropout_conf = 33;
+  // configuration for euclideanloss layer
+  optional EuclideanLossProto euclideanloss_conf = 50;
   // configuration for inner product layer
   optional InnerProductProto innerproduct_conf = 34;
   // configuration for local response normalization layer
@@ -178,6 +180,10 @@ message LayerProto {
   optional PoolingProto pooling_conf = 37;
   // configuration for prefetch layer
   optional PrefetchProto prefetch_conf = 44;
+  // configuration for rbmhid layer
+  optional RBMHidProto rbmhid_conf = 49;
+  // configuration for rbmvis layer
+  optional RBMVisProto rbmvis_conf = 48;
   // configuration for rectified linear unit layer
   optional ReLUProto relu_conf = 38;
   // configuration for rgb image parser layer
@@ -192,10 +198,7 @@ message LayerProto {
   optional SplitProto split_conf = 42;
   // configuration for tanh layer
   optional TanhProto tanh_conf = 43;
-  // configuration for rbmvis layer
-  optional RBMVisProto rbmvis_conf = 48;
-  // configuration for rbmhid layer
-  optional RBMHidProto rbmhid_conf = 49;
+
 
   // overrides the partition dimension for neural net
   optional int32 partition_dim = 60 [default = -1];
@@ -299,6 +302,9 @@ message TanhProto {
   optional float inner_scale = 2 [default = 1.0];
 }
 
+message EuclideanLossProto {
+}
+
 message SoftmaxLossProto {
   // computing accuracy against topk results
   optional int32 topk = 1 [default = 1];
@@ -367,6 +373,7 @@ message RBMVisProto {
 message RBMHidProto {
   optional int32 hid_dim = 1; // The number of outputs for the layer
   optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional bool gaussian = 3 [default = false]; // use gaussian sampling or not
 }
 
 // Message that stores parameters used by InnerProductLayer
@@ -375,6 +382,8 @@ message InnerProductProto {
   required int32 num_output = 1;
   // use bias vector or not
   optional bool bias_term = 30 [default = true];
+  // transpose or not
+  optional bool transpose = 31 [default = false];
 }
 
 message LRNProto {
@@ -524,12 +533,14 @@ enum LayerType {
   kLRN = 6;
   kPooling = 8;
   kReLU = 9;
-  kRBMHid = 24;
   kRBMVis = 23;
+  kRBMHid = 24;
+  kSigmoid = 26;
   kTanh = 14;
   // Loss layers
   //  - Compute objective loss
   kSoftmaxLoss = 11;
+  kEuclideanLoss = 25;
   // Other layers
   //  - Connect layers when neural net is partitioned
   kBridgeDst = 16;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ef4de796/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
index e047367..f112b17 100644
--- a/src/trainer/worker.cc
+++ b/src/trainer/worker.cc
@@ -380,7 +380,6 @@ void BPWorker::TestOneBatch(int step, Phase phase,
     shared_ptr<NeuralNet> net, Metric* perf) {
   Forward(step, phase, net, perf);
 }
-
 /****************************CDWorker**********************************/
 void CDWorker::Init(int thread_id, int group_id, int worker_id) {
   Worker::Init(thread_id, group_id, worker_id);
@@ -389,8 +388,11 @@ void CDWorker::Init(int thread_id, int group_id, int worker_id) {
 void CDWorker::PositivePhase(int step,
      shared_ptr<NeuralNet> net, Metric* perf) {
   auto& layers = net->layers();
+  // LOG(ERROR)<<"Positive Phase";
   for (auto& layer : layers) {
-      // clock_t s=clock();
+    for (Param* p : layer->GetParams()) {  // wait until param is updated
+      Collect(p, step);
+    }
     layer->ComputeFeature(kPositive, perf);
   }
 }
@@ -399,33 +401,39 @@ void CDWorker::NegativePhase(int step,
      shared_ptr<NeuralNet> net, Metric* perf) {
 // for negative phase, gibbs sampling only concerns RBM bottom and top layer
   auto& layers = net->layers();
-  for (int i = 0; i < job_conf_.cd_conf().pcd_k(); i++) {
+  // LOG(ERROR)<<"Negative Phase";
     for (auto& layer : layers) {
-      if (layer->is_vislayer() || layer->is_hidlayer())
+      if (layer->is_vislayer() || layer->is_hidlayer()) {
         layer->ComputeFeature(kNegative, perf);
+      }
     }
-  }
 }
 
 void CDWorker::GradientPhase(int step, shared_ptr<NeuralNet> net) {
   auto& layers = net->layers();
+  // LOG(ERROR)<<"Gradient Phase";
   for (auto& layer : layers) {
+    if (layer->is_vislayer() || layer->is_hidlayer()) {
       layer->ComputeGradient(kTrain);
       for (Param* p : layer->GetParams()) {
         Update(p, step);
       }
+    }
   }
 }
 
 void CDWorker::LossPhase(int step, shared_ptr<NeuralNet> net, Metric* perf) {
   auto& layers = net->layers();
+  // LOG(ERROR)<<"Loss Phase";
   for (auto& layer : layers) {
-    if (layer->is_hidlayer())
+    if (layer->is_hidlayer()) {
       layer->ComputeFeature(kLoss, perf);
+    }
   }
   for (auto& layer : layers) {
-    if (layer->is_vislayer())
+    if (layer->is_vislayer()) {
       layer->ComputeLoss(perf);
+    }
   }
 }

[2/2] incubator-singa git commit: SINGA-9 Add Support for Restricted Boltzman Machine (RBM) model

Posted by wa...@apache.org.

SINGA-9 Add Support for Restricted Boltzman Machine (RBM) model

* Refactor the CDWorker::TrainOneBatch.
* Replace Phase with int flag in ComputeFeature ComputeGradient
* functions. The flag can be a combination for multiple phases, e.g.,
* kTrain|kForward, where each phase is set to have only one none zero bit,
e.g., 1, 2, 4, 8, etc.
* Remove compliation dependency on opencv.
* Refactor JobProto to create a AlgProto for TrainOneBatch.
* Create a RBMLayer as the base layer for RBM layers.
* Updated all configurations of all examples.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/fbbcaafd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/fbbcaafd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/fbbcaafd

Branch: refs/heads/master
Commit: fbbcaafdba3e885eab44c0dcfd23829c2c80f732
Parents: ef4de79
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Thu Aug 27 21:50:08 2015 +0800
Committer: wangwei <wa...@comp.nus.edu.sg>
Committed: Fri Aug 28 17:18:38 2015 +0800

----------------------------------------------------------------------
 Makefile.am                     |   3 -
 configure.ac                    |  18 +-
 examples/cifar10/job.conf       |   5 +-
 examples/mnist/conv.conf        |   4 +-
 examples/mnist/job.conf         |  28 ++-
 examples/rbm/autoencoder.conf   |  89 ++--------
 examples/rbm/rbm0.conf          |  41 ++---
 examples/rbm/rbm1.conf          |  85 ++++-----
 examples/rbm/rbm2.conf          |  57 ++----
 examples/rbm/rbm3.conf          |  68 ++------
 include/mshadow/tensor_random.h |  17 +-
 include/neuralnet/base_layer.h  |  91 ++++++----
 include/neuralnet/layer.h       | 146 ++++------------
 include/trainer/worker.h        |   6 -
 include/utils/param.h           |   3 +-
 src/driver.cc                   |   8 +-
 src/neuralnet/base_layer.cc     |  49 +++---
 src/neuralnet/layer.cc          | 324 ++++++++++++++++-------------------
 src/proto/job.proto             |  47 +++--
 src/trainer/worker.cc           |  96 ++++-------
 src/utils/common.cc             |   2 +-
 src/utils/param.cc              |  10 +-
 22 files changed, 497 insertions(+), 700 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/Makefile.am
----------------------------------------------------------------------
diff --git a/Makefile.am b/Makefile.am
index fa28848..ae8a9dd 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -95,9 +95,6 @@ singa_LDFLAGS = -I./include \
                 -lglog  \
                 -lprotobuf \
                 -lrt \
-                -lopencv_highgui \
-                -lopencv_imgproc \
-                -lopencv_core \
                 -lopenblas \
                 -lzmq \
                 -lczmq \

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/configure.ac
----------------------------------------------------------------------
diff --git a/configure.ac b/configure.ac
index 35c6d61..569c406 100644
--- a/configure.ac
+++ b/configure.ac
@@ -44,15 +44,15 @@ if test x"$enable_lmdb" = x"yes"; then
 	AC_DEFINE(LMDB, 1, [Enable Option layer])
 fi
 
-AC_CHECK_LIB([opencv_imgproc], [main], [], [
-  AC_MSG_ERROR([unable to find opencv_imgproc lib])
-  ])
-AC_CHECK_LIB([opencv_highgui], [main], [], [
-  AC_MSG_ERROR([unable to find opencv_highgui lib])
-  ])
-AC_CHECK_LIB([opencv_core], [main], [], [
-  AC_MSG_ERROR([unable to find opencv_core lib])
-  ])
+#AC_CHECK_LIB([opencv_imgproc], [main], [], [
+#  AC_MSG_ERROR([unable to find opencv_imgproc lib])
+#  ])
+#AC_CHECK_LIB([opencv_highgui], [main], [], [
+#  AC_MSG_ERROR([unable to find opencv_highgui lib])
+#  ])
+#AC_CHECK_LIB([opencv_core], [main], [], [
+#  AC_MSG_ERROR([unable to find opencv_core lib])
+#  ])
 AC_CHECK_LIB([zookeeper_mt], [main], [], [
   AC_MSG_ERROR([unable to find zookeeper])
   ])

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index f44ca50..9d25904 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -3,7 +3,10 @@ train_steps: 1000
 test_steps: 100
 test_freq:300
 disp_freq:30
-alg: kBP
+debug: true
+train_one_batch {
+  alg: kBP
+}
 updater{
   type: kSGD
   weight_decay:0.004

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/examples/mnist/conv.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf
index 1d4d740..aaf34f2 100644
--- a/examples/mnist/conv.conf
+++ b/examples/mnist/conv.conf
@@ -3,7 +3,9 @@ train_steps: 10000
 test_steps:100
 test_freq:500
 disp_freq:50
-alg: kBP
+train_one_batch {
+  alg: kBP
+}
 updater {
   momentum:0.9
   weight_decay:0.0005

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/examples/mnist/job.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/job.conf b/examples/mnist/job.conf
index 360e1ec..b8d14e8 100644
--- a/examples/mnist/job.conf
+++ b/examples/mnist/job.conf
@@ -3,7 +3,9 @@ train_steps: 1000
 test_steps:10
 test_freq:60
 disp_freq:10
-alg: kBP
+train_one_batch {
+  alg: kBP
+}
 updater{
   type: kSGD
   learning_rate{
@@ -82,6 +84,10 @@ neuralnet {
   layer{
     name: "tanh1"
     type: kTanh
+    tanh_conf {
+      outer_scale: 1.7159047
+      inner_scale: 0.6666667
+    }
     srclayers:"fc1"
   }
   layer{
@@ -112,6 +118,11 @@ neuralnet {
   layer{
     name: "tanh2"
     type: kTanh
+    tanh_conf {
+      outer_scale: 1.7159047
+      inner_scale: 0.6666667
+    }
+
     srclayers:"fc2"
   }
   layer{
@@ -143,6 +154,11 @@ neuralnet {
   layer{
     name: "tanh3"
     type: kTanh
+    tanh_conf {
+      outer_scale: 1.7159047
+      inner_scale: 0.6666667
+    }
+
     srclayers:"fc3"
   }
   layer{
@@ -174,6 +190,11 @@ neuralnet {
   layer{
     name: "tanh4"
     type: kTanh
+    tanh_conf {
+      outer_scale: 1.7159047
+      inner_scale: 0.6666667
+    }
+
     srclayers:"fc4"
   }
   layer{
@@ -205,6 +226,11 @@ neuralnet {
   layer{
     name: "tanh5"
     type: kTanh
+    tanh_conf {
+      outer_scale: 1.7159047
+      inner_scale: 0.6666667
+    }
+
     srclayers:"fc5"
   }
   layer{

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/examples/rbm/autoencoder.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/autoencoder.conf b/examples/rbm/autoencoder.conf
index 9575323..bc32cc7 100644
--- a/examples/rbm/autoencoder.conf
+++ b/examples/rbm/autoencoder.conf
@@ -1,15 +1,15 @@
-name: "deep-big-simple-mlp"
+name: "auto-encoder"
 train_steps: 12200
 test_steps:100
-test_freq:100
-disp_freq:20
-checkpoint_after: 1000
-checkpoint_freq: 1000
-checkpoint_path: "examples/rbm/checkpoint/rbm0/checkpoint/step6000-worker0.bin"
-checkpoint_path: "examples/rbm/checkpoint/rbm1/checkpoint/step6000-worker0.bin"
-checkpoint_path: "examples/rbm/checkpoint/rbm2/checkpoint/step6000-worker0.bin"
-checkpoint_path: "examples/rbm/checkpoint/rbm3/checkpoint/step6000-worker0.bin"
-alg: kBP
+test_freq:1000
+disp_freq:100
+checkpoint_path: "examples/rbm/rbm0/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0.bin"
+checkpoint_path: "examples/rbm/rbm3/checkpoint/step6000-worker0.bin"
+train_one_batch{
+  alg: kBP
+}
 updater{
   type: kAdaGrad
   learning_rate{
@@ -23,7 +23,7 @@ neuralnet {
     name: "data"
     type: kShardData
     sharddata_conf {
-      path: "examples/rbm/mnist_train_shard"
+      path: "examples/mnist/mnist_train_shard"
       batchsize: 1000
     }
     exclude: kTest
@@ -33,7 +33,7 @@ neuralnet {
     name: "data"
     type: kShardData
     sharddata_conf {
-      path: "examples/rbm/mnist_test_shard"
+      path: "examples/mnist/mnist_test_shard"
       batchsize: 1000
     }
     exclude: kTrain
@@ -64,19 +64,9 @@ neuralnet {
     }
     param{
       name: "w1"
-      init{
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
     }
     param{
       name: "rb12"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
   }
 
@@ -94,19 +84,9 @@ neuralnet {
     }
     param{
       name: "w2"
-      init{
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
     }
     param{
       name: "rb22"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
   }
 
@@ -125,19 +105,9 @@ neuralnet {
     }
     param{
       name: "w3"
-      init{
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
     }
     param{
       name: "rb32"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
   }
 
@@ -156,19 +126,10 @@ neuralnet {
     }
     param{
       name: "w4"
-      init{
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
     }
     param{
       name: "rb42"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
+
     }
   }
 
@@ -187,11 +148,6 @@ neuralnet {
     }
     param{
       name: "rb41"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
   }
 
@@ -214,13 +170,7 @@ neuralnet {
     }
     param{
       name: "rb31"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
-
   }
 
   layer{
@@ -242,11 +192,6 @@ neuralnet {
     }
     param{
       name: "rb21"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
 
   }
@@ -270,13 +215,7 @@ neuralnet {
     }
     param{
       name: "rb11"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
-
   }
 
   layer{
@@ -295,5 +234,5 @@ neuralnet {
 cluster {
   nworker_groups: 1
   nserver_groups: 1
-  workspace: "examples/rbm/checkpoint/autoencoder/"
+  workspace: "examples/rbm/autoencoder/"
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/examples/rbm/rbm0.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm0.conf b/examples/rbm/rbm0.conf
index ef8653f..a7e503b 100644
--- a/examples/rbm/rbm0.conf
+++ b/examples/rbm/rbm0.conf
@@ -1,11 +1,11 @@
-name: "deep-big-simple-dbm"
+name: "rbm0"
 train_steps: 6000
 test_steps:100
 test_freq:100
 disp_freq: 100
-alg: kCD
-checkpoint_after: 500
-checkpoint_freq: 1000
+train_one_batch{
+  alg: kCD
+}
 updater{
   type: kSGD
   momentum: 0.9
@@ -21,7 +21,7 @@ layer {
   name: "data"
   type: kShardData
   sharddata_conf {
-    path: "examples/rbm/mnist_train_shard"
+    path: "examples/mnist/mnist_train_shard"
     batchsize: 100
   }
   exclude: kTest
@@ -32,7 +32,7 @@ layer {
   name: "data"
   type: kShardData
   sharddata_conf {
-    path: "examples/rbm/mnist_test_shard"
+    path: "examples/mnist/mnist_test_shard"
     batchsize: 100
   }
   exclude: kTrain
@@ -54,22 +54,15 @@ layer{
   type: kRBMVis
   srclayers:"mnist"
   srclayers:"RBMHid"
-  rbmvis_conf{
-    num_output: 1000
-  }
   param{
-    name: "w1"
-    init{
-    type: kGaussian
-    mean: 0.0
-    std: 0.1
-    }
+    name: "w1_"
+    share_from: "w1"
   }
   param{
     name: "rb11"
     init{
-    type: kConstant
-    value: 0.0
+      type: kConstant
+      value: 0.0
     }
   }
 }
@@ -82,14 +75,18 @@ layer{
     hid_dim: 1000
   }
   param{
-    name: "w1_1"
-    share_from: "w1"
+    name: "w1"
+    init{
+      type: kGaussian
+      mean: 0.0
+      std: 0.1
+    }
   }
   param{
     name: "rb12"
     init{
-    type: kConstant
-    value: 0.0
+      type: kConstant
+      value: 0.0
     }
   }
 }
@@ -99,5 +96,5 @@ cluster {
   nserver_groups: 1
   nservers_per_group: 1
   nworkers_per_group: 1
-  workspace: "examples/rbm/checkpoint/rbm0/"
+  workspace: "examples/rbm/rbm0/"
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/examples/rbm/rbm1.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm1.conf b/examples/rbm/rbm1.conf
index f9b4974..db27d3a 100644
--- a/examples/rbm/rbm1.conf
+++ b/examples/rbm/rbm1.conf
@@ -1,12 +1,12 @@
-name: "deep-big-simple-dbm"
+name: "rbm1"
 train_steps: 6000
 test_steps:100
-test_freq:500
+test_freq:1000
 disp_freq: 100
-alg: kCD
-checkpoint_after: 500
-checkpoint_freq: 1000
-checkpoint_path: "examples/rbm/checkpoint/rbm0/checkpoint/step6000-worker0.bin"
+train_one_batch{
+  alg: kCD
+}
+checkpoint_path: "examples/rbm/rbm0/checkpoint/step6000-worker0.bin"
 updater{
   type: kSGD
   momentum: 0.9
@@ -22,7 +22,7 @@ layer {
   name: "data"
   type: kShardData
   sharddata_conf {
-    path: "examples/rbm/mnist_train_shard"
+    path: "examples/mnist/mnist_train_shard"
     batchsize: 100
   }
   exclude: kTest
@@ -33,7 +33,7 @@ layer {
   name: "data"
   type: kShardData
   sharddata_conf {
-    path: "examples/rbm/mnist_test_shard"
+    path: "examples/mnist/mnist_test_shard"
     batchsize: 100
   }
   exclude: kTrain
@@ -51,51 +51,34 @@ layer{
 }
 
 layer{
-    name: "fc1"
-    type: kInnerProduct
-    srclayers:"mnist"
-    innerproduct_conf{
-      num_output: 1000
-    }
-    param{
-      name: "w1"
-      init{
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
-    }
-    param{
-      name: "rb12"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
-    }
+  name: "fc1"
+  type: kInnerProduct
+  srclayers:"mnist"
+  innerproduct_conf{
+    num_output: 1000
   }
-
-  layer{
-    name: "sigmoid1"
-    type: kSigmoid
-    srclayers:"fc1"
+  param{
+    name: "w1"
   }
+  param{
+    name: "rb12"
+  }
+}
+
+layer{
+  name: "sigmoid1"
+  type: kSigmoid
+  srclayers:"fc1"
+}
 
 layer{
   name: "RBMVis"
   type: kRBMVis
   srclayers:"sigmoid1"
   srclayers:"RBMHid"
-  rbmvis_conf{
-    num_output: 500
-  }
   param{
-    name: "w2"
-    init{
-    type: kGaussian
-    mean: 0.0
-    std: 0.1
-    }
+    name: "w2_"
+    share_from: "w2"
   }
   param{
     name: "rb21"
@@ -114,14 +97,18 @@ layer{
     hid_dim: 500
   }
   param{
-    name: "w2_1"
-    share_from: "w2"
+    name: "w2"
+    init{
+      type: kGaussian
+      mean: 0.0
+      std: 0.1
+    }
   }
   param{
     name: "rb22"
     init{
-    type: kConstant
-    value: 0.0
+      type: kConstant
+      value: 0.0
     }
   }
 }
@@ -131,5 +118,5 @@ cluster {
   nserver_groups: 1
   nservers_per_group: 1
   nworkers_per_group: 1
-  workspace: "examples/rbm/checkpoint/rbm1/"
+  workspace: "examples/rbm/rbm1/"
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/examples/rbm/rbm2.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm2.conf b/examples/rbm/rbm2.conf
index 6629481..fd08907 100644
--- a/examples/rbm/rbm2.conf
+++ b/examples/rbm/rbm2.conf
@@ -1,12 +1,12 @@
-name: "deep-big-simple-dbm"
+name: "rbm2"
 train_steps: 6000
 test_steps:100
-test_freq:100
+test_freq:1000
 disp_freq: 100
-alg: kCD
-checkpoint_after: 500
-checkpoint_freq: 1000
-checkpoint_path: "examples/rbm/checkpoint/rbm1/checkpoint/step6000-worker0.bin"
+train_one_batch{
+  alg: kCD
+}
+checkpoint_path: "examples/rbm/rbm1/checkpoint/step6000-worker0.bin"
 
 updater{
   type: kSGD
@@ -24,7 +24,7 @@ layer {
   name: "data"
   type: kShardData
   sharddata_conf {
-    path: "examples/rbm/mnist_train_shard"
+    path: "examples/mnist/mnist_train_shard"
     batchsize: 100
   }
   exclude: kTest
@@ -35,7 +35,7 @@ layer {
   name: "data"
   type: kShardData
   sharddata_conf {
-    path: "examples/rbm/mnist_test_shard"
+    path: "examples/mnist/mnist_test_shard"
     batchsize: 100
   }
   exclude: kTrain
@@ -61,19 +61,9 @@ layer{
     }
     param{
       name: "w1"
-      init {
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
     }
     param{
       name: "rb12"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
   }
 
@@ -92,19 +82,9 @@ layer{
     }
     param{
       name: "w2"
-      init{
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
     }
     param{
       name: "rb22"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
   }
 
@@ -118,16 +98,9 @@ layer{
   type: kRBMVis
   srclayers:"sigmoid2"
   srclayers:"RBMHid"
-  rbmvis_conf{
-    num_output: 250
-  }
   param{
-    name: "w3"
-    init{
-    type: kGaussian
-    mean: 0.0
-    std: 0.1
-    }
+    name: "w3_"
+    share_from: "w3"
   }
   param{
     name: "rb31"
@@ -146,8 +119,12 @@ layer{
     hid_dim: 250
   }
   param{
-    name: "w3_1"
-    share_from: "w3"
+    name: "w3"
+    init{
+      type: kGaussian
+      mean: 0.0
+      std: 0.1
+    }
   }
   param{
     name: "rb32"
@@ -163,5 +140,5 @@ cluster {
   nserver_groups: 1
   nservers_per_group: 1
   nworkers_per_group: 1
-  workspace: "examples/rbm/checkpoint/rbm2/"
+  workspace: "examples/rbm/rbm2/"
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/examples/rbm/rbm3.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm3.conf b/examples/rbm/rbm3.conf
index 482c5e7..fe7cc1f 100644
--- a/examples/rbm/rbm3.conf
+++ b/examples/rbm/rbm3.conf
@@ -1,12 +1,12 @@
-name: "deep-big-simple-dbm"
+name: "rbm3"
 train_steps: 6000
 test_steps: 100
-test_freq: 100
+test_freq: 1000
 disp_freq: 100
-alg: kCD
-checkpoint_after: 500
-checkpoint_freq: 1000
-checkpoint_path: "examples/rbm/checkpoint/rbm2/checkpoint/step6000-worker0.bin"
+train_one_batch{
+  alg: kCD
+}
+checkpoint_path: "examples/rbm/rbm2/checkpoint/step6000-worker0.bin"
 updater{
     type: kSGD
     momentum: 0.9
@@ -22,7 +22,7 @@ layer {
   name: "data"
   type: kShardData
   sharddata_conf {
-    path: "examples/rbm/mnist_train_shard"
+    path: "examples/mnist/mnist_train_shard"
     batchsize: 100
   }
   exclude: kTest
@@ -33,7 +33,7 @@ layer {
   name: "data"
   type: kShardData
   sharddata_conf {
-    path: "examples/rbm/mnist_test_shard"
+    path: "examples/mnist/mnist_test_shard"
     batchsize: 100
   }
   exclude: kTrain
@@ -59,19 +59,9 @@ layer{
     }
     param{
       name: "w1"
-      init{
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
     }
     param{
       name: "rb12"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
   }
 
@@ -90,19 +80,9 @@ layer{
     }
     param{
       name: "w2"
-      init{
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
     }
     param{
       name: "rb22"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
   }
 
@@ -121,19 +101,9 @@ layer{
     }
     param{
       name: "w3"
-      init{
-      type: kUniform
-      low:-0.05
-      high:0.05
-      }
     }
     param{
       name: "rb32"
-      init{
-      type: kUniform
-      low: -0.05
-      high:0.05
-      }
     }
   }
 
@@ -148,16 +118,10 @@ layer{
   type: kRBMVis
   srclayers:"sigmoid3"
   srclayers:"RBMHid"
-  rbmvis_conf{
-    num_output: 30
-  }
   param{
-    name: "w4"
-    init{
-    type: kGaussian
-    mean: 0.0
-    std: 0.1
-    }
+    name: "w4_"
+    share_from: "w4"
+
   }
   param{
     name: "rb41"
@@ -177,8 +141,12 @@ layer{
     gaussian: true
   }
   param{
-    name: "w4_1"
-    share_from: "w4"
+    name: "w4"
+    init{
+      type: kGaussian
+      mean: 0.0
+      std: 0.1
+    }
   }
   param{
     name: "rb42"
@@ -194,5 +162,5 @@ cluster {
   nserver_groups: 1
   nservers_per_group: 1
   nworkers_per_group: 1
-  workspace: "examples/rbm/checkpoint/rbm3/"
+  workspace: "examples/rbm/rbm3/"
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/include/mshadow/tensor_random.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_random.h b/include/mshadow/tensor_random.h
index 72164a8..59ef082 100644
--- a/include/mshadow/tensor_random.h
+++ b/include/mshadow/tensor_random.h
@@ -68,20 +68,27 @@ namespace mshadow {
             gen_.seed(seed);
             #endif
         }
+        template<int dim>
+        inline void SampleBinary(Tensor<cpu, dim> &src) {
+          SampleBinary(src, src);
+        }
+
         /*!
          * \brief generate binary data according to a probability matrix
+         * \param src source
          * \param dst destination
          * \param a lower bound of uniform
          * \param b upper bound of uniform
          * \tparam dim dimension of tensor
          */
         template<int dim>
-        inline void SampleBinary( Tensor<cpu, dim> &dst) {
+        inline void SampleBinary(Tensor<cpu, dim> &dst, Tensor<cpu, dim> &src) {
             real_t a=0.0f;
             real_t b=1.0f;
-            Tensor<cpu, 2> mat = dst.FlatTo2D();
+            Tensor<cpu, 2> dmat = dst.FlatTo2D();
+            Tensor<cpu, 2> smat = src.FlatTo2D();
             std::uniform_real_distribution<real_t> distribution (a,b);
-            for ( index_t i = 0; i < mat.shape[1]; ++i ) {
+            for ( index_t i = 0; i < dmat.shape[1]; ++i ) {
                 #if MSHADOW_USE_MKL
                 #if MSHADOW_SINGLE_PRECISION
                 int status = vsRngUniform( 0, vStream_, mat.shape[0], mat[i].dptr, a, b );
@@ -96,8 +103,8 @@ namespace mshadow {
                     mat[i][j] = this->RandNext()*(b-a) + a;
                 }
                 */
-                for ( index_t j = 0; j < mat.shape[0]; ++j ) {
-                    mat[i][j] = distribution(gen_) > mat[i][j] ? 0.0f: 1.0f;
+                for ( index_t j = 0; j < dmat.shape[0]; ++j ) {
+                    dmat[i][j] = distribution(gen_) > smat[i][j] ? 0.0f: 1.0f;
                 }
                 #endif
             }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/include/neuralnet/base_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/base_layer.h b/include/neuralnet/base_layer.h
index 5575fc7..9aa207d 100644
--- a/include/neuralnet/base_layer.h
+++ b/include/neuralnet/base_layer.h
@@ -49,25 +49,25 @@ class Layer {
   /**
    * Compute features of this layer based on connected layers.
    *
-   * @param phase kTrain, kTest, kPositive, etc.
+   * @param flag kTrain, kTest, kPositive, etc.
    */
-  virtual void ComputeFeature(Phase phase, Metric* perf) = 0;
+  virtual void ComputeFeature(int flag, Metric* perf) = 0;
   /**
    * Compute gradients for parameters and connected layers.
    *
-   * @param phase kTrain, kTest, kPositive, etc.
+   * @param flag kTrain, kTest, kPositive, etc.
    */
   virtual void ComputeLoss(Metric* perf) {}
-  virtual void ComputeGradient(Phase phase) = 0;
+  virtual void ComputeGradient(int flag) = 0;
   /**
    * For print debug info about each layer, e.g., norm of feature vector,
    * norm of parameters.
    *
    * @param step training/test/validation step
-   * @param phase forward/backward/positive/negative...
+   * @param flag forward/backward/positive/negative...
    * @return debug info about this layer.
    */
-  const string DebugString(int step, Phase phase);
+  const string DebugString(int step, int flag);
   /**
    * Layers that have paramters must override this function.
    *
@@ -141,10 +141,10 @@ class Layer {
   /**
    * @return a const ref for Blob storing neuron values of this layer for BP
    */
-  virtual const Blob<float>& data(const Layer* from, Phase = kPositive) const {
+  virtual const Blob<float>& data(const Layer* from) const {
     return data_;
   }
-  virtual Blob<float>* mutable_data(const Layer* from, Phase = kPositive) {
+  virtual Blob<float>* mutable_data(const Layer* from) {
     return &data_;
   }
 
@@ -246,15 +246,15 @@ class BridgeSrcLayer: public BridgeLayer {
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  void ComputeFeature(Phase phase, Metric* perf) override {}
-  void ComputeGradient(Phase phase) override {
+  void ComputeFeature(int flag, Metric* perf) override {}
+  void ComputeGradient(int flag) override {
     ready_ = false;
   }
 
-  const Blob<float>& data(const Layer* from, Phase phase) const override {
+  const Blob<float>& data(const Layer* from) const override {
     return srclayers_[0]->data(this);
   }
-  Blob<float>* mutable_data(const Layer* from, Phase phase) override {
+  Blob<float>* mutable_data(const Layer* from) override {
     return srclayers_[0]->mutable_data(this);
   }
   const Blob<float>& grad(const Layer* from) const override {
@@ -278,11 +278,11 @@ class BridgeDstLayer: public BridgeLayer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric* perf) override {
+  void ComputeFeature(int flag, Metric* perf) override {
     // reset ready_ for next iteration.
     ready_ = false;
   }
-  void ComputeGradient(Phase phase) override {}
+  void ComputeGradient(int flag) override {}
   bool is_bridgedstlayer() const {
     return true;
   }
@@ -297,8 +297,8 @@ class ConcateLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric* perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric* perf) override;
+  void ComputeGradient(int flag) override;
 };
 
 /**
@@ -311,11 +311,11 @@ class DataLayer: public Layer{
   using Layer::mutable_grad;
   using Layer::dst_layer_connection;
 
-  void ComputeGradient(Phase phase) override {}
+  void ComputeGradient(int flag) override {}
   bool is_datalayer() const override {
     return true;
   }
-  Blob<float>* mutable_data(const Layer* layer, Phase phase) override {
+  Blob<float>* mutable_data(const Layer* layer) override {
     return nullptr;
   }
   Blob<float>* mutable_grad(const Layer* layer) override {
@@ -357,11 +357,11 @@ class PrefetchLayer : public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric* perf) override;
-  void ComputeGradient(Phase phase) override {};
+  void ComputeFeature(int flag, Metric* perf) override;
+  void ComputeGradient(int flag) override {};
 
-  const Blob<float>& data(const Layer* from, Phase phase) const override;
-  Blob<float>* mutable_data(const Layer* layer, Phase phase) override;
+  const Blob<float>& data(const Layer* from) const override;
+  Blob<float>* mutable_data(const Layer* layer) override;
 
   Blob<float>* mutable_grad(const Layer* layer) override {
     return nullptr;
@@ -371,7 +371,7 @@ class PrefetchLayer : public Layer {
     return grad_;
   }
 
-  void Prefetch(Phase phase);
+  void Prefetch(int flag);
   virtual ~PrefetchLayer();
 
  protected:
@@ -389,14 +389,14 @@ class SliceLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric* perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric* perf) override;
+  void ComputeGradient(int flag) override;
   ConnectionType dst_layer_connection() const override {
     return kOneToMany;
   }
-  const Blob<float>& data(const Layer* layer, Phase phase) const override;
+  const Blob<float>& data(const Layer* layer) const override;
   const Blob<float>& grad(const Layer* layer) const override;
-  Blob<float>* mutable_data(const Layer* layer, Phase phase) override;
+  Blob<float>* mutable_data(const Layer* layer) override;
   Blob<float>* mutable_grad(const Layer* layer) override;
 
  protected:
@@ -418,8 +418,8 @@ class SplitLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric* perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric* perf) override;
+  void ComputeGradient(int flag) override;
   ConnectionType dst_layer_connection() const override {
     return kOneToMany;
   }
@@ -462,12 +462,12 @@ class ParserLayer: public Layer {
   using Layer::mutable_grad;
   using Layer::grad;
 
-  void ComputeFeature(Phase phase, Metric* perf) override;
-  void ComputeGradient(Phase phase) override {};
+  void ComputeFeature(int flag, Metric* perf) override;
+  void ComputeGradient(int flag) override {};
   /**
    * Parse records from DataLayer into blob.
    */
-  virtual void ParseRecords(Phase phase, const vector<Record>& records,
+  virtual void ParseRecords(int flag, const vector<Record>& records,
       Blob<float>* blob) = 0;
   bool is_parserlayer() const override {
     return true;
@@ -480,6 +480,33 @@ class ParserLayer: public Layer {
     return grad_;
   }
 };
+
+class RBMLayer: public Layer {
+ public:
+  const Blob<float>& neg_data(const Layer* layer) {
+    return neg_data_;
+  }
+  Blob<float>* mutable_neg_data(const Layer* layer) {
+    return &neg_data_;
+  }
+  const vector<Param*> GetParams() const override {
+    vector<Param*> params{weight_, bias_};
+    return params;
+  }
+  virtual Blob<float>* Sample(int flat) = 0;
+
+ protected:
+  //! dimension of the hidden layer
+  int hdim_;
+  //! dimension of the visible layer
+  int vdim_;
+  int batchsize_;
+  Param* weight_, *bias_;
+
+  Blob<float> neg_data_;
+  Blob<float> neg_sample_;
+  Blob<float> sample_;
+};
 }  // namespace singa
 
 #endif  // SINGA_NEURALNET_BASE_LAYER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/include/neuralnet/layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h
index b1fbbb0..435d854 100644
--- a/include/neuralnet/layer.h
+++ b/include/neuralnet/layer.h
@@ -31,8 +31,8 @@ class ConvolutionLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
   const vector<Param*> GetParams() const override {
     vector<Param*> params{weight_, bias_};
     return params;
@@ -57,8 +57,8 @@ class DropoutLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
 
  protected:
   // drop probability
@@ -68,112 +68,42 @@ class DropoutLayer: public Layer {
    */
   Blob<float> mask_;
 };
+
 /**
  * RBM visible layer
  */
-class RBMVisLayer: public Layer {
+class RBMVisLayer: public RBMLayer {
  public:
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  void Setup(const LayerProto& proto,
-      int npartitions) override;
-  virtual bool is_vislayer() const {
-    return true;
-  }
-
-  void ComputeFeature(Phase phase,
-     Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
-  virtual void ComputeLoss(Metric* perf);
-  virtual Blob<float>* mutable_data(const Layer* from, Phase phase) {
-    if (phase == kPositive) {
-      return &data_;
-    } else {
-       return &vis_sample_;
-    }
-  }
-  virtual const Blob<float>& data(const Layer* from, Phase phase) const {
-    if (phase == kPositive) {
-      return data_;
-    } else {
-       return vis_sample_;
-    }
-  }
-  // virtual void ToProto(LayerProto *layer_proto, bool copyData);
-  const vector<Param*> GetParams() const override {
-    vector<Param*> params{weight_, bias_};
-    return params;
-  }
   ~RBMVisLayer();
-
-
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
+  Blob<float>* Sample(int flat) override;
 
  private:
-  //! dimension of the hidden layer
-  int hdim_;
-  //! dimension of the visible layer
-  int vdim_;
-  int batchsize_;
-  // batchsize of negative phase
-  int neg_batchsize_;
-  bool is_first_iteration_vis_;
-  float scale_;
-  // srclayer index
-  int data_idx_;
-  int hid_idx_;
-  Param* weight_, *bias_;
-  // data to store sampling result
-  Blob<float> vis_sample_;
-  // in order to implement Persistent Contrastive Divergence,
+  RBMLayer* hid_layer_;
+  Layer* input_layer_;
 };
 /**
  * RBM hidden layer
  */
-class RBMHidLayer: public Layer {
+class RBMHidLayer: public RBMLayer {
  public:
   using Layer::ComputeFeature;
   using Layer::ComputeGradient;
 
-  void Setup(const LayerProto& proto,
-      int npartitions) override;
-  virtual bool is_hidlayer() const {
-    return true;
-  }
-
-  void ComputeFeature(Phase phase,
-     Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
-  virtual Blob<float>* mutable_data(const Layer* from, Phase phase) {
-    if (phase == kPositive)
-      return &data_;
-    else
-      return &hid_sample_;
-  }
-  virtual const Blob<float>& data(const Layer* from, Phase phase) const {
-    if (phase == kPositive)
-      return data_;
-    else
-      return hid_sample_;
-    }
-  const vector<Param*> GetParams() const override {
-    vector<Param*> params{weight_, bias_};
-    return params;
-  }
   ~RBMHidLayer();
-
+  void Setup(const LayerProto& proto, int npartitions) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
+  Blob<float>* Sample(int flat) override;
  private:
-  //! dimension of the hidden layer
-  int hdim_;
-  int vdim_;  // dimension of visible layer
-  int batchsize_;
-  // batchsize of negative phase
-  int neg_batchsize_;
-  float scale_;
   // whether use gaussian sampling
   bool gaussian_;
-  Blob<float> hid_sample_;
-  Param* weight_, *bias_;
+  RBMLayer *vis_layer_;
 };
 /**
   * fully connected layer
@@ -184,8 +114,8 @@ class InnerProductLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
 
   ConnectionType src_neuron_connection(int k) const override {
     // CHECK_LT(k, srclayers_.size());
@@ -212,7 +142,7 @@ class LabelLayer: public ParserLayer {
   using ParserLayer::ParseRecords;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ParseRecords(Phase phase, const vector<Record>& records,
+  void ParseRecords(int flag, const vector<Record>& records,
       Blob<float>* blob) override;
 };
 
@@ -229,8 +159,8 @@ class LRNLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
 
  protected:
   //! shape of the bottom layer feature
@@ -247,7 +177,7 @@ class MnistLayer: public ParserLayer {
   using ParserLayer::ParseRecords;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ParseRecords(Phase phase, const vector<Record>& records,
+  void ParseRecords(int flag, const vector<Record>& records,
       Blob<float>* blob) override;
   ConnectionType dst_layer_connection() const override {
     return kOneToMany;
@@ -269,8 +199,8 @@ class PoolingLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
 
  protected:
   int kernel_, pad_, stride_;
@@ -284,8 +214,8 @@ class ReLULayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions = 1) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
 };
 
 class EuclideanLossLayer: public LossLayer {
@@ -294,8 +224,8 @@ class EuclideanLossLayer: public LossLayer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
 
 
   int partition_dim() const override {
@@ -321,8 +251,8 @@ class SoftmaxLossLayer: public LossLayer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
 
   /**
    * softmax is not recommendeded for partition because it requires the whole
@@ -349,7 +279,7 @@ class RGBImageLayer: public ParserLayer {
   using ParserLayer::ParseRecords;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ParseRecords(Phase phase, const vector<Record>& records,
+  void ParseRecords(int flag, const vector<Record>& records,
       Blob<float>* blob) override;
 
  private:
@@ -365,7 +295,7 @@ class ShardDataLayer: public DataLayer{
 
   ~ShardDataLayer();
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
+  void ComputeFeature(int flag, Metric *perf) override;
 
  private:
   DataShard* shard_;
@@ -382,8 +312,8 @@ class SigmoidLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
 };
 
 /**
@@ -397,8 +327,8 @@ class TanhLayer: public Layer {
   using Layer::ComputeGradient;
 
   void Setup(const LayerProto& proto, int npartitions) override;
-  void ComputeFeature(Phase phase, Metric *perf) override;
-  void ComputeGradient(Phase phase) override;
+  void ComputeFeature(int flag, Metric *perf) override;
+  void ComputeGradient(int flag) override;
 
  private:
   float outer_scale_, inner_scale_;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/include/trainer/worker.h
----------------------------------------------------------------------
diff --git a/include/trainer/worker.h b/include/trainer/worker.h
index 86b1c90..35ce77e 100644
--- a/include/trainer/worker.h
+++ b/include/trainer/worker.h
@@ -192,15 +192,9 @@ class BPWorker: public Worker{
 
 class CDWorker: public Worker{
  public:
-  ~CDWorker() {}
-  void Init(int thread_id, int grp_id, int id) override;
   void TrainOneBatch(int step, Metric* perf) override;
   void TestOneBatch(int step, Phase phase, shared_ptr<NeuralNet> net,
       Metric* perf) override;
-  void PositivePhase(int step, shared_ptr<NeuralNet> net, Metric* perf);
-  void NegativePhase(int step, shared_ptr<NeuralNet> net, Metric* perf);
-  void GradientPhase(int step, shared_ptr<NeuralNet> net);
-  void LossPhase(int step, shared_ptr<NeuralNet> net, Metric* perf);
 };
 
 inline int BlobTrgt(int grp, int layer) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/include/utils/param.h
----------------------------------------------------------------------
diff --git a/include/utils/param.h b/include/utils/param.h
index f7a0982..0d24e95 100644
--- a/include/utils/param.h
+++ b/include/utils/param.h
@@ -74,13 +74,14 @@ class Param {
   static Param* Create(const ParamProto& proto);
   Param();
   virtual ~Param() {}
+  void Init(const ParamProto& proto) { proto_ = proto; }
   /**
    * Setup param object
    *
    * @param conf param configuration, include learning rate multiplier etc.
    * @param shape one value per dimension
    */
-  virtual void Setup(const ParamProto& conf, const std::vector<int>& shape);
+  virtual void Setup(const std::vector<int>& shape);
   /*
    * Fill the values according to init method, e.g., gaussian distribution.
    *

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index e5045a3..9fa4b86 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -1,9 +1,12 @@
-#include "singa.h"
 
 #include <cblas.h>
 #include <glog/logging.h>
 #include <string>
 
+#include "singa.h"
+
+#include "utils/tinydir.h"
+
 namespace singa {
 
 void Driver::Init(int argc, char **argv) {
@@ -89,6 +92,9 @@ void Driver::Submit(bool resume, const JobProto& jobConf) {
   if (singa_conf_.has_log_dir())
     SetupLog(singa_conf_.log_dir(), std::to_string(job_id_)
              + "-" + jobConf.name());
+  tinydir_dir workspace;
+  if (tinydir_open(&workspace, jobConf.cluster().workspace().c_str()) == -1)
+    LOG(FATAL) << "workspace does not exist: " << jobConf.cluster().workspace();
   if (jobConf.num_openblas_threads() != 1)
     LOG(WARNING) << "openblas with "
       << jobConf.num_openblas_threads() << " threads";

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/src/neuralnet/base_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/base_layer.cc b/src/neuralnet/base_layer.cc
index f995353..46f8b57 100644
--- a/src/neuralnet/base_layer.cc
+++ b/src/neuralnet/base_layer.cc
@@ -1,5 +1,3 @@
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
 #include <cblas.h>
 #include <math.h>
 #include <cfloat>
@@ -24,14 +22,13 @@ void Layer::Setup(const LayerProto& proto, int npartitions) {
   layer_proto_ = proto;
 }
 
-const string Layer::DebugString(int step, Phase phase) {
+const string Layer::DebugString(int step, int flag) {
   string ret =StringPrintf("Layer %10s ", name().c_str());
-  if(data_.count() != 0)
-    return ret;
-  if(phase == kForward) {
-    ret += StringPrintf("data %10s data norm1 %13.9f", data_.asum_data());
-  }else if(phase == kBackward) {
-    ret += StringPrintf("grad norm1 %13.9f\n", grad_.asum_data());
+  if ((flag & kForward) == kForward && data_.count() !=0) {
+    ret += StringPrintf("data norm1 %13.9f", data_.asum_data());
+  } else if ((flag & kBackward) == kBackward) {
+    if (grad_.count() != 0)
+      ret += StringPrintf("grad norm1 %13.9f\n", grad_.asum_data());
     for(Param* p: GetParams())
       ret += StringPrintf("param id %2d, name %10s,\
           value norm1 %13.9f, grad norm1 %13.9f\n",
@@ -68,41 +65,41 @@ void ConcateLayer::Setup(const LayerProto& proto, int npartitions) {
   grad_.Reshape(shape);
 }
 
-void ConcateLayer::ComputeFeature(Phase phase, Metric *perf){
+void ConcateLayer::ComputeFeature(int flag, Metric *perf){
   LOG(FATAL) << "Not implemented for Concate Layer";
 }
 
-void ConcateLayer::ComputeGradient(Phase phase){
+void ConcateLayer::ComputeGradient(int flag){
   LOG(FATAL) << "Not implemented for Concate Layer";
 }
 
 /************* Implementation for ParserLayer ***********/
-void ParserLayer::ComputeFeature(Phase phase, Metric *perf){
+void ParserLayer::ComputeFeature(int flag, Metric *perf){
   CHECK_EQ(srclayers_.size(),1);
   auto datalayer=static_cast<DataLayer*>(*srclayers_.begin());
-  ParseRecords(phase, datalayer->records(), &data_);
+  ParseRecords(flag, datalayer->records(), &data_);
 }
 
 /************* Implementation for PrefetchLayer ***********/
-void PrefetchLayer::Prefetch(Phase phase){
+void PrefetchLayer::Prefetch(int flag){
   //clock_t s=clock();
   for(auto layer: sublayers_)
-    layer->ComputeFeature(phase, nullptr);
+    layer->ComputeFeature(flag, nullptr);
   //LOG(ERROR)<<(clock()-s)*1.0/CLOCKS_PER_SEC;
 }
 
-void PrefetchLayer::ComputeFeature(Phase phase, Metric* perf){
+void PrefetchLayer::ComputeFeature(int flag, Metric* perf){
   if(thread_.joinable())
     thread_.join();
   else{
-    Prefetch(phase);
+    Prefetch(flag);
   }
   for(auto layer: sublayers_){
     if(layer->is_parserlayer())
       // TODO replace CopyFrom with Swap?
       datablobs_.at(layer->name()).CopyFrom(layer->data(this));
   }
-  thread_=std::thread(&PrefetchLayer::Prefetch, this, phase);
+  thread_=std::thread(&PrefetchLayer::Prefetch, this, flag);
 }
 
 void PrefetchLayer::Setup(const LayerProto& proto, int npartitions) {
@@ -133,7 +130,7 @@ void PrefetchLayer::Setup(const LayerProto& proto, int npartitions) {
       datablobs_[layer->name()]=Blob<float>(layer->data(this).shape());
 }
 
-const Blob<float>& PrefetchLayer::data(const Layer* from, Phase phase) const {
+const Blob<float>& PrefetchLayer::data(const Layer* from) const {
   LOG(FATAL) << " needs update";
   if(from != nullptr) {
     return datablobs_.at("");
@@ -143,7 +140,7 @@ const Blob<float>& PrefetchLayer::data(const Layer* from, Phase phase) const {
   }
 }
 
-Blob<float>* PrefetchLayer::mutable_data(const Layer* from, Phase phase) {
+Blob<float>* PrefetchLayer::mutable_data(const Layer* from) {
   LOG(FATAL) << " needs update";
   if(from!=nullptr){
     return &(datablobs_.at(""));
@@ -194,7 +191,7 @@ int SliceLayer::SliceID(const Layer* layer) const {
   return -1;
 }
 
-const Blob<float>& SliceLayer::data(const Layer* layer, Phase phase) const {
+const Blob<float>& SliceLayer::data(const Layer* layer) const {
   if(layer==nullptr)
     return data_;
   return datavec_[SliceID(layer)];
@@ -204,7 +201,7 @@ const Blob<float>& SliceLayer::grad(const Layer* layer) const {
     return grad_;
   return gradvec_[SliceID(layer)];
 }
-Blob<float>* SliceLayer::mutable_data(const Layer* layer, Phase phase) {
+Blob<float>* SliceLayer::mutable_data(const Layer* layer) {
   if(layer==nullptr)
     return &data_;
   return &datavec_[SliceID(layer)];
@@ -214,7 +211,7 @@ Blob<float>* SliceLayer::mutable_grad(const Layer* layer){
     return &grad_;
   return &gradvec_[SliceID(layer)];
 }
-void SliceLayer::ComputeFeature(Phase phase, Metric *perf) {
+void SliceLayer::ComputeFeature(int flag, Metric *perf) {
   CHECK_EQ(srclayers_.size(),1);
   if(slice_dim_==0){
     const auto& blob=srclayers_.at(0)->data(this);
@@ -226,7 +223,7 @@ void SliceLayer::ComputeFeature(Phase phase, Metric *perf) {
     }
   }
 }
-void SliceLayer::ComputeGradient(Phase phase) {
+void SliceLayer::ComputeGradient(int flag) {
   // LOG(FATAL) << "Not implemented";
 }
 
@@ -240,11 +237,11 @@ void SplitLayer::Setup(const LayerProto& proto, int npartitions) {
   grad_.Reshape(srclayers_[0]->data(this).shape());
 }
 
-void SplitLayer::ComputeFeature(Phase phase, Metric *perf) {
+void SplitLayer::ComputeFeature(int flag, Metric *perf) {
   LOG(FATAL) << "Not implemented";
 
 }
-void SplitLayer::ComputeGradient(Phase phase) {
+void SplitLayer::ComputeGradient(int flag) {
   LOG(FATAL) << "Not implemented";
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index b5c986e..29a2312 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -72,11 +72,11 @@ void ConvolutionLayer::Setup(const LayerProto& proto, int npartitions) {
 
   weight_ = Param::Create(proto.param(0));
   bias_ = Param::Create(proto.param(1));
-  weight_->Setup(proto.param(0), vector<int>{num_filters_, col_height_});
-  bias_->Setup(proto.param(1), vector<int>{num_filters_});
+  weight_->Setup(vector<int>{num_filters_, col_height_});
+  bias_->Setup(vector<int>{num_filters_});
 }
 
-void ConvolutionLayer::ComputeFeature(Phase phase, Metric* perf){
+void ConvolutionLayer::ComputeFeature(int flag, Metric* perf){
   auto src = Tensor4(srclayers_[0]->mutable_data(this));
   auto data = Tensor3(&data_);
   auto col = Tensor2(&col_data_);
@@ -93,7 +93,7 @@ void ConvolutionLayer::ComputeFeature(Phase phase, Metric* perf){
   data+=broadcast<1>(bias, data.shape);
 }
 
-void ConvolutionLayer::ComputeGradient(Phase phase) {
+void ConvolutionLayer::ComputeGradient(int flag) {
   auto src = Tensor4(srclayers_[0]->mutable_data(this));
   auto col = Tensor2(&col_data_);
   auto weight = Tensor2(weight_->mutable_data());
@@ -137,9 +137,9 @@ void DropoutLayer::Setup(const LayerProto& proto, int npartitions) {
   pdrop_ = proto.dropout_conf().dropout_ratio();
 }
 
-void DropoutLayer::ComputeFeature(Phase phase, Metric* perf) {
+void DropoutLayer::ComputeFeature(int flag, Metric* perf) {
   // check training
-  if(phase != kTrain){//!training){
+  if((flag & kTrain) != kTrain) {
     data_.CopyFrom(srclayers_[0]->data(this));
     return;
   }
@@ -152,7 +152,7 @@ void DropoutLayer::ComputeFeature(Phase phase, Metric* perf) {
   data = src * mask;
 }
 
-void DropoutLayer::ComputeGradient(Phase phase)  {
+void DropoutLayer::ComputeGradient(int flag)  {
   auto mask = Tensor1(&mask_);
   auto grad = Tensor1(&grad_);
   auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
@@ -164,94 +164,69 @@ RBMVisLayer::~RBMVisLayer() {
   delete bias_;
 }
 
-void RBMVisLayer::Setup(const LayerProto& proto,
-      int npartitions) {
+void RBMVisLayer::Setup(const LayerProto& proto, int npartitions) {
   Layer::Setup(proto, npartitions);
   CHECK_EQ(srclayers_.size(), 2);
-  // hid_idx_: index indicating which srclayer is is hidden layer
-  // data_idx_: index indicating which srclayer is data layer
-  for (unsigned int i = 0; i < srclayers_.size(); i++)
-    for (unsigned int j = 0; j < (srclayers_[i]-> dstlayers()).size(); j++)
-      if (strcmp(((srclayers_[i]->dstlayers()).at(j)->name().c_str()),
-        (this->name()).c_str()) == 0)
-        hid_idx_ = i;
-  for (unsigned int i = 0; i < srclayers_.size(); i++)
-    if (i != static_cast<unsigned int>(hid_idx_) )
-      data_idx_ = i;
-  const auto& src = srclayers_[data_idx_]->data(this);
-  is_first_iteration_vis_ = true;
+  hid_layer_ = nullptr;
+  for (auto src : srclayers_) {
+    for (auto dst : src->srclayers()) {
+      if (dst->name() == name()) {
+        CHECK(hid_layer_ == nullptr);
+        hid_layer_ = static_cast<RBMHidLayer*>(src);
+      }
+    }
+  }
+  input_layer_ = srclayers_[0] != hid_layer_ ? srclayers_[0]: srclayers_[1];
+  const auto& src = input_layer_->data(this);
   batchsize_ = src.shape()[0];
-  neg_batchsize_ = batchsize_;
-  /*gibbs sampling size and input have the same size*/
-  vdim_ = src.count()/batchsize_;
-  hdim_ = proto.rbmvis_conf().num_output();
-  data_.Reshape(vector<int>{batchsize_, vdim_});  // this is visible dimension
-  vis_sample_.Reshape(vector<int>{neg_batchsize_, vdim_});
+  data_.ReshapeLike(src);  // this is visible dimension
+  neg_data_.ReshapeLike(data_);
+  neg_sample_.ReshapeLike(data_);
   weight_ = Param::Create(proto.param(0));
   bias_ = Param::Create(proto.param(1));
-  weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_});
-  bias_->Setup(proto.param(1), vector<int>{vdim_});
-}
-
-void RBMVisLayer::ComputeFeature(Phase phase, Metric* perf) {
-  if (phase == kPositive) { /*positive phase*/
-    auto data = Tensor2(&data_);
-    CHECK_EQ(srclayers_[data_idx_]->data(this).count(), batchsize_*vdim_);
-    auto src = Tensor2(srclayers_[data_idx_]->mutable_data(this));
-    Copy(data, src);
-  } else if (phase == kNegative) {   /*negative phase*/
-      auto hid_sample =
-        Tensor2(srclayers_[hid_idx_]->mutable_data(this, kNegative));
-      // fetch sampling results from hidden layer
-      auto vis_sample = Tensor2(&vis_sample_);
-      auto weight = Tensor2(weight_->mutable_data());
-      auto bias = Tensor1(bias_->mutable_data());
-      vis_sample = dot(hid_sample, weight);
-      vis_sample+=repmat(bias, neg_batchsize_);
-      vis_sample = F<op::sigmoid>(vis_sample);
+  bias_->Setup(vector<int>{src.count() / batchsize_});
+}
+Blob<float>* RBMVisLayer::Sample(int flag) {
+  Tensor<cpu, 2> sample, data;
+  if ((flag & kPositive) == kPositive) {
+    LOG(FATAL) << "RBMVisLayer can not be sampled for positive flag";
+  } else {
+    data = Tensor2(&neg_data_);
+    sample = Tensor2(&neg_sample_);
+  }
+  auto random = TSingleton<Random<cpu>>::Instance();
+  random->SampleBinary(sample, data);
+  return &neg_sample_;
+}
+void RBMVisLayer::ComputeFeature(int flag, Metric* perf) {
+  if ((flag & kPositive) == kPositive) { /*positive flag*/
+    data_.CopyFrom(input_layer_->data(this), true);
+  } else if ((flag & kNegative) == kNegative) {   /*negative flag*/
+    auto hid_sample = Tensor2(hid_layer_->Sample(flag));
+    // fetch sampling results from hidden layer
+    auto data = Tensor2(&neg_data_);
+    auto weight = Tensor2(weight_->mutable_data());
+    auto bias = Tensor1(bias_->mutable_data());
+    data = dot(hid_sample, weight);
+    data += repmat(bias, batchsize_);
+    data = F<op::sigmoid>(data);
+    if ((flag & kTest) == kTest) {
+      const float *dptr = data_.cpu_data(), *rcns = neg_data_.cpu_data();
+      float err = 0.f;
+      for (int i = 0; i < data_.count(); i++) {
+        err += (dptr[i] - rcns[i]) * (dptr[i] - rcns[i]);
+      }
+      perf->Add("Squared Error", err / batchsize_);
     }
+  }
 }
 
-void RBMVisLayer::ComputeGradient(Phase phase) {
-  auto data = Tensor2(&data_);
-  auto hid_data = Tensor2(srclayers_[hid_idx_]->mutable_data(this, kPositive));
-  auto vis_sample = Tensor2(&vis_sample_);
-  auto hid_sample =
-       Tensor2(srclayers_[hid_idx_]->mutable_data(this, kNegative));
-  // fetch sampling results from hidden layer
-  auto gweight = Tensor2(weight_->mutable_grad());
-  auto gbias = Tensor1(bias_->mutable_grad());
-  gbias = sum_rows(vis_sample);
-  gbias -= sum_rows(data);
-  gweight = dot(hid_sample.T(), vis_sample);
-  gweight -= dot(hid_data.T(), data);
-  gbias*=(1.0f)/(1.0f*batchsize_);
-  gweight*=(1.0f)/(1.0f*batchsize_);
-}
-
-void RBMVisLayer::ComputeLoss(Metric* perf) {
-  float loss_sqr = (0.0f);
-  CHECK_EQ(srclayers_[data_idx_]->data(this).count(), batchsize_*vdim_);
-  auto src = Tensor2(srclayers_[data_idx_]->mutable_data(this));
-  auto hid_data = Tensor2(srclayers_[hid_idx_]->mutable_data(this, kPositive));
-  // gibbs using u
-  auto weight = Tensor2(weight_->mutable_data());
-  auto bias = Tensor1(bias_->mutable_data());
-  Tensor<cpu, 2> reconstruct(Shape2(batchsize_, vdim_)); /*reconstruct error*/
-  AllocSpace(reconstruct);
-  reconstruct = dot(hid_data, weight);
-  reconstruct+=repmat(bias, batchsize_);
-  reconstruct = F<op::sigmoid>(reconstruct);
-  float *src_dptr = src.dptr;
-  for (int i = 0; i < vdim_*batchsize_; i++) {
-      int recon_row = i / vdim_;
-      int recon_col = i - recon_row * vdim_;
-      loss_sqr += (src_dptr[i] - reconstruct[recon_row][recon_col]) *
-                  (src_dptr[i] - reconstruct[recon_row][recon_col]);
-  }
-  FreeSpace(reconstruct);
-  perf->Reset();
-  perf->Add("sqr_reconstruct_error", loss_sqr);
+void RBMVisLayer::ComputeGradient(int flag) {
+  auto vis_pos = Tensor2(&data_);
+  auto vis_neg = Tensor2(&neg_data_);
+    auto gbias = Tensor1(bias_->mutable_grad());
+  gbias = sum_rows(vis_neg);
+  gbias -= sum_rows(vis_pos);
 }
 /**************** Implementation for RBMHidLayer********************/
 RBMHidLayer::~RBMHidLayer() {
@@ -263,84 +238,75 @@ void RBMHidLayer::Setup(const LayerProto& proto,
       int npartitions) {
   Layer::Setup(proto, npartitions);
   CHECK_EQ(srclayers_.size(), 1);
-  const auto& src_data = srclayers_[0]->data(this, kPositive);
-  const auto& src_sample = srclayers_[0]->data(this, kNegative);
-  scale_ = static_cast<float> (1.0f);
+  const auto& src_data = srclayers_[0]->data(this);
   batchsize_ = src_data.shape()[0];
-  neg_batchsize_ = src_sample.shape()[0];
   vdim_ = src_data.count()/batchsize_;
   hdim_ = proto.rbmhid_conf().hid_dim();
   gaussian_ = proto.rbmhid_conf().gaussian();
   data_.Reshape(vector<int>{batchsize_, hdim_});
-  hid_sample_.Reshape(vector<int>{neg_batchsize_, hdim_});
+  neg_data_.ReshapeLike(data_);
+  sample_.ReshapeLike(data_);
+  neg_sample_.ReshapeLike(data_);
   weight_ = Param::Create(proto.param(0));
   bias_ = Param::Create(proto.param(1));
-  bias_->Setup(proto.param(1), vector<int>{hdim_});
-  weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_});
+  bias_->Setup(vector<int>{hdim_});
+  weight_->Setup(vector<int>{hdim_, vdim_});
+  vis_layer_ = static_cast<RBMVisLayer*> (srclayers_[0]);
 }
 
-void RBMHidLayer::ComputeFeature(Phase phase, Metric* perf) {
-  if (phase == kPositive) {  /*postive phase*/
-    auto data = Tensor2(&data_);
-
-    auto hid_sample = Tensor2(&hid_sample_);
-
-    CHECK_EQ(srclayers_[0]->data(this, kPositive).count(), batchsize_*vdim_);
-    auto src = Tensor2(srclayers_[0]->mutable_data(this, kPositive));
-    auto weight = Tensor2(weight_->mutable_data());
-    auto bias = Tensor1(bias_->mutable_data());
-    data = dot(src, weight.T());
-    data += repmat(bias, batchsize_);
-
-    if (!gaussian_)
-      data = F<op::sigmoid>(data);
+Blob<float>* RBMHidLayer::Sample(int flag) {
+  Tensor<cpu, 2> sample, data;
+  if ((flag & kPositive) == kPositive) {
+    data = Tensor2(&data_);
+    sample = Tensor2(&sample_);
+  } else {
+    data = Tensor2(&neg_data_);
+    sample = Tensor2(&neg_sample_);
+  }
+  auto random = TSingleton<Random<cpu>>::Instance();
+  if (gaussian_) {  // first gibbs
+    random->SampleGaussian(sample, 0.0f, 1.0f);
+    sample += data;
+  } else {
+    random->SampleBinary(sample, data);
+  }
+  return (flag & kPositive) == kPositive ? &sample_ : &neg_sample_;
+}
 
-    Copy(hid_sample, data);
+void RBMHidLayer::ComputeFeature(int flag, Metric* perf) {
+  auto weight = Tensor2(weight_->mutable_data());
+  auto bias = Tensor1(bias_->mutable_data());
 
-    if (gaussian_) {  // first gibbs
-      Tensor<cpu, 2> gaussian_sample(Shape2(batchsize_, hdim_));
-      AllocSpace(gaussian_sample);
-      auto random = TSingleton<Random<cpu>>::Instance();
-      random->SampleGaussian(gaussian_sample, 0.0f, 1.0f);
-      hid_sample += gaussian_sample;
-      FreeSpace(gaussian_sample);
-    } else {
-        TSingleton<Random<cpu>>::Instance()->SampleBinary(hid_sample);
-    }
+  Tensor<cpu, 2> data, src;
+  if ((flag & kPositive) == kPositive) {  /*postive flag*/
+    data = Tensor2(&data_);
+    src = Tensor2(vis_layer_->mutable_data(this));
+  } else {
+    data = Tensor2(&neg_data_);
+    src = Tensor2(vis_layer_->Sample(flag));
+  }
+  data = dot(src, weight.T());
+  data += repmat(bias, batchsize_);
 
-  } else if (phase == kNegative) {   /*negative phase*/
-      CHECK_EQ(srclayers_[0]->data(this, kNegative).count(),
-         neg_batchsize_*vdim_);
-      auto src_sample = Tensor2(srclayers_[0]->mutable_data(this, kNegative));
-      auto hid_sample = Tensor2(&hid_sample_);
-      auto bias = Tensor1(bias_->mutable_data());
-      auto weight = Tensor2(weight_->mutable_data());
-      hid_sample = dot(src_sample, weight.T());
-      hid_sample += repmat(bias, neg_batchsize_);
-      if (!gaussian_)
-        hid_sample = F<op::sigmoid>(hid_sample);
-    } else if (phase == kLoss) {   /*test phase*/
-        auto data = Tensor2(&data_);  // data: sigmoid(Wv+b)
-        if (gaussian_) {
-          Tensor<cpu, 2> gaussian_sample(Shape2(batchsize_, hdim_));
-          AllocSpace(gaussian_sample);
-          auto random = TSingleton<Random<cpu>>::Instance();
-          random->SampleGaussian(gaussian_sample, 0.0f, 1.0f);
-          data += gaussian_sample;
-          FreeSpace(gaussian_sample);
-        }
-        else
-          TSingleton<Random<cpu>>::Instance()->SampleBinary(data);
-      }
+  if (!gaussian_)
+    data = F<op::sigmoid>(data);
 }
 
-void RBMHidLayer::ComputeGradient(Phase phase) {
-  auto data = Tensor2(&data_);
-  auto hid_sample = Tensor2(&hid_sample_);
+void RBMHidLayer::ComputeGradient(int flag) {
+  auto hid_pos = Tensor2(&data_);
+  auto hid_neg = Tensor2(&neg_data_);
+  auto vis_pos = Tensor2(vis_layer_->mutable_data(this));
+  auto vis_neg = Tensor2(vis_layer_->mutable_data(this));
+
   auto gbias = Tensor1(bias_->mutable_grad());
-  gbias = sum_rows(hid_sample);
-  gbias -= sum_rows(data);
-  gbias *= scale_/(1.0f*batchsize_);
+  gbias = sum_rows(hid_neg);
+  gbias -= sum_rows(hid_pos);
+  gbias /= batchsize_;
+
+  auto gweight = Tensor2(weight_->mutable_grad());
+  gweight = dot(hid_neg.T(), vis_neg);
+  gweight -= dot(hid_pos.T(), vis_pos);
+  gweight /= batchsize_;
 }
 /*********** Implementation for InnerProductLayer**********/
 InnerProductLayer::~InnerProductLayer() {
@@ -362,13 +328,13 @@ void InnerProductLayer::Setup(const LayerProto& proto, int npartitions) {
   weight_ = Param::Create(proto.param(0));
   bias_ = Param::Create(proto.param(1));
   if (transpose_)
-    weight_->Setup(proto.param(0), vector<int>{vdim_, hdim_});
+    weight_->Setup(vector<int>{vdim_, hdim_});
   else
-    weight_->Setup(proto.param(0), vector<int>{hdim_, vdim_});
-  bias_->Setup(proto.param(1), vector<int>{hdim_});
+    weight_->Setup(vector<int>{hdim_, vdim_});
+  bias_->Setup(vector<int>{hdim_});
 }
 
-void InnerProductLayer::ComputeFeature(Phase phase, Metric* perf) {
+void InnerProductLayer::ComputeFeature(int flag, Metric* perf) {
   auto data = Tensor2(&data_);
   auto src = Tensor2(srclayers_[0]->mutable_data(this));
   auto weight = Tensor2(weight_->mutable_data());
@@ -381,7 +347,7 @@ void InnerProductLayer::ComputeFeature(Phase phase, Metric* perf) {
   data+=repmat(bias, batchsize_);
 }
 
-void InnerProductLayer::ComputeGradient(Phase phas) {
+void InnerProductLayer::ComputeGradient(int phas) {
   auto src = Tensor2(srclayers_[0]->mutable_data(this));
   auto grad = Tensor2(&grad_);
   auto weight = Tensor2(weight_->mutable_data());
@@ -411,7 +377,7 @@ void LabelLayer::Setup(const LayerProto& proto, int npartitions){
   data_.Reshape(vector<int>{batchsize});
 }
 
-void LabelLayer::ParseRecords(Phase phase, const vector<Record>& records,
+void LabelLayer::ParseRecords(int flag, const vector<Record>& records,
     Blob<float>* blob){
   int rid=0;
   float *label= blob->mutable_cpu_data() ;
@@ -442,7 +408,7 @@ void LRNLayer::Setup(const LayerProto& proto, int npartitions) {
   width_=s[3];
 }
 
-void LRNLayer::ComputeFeature(Phase phase, Metric* perf) {
+void LRNLayer::ComputeFeature(int flag, Metric* perf) {
   const float salpha = alpha_ / lsize_;
   auto src = Tensor4(srclayers_[0]->mutable_data(this));
   auto data = Tensor4(&data_);
@@ -452,7 +418,7 @@ void LRNLayer::ComputeFeature(Phase phase, Metric* perf) {
   data = src * F<op::power>(norm, -beta_ );
 }
 
-void LRNLayer::ComputeGradient(Phase phase) {
+void LRNLayer::ComputeGradient(int flag) {
   const float salpha = alpha_ / lsize_;
   auto src = Tensor4(srclayers_[0]->mutable_data(this));
   auto norm = Tensor4(&norm_);
@@ -466,8 +432,10 @@ void LRNLayer::ComputeGradient(Phase phase) {
 
 /**************** Implementation for MnistImageLayer******************/
 
-void MnistLayer::ParseRecords(Phase phase,
+void MnistLayer::ParseRecords(int flag,
     const vector<Record>& records, Blob<float>* blob){
+  if ((flag & kForward) == 0)
+    return;
   LOG_IF(ERROR, records.size()==0)<<"Empty records to parse";
   int ndim=records.at(0).image().shape_size();
   int inputsize =records.at(0).image().shape(ndim-1);
@@ -554,7 +522,7 @@ void PoolingLayer::Setup(const LayerProto& proto, int npartitions) {
   grad_.ReshapeLike(data_);
 }
 
-void PoolingLayer::ComputeFeature(Phase phase, Metric* perf) {
+void PoolingLayer::ComputeFeature(int flag, Metric* perf) {
   auto src = Tensor4(srclayers_[0]->mutable_data(this));
   auto data = Tensor4(&data_);
   if(pool_ == PoolingProto_PoolMethod_MAX)
@@ -567,7 +535,7 @@ void PoolingLayer::ComputeFeature(Phase phase, Metric* perf) {
  * partition only on num/channel dim
  * assume grad and data have the same paritition
  */
-void PoolingLayer::ComputeGradient(Phase phase) {
+void PoolingLayer::ComputeGradient(int flag) {
   auto src = Tensor4(srclayers_[0]->mutable_data(this));
   auto gsrc = Tensor4(srclayers_[0]->mutable_grad(this));
   auto data = Tensor4(&data_);
@@ -587,13 +555,13 @@ void ReLULayer::Setup(const LayerProto& proto, int npartitions) {
   grad_.ReshapeLike(*(srclayers_[0]->mutable_grad(this)));
 }
 
-void ReLULayer::ComputeFeature(Phase phase, Metric* perf) {
+void ReLULayer::ComputeFeature(int flag, Metric* perf) {
   auto data = Tensor1(&data_);
   auto src = Tensor1(srclayers_[0]->mutable_data(this));
   data=F<op::relu>(src);
 }
 
-void ReLULayer::ComputeGradient(Phase phase) {
+void ReLULayer::ComputeGradient(int flag) {
   auto data = Tensor1(&data_);
   auto grad = Tensor1(&grad_);
   auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
@@ -602,8 +570,11 @@ void ReLULayer::ComputeGradient(Phase phase) {
 
 /*************** Implementation for RGBImageLayer *************************/
 
-void RGBImageLayer::ParseRecords(Phase phase,
+void RGBImageLayer::ParseRecords(int flag,
     const vector<Record>& records, Blob<float>* blob){
+  if ((flag & kForward) == 0)
+    return;
+
   const vector<int>& s=blob->shape();
   auto images = Tensor4(&data_);
   const SingleLabelImageRecord& r=records.at(0).image();
@@ -617,8 +588,8 @@ void RGBImageLayer::ParseRecords(Phase phase,
   const float* meandptr=mean_.cpu_data();
   for(const Record& record: records){
     auto image=images[rid];
-    bool do_crop=cropsize_>0&&(phase == kTrain);
-    bool do_mirror=mirror_&&rand()%2&&(phase == kTrain);
+    bool do_crop = cropsize_ > 0 && ((flag & kTrain) == kTrain);
+    bool do_mirror = mirror_ && rand() % 2 && ((flag & kTrain) == kTrain);
     float* dptr=nullptr;
     if(do_crop||do_mirror)
       dptr=raw_image.dptr;
@@ -697,7 +668,10 @@ void RGBImageLayer::Setup(const LayerProto& proto, int npartitions) {
 }
 
 /***************Implementation for ShardDataLayer**************************/
-void ShardDataLayer::ComputeFeature(Phase phase, Metric* perf){
+void ShardDataLayer::ComputeFeature(int flag, Metric* perf){
+  if ((flag & kForward) == 0)
+    return;
+
   if (shard_ == nullptr)
     shard_ = new DataShard(layer_proto_.sharddata_conf().path(),
         DataShard::kRead);
@@ -747,13 +721,13 @@ void SigmoidLayer::Setup(const LayerProto& proto, int npartitions) {
   grad_.ReshapeLike(srclayers_[0]->grad(this));
 }
 
-void SigmoidLayer::ComputeFeature(Phase phase, Metric* perf) {
+void SigmoidLayer::ComputeFeature(int flag, Metric* perf) {
   auto data = Tensor1(&data_);
   auto src = Tensor1(srclayers_[0]->mutable_data(this));
   data = F<op::sigmoid>(src);
 }
 
-void SigmoidLayer::ComputeGradient(Phase phase) {
+void SigmoidLayer::ComputeGradient(int flag) {
   auto data = Tensor1(&data_);
   auto grad = Tensor1(&grad_);
   auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
@@ -766,13 +740,13 @@ void TanhLayer::Setup(const LayerProto& proto, int npartitions){
   grad_.ReshapeLike(srclayers_[0]->grad(this));
 }
 
-void TanhLayer::ComputeFeature(Phase phase, Metric* perf) {
+void TanhLayer::ComputeFeature(int flag, Metric* perf) {
   auto data = Tensor1(&data_);
   auto src = Tensor1(srclayers_[0]->mutable_data(this));
   data=F<op::stanh>(src);
 }
 
-void TanhLayer::ComputeGradient(Phase phase) {
+void TanhLayer::ComputeGradient(int flag) {
   auto data = Tensor1(&data_);
   auto grad = Tensor1(&grad_);
   auto gsrc = Tensor1(srclayers_[0]->mutable_grad(this));
@@ -787,7 +761,7 @@ void EuclideanLossLayer::Setup(const LayerProto& proto, int npartitions) {
   dim_ = data_.count()/batchsize_;
   metric_.Reshape(vector<int>{1});
 }
-void EuclideanLossLayer::ComputeFeature(Phase phase, Metric* perf) {
+void EuclideanLossLayer::ComputeFeature(int flag, Metric* perf) {
   const float* reconstruct_dptr = srclayers_[0]->data(this).cpu_data();
   const float* input_dptr = srclayers_[1]->data(this).cpu_data();
   float loss = 0;
@@ -805,7 +779,7 @@ void EuclideanLossLayer::ComputeFeature(Phase phase, Metric* perf) {
       srclayers_[1]->data(this).cpu_data() + (batchsize_*dim_));
   perf->Add("loss", loss/(1.0f*batchsize_));
 }
-void EuclideanLossLayer::ComputeGradient(Phase phase) {
+void EuclideanLossLayer::ComputeGradient(int flag) {
   const float* reconstruct_dptr = srclayers_[0]->data(this).cpu_data();
   const float* input_dptr = srclayers_[1]->data(this).cpu_data();
   Blob<float>* gsrcblob = srclayers_[0]->mutable_grad(this);
@@ -828,7 +802,7 @@ void SoftmaxLossLayer::Setup(const LayerProto& proto, int npartitions) {
   metric_.Reshape(vector<int>{2});
   scale_=proto.softmaxloss_conf().scale();
 }
-void SoftmaxLossLayer::ComputeFeature(Phase phase, Metric* perf) {
+void SoftmaxLossLayer::ComputeFeature(int flag, Metric* perf) {
   Shape<2> s=Shape2(batchsize_, dim_);
   Tensor<cpu, 2> prob(data_.mutable_cpu_data(), s);
   Tensor<cpu, 2> src(srclayers_[0]->mutable_data(this)->mutable_cpu_data(), s);
@@ -863,7 +837,7 @@ void SoftmaxLossLayer::ComputeFeature(Phase phase, Metric* perf) {
   perf->Add("accuracy", precision*scale_/(1.0f*batchsize_));
 }
 
-void SoftmaxLossLayer::ComputeGradient(Phase phase) {
+void SoftmaxLossLayer::ComputeGradient(int flag) {
   const float* label=srclayers_[1]->data(this).cpu_data();
   Blob<float>* gsrcblob=srclayers_[0]->mutable_grad(this);
   gsrcblob->CopyFrom(data_);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/src/proto/job.proto
----------------------------------------------------------------------
diff --git a/src/proto/job.proto b/src/proto/job.proto
index 7f030ac..1c79aea 100644
--- a/src/proto/job.proto
+++ b/src/proto/job.proto
@@ -21,18 +21,12 @@ message JobProto {
   required string name = 1;
   // neural net consits of a set of connected layers
   required NetProto neuralnet = 3;
-  // algorithms calculating gradients for one mini-batch/iteration
-  optional TrainOneBatchAlg alg = 5 [default = kUserAlg];
-  // user defined algorithm
-  optional string user_alg = 6;
+  // algorithm for computing gradients over one mini-batch
+  required AlgProto train_one_batch = 5;
   // configuration of SGD updater, including learning rate, etc.
   required UpdaterProto updater = 7;
   // cluster toplogy conf
   required ClusterProto cluster = 9;
-
-  // for setting CD fields
-  optional CDProto cd_conf = 12;
-
   // total num of steps for training
   required int32 train_steps = 16;
   // frequency of displaying training info
@@ -86,6 +80,16 @@ message JobProto {
 // Protos used by JobProto
 // -----------------------
 
+message AlgProto {
+  // algorithms calculating gradients for one mini-batch/iteration
+  optional AlgType alg = 1 [default = kUserAlg];
+  // user defined algorithm
+  optional string user_alg = 2;
+  // for setting CD fields
+  optional CDProto cd_conf = 10;
+
+  extensions 101 to 200;
+}
 message NetProto {
   repeated LayerProto layer = 1;
   // partitioning type for parallelism
@@ -140,7 +144,7 @@ message ClusterProto {
 
 message CDProto {
   //number of steps for gibbs sampling
-  optional int32 pcd_k = 1 [default = 1];
+  optional int32 cd_k = 1 [default = 1];
 }
 
 message LayerProto {
@@ -182,8 +186,6 @@ message LayerProto {
   optional PrefetchProto prefetch_conf = 44;
   // configuration for rbmhid layer
   optional RBMHidProto rbmhid_conf = 49;
-  // configuration for rbmvis layer
-  optional RBMVisProto rbmvis_conf = 48;
   // configuration for rectified linear unit layer
   optional ReLUProto relu_conf = 38;
   // configuration for rgb image parser layer
@@ -365,11 +367,6 @@ message DropoutProto {
   optional float dropout_ratio = 30 [default = 0.5];
 }
 
-message RBMVisProto {
-  optional int32 num_output = 1; // The number of outputs for the layer
-  optional bool bias_term = 2 [default = true]; // whether to have bias terms
-}
-
 message RBMHidProto {
   optional int32 hid_dim = 1; // The number of outputs for the layer
   optional bool bias_term = 2 [default = true]; // whether to have bias terms
@@ -559,16 +556,16 @@ enum PartitionType {
 }
 
 enum Phase {
-  kTrain = 0;
-  kValidation = 1;
-  kTest= 2;
+  kTrain = 1;
+  kValidation = 2;
+  kTest= 4;
   // postivie phase for contrastive divergence algorithm
-  kPositive = 3;
+  kPositive = 8;
   // negative phase for contrastive divergence algorithm
-  kNegative = 4;
-  kForward = 5;
-  kBackward = 6;
-  kLoss = 7;
+  kNegative = 16;
+  kForward = 32;
+  kBackward = 64;
+  kLoss = 128;
 }
 
 enum ParamType {
@@ -578,7 +575,7 @@ enum ParamType {
   kUser = 103;
 }
 
-enum TrainOneBatchAlg {
+enum AlgType {
   // Back-propagation algorithm for feed-forward models, e.g., CNN and RNN
   kBP = 1;
   // Contrastive Divergence algorithm for RBM, DBM, etc.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/src/trainer/worker.cc
----------------------------------------------------------------------
diff --git a/src/trainer/worker.cc b/src/trainer/worker.cc
index f112b17..a22a8ef 100644
--- a/src/trainer/worker.cc
+++ b/src/trainer/worker.cc
@@ -13,12 +13,14 @@ using std::thread;
 Worker* Worker::Create(const JobProto& proto) {
   auto factory = Singleton<Factory<singa::Worker>>::Instance();
   Worker* worker = nullptr;
-  if (proto.has_user_alg())
-    worker = factory->Create(proto.user_alg());
+  const auto& conf = proto.train_one_batch();
+  if (conf.has_user_alg())
+    worker = factory->Create(conf.user_alg());
   else
-    worker = factory->Create(proto.alg());
+    worker = factory->Create(conf.alg());
   return worker;
 }
+
 void Worker::Init(int thread_id, int grp_id, int id) {
   thread_id_ = thread_id;
   grp_id_ = grp_id;
@@ -63,7 +65,7 @@ void Worker::InitLocalParams() {
     // the param from previous checkpoint files will be overwritten by
     // the param with the same name in later checkpoint files.
     for (const auto checkpoint : job_conf_.checkpoint_path()) {
-      LOG(INFO) << "Load from checkpoint file " << checkpoint;
+      LOG(ERROR) << "Load from checkpoint file " << checkpoint;
       BlobProtos bps;
       ReadProtoFromBinaryFile(checkpoint.c_str(), &bps);
       for (int i = 0; i < bps.name_size(); i++) {
@@ -342,11 +344,11 @@ void BPWorker::Forward(
           Collect(p, step);
         }
       }
-      layer->ComputeFeature(phase, perf);
+      layer->ComputeFeature(phase | kForward, perf);
       if (layer->is_bridgesrclayer())  // send data to other workers
         SendBlobs(true, false, static_cast<BridgeLayer*>(layer), net);
       if (DisplayDebugInfo(step))
-        LOG(INFO) << layer->DebugString(step, kForward);
+        LOG(INFO) << layer->DebugString(step, phase | kForward);
     }
   }
 }
@@ -359,9 +361,9 @@ void BPWorker::Backward(int step, shared_ptr<NeuralNet> net) {
       if(layer->is_bridgesrclayer()) {
         // ReceiveBlobs(false, true, layer, net);
       }
-      layer->ComputeGradient(kTrain);
+      layer->ComputeGradient(kTrain | kBackward);
       if (DisplayDebugInfo(step))
-        LOG(INFO) << layer->DebugString(step, kBackward);
+        LOG(INFO) << layer->DebugString(step, kTrain | kBackward);
       for (Param* p : layer->GetParams())
         Update(p, step);
       if (layer->is_bridgedstlayer()) {
@@ -381,72 +383,34 @@ void BPWorker::TestOneBatch(int step, Phase phase,
   Forward(step, phase, net, perf);
 }
 /****************************CDWorker**********************************/
-void CDWorker::Init(int thread_id, int group_id, int worker_id) {
-  Worker::Init(thread_id, group_id, worker_id);
-}
-
-void CDWorker::PositivePhase(int step,
-     shared_ptr<NeuralNet> net, Metric* perf) {
-  auto& layers = net->layers();
-  // LOG(ERROR)<<"Positive Phase";
-  for (auto& layer : layers) {
-    for (Param* p : layer->GetParams()) {  // wait until param is updated
+void CDWorker::TrainOneBatch(int step, Metric* perf) {
+  const auto& layers = train_net_->layers();
+  for (auto* layer : layers) {
+    for (Param* p : layer->GetParams())  // wait until param is updated
       Collect(p, step);
-    }
-    layer->ComputeFeature(kPositive, perf);
+    layer->ComputeFeature(kPositive | kForward, perf);
   }
-}
-
-void CDWorker::NegativePhase(int step,
-     shared_ptr<NeuralNet> net, Metric* perf) {
-// for negative phase, gibbs sampling only concerns RBM bottom and top layer
-  auto& layers = net->layers();
-  // LOG(ERROR)<<"Negative Phase";
-    for (auto& layer : layers) {
-      if (layer->is_vislayer() || layer->is_hidlayer()) {
-        layer->ComputeFeature(kNegative, perf);
-      }
-    }
-}
-
-void CDWorker::GradientPhase(int step, shared_ptr<NeuralNet> net) {
-  auto& layers = net->layers();
-  // LOG(ERROR)<<"Gradient Phase";
-  for (auto& layer : layers) {
-    if (layer->is_vislayer() || layer->is_hidlayer()) {
-      layer->ComputeGradient(kTrain);
-      for (Param* p : layer->GetParams()) {
-        Update(p, step);
-      }
+  for (auto* layer : layers)
+    layer->ComputeFeature(kNegative | kTest, perf);
+  for (int i = 1; i < job_conf_.train_one_batch().cd_conf().cd_k(); i++) {
+    for (auto* layer : layers) {
+      layer->ComputeFeature(kNegative, perf);
     }
   }
-}
-
-void CDWorker::LossPhase(int step, shared_ptr<NeuralNet> net, Metric* perf) {
-  auto& layers = net->layers();
-  // LOG(ERROR)<<"Loss Phase";
-  for (auto& layer : layers) {
-    if (layer->is_hidlayer()) {
-      layer->ComputeFeature(kLoss, perf);
+  for (auto* layer : layers) {
+    layer->ComputeGradient(kTrain);
+    for (Param* p : layer->GetParams()) {
+      Update(p, step);
     }
   }
-  for (auto& layer : layers) {
-    if (layer->is_vislayer()) {
-      layer->ComputeLoss(perf);
-    }
-  }
-}
-
-void CDWorker::TrainOneBatch(int step, Metric* perf) {
-  PositivePhase(step, train_net_, perf);
-  NegativePhase(step, train_net_, perf);
-  GradientPhase(step, train_net_);
-  LossPhase(step, train_net_, perf);
 }
 
 void CDWorker::TestOneBatch(int step, Phase phase,
-     shared_ptr<NeuralNet> net, Metric* perf) {
-  PositivePhase(step, test_net_, perf);
-  LossPhase(step, test_net_, perf);
+    shared_ptr<NeuralNet> net, Metric* perf) {
+  auto& layers = net->layers();
+  for (auto layer : layers)
+    layer->ComputeFeature(kPositive | kForward, perf);
+  for (auto layer : layers)
+    layer->ComputeFeature(kNegative | kTest, perf);
 }
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/src/utils/common.cc
----------------------------------------------------------------------
diff --git a/src/utils/common.cc b/src/utils/common.cc
index 1888380..d13faea 100644
--- a/src/utils/common.cc
+++ b/src/utils/common.cc
@@ -40,7 +40,7 @@ string IntVecToString(const vector<int>& vec) {
  *  * Formatted string.
  *   */
 string VStringPrintf(string fmt, va_list l) {
-  char buffer[32768];
+  char buffer[4096];
   vsnprintf(buffer, sizeof(buffer), fmt.c_str(), l);
   return string(buffer);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/fbbcaafd/src/utils/param.cc
----------------------------------------------------------------------
diff --git a/src/utils/param.cc b/src/utils/param.cc
index 67f14ab..69f697b 100644
--- a/src/utils/param.cc
+++ b/src/utils/param.cc
@@ -73,17 +73,18 @@ Param* Param::Create(const ParamProto& proto) {
     p = factory->Create(proto.user_type());
   else
     p = factory->Create(proto.type());
+  p->Init(proto);
   return p;
 }
 
 Param::Param():local_version_(-1), slice_start_(0), num_slices_(0),
   num_pending_requests_(0), data_(nullptr) {
 }
-void Param::Setup(const ParamProto& proto, const vector<int>& shape) {
+
+void Param::Setup(const vector<int>& shape) {
   data_ = std::make_shared<Blob<float>>(shape);
   grad_.Reshape(shape);
   history_.Reshape(shape);
-  proto_.CopyFrom(proto);
 }
 
 void Param::AddSlice(int slice_id, int size) {
@@ -178,7 +179,8 @@ Msg* Param::HandlePutMsg(Msg** msg, bool reserve) {
   proto.set_lr_scale(lr);
   proto.set_wd_scale(wc);
   vector<int> shape{size};
-  Setup(proto, shape);
+  Init(proto);
+  Setup(shape);
   if (ptr == nullptr) {
     CHECK((*msg)->NextFrame());
     CHECK_EQ(size* sizeof(float), (*msg)->FrameSize());
@@ -298,6 +300,8 @@ void Param::ShareFrom(const Param& other) {
           other.data_->shape().begin()));
   }
   data_ = other.data_;
+  if (grad_.count() == 0)
+    grad_.Reshape(data_->shape());
   slice_offset_ = other.slice_offset_;
   slice_size_ = other.slice_size_;
   slice_start_ = other.slice_start_;